diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,85221 @@ +{ + "best_metric": 0.5537328124046326, + "best_model_checkpoint": "gpt_light_model_unpaired/model_outputs/full_new_tokenizer_gpt2_light_seqs_unp_lr_5e-4_wd_0.1_bs_32_epochs_500_/checkpoint-6058816", + "epoch": 41.0, + "eval_steps": 500, + "global_step": 6058816, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0033834993503681246, + "grad_norm": 1.8880629539489746, + "learning_rate": 4.9999661650064965e-05, + "loss": 1.7484, + "step": 500 + }, + { + "epoch": 0.006766998700736249, + "grad_norm": 1.0062955617904663, + "learning_rate": 4.999932330012993e-05, + "loss": 0.7633, + "step": 1000 + }, + { + "epoch": 0.010150498051104373, + "grad_norm": 0.930011510848999, + "learning_rate": 4.9998984950194896e-05, + "loss": 0.6464, + "step": 1500 + }, + { + "epoch": 0.013533997401472498, + "grad_norm": 0.5899236798286438, + "learning_rate": 4.999864660025986e-05, + "loss": 0.6073, + "step": 2000 + }, + { + "epoch": 0.016917496751840625, + "grad_norm": 0.6071146726608276, + "learning_rate": 4.999830825032481e-05, + "loss": 0.587, + "step": 2500 + }, + { + "epoch": 0.020300996102208747, + "grad_norm": 0.48228690028190613, + "learning_rate": 4.999796990038978e-05, + "loss": 0.5754, + "step": 3000 + }, + { + "epoch": 0.02368449545257687, + "grad_norm": 0.46376076340675354, + "learning_rate": 4.9997631550454744e-05, + "loss": 0.564, + "step": 3500 + }, + { + "epoch": 0.027067994802944997, + "grad_norm": 0.40634822845458984, + "learning_rate": 4.9997293200519706e-05, + "loss": 0.5585, + "step": 4000 + }, + { + "epoch": 0.03045149415331312, + "grad_norm": 0.4346064627170563, + "learning_rate": 4.999695485058467e-05, + "loss": 0.5528, + "step": 4500 + }, + { + "epoch": 0.03383499350368125, + "grad_norm": 0.41330981254577637, + "learning_rate": 4.999661650064964e-05, + "loss": 0.5488, + "step": 5000 + }, + { + "epoch": 0.03721849285404937, + "grad_norm": 0.40927866101264954, + "learning_rate": 4.99962781507146e-05, + "loss": 0.5455, + "step": 5500 + }, + { + "epoch": 0.04060199220441749, + "grad_norm": 0.4025146961212158, + "learning_rate": 4.999593980077956e-05, + "loss": 0.5395, + "step": 6000 + }, + { + "epoch": 0.04398549155478562, + "grad_norm": 0.40866559743881226, + "learning_rate": 4.9995601450844524e-05, + "loss": 0.5381, + "step": 6500 + }, + { + "epoch": 0.04736899090515374, + "grad_norm": 0.35923299193382263, + "learning_rate": 4.999526310090949e-05, + "loss": 0.533, + "step": 7000 + }, + { + "epoch": 0.05075249025552187, + "grad_norm": 0.3413761556148529, + "learning_rate": 4.999492475097445e-05, + "loss": 0.5299, + "step": 7500 + }, + { + "epoch": 0.05413598960588999, + "grad_norm": 0.3142264187335968, + "learning_rate": 4.999458640103941e-05, + "loss": 0.5287, + "step": 8000 + }, + { + "epoch": 0.05751948895625812, + "grad_norm": 0.31593766808509827, + "learning_rate": 4.999424805110437e-05, + "loss": 0.5263, + "step": 8500 + }, + { + "epoch": 0.06090298830662624, + "grad_norm": 0.2999120056629181, + "learning_rate": 4.999390970116934e-05, + "loss": 0.5227, + "step": 9000 + }, + { + "epoch": 0.06428648765699436, + "grad_norm": 0.27803242206573486, + "learning_rate": 4.99935713512343e-05, + "loss": 0.5191, + "step": 9500 + }, + { + "epoch": 0.0676699870073625, + "grad_norm": 0.2870517671108246, + "learning_rate": 4.9993233001299265e-05, + "loss": 0.5187, + "step": 10000 + }, + { + "epoch": 0.07105348635773062, + "grad_norm": 0.28629007935523987, + "learning_rate": 4.999289465136423e-05, + "loss": 0.5169, + "step": 10500 + }, + { + "epoch": 0.07443698570809874, + "grad_norm": 0.2766497731208801, + "learning_rate": 4.9992556301429196e-05, + "loss": 0.5142, + "step": 11000 + }, + { + "epoch": 0.07782048505846687, + "grad_norm": 0.28909796476364136, + "learning_rate": 4.999221795149416e-05, + "loss": 0.5113, + "step": 11500 + }, + { + "epoch": 0.08120398440883499, + "grad_norm": 0.26921311020851135, + "learning_rate": 4.9991879601559114e-05, + "loss": 0.5107, + "step": 12000 + }, + { + "epoch": 0.08458748375920312, + "grad_norm": 0.26146847009658813, + "learning_rate": 4.999154125162408e-05, + "loss": 0.5068, + "step": 12500 + }, + { + "epoch": 0.08797098310957124, + "grad_norm": 0.27655014395713806, + "learning_rate": 4.9991202901689045e-05, + "loss": 0.5062, + "step": 13000 + }, + { + "epoch": 0.09135448245993937, + "grad_norm": 0.28565794229507446, + "learning_rate": 4.999086455175401e-05, + "loss": 0.5019, + "step": 13500 + }, + { + "epoch": 0.09473798181030749, + "grad_norm": 0.26864224672317505, + "learning_rate": 4.999052620181897e-05, + "loss": 0.5038, + "step": 14000 + }, + { + "epoch": 0.09812148116067562, + "grad_norm": 0.2848644256591797, + "learning_rate": 4.999018785188394e-05, + "loss": 0.5009, + "step": 14500 + }, + { + "epoch": 0.10150498051104374, + "grad_norm": 0.27758416533470154, + "learning_rate": 4.99898495019489e-05, + "loss": 0.5009, + "step": 15000 + }, + { + "epoch": 0.10488847986141187, + "grad_norm": 0.24842596054077148, + "learning_rate": 4.998951115201386e-05, + "loss": 0.4975, + "step": 15500 + }, + { + "epoch": 0.10827197921177999, + "grad_norm": 0.24530164897441864, + "learning_rate": 4.9989172802078824e-05, + "loss": 0.4981, + "step": 16000 + }, + { + "epoch": 0.11165547856214812, + "grad_norm": 0.25260645151138306, + "learning_rate": 4.998883445214379e-05, + "loss": 0.4946, + "step": 16500 + }, + { + "epoch": 0.11503897791251624, + "grad_norm": 0.2753775119781494, + "learning_rate": 4.998849610220875e-05, + "loss": 0.4951, + "step": 17000 + }, + { + "epoch": 0.11842247726288437, + "grad_norm": 0.2859707772731781, + "learning_rate": 4.998815775227371e-05, + "loss": 0.4932, + "step": 17500 + }, + { + "epoch": 0.12180597661325249, + "grad_norm": 0.2658611834049225, + "learning_rate": 4.998781940233867e-05, + "loss": 0.4898, + "step": 18000 + }, + { + "epoch": 0.12518947596362062, + "grad_norm": 0.2859034836292267, + "learning_rate": 4.998748105240364e-05, + "loss": 0.4898, + "step": 18500 + }, + { + "epoch": 0.12857297531398873, + "grad_norm": 0.24946685135364532, + "learning_rate": 4.9987142702468604e-05, + "loss": 0.4872, + "step": 19000 + }, + { + "epoch": 0.13195647466435687, + "grad_norm": 0.29766103625297546, + "learning_rate": 4.9986804352533566e-05, + "loss": 0.484, + "step": 19500 + }, + { + "epoch": 0.135339974014725, + "grad_norm": 0.26545271277427673, + "learning_rate": 4.998646600259853e-05, + "loss": 0.4857, + "step": 20000 + }, + { + "epoch": 0.1387234733650931, + "grad_norm": 0.257906436920166, + "learning_rate": 4.99861276526635e-05, + "loss": 0.4845, + "step": 20500 + }, + { + "epoch": 0.14210697271546124, + "grad_norm": 0.3082759976387024, + "learning_rate": 4.998578930272846e-05, + "loss": 0.4819, + "step": 21000 + }, + { + "epoch": 0.14549047206582935, + "grad_norm": 0.2744785249233246, + "learning_rate": 4.9985450952793414e-05, + "loss": 0.4814, + "step": 21500 + }, + { + "epoch": 0.1488739714161975, + "grad_norm": 0.26781973242759705, + "learning_rate": 4.998511260285838e-05, + "loss": 0.4814, + "step": 22000 + }, + { + "epoch": 0.15225747076656562, + "grad_norm": 0.26577427983283997, + "learning_rate": 4.9984774252923345e-05, + "loss": 0.4794, + "step": 22500 + }, + { + "epoch": 0.15564097011693373, + "grad_norm": 0.2738116681575775, + "learning_rate": 4.998443590298831e-05, + "loss": 0.478, + "step": 23000 + }, + { + "epoch": 0.15902446946730187, + "grad_norm": 0.2903014123439789, + "learning_rate": 4.998409755305327e-05, + "loss": 0.4771, + "step": 23500 + }, + { + "epoch": 0.16240796881766997, + "grad_norm": 0.2572200298309326, + "learning_rate": 4.998375920311824e-05, + "loss": 0.4768, + "step": 24000 + }, + { + "epoch": 0.1657914681680381, + "grad_norm": 0.2693314254283905, + "learning_rate": 4.99834208531832e-05, + "loss": 0.4759, + "step": 24500 + }, + { + "epoch": 0.16917496751840624, + "grad_norm": 0.27925193309783936, + "learning_rate": 4.998308250324816e-05, + "loss": 0.4748, + "step": 25000 + }, + { + "epoch": 0.17255846686877435, + "grad_norm": 0.3076331913471222, + "learning_rate": 4.9982744153313125e-05, + "loss": 0.4731, + "step": 25500 + }, + { + "epoch": 0.1759419662191425, + "grad_norm": 0.2727000415325165, + "learning_rate": 4.9982405803378094e-05, + "loss": 0.4719, + "step": 26000 + }, + { + "epoch": 0.17932546556951062, + "grad_norm": 0.2653730809688568, + "learning_rate": 4.998206745344305e-05, + "loss": 0.4706, + "step": 26500 + }, + { + "epoch": 0.18270896491987873, + "grad_norm": 0.25557219982147217, + "learning_rate": 4.998172910350801e-05, + "loss": 0.4707, + "step": 27000 + }, + { + "epoch": 0.18609246427024687, + "grad_norm": 0.25986531376838684, + "learning_rate": 4.998139075357297e-05, + "loss": 0.47, + "step": 27500 + }, + { + "epoch": 0.18947596362061497, + "grad_norm": 0.25300464034080505, + "learning_rate": 4.998105240363794e-05, + "loss": 0.4687, + "step": 28000 + }, + { + "epoch": 0.1928594629709831, + "grad_norm": 0.26704490184783936, + "learning_rate": 4.9980714053702904e-05, + "loss": 0.4677, + "step": 28500 + }, + { + "epoch": 0.19624296232135124, + "grad_norm": 0.2561919689178467, + "learning_rate": 4.9980375703767866e-05, + "loss": 0.4671, + "step": 29000 + }, + { + "epoch": 0.19962646167171935, + "grad_norm": 0.26999321579933167, + "learning_rate": 4.998003735383283e-05, + "loss": 0.4669, + "step": 29500 + }, + { + "epoch": 0.2030099610220875, + "grad_norm": 0.2723313570022583, + "learning_rate": 4.99796990038978e-05, + "loss": 0.4656, + "step": 30000 + }, + { + "epoch": 0.2063934603724556, + "grad_norm": 0.28622591495513916, + "learning_rate": 4.997936065396276e-05, + "loss": 0.4655, + "step": 30500 + }, + { + "epoch": 0.20977695972282373, + "grad_norm": 0.2644602954387665, + "learning_rate": 4.997902230402772e-05, + "loss": 0.4636, + "step": 31000 + }, + { + "epoch": 0.21316045907319187, + "grad_norm": 0.2647627294063568, + "learning_rate": 4.9978683954092684e-05, + "loss": 0.4619, + "step": 31500 + }, + { + "epoch": 0.21654395842355997, + "grad_norm": 0.28980907797813416, + "learning_rate": 4.9978345604157646e-05, + "loss": 0.4635, + "step": 32000 + }, + { + "epoch": 0.2199274577739281, + "grad_norm": 0.24661865830421448, + "learning_rate": 4.997800725422261e-05, + "loss": 0.462, + "step": 32500 + }, + { + "epoch": 0.22331095712429624, + "grad_norm": 0.2632274329662323, + "learning_rate": 4.997766890428757e-05, + "loss": 0.4606, + "step": 33000 + }, + { + "epoch": 0.22669445647466435, + "grad_norm": 0.2585084140300751, + "learning_rate": 4.997733055435254e-05, + "loss": 0.4608, + "step": 33500 + }, + { + "epoch": 0.2300779558250325, + "grad_norm": 0.2850891053676605, + "learning_rate": 4.99769922044175e-05, + "loss": 0.4603, + "step": 34000 + }, + { + "epoch": 0.2334614551754006, + "grad_norm": 0.2871512174606323, + "learning_rate": 4.997665385448246e-05, + "loss": 0.4604, + "step": 34500 + }, + { + "epoch": 0.23684495452576873, + "grad_norm": 0.27349814772605896, + "learning_rate": 4.9976315504547425e-05, + "loss": 0.4582, + "step": 35000 + }, + { + "epoch": 0.24022845387613687, + "grad_norm": 0.25539854168891907, + "learning_rate": 4.9975977154612394e-05, + "loss": 0.4581, + "step": 35500 + }, + { + "epoch": 0.24361195322650497, + "grad_norm": 0.2622606158256531, + "learning_rate": 4.997563880467735e-05, + "loss": 0.4568, + "step": 36000 + }, + { + "epoch": 0.2469954525768731, + "grad_norm": 0.27912938594818115, + "learning_rate": 4.997530045474231e-05, + "loss": 0.4575, + "step": 36500 + }, + { + "epoch": 0.25037895192724124, + "grad_norm": 0.2705869972705841, + "learning_rate": 4.9974962104807274e-05, + "loss": 0.456, + "step": 37000 + }, + { + "epoch": 0.2537624512776094, + "grad_norm": 0.2651195824146271, + "learning_rate": 4.997462375487224e-05, + "loss": 0.4561, + "step": 37500 + }, + { + "epoch": 0.25714595062797746, + "grad_norm": 0.2819836735725403, + "learning_rate": 4.9974285404937205e-05, + "loss": 0.4549, + "step": 38000 + }, + { + "epoch": 0.2605294499783456, + "grad_norm": 0.27624449133872986, + "learning_rate": 4.997394705500217e-05, + "loss": 0.4544, + "step": 38500 + }, + { + "epoch": 0.26391294932871373, + "grad_norm": 0.26522350311279297, + "learning_rate": 4.997360870506713e-05, + "loss": 0.4544, + "step": 39000 + }, + { + "epoch": 0.26729644867908187, + "grad_norm": 0.3110775649547577, + "learning_rate": 4.99732703551321e-05, + "loss": 0.4514, + "step": 39500 + }, + { + "epoch": 0.27067994802945, + "grad_norm": 0.2998841404914856, + "learning_rate": 4.997293200519706e-05, + "loss": 0.4505, + "step": 40000 + }, + { + "epoch": 0.2740634473798181, + "grad_norm": 0.2854876220226288, + "learning_rate": 4.997259365526202e-05, + "loss": 0.4514, + "step": 40500 + }, + { + "epoch": 0.2774469467301862, + "grad_norm": 0.2567809522151947, + "learning_rate": 4.9972255305326984e-05, + "loss": 0.452, + "step": 41000 + }, + { + "epoch": 0.28083044608055435, + "grad_norm": 0.2791685461997986, + "learning_rate": 4.9971916955391947e-05, + "loss": 0.4511, + "step": 41500 + }, + { + "epoch": 0.2842139454309225, + "grad_norm": 0.29572752118110657, + "learning_rate": 4.997157860545691e-05, + "loss": 0.4517, + "step": 42000 + }, + { + "epoch": 0.2875974447812906, + "grad_norm": 0.27020877599716187, + "learning_rate": 4.997124025552187e-05, + "loss": 0.4483, + "step": 42500 + }, + { + "epoch": 0.2909809441316587, + "grad_norm": 0.2704961597919464, + "learning_rate": 4.997090190558684e-05, + "loss": 0.4482, + "step": 43000 + }, + { + "epoch": 0.29436444348202684, + "grad_norm": 0.2908715009689331, + "learning_rate": 4.99705635556518e-05, + "loss": 0.4477, + "step": 43500 + }, + { + "epoch": 0.297747942832395, + "grad_norm": 0.28041768074035645, + "learning_rate": 4.9970225205716764e-05, + "loss": 0.4494, + "step": 44000 + }, + { + "epoch": 0.3011314421827631, + "grad_norm": 0.23975065350532532, + "learning_rate": 4.9969886855781726e-05, + "loss": 0.4483, + "step": 44500 + }, + { + "epoch": 0.30451494153313124, + "grad_norm": 0.2862926125526428, + "learning_rate": 4.9969548505846695e-05, + "loss": 0.4463, + "step": 45000 + }, + { + "epoch": 0.3078984408834993, + "grad_norm": 0.26498425006866455, + "learning_rate": 4.996921015591165e-05, + "loss": 0.4455, + "step": 45500 + }, + { + "epoch": 0.31128194023386746, + "grad_norm": 0.27339521050453186, + "learning_rate": 4.996887180597661e-05, + "loss": 0.447, + "step": 46000 + }, + { + "epoch": 0.3146654395842356, + "grad_norm": 0.25587713718414307, + "learning_rate": 4.9968533456041575e-05, + "loss": 0.4481, + "step": 46500 + }, + { + "epoch": 0.31804893893460373, + "grad_norm": 0.290996789932251, + "learning_rate": 4.9968195106106543e-05, + "loss": 0.4471, + "step": 47000 + }, + { + "epoch": 0.32143243828497187, + "grad_norm": 0.2602178752422333, + "learning_rate": 4.9967856756171506e-05, + "loss": 0.4451, + "step": 47500 + }, + { + "epoch": 0.32481593763533995, + "grad_norm": 0.27312254905700684, + "learning_rate": 4.996751840623647e-05, + "loss": 0.445, + "step": 48000 + }, + { + "epoch": 0.3281994369857081, + "grad_norm": 0.29018092155456543, + "learning_rate": 4.996718005630143e-05, + "loss": 0.4445, + "step": 48500 + }, + { + "epoch": 0.3315829363360762, + "grad_norm": 0.27921053767204285, + "learning_rate": 4.99668417063664e-05, + "loss": 0.4418, + "step": 49000 + }, + { + "epoch": 0.33496643568644435, + "grad_norm": 0.2919737994670868, + "learning_rate": 4.996650335643136e-05, + "loss": 0.4432, + "step": 49500 + }, + { + "epoch": 0.3383499350368125, + "grad_norm": 0.22763541340827942, + "learning_rate": 4.996616500649632e-05, + "loss": 0.4409, + "step": 50000 + }, + { + "epoch": 0.3417334343871806, + "grad_norm": 0.2642669975757599, + "learning_rate": 4.9965826656561285e-05, + "loss": 0.4427, + "step": 50500 + }, + { + "epoch": 0.3451169337375487, + "grad_norm": 0.27928030490875244, + "learning_rate": 4.996548830662625e-05, + "loss": 0.4444, + "step": 51000 + }, + { + "epoch": 0.34850043308791684, + "grad_norm": 0.27971315383911133, + "learning_rate": 4.996514995669121e-05, + "loss": 0.4413, + "step": 51500 + }, + { + "epoch": 0.351883932438285, + "grad_norm": 0.2908726632595062, + "learning_rate": 4.996481160675617e-05, + "loss": 0.4409, + "step": 52000 + }, + { + "epoch": 0.3552674317886531, + "grad_norm": 0.24951176345348358, + "learning_rate": 4.996447325682114e-05, + "loss": 0.4403, + "step": 52500 + }, + { + "epoch": 0.35865093113902125, + "grad_norm": 0.2689545452594757, + "learning_rate": 4.99641349068861e-05, + "loss": 0.4392, + "step": 53000 + }, + { + "epoch": 0.3620344304893893, + "grad_norm": 0.2819485366344452, + "learning_rate": 4.9963796556951065e-05, + "loss": 0.4398, + "step": 53500 + }, + { + "epoch": 0.36541792983975746, + "grad_norm": 0.2707166373729706, + "learning_rate": 4.996345820701603e-05, + "loss": 0.4395, + "step": 54000 + }, + { + "epoch": 0.3688014291901256, + "grad_norm": 0.27707841992378235, + "learning_rate": 4.996311985708099e-05, + "loss": 0.4381, + "step": 54500 + }, + { + "epoch": 0.37218492854049373, + "grad_norm": 0.2961883842945099, + "learning_rate": 4.996278150714595e-05, + "loss": 0.4397, + "step": 55000 + }, + { + "epoch": 0.37556842789086187, + "grad_norm": 0.2749854624271393, + "learning_rate": 4.996244315721091e-05, + "loss": 0.438, + "step": 55500 + }, + { + "epoch": 0.37895192724122995, + "grad_norm": 0.2799800932407379, + "learning_rate": 4.9962104807275875e-05, + "loss": 0.4367, + "step": 56000 + }, + { + "epoch": 0.3823354265915981, + "grad_norm": 0.27645984292030334, + "learning_rate": 4.9961766457340844e-05, + "loss": 0.4396, + "step": 56500 + }, + { + "epoch": 0.3857189259419662, + "grad_norm": 0.2911885678768158, + "learning_rate": 4.9961428107405806e-05, + "loss": 0.4379, + "step": 57000 + }, + { + "epoch": 0.38910242529233435, + "grad_norm": 0.29465240240097046, + "learning_rate": 4.996108975747077e-05, + "loss": 0.4384, + "step": 57500 + }, + { + "epoch": 0.3924859246427025, + "grad_norm": 0.28913426399230957, + "learning_rate": 4.996075140753573e-05, + "loss": 0.4376, + "step": 58000 + }, + { + "epoch": 0.39586942399307057, + "grad_norm": 0.27211660146713257, + "learning_rate": 4.99604130576007e-05, + "loss": 0.4353, + "step": 58500 + }, + { + "epoch": 0.3992529233434387, + "grad_norm": 0.2998749017715454, + "learning_rate": 4.996007470766566e-05, + "loss": 0.4363, + "step": 59000 + }, + { + "epoch": 0.40263642269380684, + "grad_norm": 0.2559037208557129, + "learning_rate": 4.9959736357730624e-05, + "loss": 0.4371, + "step": 59500 + }, + { + "epoch": 0.406019922044175, + "grad_norm": 0.3002610206604004, + "learning_rate": 4.9959398007795586e-05, + "loss": 0.4357, + "step": 60000 + }, + { + "epoch": 0.4094034213945431, + "grad_norm": 0.29734936356544495, + "learning_rate": 4.995905965786055e-05, + "loss": 0.4354, + "step": 60500 + }, + { + "epoch": 0.4127869207449112, + "grad_norm": 0.2875003218650818, + "learning_rate": 4.995872130792551e-05, + "loss": 0.435, + "step": 61000 + }, + { + "epoch": 0.4161704200952793, + "grad_norm": 0.3090741038322449, + "learning_rate": 4.995838295799047e-05, + "loss": 0.4342, + "step": 61500 + }, + { + "epoch": 0.41955391944564746, + "grad_norm": 0.2710956931114197, + "learning_rate": 4.995804460805544e-05, + "loss": 0.4357, + "step": 62000 + }, + { + "epoch": 0.4229374187960156, + "grad_norm": 0.2595633566379547, + "learning_rate": 4.99577062581204e-05, + "loss": 0.4342, + "step": 62500 + }, + { + "epoch": 0.42632091814638373, + "grad_norm": 0.2667919993400574, + "learning_rate": 4.9957367908185365e-05, + "loss": 0.4326, + "step": 63000 + }, + { + "epoch": 0.42970441749675187, + "grad_norm": 0.27533194422721863, + "learning_rate": 4.995702955825033e-05, + "loss": 0.4336, + "step": 63500 + }, + { + "epoch": 0.43308791684711995, + "grad_norm": 0.26632460951805115, + "learning_rate": 4.995669120831529e-05, + "loss": 0.4347, + "step": 64000 + }, + { + "epoch": 0.4364714161974881, + "grad_norm": 0.2682251036167145, + "learning_rate": 4.995635285838025e-05, + "loss": 0.432, + "step": 64500 + }, + { + "epoch": 0.4398549155478562, + "grad_norm": 0.2722899317741394, + "learning_rate": 4.9956014508445214e-05, + "loss": 0.4329, + "step": 65000 + }, + { + "epoch": 0.44323841489822435, + "grad_norm": 0.28562718629837036, + "learning_rate": 4.9955676158510176e-05, + "loss": 0.4328, + "step": 65500 + }, + { + "epoch": 0.4466219142485925, + "grad_norm": 0.2702755630016327, + "learning_rate": 4.9955337808575145e-05, + "loss": 0.4313, + "step": 66000 + }, + { + "epoch": 0.45000541359896057, + "grad_norm": 0.30107203125953674, + "learning_rate": 4.995499945864011e-05, + "loss": 0.4317, + "step": 66500 + }, + { + "epoch": 0.4533889129493287, + "grad_norm": 0.268795907497406, + "learning_rate": 4.995466110870507e-05, + "loss": 0.4314, + "step": 67000 + }, + { + "epoch": 0.45677241229969684, + "grad_norm": 0.2796587646007538, + "learning_rate": 4.995432275877003e-05, + "loss": 0.4297, + "step": 67500 + }, + { + "epoch": 0.460155911650065, + "grad_norm": 0.29196685552597046, + "learning_rate": 4.9953984408835e-05, + "loss": 0.4325, + "step": 68000 + }, + { + "epoch": 0.4635394110004331, + "grad_norm": 0.26869258284568787, + "learning_rate": 4.995364605889996e-05, + "loss": 0.4312, + "step": 68500 + }, + { + "epoch": 0.4669229103508012, + "grad_norm": 0.3043369948863983, + "learning_rate": 4.9953307708964924e-05, + "loss": 0.432, + "step": 69000 + }, + { + "epoch": 0.4703064097011693, + "grad_norm": 0.2701905071735382, + "learning_rate": 4.9952969359029886e-05, + "loss": 0.4307, + "step": 69500 + }, + { + "epoch": 0.47368990905153746, + "grad_norm": 0.2781127393245697, + "learning_rate": 4.995263100909485e-05, + "loss": 0.4313, + "step": 70000 + }, + { + "epoch": 0.4770734084019056, + "grad_norm": 0.25713086128234863, + "learning_rate": 4.995229265915981e-05, + "loss": 0.4297, + "step": 70500 + }, + { + "epoch": 0.48045690775227373, + "grad_norm": 0.28825071454048157, + "learning_rate": 4.995195430922477e-05, + "loss": 0.4311, + "step": 71000 + }, + { + "epoch": 0.4838404071026418, + "grad_norm": 0.27780482172966003, + "learning_rate": 4.9951615959289735e-05, + "loss": 0.4293, + "step": 71500 + }, + { + "epoch": 0.48722390645300995, + "grad_norm": 0.2669151723384857, + "learning_rate": 4.9951277609354704e-05, + "loss": 0.428, + "step": 72000 + }, + { + "epoch": 0.4906074058033781, + "grad_norm": 0.28188133239746094, + "learning_rate": 4.9950939259419666e-05, + "loss": 0.4287, + "step": 72500 + }, + { + "epoch": 0.4939909051537462, + "grad_norm": 0.29270079731941223, + "learning_rate": 4.995060090948463e-05, + "loss": 0.4287, + "step": 73000 + }, + { + "epoch": 0.49737440450411435, + "grad_norm": 0.27843374013900757, + "learning_rate": 4.995026255954959e-05, + "loss": 0.428, + "step": 73500 + }, + { + "epoch": 0.5007579038544825, + "grad_norm": 0.28133416175842285, + "learning_rate": 4.994992420961455e-05, + "loss": 0.4275, + "step": 74000 + }, + { + "epoch": 0.5041414032048506, + "grad_norm": 0.2791186273097992, + "learning_rate": 4.9949585859679514e-05, + "loss": 0.4286, + "step": 74500 + }, + { + "epoch": 0.5075249025552188, + "grad_norm": 0.2686724364757538, + "learning_rate": 4.9949247509744476e-05, + "loss": 0.427, + "step": 75000 + }, + { + "epoch": 0.5109084019055868, + "grad_norm": 0.29482367634773254, + "learning_rate": 4.9948909159809445e-05, + "loss": 0.4269, + "step": 75500 + }, + { + "epoch": 0.5142919012559549, + "grad_norm": 0.3145529329776764, + "learning_rate": 4.994857080987441e-05, + "loss": 0.4274, + "step": 76000 + }, + { + "epoch": 0.5176754006063231, + "grad_norm": 0.28418517112731934, + "learning_rate": 4.994823245993937e-05, + "loss": 0.4268, + "step": 76500 + }, + { + "epoch": 0.5210588999566912, + "grad_norm": 0.3213294446468353, + "learning_rate": 4.994789411000433e-05, + "loss": 0.4273, + "step": 77000 + }, + { + "epoch": 0.5244423993070594, + "grad_norm": 0.2876501679420471, + "learning_rate": 4.99475557600693e-05, + "loss": 0.4279, + "step": 77500 + }, + { + "epoch": 0.5278258986574275, + "grad_norm": 0.2928789258003235, + "learning_rate": 4.994721741013426e-05, + "loss": 0.4261, + "step": 78000 + }, + { + "epoch": 0.5312093980077955, + "grad_norm": 0.3037525713443756, + "learning_rate": 4.9946879060199225e-05, + "loss": 0.4253, + "step": 78500 + }, + { + "epoch": 0.5345928973581637, + "grad_norm": 0.2588423788547516, + "learning_rate": 4.994654071026418e-05, + "loss": 0.4259, + "step": 79000 + }, + { + "epoch": 0.5379763967085318, + "grad_norm": 0.30956318974494934, + "learning_rate": 4.994620236032915e-05, + "loss": 0.4243, + "step": 79500 + }, + { + "epoch": 0.5413598960589, + "grad_norm": 0.2818774878978729, + "learning_rate": 4.994586401039411e-05, + "loss": 0.4262, + "step": 80000 + }, + { + "epoch": 0.5447433954092681, + "grad_norm": 0.27260732650756836, + "learning_rate": 4.994552566045907e-05, + "loss": 0.4257, + "step": 80500 + }, + { + "epoch": 0.5481268947596362, + "grad_norm": 0.28293049335479736, + "learning_rate": 4.9945187310524035e-05, + "loss": 0.4244, + "step": 81000 + }, + { + "epoch": 0.5515103941100044, + "grad_norm": 0.299513041973114, + "learning_rate": 4.9944848960589004e-05, + "loss": 0.4249, + "step": 81500 + }, + { + "epoch": 0.5548938934603724, + "grad_norm": 0.2706651985645294, + "learning_rate": 4.9944510610653966e-05, + "loss": 0.4247, + "step": 82000 + }, + { + "epoch": 0.5582773928107406, + "grad_norm": 0.3018222153186798, + "learning_rate": 4.994417226071893e-05, + "loss": 0.4256, + "step": 82500 + }, + { + "epoch": 0.5616608921611087, + "grad_norm": 0.3136172294616699, + "learning_rate": 4.994383391078389e-05, + "loss": 0.4241, + "step": 83000 + }, + { + "epoch": 0.5650443915114768, + "grad_norm": 0.27541613578796387, + "learning_rate": 4.994349556084885e-05, + "loss": 0.4239, + "step": 83500 + }, + { + "epoch": 0.568427890861845, + "grad_norm": 0.2760767936706543, + "learning_rate": 4.9943157210913815e-05, + "loss": 0.425, + "step": 84000 + }, + { + "epoch": 0.571811390212213, + "grad_norm": 0.2719828188419342, + "learning_rate": 4.994281886097878e-05, + "loss": 0.4239, + "step": 84500 + }, + { + "epoch": 0.5751948895625812, + "grad_norm": 0.2611558437347412, + "learning_rate": 4.9942480511043746e-05, + "loss": 0.4215, + "step": 85000 + }, + { + "epoch": 0.5785783889129493, + "grad_norm": 0.28882431983947754, + "learning_rate": 4.994214216110871e-05, + "loss": 0.423, + "step": 85500 + }, + { + "epoch": 0.5819618882633174, + "grad_norm": 0.2750629186630249, + "learning_rate": 4.994180381117367e-05, + "loss": 0.4211, + "step": 86000 + }, + { + "epoch": 0.5853453876136856, + "grad_norm": 0.34259527921676636, + "learning_rate": 4.994146546123863e-05, + "loss": 0.4222, + "step": 86500 + }, + { + "epoch": 0.5887288869640537, + "grad_norm": 0.28423207998275757, + "learning_rate": 4.99411271113036e-05, + "loss": 0.424, + "step": 87000 + }, + { + "epoch": 0.5921123863144219, + "grad_norm": 0.27727535367012024, + "learning_rate": 4.994078876136856e-05, + "loss": 0.4232, + "step": 87500 + }, + { + "epoch": 0.59549588566479, + "grad_norm": 0.29033538699150085, + "learning_rate": 4.9940450411433525e-05, + "loss": 0.4223, + "step": 88000 + }, + { + "epoch": 0.598879385015158, + "grad_norm": 0.3040529489517212, + "learning_rate": 4.994011206149848e-05, + "loss": 0.4222, + "step": 88500 + }, + { + "epoch": 0.6022628843655262, + "grad_norm": 0.29766690731048584, + "learning_rate": 4.993977371156345e-05, + "loss": 0.4223, + "step": 89000 + }, + { + "epoch": 0.6056463837158943, + "grad_norm": 0.28429678082466125, + "learning_rate": 4.993943536162841e-05, + "loss": 0.4233, + "step": 89500 + }, + { + "epoch": 0.6090298830662625, + "grad_norm": 0.2714273929595947, + "learning_rate": 4.9939097011693374e-05, + "loss": 0.4216, + "step": 90000 + }, + { + "epoch": 0.6124133824166306, + "grad_norm": 0.30011844635009766, + "learning_rate": 4.9938758661758336e-05, + "loss": 0.421, + "step": 90500 + }, + { + "epoch": 0.6157968817669986, + "grad_norm": 0.28375932574272156, + "learning_rate": 4.9938420311823305e-05, + "loss": 0.4226, + "step": 91000 + }, + { + "epoch": 0.6191803811173668, + "grad_norm": 0.27526459097862244, + "learning_rate": 4.993808196188827e-05, + "loss": 0.422, + "step": 91500 + }, + { + "epoch": 0.6225638804677349, + "grad_norm": 0.2931526303291321, + "learning_rate": 4.993774361195323e-05, + "loss": 0.4208, + "step": 92000 + }, + { + "epoch": 0.6259473798181031, + "grad_norm": 0.27956876158714294, + "learning_rate": 4.993740526201819e-05, + "loss": 0.4189, + "step": 92500 + }, + { + "epoch": 0.6293308791684712, + "grad_norm": 0.29397526383399963, + "learning_rate": 4.993706691208316e-05, + "loss": 0.4211, + "step": 93000 + }, + { + "epoch": 0.6327143785188393, + "grad_norm": 0.26474645733833313, + "learning_rate": 4.9936728562148116e-05, + "loss": 0.4212, + "step": 93500 + }, + { + "epoch": 0.6360978778692075, + "grad_norm": 0.27761274576187134, + "learning_rate": 4.993639021221308e-05, + "loss": 0.4215, + "step": 94000 + }, + { + "epoch": 0.6394813772195755, + "grad_norm": 0.2934247553348541, + "learning_rate": 4.993605186227805e-05, + "loss": 0.4205, + "step": 94500 + }, + { + "epoch": 0.6428648765699437, + "grad_norm": 0.2508888840675354, + "learning_rate": 4.993571351234301e-05, + "loss": 0.4183, + "step": 95000 + }, + { + "epoch": 0.6462483759203118, + "grad_norm": 0.28504717350006104, + "learning_rate": 4.993537516240797e-05, + "loss": 0.4189, + "step": 95500 + }, + { + "epoch": 0.6496318752706799, + "grad_norm": 0.27248984575271606, + "learning_rate": 4.993503681247293e-05, + "loss": 0.4203, + "step": 96000 + }, + { + "epoch": 0.6530153746210481, + "grad_norm": 0.2794642150402069, + "learning_rate": 4.99346984625379e-05, + "loss": 0.4215, + "step": 96500 + }, + { + "epoch": 0.6563988739714162, + "grad_norm": 0.30702054500579834, + "learning_rate": 4.9934360112602864e-05, + "loss": 0.4185, + "step": 97000 + }, + { + "epoch": 0.6597823733217844, + "grad_norm": 0.27618154883384705, + "learning_rate": 4.9934021762667826e-05, + "loss": 0.4195, + "step": 97500 + }, + { + "epoch": 0.6631658726721524, + "grad_norm": 0.28443071246147156, + "learning_rate": 4.993368341273278e-05, + "loss": 0.4183, + "step": 98000 + }, + { + "epoch": 0.6665493720225206, + "grad_norm": 0.2913377583026886, + "learning_rate": 4.993334506279775e-05, + "loss": 0.4193, + "step": 98500 + }, + { + "epoch": 0.6699328713728887, + "grad_norm": 0.31212741136550903, + "learning_rate": 4.993300671286271e-05, + "loss": 0.419, + "step": 99000 + }, + { + "epoch": 0.6733163707232568, + "grad_norm": 0.28324469923973083, + "learning_rate": 4.9932668362927675e-05, + "loss": 0.4181, + "step": 99500 + }, + { + "epoch": 0.676699870073625, + "grad_norm": 0.2820169925689697, + "learning_rate": 4.993233001299264e-05, + "loss": 0.4192, + "step": 100000 + }, + { + "epoch": 0.6800833694239931, + "grad_norm": 0.3011641800403595, + "learning_rate": 4.9931991663057606e-05, + "loss": 0.4182, + "step": 100500 + }, + { + "epoch": 0.6834668687743612, + "grad_norm": 0.27722039818763733, + "learning_rate": 4.993165331312257e-05, + "loss": 0.4189, + "step": 101000 + }, + { + "epoch": 0.6868503681247293, + "grad_norm": 0.28944921493530273, + "learning_rate": 4.993131496318753e-05, + "loss": 0.4196, + "step": 101500 + }, + { + "epoch": 0.6902338674750974, + "grad_norm": 0.29599529504776, + "learning_rate": 4.993097661325249e-05, + "loss": 0.4176, + "step": 102000 + }, + { + "epoch": 0.6936173668254656, + "grad_norm": 0.2635329067707062, + "learning_rate": 4.993063826331746e-05, + "loss": 0.4191, + "step": 102500 + }, + { + "epoch": 0.6970008661758337, + "grad_norm": 0.26696333289146423, + "learning_rate": 4.9930299913382416e-05, + "loss": 0.4192, + "step": 103000 + }, + { + "epoch": 0.7003843655262019, + "grad_norm": 0.30064094066619873, + "learning_rate": 4.992996156344738e-05, + "loss": 0.4176, + "step": 103500 + }, + { + "epoch": 0.70376786487657, + "grad_norm": 0.27610304951667786, + "learning_rate": 4.992962321351235e-05, + "loss": 0.4178, + "step": 104000 + }, + { + "epoch": 0.707151364226938, + "grad_norm": 0.3050728142261505, + "learning_rate": 4.992928486357731e-05, + "loss": 0.4185, + "step": 104500 + }, + { + "epoch": 0.7105348635773062, + "grad_norm": 0.28778210282325745, + "learning_rate": 4.992894651364227e-05, + "loss": 0.4169, + "step": 105000 + }, + { + "epoch": 0.7139183629276743, + "grad_norm": 0.2820018231868744, + "learning_rate": 4.9928608163707234e-05, + "loss": 0.4159, + "step": 105500 + }, + { + "epoch": 0.7173018622780425, + "grad_norm": 0.32361045479774475, + "learning_rate": 4.99282698137722e-05, + "loss": 0.4166, + "step": 106000 + }, + { + "epoch": 0.7206853616284106, + "grad_norm": 0.2677634656429291, + "learning_rate": 4.9927931463837165e-05, + "loss": 0.4156, + "step": 106500 + }, + { + "epoch": 0.7240688609787787, + "grad_norm": 0.30490702390670776, + "learning_rate": 4.992759311390213e-05, + "loss": 0.4163, + "step": 107000 + }, + { + "epoch": 0.7274523603291468, + "grad_norm": 0.2492278665304184, + "learning_rate": 4.992725476396708e-05, + "loss": 0.4169, + "step": 107500 + }, + { + "epoch": 0.7308358596795149, + "grad_norm": 0.27243173122406006, + "learning_rate": 4.992691641403205e-05, + "loss": 0.4169, + "step": 108000 + }, + { + "epoch": 0.7342193590298831, + "grad_norm": 0.2796129882335663, + "learning_rate": 4.992657806409701e-05, + "loss": 0.4166, + "step": 108500 + }, + { + "epoch": 0.7376028583802512, + "grad_norm": 0.2759961783885956, + "learning_rate": 4.9926239714161975e-05, + "loss": 0.4166, + "step": 109000 + }, + { + "epoch": 0.7409863577306193, + "grad_norm": 0.2797967195510864, + "learning_rate": 4.992590136422694e-05, + "loss": 0.4141, + "step": 109500 + }, + { + "epoch": 0.7443698570809875, + "grad_norm": 0.28543514013290405, + "learning_rate": 4.9925563014291906e-05, + "loss": 0.4154, + "step": 110000 + }, + { + "epoch": 0.7477533564313555, + "grad_norm": 0.2752548158168793, + "learning_rate": 4.992522466435687e-05, + "loss": 0.4149, + "step": 110500 + }, + { + "epoch": 0.7511368557817237, + "grad_norm": 0.2887478768825531, + "learning_rate": 4.992488631442183e-05, + "loss": 0.4147, + "step": 111000 + }, + { + "epoch": 0.7545203551320918, + "grad_norm": 0.2966802716255188, + "learning_rate": 4.992454796448679e-05, + "loss": 0.4159, + "step": 111500 + }, + { + "epoch": 0.7579038544824599, + "grad_norm": 0.2996438443660736, + "learning_rate": 4.992420961455176e-05, + "loss": 0.4165, + "step": 112000 + }, + { + "epoch": 0.7612873538328281, + "grad_norm": 0.2674398422241211, + "learning_rate": 4.992387126461672e-05, + "loss": 0.4161, + "step": 112500 + }, + { + "epoch": 0.7646708531831962, + "grad_norm": 0.2995375990867615, + "learning_rate": 4.992353291468168e-05, + "loss": 0.4155, + "step": 113000 + }, + { + "epoch": 0.7680543525335644, + "grad_norm": 0.2994774580001831, + "learning_rate": 4.992319456474665e-05, + "loss": 0.4135, + "step": 113500 + }, + { + "epoch": 0.7714378518839324, + "grad_norm": 0.32188859581947327, + "learning_rate": 4.992285621481161e-05, + "loss": 0.4144, + "step": 114000 + }, + { + "epoch": 0.7748213512343005, + "grad_norm": 0.26384779810905457, + "learning_rate": 4.992251786487657e-05, + "loss": 0.4144, + "step": 114500 + }, + { + "epoch": 0.7782048505846687, + "grad_norm": 0.3245967924594879, + "learning_rate": 4.9922179514941534e-05, + "loss": 0.4143, + "step": 115000 + }, + { + "epoch": 0.7815883499350368, + "grad_norm": 0.2749451696872711, + "learning_rate": 4.99218411650065e-05, + "loss": 0.4157, + "step": 115500 + }, + { + "epoch": 0.784971849285405, + "grad_norm": 0.2730276882648468, + "learning_rate": 4.9921502815071465e-05, + "loss": 0.4135, + "step": 116000 + }, + { + "epoch": 0.7883553486357731, + "grad_norm": 0.3006286919116974, + "learning_rate": 4.992116446513643e-05, + "loss": 0.4141, + "step": 116500 + }, + { + "epoch": 0.7917388479861411, + "grad_norm": 0.29308584332466125, + "learning_rate": 4.992082611520138e-05, + "loss": 0.4148, + "step": 117000 + }, + { + "epoch": 0.7951223473365093, + "grad_norm": 0.27468234300613403, + "learning_rate": 4.992048776526635e-05, + "loss": 0.4142, + "step": 117500 + }, + { + "epoch": 0.7985058466868774, + "grad_norm": 0.31991246342658997, + "learning_rate": 4.9920149415331314e-05, + "loss": 0.4131, + "step": 118000 + }, + { + "epoch": 0.8018893460372456, + "grad_norm": 0.2824453115463257, + "learning_rate": 4.9919811065396276e-05, + "loss": 0.4134, + "step": 118500 + }, + { + "epoch": 0.8052728453876137, + "grad_norm": 0.27071237564086914, + "learning_rate": 4.991947271546124e-05, + "loss": 0.414, + "step": 119000 + }, + { + "epoch": 0.8086563447379818, + "grad_norm": 0.27571454644203186, + "learning_rate": 4.991913436552621e-05, + "loss": 0.4115, + "step": 119500 + }, + { + "epoch": 0.81203984408835, + "grad_norm": 0.27670493721961975, + "learning_rate": 4.991879601559117e-05, + "loss": 0.4125, + "step": 120000 + }, + { + "epoch": 0.815423343438718, + "grad_norm": 0.2901179790496826, + "learning_rate": 4.991845766565613e-05, + "loss": 0.4142, + "step": 120500 + }, + { + "epoch": 0.8188068427890862, + "grad_norm": 0.27062419056892395, + "learning_rate": 4.991811931572109e-05, + "loss": 0.4154, + "step": 121000 + }, + { + "epoch": 0.8221903421394543, + "grad_norm": 0.2876355051994324, + "learning_rate": 4.991778096578606e-05, + "loss": 0.4121, + "step": 121500 + }, + { + "epoch": 0.8255738414898224, + "grad_norm": 0.27911514043807983, + "learning_rate": 4.991744261585102e-05, + "loss": 0.4108, + "step": 122000 + }, + { + "epoch": 0.8289573408401906, + "grad_norm": 0.32272857427597046, + "learning_rate": 4.991710426591598e-05, + "loss": 0.4114, + "step": 122500 + }, + { + "epoch": 0.8323408401905587, + "grad_norm": 0.28503257036209106, + "learning_rate": 4.991676591598095e-05, + "loss": 0.413, + "step": 123000 + }, + { + "epoch": 0.8357243395409268, + "grad_norm": 0.30182546377182007, + "learning_rate": 4.991642756604591e-05, + "loss": 0.4115, + "step": 123500 + }, + { + "epoch": 0.8391078388912949, + "grad_norm": 0.31456178426742554, + "learning_rate": 4.991608921611087e-05, + "loss": 0.4128, + "step": 124000 + }, + { + "epoch": 0.8424913382416631, + "grad_norm": 0.2838102877140045, + "learning_rate": 4.9915750866175835e-05, + "loss": 0.4105, + "step": 124500 + }, + { + "epoch": 0.8458748375920312, + "grad_norm": 0.3066151738166809, + "learning_rate": 4.9915412516240804e-05, + "loss": 0.4131, + "step": 125000 + }, + { + "epoch": 0.8492583369423993, + "grad_norm": 0.2823828458786011, + "learning_rate": 4.9915074166305766e-05, + "loss": 0.4127, + "step": 125500 + }, + { + "epoch": 0.8526418362927675, + "grad_norm": 0.27775952219963074, + "learning_rate": 4.991473581637073e-05, + "loss": 0.4128, + "step": 126000 + }, + { + "epoch": 0.8560253356431355, + "grad_norm": 0.2724365293979645, + "learning_rate": 4.991439746643568e-05, + "loss": 0.4124, + "step": 126500 + }, + { + "epoch": 0.8594088349935037, + "grad_norm": 0.29520806670188904, + "learning_rate": 4.991405911650065e-05, + "loss": 0.4114, + "step": 127000 + }, + { + "epoch": 0.8627923343438718, + "grad_norm": 0.2807687819004059, + "learning_rate": 4.9913720766565614e-05, + "loss": 0.413, + "step": 127500 + }, + { + "epoch": 0.8661758336942399, + "grad_norm": 0.2868216633796692, + "learning_rate": 4.9913382416630577e-05, + "loss": 0.4107, + "step": 128000 + }, + { + "epoch": 0.8695593330446081, + "grad_norm": 0.28542953729629517, + "learning_rate": 4.991304406669554e-05, + "loss": 0.4128, + "step": 128500 + }, + { + "epoch": 0.8729428323949762, + "grad_norm": 0.29025155305862427, + "learning_rate": 4.991270571676051e-05, + "loss": 0.4122, + "step": 129000 + }, + { + "epoch": 0.8763263317453444, + "grad_norm": 0.29234567284584045, + "learning_rate": 4.991236736682547e-05, + "loss": 0.4099, + "step": 129500 + }, + { + "epoch": 0.8797098310957124, + "grad_norm": 0.27568700909614563, + "learning_rate": 4.991202901689043e-05, + "loss": 0.4118, + "step": 130000 + }, + { + "epoch": 0.8830933304460805, + "grad_norm": 0.3259067237377167, + "learning_rate": 4.9911690666955394e-05, + "loss": 0.4123, + "step": 130500 + }, + { + "epoch": 0.8864768297964487, + "grad_norm": 0.33408623933792114, + "learning_rate": 4.991135231702036e-05, + "loss": 0.4119, + "step": 131000 + }, + { + "epoch": 0.8898603291468168, + "grad_norm": 0.2826260030269623, + "learning_rate": 4.991101396708532e-05, + "loss": 0.411, + "step": 131500 + }, + { + "epoch": 0.893243828497185, + "grad_norm": 0.27461233735084534, + "learning_rate": 4.991067561715028e-05, + "loss": 0.4105, + "step": 132000 + }, + { + "epoch": 0.8966273278475531, + "grad_norm": 0.3100557327270508, + "learning_rate": 4.991033726721525e-05, + "loss": 0.4121, + "step": 132500 + }, + { + "epoch": 0.9000108271979211, + "grad_norm": 0.30305910110473633, + "learning_rate": 4.990999891728021e-05, + "loss": 0.411, + "step": 133000 + }, + { + "epoch": 0.9033943265482893, + "grad_norm": 0.31239059567451477, + "learning_rate": 4.9909660567345173e-05, + "loss": 0.4091, + "step": 133500 + }, + { + "epoch": 0.9067778258986574, + "grad_norm": 0.332537442445755, + "learning_rate": 4.9909322217410136e-05, + "loss": 0.4092, + "step": 134000 + }, + { + "epoch": 0.9101613252490256, + "grad_norm": 0.3206341564655304, + "learning_rate": 4.99089838674751e-05, + "loss": 0.4109, + "step": 134500 + }, + { + "epoch": 0.9135448245993937, + "grad_norm": 0.27052798867225647, + "learning_rate": 4.9908645517540067e-05, + "loss": 0.4096, + "step": 135000 + }, + { + "epoch": 0.9169283239497618, + "grad_norm": 0.31399449706077576, + "learning_rate": 4.990830716760503e-05, + "loss": 0.4096, + "step": 135500 + }, + { + "epoch": 0.92031182330013, + "grad_norm": 0.2710965871810913, + "learning_rate": 4.9907968817669984e-05, + "loss": 0.411, + "step": 136000 + }, + { + "epoch": 0.923695322650498, + "grad_norm": 0.281416118144989, + "learning_rate": 4.990763046773495e-05, + "loss": 0.4109, + "step": 136500 + }, + { + "epoch": 0.9270788220008662, + "grad_norm": 0.29734212160110474, + "learning_rate": 4.9907292117799915e-05, + "loss": 0.4111, + "step": 137000 + }, + { + "epoch": 0.9304623213512343, + "grad_norm": 0.28775766491889954, + "learning_rate": 4.990695376786488e-05, + "loss": 0.4101, + "step": 137500 + }, + { + "epoch": 0.9338458207016024, + "grad_norm": 0.28395044803619385, + "learning_rate": 4.990661541792984e-05, + "loss": 0.4093, + "step": 138000 + }, + { + "epoch": 0.9372293200519706, + "grad_norm": 0.28168410062789917, + "learning_rate": 4.990627706799481e-05, + "loss": 0.4103, + "step": 138500 + }, + { + "epoch": 0.9406128194023387, + "grad_norm": 0.2947444021701813, + "learning_rate": 4.990593871805977e-05, + "loss": 0.4099, + "step": 139000 + }, + { + "epoch": 0.9439963187527068, + "grad_norm": 0.299245685338974, + "learning_rate": 4.990560036812473e-05, + "loss": 0.4098, + "step": 139500 + }, + { + "epoch": 0.9473798181030749, + "grad_norm": 0.28602391481399536, + "learning_rate": 4.9905262018189695e-05, + "loss": 0.4094, + "step": 140000 + }, + { + "epoch": 0.950763317453443, + "grad_norm": 0.3072488307952881, + "learning_rate": 4.9904923668254663e-05, + "loss": 0.4099, + "step": 140500 + }, + { + "epoch": 0.9541468168038112, + "grad_norm": 0.28334489464759827, + "learning_rate": 4.990458531831962e-05, + "loss": 0.4078, + "step": 141000 + }, + { + "epoch": 0.9575303161541793, + "grad_norm": 0.28181710839271545, + "learning_rate": 4.990424696838458e-05, + "loss": 0.4091, + "step": 141500 + }, + { + "epoch": 0.9609138155045475, + "grad_norm": 0.285423219203949, + "learning_rate": 4.990390861844954e-05, + "loss": 0.4104, + "step": 142000 + }, + { + "epoch": 0.9642973148549155, + "grad_norm": 0.3231546878814697, + "learning_rate": 4.990357026851451e-05, + "loss": 0.4093, + "step": 142500 + }, + { + "epoch": 0.9676808142052836, + "grad_norm": 0.2778891324996948, + "learning_rate": 4.9903231918579474e-05, + "loss": 0.4104, + "step": 143000 + }, + { + "epoch": 0.9710643135556518, + "grad_norm": 0.31177300214767456, + "learning_rate": 4.9902893568644436e-05, + "loss": 0.4077, + "step": 143500 + }, + { + "epoch": 0.9744478129060199, + "grad_norm": 0.2938098907470703, + "learning_rate": 4.99025552187094e-05, + "loss": 0.4088, + "step": 144000 + }, + { + "epoch": 0.9778313122563881, + "grad_norm": 0.3233024775981903, + "learning_rate": 4.990221686877437e-05, + "loss": 0.407, + "step": 144500 + }, + { + "epoch": 0.9812148116067562, + "grad_norm": 0.31348568201065063, + "learning_rate": 4.990187851883933e-05, + "loss": 0.4091, + "step": 145000 + }, + { + "epoch": 0.9845983109571242, + "grad_norm": 0.29528912901878357, + "learning_rate": 4.990154016890429e-05, + "loss": 0.4101, + "step": 145500 + }, + { + "epoch": 0.9879818103074924, + "grad_norm": 0.30985161662101746, + "learning_rate": 4.9901201818969254e-05, + "loss": 0.409, + "step": 146000 + }, + { + "epoch": 0.9913653096578605, + "grad_norm": 0.3055655360221863, + "learning_rate": 4.9900863469034216e-05, + "loss": 0.4072, + "step": 146500 + }, + { + "epoch": 0.9947488090082287, + "grad_norm": 0.2524847090244293, + "learning_rate": 4.990052511909918e-05, + "loss": 0.4079, + "step": 147000 + }, + { + "epoch": 0.9981323083585968, + "grad_norm": 0.3176727294921875, + "learning_rate": 4.990018676916414e-05, + "loss": 0.4089, + "step": 147500 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.8459713252862398, + "eval_loss": 0.6243861317634583, + "eval_runtime": 3360.1882, + "eval_samples_per_second": 86.526, + "eval_steps_per_second": 5.408, + "step": 147776 + }, + { + "epoch": 1.001515807708965, + "grad_norm": 0.2944328188896179, + "learning_rate": 4.989984841922911e-05, + "loss": 0.4063, + "step": 148000 + }, + { + "epoch": 1.004899307059333, + "grad_norm": 0.29024773836135864, + "learning_rate": 4.989951006929407e-05, + "loss": 0.4039, + "step": 148500 + }, + { + "epoch": 1.0082828064097011, + "grad_norm": 0.27989470958709717, + "learning_rate": 4.989917171935903e-05, + "loss": 0.4064, + "step": 149000 + }, + { + "epoch": 1.0116663057600692, + "grad_norm": 0.28938227891921997, + "learning_rate": 4.9898833369423995e-05, + "loss": 0.407, + "step": 149500 + }, + { + "epoch": 1.0150498051104375, + "grad_norm": 0.295976459980011, + "learning_rate": 4.9898495019488964e-05, + "loss": 0.406, + "step": 150000 + }, + { + "epoch": 1.0184333044608056, + "grad_norm": 0.31487059593200684, + "learning_rate": 4.989815666955392e-05, + "loss": 0.4051, + "step": 150500 + }, + { + "epoch": 1.0218168038111737, + "grad_norm": 0.26871323585510254, + "learning_rate": 4.989781831961888e-05, + "loss": 0.4059, + "step": 151000 + }, + { + "epoch": 1.0252003031615418, + "grad_norm": 0.28550779819488525, + "learning_rate": 4.9897479969683844e-05, + "loss": 0.4039, + "step": 151500 + }, + { + "epoch": 1.0285838025119098, + "grad_norm": 0.2694486975669861, + "learning_rate": 4.989714161974881e-05, + "loss": 0.4062, + "step": 152000 + }, + { + "epoch": 1.0319673018622781, + "grad_norm": 0.2790544629096985, + "learning_rate": 4.9896803269813775e-05, + "loss": 0.4051, + "step": 152500 + }, + { + "epoch": 1.0353508012126462, + "grad_norm": 0.2629123628139496, + "learning_rate": 4.989646491987874e-05, + "loss": 0.4059, + "step": 153000 + }, + { + "epoch": 1.0387343005630143, + "grad_norm": 0.3039059340953827, + "learning_rate": 4.98961265699437e-05, + "loss": 0.4053, + "step": 153500 + }, + { + "epoch": 1.0421177999133824, + "grad_norm": 0.3079100549221039, + "learning_rate": 4.989578822000867e-05, + "loss": 0.4058, + "step": 154000 + }, + { + "epoch": 1.0455012992637505, + "grad_norm": 0.3271750807762146, + "learning_rate": 4.989544987007363e-05, + "loss": 0.405, + "step": 154500 + }, + { + "epoch": 1.0488847986141188, + "grad_norm": 0.30280107259750366, + "learning_rate": 4.989511152013859e-05, + "loss": 0.4049, + "step": 155000 + }, + { + "epoch": 1.0522682979644868, + "grad_norm": 0.3045520484447479, + "learning_rate": 4.9894773170203554e-05, + "loss": 0.4041, + "step": 155500 + }, + { + "epoch": 1.055651797314855, + "grad_norm": 0.2965322732925415, + "learning_rate": 4.9894434820268516e-05, + "loss": 0.4065, + "step": 156000 + }, + { + "epoch": 1.059035296665223, + "grad_norm": 0.28401070833206177, + "learning_rate": 4.989409647033348e-05, + "loss": 0.4066, + "step": 156500 + }, + { + "epoch": 1.062418796015591, + "grad_norm": 0.3080543577671051, + "learning_rate": 4.989375812039844e-05, + "loss": 0.4074, + "step": 157000 + }, + { + "epoch": 1.0658022953659594, + "grad_norm": 0.28200146555900574, + "learning_rate": 4.989341977046341e-05, + "loss": 0.4048, + "step": 157500 + }, + { + "epoch": 1.0691857947163275, + "grad_norm": 0.28583553433418274, + "learning_rate": 4.989308142052837e-05, + "loss": 0.4048, + "step": 158000 + }, + { + "epoch": 1.0725692940666955, + "grad_norm": 0.29166433215141296, + "learning_rate": 4.9892743070593334e-05, + "loss": 0.405, + "step": 158500 + }, + { + "epoch": 1.0759527934170636, + "grad_norm": 0.2707166075706482, + "learning_rate": 4.9892404720658296e-05, + "loss": 0.4047, + "step": 159000 + }, + { + "epoch": 1.0793362927674317, + "grad_norm": 0.2985897362232208, + "learning_rate": 4.9892066370723265e-05, + "loss": 0.4048, + "step": 159500 + }, + { + "epoch": 1.0827197921178, + "grad_norm": 0.32868626713752747, + "learning_rate": 4.989172802078822e-05, + "loss": 0.4075, + "step": 160000 + }, + { + "epoch": 1.086103291468168, + "grad_norm": 0.3031218349933624, + "learning_rate": 4.989138967085318e-05, + "loss": 0.4049, + "step": 160500 + }, + { + "epoch": 1.0894867908185362, + "grad_norm": 0.277643084526062, + "learning_rate": 4.9891051320918144e-05, + "loss": 0.4065, + "step": 161000 + }, + { + "epoch": 1.0928702901689042, + "grad_norm": 0.2969783842563629, + "learning_rate": 4.989071297098311e-05, + "loss": 0.4044, + "step": 161500 + }, + { + "epoch": 1.0962537895192723, + "grad_norm": 0.30704718828201294, + "learning_rate": 4.9890374621048075e-05, + "loss": 0.4035, + "step": 162000 + }, + { + "epoch": 1.0996372888696406, + "grad_norm": 0.303273469209671, + "learning_rate": 4.989003627111304e-05, + "loss": 0.4044, + "step": 162500 + }, + { + "epoch": 1.1030207882200087, + "grad_norm": 0.30890023708343506, + "learning_rate": 4.9889697921178e-05, + "loss": 0.4044, + "step": 163000 + }, + { + "epoch": 1.1064042875703768, + "grad_norm": 0.3068098723888397, + "learning_rate": 4.988935957124297e-05, + "loss": 0.4049, + "step": 163500 + }, + { + "epoch": 1.1097877869207449, + "grad_norm": 0.2811026871204376, + "learning_rate": 4.988902122130793e-05, + "loss": 0.4032, + "step": 164000 + }, + { + "epoch": 1.113171286271113, + "grad_norm": 0.2884727716445923, + "learning_rate": 4.988868287137289e-05, + "loss": 0.4056, + "step": 164500 + }, + { + "epoch": 1.1165547856214812, + "grad_norm": 0.296905517578125, + "learning_rate": 4.9888344521437855e-05, + "loss": 0.4044, + "step": 165000 + }, + { + "epoch": 1.1199382849718493, + "grad_norm": 0.28965866565704346, + "learning_rate": 4.988800617150282e-05, + "loss": 0.403, + "step": 165500 + }, + { + "epoch": 1.1233217843222174, + "grad_norm": 0.2720777094364166, + "learning_rate": 4.988766782156778e-05, + "loss": 0.4028, + "step": 166000 + }, + { + "epoch": 1.1267052836725855, + "grad_norm": 0.320305734872818, + "learning_rate": 4.988732947163274e-05, + "loss": 0.4057, + "step": 166500 + }, + { + "epoch": 1.1300887830229536, + "grad_norm": 0.33288514614105225, + "learning_rate": 4.988699112169771e-05, + "loss": 0.4038, + "step": 167000 + }, + { + "epoch": 1.1334722823733219, + "grad_norm": 0.3267863392829895, + "learning_rate": 4.988665277176267e-05, + "loss": 0.405, + "step": 167500 + }, + { + "epoch": 1.13685578172369, + "grad_norm": 0.2968672513961792, + "learning_rate": 4.9886314421827634e-05, + "loss": 0.4043, + "step": 168000 + }, + { + "epoch": 1.140239281074058, + "grad_norm": 0.3072211742401123, + "learning_rate": 4.9885976071892596e-05, + "loss": 0.4031, + "step": 168500 + }, + { + "epoch": 1.143622780424426, + "grad_norm": 0.28575554490089417, + "learning_rate": 4.9885637721957565e-05, + "loss": 0.4053, + "step": 169000 + }, + { + "epoch": 1.1470062797747942, + "grad_norm": 0.3024563193321228, + "learning_rate": 4.988529937202252e-05, + "loss": 0.4051, + "step": 169500 + }, + { + "epoch": 1.1503897791251625, + "grad_norm": 0.27619415521621704, + "learning_rate": 4.988496102208748e-05, + "loss": 0.4046, + "step": 170000 + }, + { + "epoch": 1.1537732784755306, + "grad_norm": 0.31210121512413025, + "learning_rate": 4.9884622672152445e-05, + "loss": 0.4035, + "step": 170500 + }, + { + "epoch": 1.1571567778258987, + "grad_norm": 0.2941502630710602, + "learning_rate": 4.9884284322217414e-05, + "loss": 0.4045, + "step": 171000 + }, + { + "epoch": 1.1605402771762667, + "grad_norm": 0.29555612802505493, + "learning_rate": 4.9883945972282376e-05, + "loss": 0.4059, + "step": 171500 + }, + { + "epoch": 1.1639237765266348, + "grad_norm": 0.30727556347846985, + "learning_rate": 4.988360762234734e-05, + "loss": 0.4031, + "step": 172000 + }, + { + "epoch": 1.1673072758770031, + "grad_norm": 0.274087131023407, + "learning_rate": 4.98832692724123e-05, + "loss": 0.4037, + "step": 172500 + }, + { + "epoch": 1.1706907752273712, + "grad_norm": 0.2855257987976074, + "learning_rate": 4.988293092247727e-05, + "loss": 0.4029, + "step": 173000 + }, + { + "epoch": 1.1740742745777393, + "grad_norm": 0.29047608375549316, + "learning_rate": 4.988259257254223e-05, + "loss": 0.4054, + "step": 173500 + }, + { + "epoch": 1.1774577739281074, + "grad_norm": 0.3169862926006317, + "learning_rate": 4.988225422260719e-05, + "loss": 0.4039, + "step": 174000 + }, + { + "epoch": 1.1808412732784754, + "grad_norm": 0.286082923412323, + "learning_rate": 4.9881915872672155e-05, + "loss": 0.402, + "step": 174500 + }, + { + "epoch": 1.1842247726288437, + "grad_norm": 0.3077320158481598, + "learning_rate": 4.988157752273712e-05, + "loss": 0.4038, + "step": 175000 + }, + { + "epoch": 1.1876082719792118, + "grad_norm": 0.2859615683555603, + "learning_rate": 4.988123917280208e-05, + "loss": 0.4048, + "step": 175500 + }, + { + "epoch": 1.19099177132958, + "grad_norm": 0.2765771746635437, + "learning_rate": 4.988090082286704e-05, + "loss": 0.4033, + "step": 176000 + }, + { + "epoch": 1.194375270679948, + "grad_norm": 0.2984127700328827, + "learning_rate": 4.988056247293201e-05, + "loss": 0.4024, + "step": 176500 + }, + { + "epoch": 1.1977587700303163, + "grad_norm": 0.28113898634910583, + "learning_rate": 4.988022412299697e-05, + "loss": 0.4023, + "step": 177000 + }, + { + "epoch": 1.2011422693806844, + "grad_norm": 0.2798595130443573, + "learning_rate": 4.9879885773061935e-05, + "loss": 0.4034, + "step": 177500 + }, + { + "epoch": 1.2045257687310524, + "grad_norm": 0.3147580027580261, + "learning_rate": 4.98795474231269e-05, + "loss": 0.4043, + "step": 178000 + }, + { + "epoch": 1.2079092680814205, + "grad_norm": 0.316378653049469, + "learning_rate": 4.9879209073191866e-05, + "loss": 0.4018, + "step": 178500 + }, + { + "epoch": 1.2112927674317886, + "grad_norm": 0.30602577328681946, + "learning_rate": 4.987887072325682e-05, + "loss": 0.4039, + "step": 179000 + }, + { + "epoch": 1.2146762667821567, + "grad_norm": 0.3135761022567749, + "learning_rate": 4.9878532373321783e-05, + "loss": 0.4022, + "step": 179500 + }, + { + "epoch": 1.218059766132525, + "grad_norm": 0.3081457018852234, + "learning_rate": 4.9878194023386746e-05, + "loss": 0.402, + "step": 180000 + }, + { + "epoch": 1.221443265482893, + "grad_norm": 0.3091464638710022, + "learning_rate": 4.9877855673451714e-05, + "loss": 0.4013, + "step": 180500 + }, + { + "epoch": 1.2248267648332611, + "grad_norm": 0.2828048765659332, + "learning_rate": 4.9877517323516677e-05, + "loss": 0.4016, + "step": 181000 + }, + { + "epoch": 1.2282102641836292, + "grad_norm": 0.3356885612010956, + "learning_rate": 4.987717897358164e-05, + "loss": 0.4033, + "step": 181500 + }, + { + "epoch": 1.2315937635339975, + "grad_norm": 0.2955090403556824, + "learning_rate": 4.98768406236466e-05, + "loss": 0.4022, + "step": 182000 + }, + { + "epoch": 1.2349772628843656, + "grad_norm": 0.3026246130466461, + "learning_rate": 4.987650227371157e-05, + "loss": 0.4013, + "step": 182500 + }, + { + "epoch": 1.2383607622347337, + "grad_norm": 0.30111822485923767, + "learning_rate": 4.987616392377653e-05, + "loss": 0.4028, + "step": 183000 + }, + { + "epoch": 1.2417442615851018, + "grad_norm": 0.30120477080345154, + "learning_rate": 4.9875825573841494e-05, + "loss": 0.402, + "step": 183500 + }, + { + "epoch": 1.2451277609354698, + "grad_norm": 0.3235791027545929, + "learning_rate": 4.9875487223906456e-05, + "loss": 0.4034, + "step": 184000 + }, + { + "epoch": 1.248511260285838, + "grad_norm": 0.2712317109107971, + "learning_rate": 4.987514887397142e-05, + "loss": 0.4011, + "step": 184500 + }, + { + "epoch": 1.2518947596362062, + "grad_norm": 0.2865242063999176, + "learning_rate": 4.987481052403638e-05, + "loss": 0.4022, + "step": 185000 + }, + { + "epoch": 1.2552782589865743, + "grad_norm": 0.27379170060157776, + "learning_rate": 4.987447217410134e-05, + "loss": 0.4007, + "step": 185500 + }, + { + "epoch": 1.2586617583369424, + "grad_norm": 0.2991056442260742, + "learning_rate": 4.987413382416631e-05, + "loss": 0.403, + "step": 186000 + }, + { + "epoch": 1.2620452576873105, + "grad_norm": 0.27812930941581726, + "learning_rate": 4.9873795474231273e-05, + "loss": 0.4011, + "step": 186500 + }, + { + "epoch": 1.2654287570376788, + "grad_norm": 0.3062492609024048, + "learning_rate": 4.9873457124296236e-05, + "loss": 0.4022, + "step": 187000 + }, + { + "epoch": 1.2688122563880468, + "grad_norm": 0.2890298366546631, + "learning_rate": 4.98731187743612e-05, + "loss": 0.4017, + "step": 187500 + }, + { + "epoch": 1.272195755738415, + "grad_norm": 0.3117908835411072, + "learning_rate": 4.987278042442616e-05, + "loss": 0.4005, + "step": 188000 + }, + { + "epoch": 1.275579255088783, + "grad_norm": 0.30386489629745483, + "learning_rate": 4.987244207449112e-05, + "loss": 0.4026, + "step": 188500 + }, + { + "epoch": 1.278962754439151, + "grad_norm": 0.3161843419075012, + "learning_rate": 4.9872103724556084e-05, + "loss": 0.401, + "step": 189000 + }, + { + "epoch": 1.2823462537895192, + "grad_norm": 0.28740108013153076, + "learning_rate": 4.9871765374621046e-05, + "loss": 0.4016, + "step": 189500 + }, + { + "epoch": 1.2857297531398875, + "grad_norm": 0.29486459493637085, + "learning_rate": 4.9871427024686015e-05, + "loss": 0.3998, + "step": 190000 + }, + { + "epoch": 1.2891132524902555, + "grad_norm": 0.2807752788066864, + "learning_rate": 4.987108867475098e-05, + "loss": 0.402, + "step": 190500 + }, + { + "epoch": 1.2924967518406236, + "grad_norm": 0.30910518765449524, + "learning_rate": 4.987075032481594e-05, + "loss": 0.4004, + "step": 191000 + }, + { + "epoch": 1.2958802511909917, + "grad_norm": 0.302749365568161, + "learning_rate": 4.98704119748809e-05, + "loss": 0.4002, + "step": 191500 + }, + { + "epoch": 1.29926375054136, + "grad_norm": 0.297519713640213, + "learning_rate": 4.987007362494587e-05, + "loss": 0.4026, + "step": 192000 + }, + { + "epoch": 1.302647249891728, + "grad_norm": 0.289521187543869, + "learning_rate": 4.986973527501083e-05, + "loss": 0.4012, + "step": 192500 + }, + { + "epoch": 1.3060307492420962, + "grad_norm": 0.3190580904483795, + "learning_rate": 4.9869396925075795e-05, + "loss": 0.4013, + "step": 193000 + }, + { + "epoch": 1.3094142485924642, + "grad_norm": 0.2650038003921509, + "learning_rate": 4.986905857514076e-05, + "loss": 0.3998, + "step": 193500 + }, + { + "epoch": 1.3127977479428323, + "grad_norm": 0.2761973440647125, + "learning_rate": 4.986872022520572e-05, + "loss": 0.401, + "step": 194000 + }, + { + "epoch": 1.3161812472932004, + "grad_norm": 0.2967272400856018, + "learning_rate": 4.986838187527068e-05, + "loss": 0.4031, + "step": 194500 + }, + { + "epoch": 1.3195647466435687, + "grad_norm": 0.29815414547920227, + "learning_rate": 4.986804352533564e-05, + "loss": 0.4023, + "step": 195000 + }, + { + "epoch": 1.3229482459939368, + "grad_norm": 0.3200174868106842, + "learning_rate": 4.986770517540061e-05, + "loss": 0.401, + "step": 195500 + }, + { + "epoch": 1.3263317453443049, + "grad_norm": 0.29795876145362854, + "learning_rate": 4.9867366825465574e-05, + "loss": 0.4003, + "step": 196000 + }, + { + "epoch": 1.329715244694673, + "grad_norm": 0.2837540805339813, + "learning_rate": 4.9867028475530536e-05, + "loss": 0.402, + "step": 196500 + }, + { + "epoch": 1.3330987440450413, + "grad_norm": 0.32125329971313477, + "learning_rate": 4.98666901255955e-05, + "loss": 0.4018, + "step": 197000 + }, + { + "epoch": 1.3364822433954093, + "grad_norm": 0.2916601896286011, + "learning_rate": 4.986635177566046e-05, + "loss": 0.4005, + "step": 197500 + }, + { + "epoch": 1.3398657427457774, + "grad_norm": 0.3081722557544708, + "learning_rate": 4.986601342572542e-05, + "loss": 0.3996, + "step": 198000 + }, + { + "epoch": 1.3432492420961455, + "grad_norm": 0.33962398767471313, + "learning_rate": 4.9865675075790385e-05, + "loss": 0.4019, + "step": 198500 + }, + { + "epoch": 1.3466327414465136, + "grad_norm": 0.29512572288513184, + "learning_rate": 4.986533672585535e-05, + "loss": 0.401, + "step": 199000 + }, + { + "epoch": 1.3500162407968817, + "grad_norm": 0.3279782831668854, + "learning_rate": 4.9864998375920316e-05, + "loss": 0.4017, + "step": 199500 + }, + { + "epoch": 1.35339974014725, + "grad_norm": 0.3008809983730316, + "learning_rate": 4.986466002598528e-05, + "loss": 0.4008, + "step": 200000 + }, + { + "epoch": 1.356783239497618, + "grad_norm": 0.2847321629524231, + "learning_rate": 4.986432167605024e-05, + "loss": 0.3978, + "step": 200500 + }, + { + "epoch": 1.3601667388479861, + "grad_norm": 0.29812660813331604, + "learning_rate": 4.98639833261152e-05, + "loss": 0.3993, + "step": 201000 + }, + { + "epoch": 1.3635502381983542, + "grad_norm": 0.3119332194328308, + "learning_rate": 4.986364497618017e-05, + "loss": 0.4002, + "step": 201500 + }, + { + "epoch": 1.3669337375487225, + "grad_norm": 0.29071831703186035, + "learning_rate": 4.986330662624513e-05, + "loss": 0.399, + "step": 202000 + }, + { + "epoch": 1.3703172368990906, + "grad_norm": 0.2947494089603424, + "learning_rate": 4.9862968276310095e-05, + "loss": 0.401, + "step": 202500 + }, + { + "epoch": 1.3737007362494587, + "grad_norm": 0.3152572810649872, + "learning_rate": 4.986262992637506e-05, + "loss": 0.4003, + "step": 203000 + }, + { + "epoch": 1.3770842355998267, + "grad_norm": 0.3083915710449219, + "learning_rate": 4.986229157644002e-05, + "loss": 0.4017, + "step": 203500 + }, + { + "epoch": 1.3804677349501948, + "grad_norm": 0.3038440942764282, + "learning_rate": 4.986195322650498e-05, + "loss": 0.3995, + "step": 204000 + }, + { + "epoch": 1.383851234300563, + "grad_norm": 0.3062540590763092, + "learning_rate": 4.9861614876569944e-05, + "loss": 0.401, + "step": 204500 + }, + { + "epoch": 1.3872347336509312, + "grad_norm": 0.3120565116405487, + "learning_rate": 4.9861276526634906e-05, + "loss": 0.3996, + "step": 205000 + }, + { + "epoch": 1.3906182330012993, + "grad_norm": 0.2866579294204712, + "learning_rate": 4.9860938176699875e-05, + "loss": 0.4006, + "step": 205500 + }, + { + "epoch": 1.3940017323516674, + "grad_norm": 0.2914845943450928, + "learning_rate": 4.986059982676484e-05, + "loss": 0.3988, + "step": 206000 + }, + { + "epoch": 1.3973852317020354, + "grad_norm": 0.2740603983402252, + "learning_rate": 4.98602614768298e-05, + "loss": 0.399, + "step": 206500 + }, + { + "epoch": 1.4007687310524037, + "grad_norm": 0.289460152387619, + "learning_rate": 4.985992312689476e-05, + "loss": 0.4, + "step": 207000 + }, + { + "epoch": 1.4041522304027718, + "grad_norm": 0.29983991384506226, + "learning_rate": 4.985958477695973e-05, + "loss": 0.3996, + "step": 207500 + }, + { + "epoch": 1.40753572975314, + "grad_norm": 0.3190790116786957, + "learning_rate": 4.9859246427024685e-05, + "loss": 0.4007, + "step": 208000 + }, + { + "epoch": 1.410919229103508, + "grad_norm": 0.2991366982460022, + "learning_rate": 4.985890807708965e-05, + "loss": 0.4003, + "step": 208500 + }, + { + "epoch": 1.414302728453876, + "grad_norm": 0.29199305176734924, + "learning_rate": 4.9858569727154616e-05, + "loss": 0.3994, + "step": 209000 + }, + { + "epoch": 1.4176862278042441, + "grad_norm": 0.32636138796806335, + "learning_rate": 4.985823137721958e-05, + "loss": 0.4004, + "step": 209500 + }, + { + "epoch": 1.4210697271546124, + "grad_norm": 0.3044842481613159, + "learning_rate": 4.985789302728454e-05, + "loss": 0.3996, + "step": 210000 + }, + { + "epoch": 1.4244532265049805, + "grad_norm": 0.28607505559921265, + "learning_rate": 4.98575546773495e-05, + "loss": 0.3985, + "step": 210500 + }, + { + "epoch": 1.4278367258553486, + "grad_norm": 0.3226557970046997, + "learning_rate": 4.985721632741447e-05, + "loss": 0.3993, + "step": 211000 + }, + { + "epoch": 1.4312202252057167, + "grad_norm": 0.31989920139312744, + "learning_rate": 4.9856877977479434e-05, + "loss": 0.4005, + "step": 211500 + }, + { + "epoch": 1.434603724556085, + "grad_norm": 0.2904207408428192, + "learning_rate": 4.9856539627544396e-05, + "loss": 0.4013, + "step": 212000 + }, + { + "epoch": 1.437987223906453, + "grad_norm": 0.28892791271209717, + "learning_rate": 4.985620127760935e-05, + "loss": 0.4, + "step": 212500 + }, + { + "epoch": 1.4413707232568211, + "grad_norm": 0.2917577028274536, + "learning_rate": 4.985586292767432e-05, + "loss": 0.3991, + "step": 213000 + }, + { + "epoch": 1.4447542226071892, + "grad_norm": 0.3064013123512268, + "learning_rate": 4.985552457773928e-05, + "loss": 0.3971, + "step": 213500 + }, + { + "epoch": 1.4481377219575573, + "grad_norm": 0.30525708198547363, + "learning_rate": 4.9855186227804244e-05, + "loss": 0.4017, + "step": 214000 + }, + { + "epoch": 1.4515212213079254, + "grad_norm": 0.26815348863601685, + "learning_rate": 4.9854847877869206e-05, + "loss": 0.3983, + "step": 214500 + }, + { + "epoch": 1.4549047206582937, + "grad_norm": 0.3009331524372101, + "learning_rate": 4.9854509527934175e-05, + "loss": 0.3989, + "step": 215000 + }, + { + "epoch": 1.4582882200086618, + "grad_norm": 0.3506813049316406, + "learning_rate": 4.985417117799914e-05, + "loss": 0.4001, + "step": 215500 + }, + { + "epoch": 1.4616717193590298, + "grad_norm": 0.30114126205444336, + "learning_rate": 4.98538328280641e-05, + "loss": 0.4005, + "step": 216000 + }, + { + "epoch": 1.4650552187093981, + "grad_norm": 0.27080991864204407, + "learning_rate": 4.985349447812906e-05, + "loss": 0.3972, + "step": 216500 + }, + { + "epoch": 1.4684387180597662, + "grad_norm": 0.32722705602645874, + "learning_rate": 4.985315612819403e-05, + "loss": 0.4001, + "step": 217000 + }, + { + "epoch": 1.4718222174101343, + "grad_norm": 0.32709217071533203, + "learning_rate": 4.9852817778258986e-05, + "loss": 0.3984, + "step": 217500 + }, + { + "epoch": 1.4752057167605024, + "grad_norm": 0.30302122235298157, + "learning_rate": 4.985247942832395e-05, + "loss": 0.3968, + "step": 218000 + }, + { + "epoch": 1.4785892161108705, + "grad_norm": 0.30743733048439026, + "learning_rate": 4.985214107838892e-05, + "loss": 0.3999, + "step": 218500 + }, + { + "epoch": 1.4819727154612385, + "grad_norm": 0.2864611744880676, + "learning_rate": 4.985180272845388e-05, + "loss": 0.3998, + "step": 219000 + }, + { + "epoch": 1.4853562148116066, + "grad_norm": 0.33113473653793335, + "learning_rate": 4.985146437851884e-05, + "loss": 0.3991, + "step": 219500 + }, + { + "epoch": 1.488739714161975, + "grad_norm": 0.30229973793029785, + "learning_rate": 4.98511260285838e-05, + "loss": 0.3981, + "step": 220000 + }, + { + "epoch": 1.492123213512343, + "grad_norm": 0.30289003252983093, + "learning_rate": 4.985078767864877e-05, + "loss": 0.3991, + "step": 220500 + }, + { + "epoch": 1.495506712862711, + "grad_norm": 0.32852068543434143, + "learning_rate": 4.9850449328713734e-05, + "loss": 0.3999, + "step": 221000 + }, + { + "epoch": 1.4988902122130794, + "grad_norm": 0.3039749562740326, + "learning_rate": 4.9850110978778696e-05, + "loss": 0.3986, + "step": 221500 + }, + { + "epoch": 1.5022737115634475, + "grad_norm": 0.30670079588890076, + "learning_rate": 4.984977262884365e-05, + "loss": 0.3992, + "step": 222000 + }, + { + "epoch": 1.5056572109138155, + "grad_norm": 0.27520325779914856, + "learning_rate": 4.984943427890862e-05, + "loss": 0.3998, + "step": 222500 + }, + { + "epoch": 1.5090407102641836, + "grad_norm": 0.28639331459999084, + "learning_rate": 4.984909592897358e-05, + "loss": 0.3985, + "step": 223000 + }, + { + "epoch": 1.5124242096145517, + "grad_norm": 0.3116671144962311, + "learning_rate": 4.9848757579038545e-05, + "loss": 0.3987, + "step": 223500 + }, + { + "epoch": 1.5158077089649198, + "grad_norm": 0.31099647283554077, + "learning_rate": 4.984841922910351e-05, + "loss": 0.3991, + "step": 224000 + }, + { + "epoch": 1.5191912083152879, + "grad_norm": 0.3141653537750244, + "learning_rate": 4.9848080879168476e-05, + "loss": 0.3993, + "step": 224500 + }, + { + "epoch": 1.5225747076656562, + "grad_norm": 0.27565282583236694, + "learning_rate": 4.984774252923344e-05, + "loss": 0.3973, + "step": 225000 + }, + { + "epoch": 1.5259582070160242, + "grad_norm": 0.33048033714294434, + "learning_rate": 4.98474041792984e-05, + "loss": 0.3977, + "step": 225500 + }, + { + "epoch": 1.5293417063663923, + "grad_norm": 0.3040478527545929, + "learning_rate": 4.984706582936336e-05, + "loss": 0.3989, + "step": 226000 + }, + { + "epoch": 1.5327252057167606, + "grad_norm": 0.3027140200138092, + "learning_rate": 4.984672747942833e-05, + "loss": 0.3993, + "step": 226500 + }, + { + "epoch": 1.5361087050671287, + "grad_norm": 0.2882574498653412, + "learning_rate": 4.9846389129493287e-05, + "loss": 0.4007, + "step": 227000 + }, + { + "epoch": 1.5394922044174968, + "grad_norm": 0.3169984221458435, + "learning_rate": 4.984605077955825e-05, + "loss": 0.3964, + "step": 227500 + }, + { + "epoch": 1.5428757037678649, + "grad_norm": 0.26678523421287537, + "learning_rate": 4.984571242962322e-05, + "loss": 0.3981, + "step": 228000 + }, + { + "epoch": 1.546259203118233, + "grad_norm": 0.30455484986305237, + "learning_rate": 4.984537407968818e-05, + "loss": 0.3974, + "step": 228500 + }, + { + "epoch": 1.549642702468601, + "grad_norm": 0.30882003903388977, + "learning_rate": 4.984503572975314e-05, + "loss": 0.3983, + "step": 229000 + }, + { + "epoch": 1.553026201818969, + "grad_norm": 0.29881536960601807, + "learning_rate": 4.9844697379818104e-05, + "loss": 0.3965, + "step": 229500 + }, + { + "epoch": 1.5564097011693374, + "grad_norm": 0.30941638350486755, + "learning_rate": 4.984435902988307e-05, + "loss": 0.3958, + "step": 230000 + }, + { + "epoch": 1.5597932005197055, + "grad_norm": 0.28484824299812317, + "learning_rate": 4.9844020679948035e-05, + "loss": 0.3972, + "step": 230500 + }, + { + "epoch": 1.5631766998700736, + "grad_norm": 0.27541229128837585, + "learning_rate": 4.9843682330013e-05, + "loss": 0.3974, + "step": 231000 + }, + { + "epoch": 1.5665601992204419, + "grad_norm": 0.2993621230125427, + "learning_rate": 4.984334398007795e-05, + "loss": 0.3989, + "step": 231500 + }, + { + "epoch": 1.56994369857081, + "grad_norm": 0.28241676092147827, + "learning_rate": 4.984300563014292e-05, + "loss": 0.3983, + "step": 232000 + }, + { + "epoch": 1.573327197921178, + "grad_norm": 0.2912180423736572, + "learning_rate": 4.9842667280207883e-05, + "loss": 0.3972, + "step": 232500 + }, + { + "epoch": 1.5767106972715461, + "grad_norm": 0.26539215445518494, + "learning_rate": 4.9842328930272846e-05, + "loss": 0.3978, + "step": 233000 + }, + { + "epoch": 1.5800941966219142, + "grad_norm": 0.29652032256126404, + "learning_rate": 4.984199058033781e-05, + "loss": 0.3972, + "step": 233500 + }, + { + "epoch": 1.5834776959722823, + "grad_norm": 0.31173455715179443, + "learning_rate": 4.984165223040278e-05, + "loss": 0.3987, + "step": 234000 + }, + { + "epoch": 1.5868611953226504, + "grad_norm": 0.3123958110809326, + "learning_rate": 4.984131388046774e-05, + "loss": 0.3975, + "step": 234500 + }, + { + "epoch": 1.5902446946730187, + "grad_norm": 0.3172590732574463, + "learning_rate": 4.98409755305327e-05, + "loss": 0.3962, + "step": 235000 + }, + { + "epoch": 1.5936281940233867, + "grad_norm": 0.2888542413711548, + "learning_rate": 4.984063718059766e-05, + "loss": 0.399, + "step": 235500 + }, + { + "epoch": 1.597011693373755, + "grad_norm": 0.3058955669403076, + "learning_rate": 4.984029883066263e-05, + "loss": 0.3965, + "step": 236000 + }, + { + "epoch": 1.6003951927241231, + "grad_norm": 0.2985432744026184, + "learning_rate": 4.983996048072759e-05, + "loss": 0.3978, + "step": 236500 + }, + { + "epoch": 1.6037786920744912, + "grad_norm": 0.3273903429508209, + "learning_rate": 4.983962213079255e-05, + "loss": 0.398, + "step": 237000 + }, + { + "epoch": 1.6071621914248593, + "grad_norm": 0.32129883766174316, + "learning_rate": 4.983928378085752e-05, + "loss": 0.3969, + "step": 237500 + }, + { + "epoch": 1.6105456907752274, + "grad_norm": 0.30501073598861694, + "learning_rate": 4.983894543092248e-05, + "loss": 0.3978, + "step": 238000 + }, + { + "epoch": 1.6139291901255954, + "grad_norm": 0.3106651306152344, + "learning_rate": 4.983860708098744e-05, + "loss": 0.3971, + "step": 238500 + }, + { + "epoch": 1.6173126894759635, + "grad_norm": 0.2970840036869049, + "learning_rate": 4.9838268731052405e-05, + "loss": 0.3965, + "step": 239000 + }, + { + "epoch": 1.6206961888263316, + "grad_norm": 0.31059491634368896, + "learning_rate": 4.9837930381117374e-05, + "loss": 0.3998, + "step": 239500 + }, + { + "epoch": 1.6240796881767, + "grad_norm": 0.3194037675857544, + "learning_rate": 4.9837592031182336e-05, + "loss": 0.3978, + "step": 240000 + }, + { + "epoch": 1.627463187527068, + "grad_norm": 0.2921430766582489, + "learning_rate": 4.98372536812473e-05, + "loss": 0.3974, + "step": 240500 + }, + { + "epoch": 1.6308466868774363, + "grad_norm": 0.36188748478889465, + "learning_rate": 4.983691533131225e-05, + "loss": 0.3976, + "step": 241000 + }, + { + "epoch": 1.6342301862278044, + "grad_norm": 0.266747385263443, + "learning_rate": 4.983657698137722e-05, + "loss": 0.3961, + "step": 241500 + }, + { + "epoch": 1.6376136855781724, + "grad_norm": 0.3226276636123657, + "learning_rate": 4.9836238631442184e-05, + "loss": 0.3961, + "step": 242000 + }, + { + "epoch": 1.6409971849285405, + "grad_norm": 0.273739218711853, + "learning_rate": 4.9835900281507146e-05, + "loss": 0.3969, + "step": 242500 + }, + { + "epoch": 1.6443806842789086, + "grad_norm": 0.2933656871318817, + "learning_rate": 4.983556193157211e-05, + "loss": 0.3966, + "step": 243000 + }, + { + "epoch": 1.6477641836292767, + "grad_norm": 0.3296910226345062, + "learning_rate": 4.983522358163708e-05, + "loss": 0.3973, + "step": 243500 + }, + { + "epoch": 1.6511476829796448, + "grad_norm": 0.3250204622745514, + "learning_rate": 4.983488523170204e-05, + "loss": 0.3948, + "step": 244000 + }, + { + "epoch": 1.6545311823300128, + "grad_norm": 0.30639970302581787, + "learning_rate": 4.9834546881767e-05, + "loss": 0.3957, + "step": 244500 + }, + { + "epoch": 1.6579146816803811, + "grad_norm": 0.31183937191963196, + "learning_rate": 4.9834208531831964e-05, + "loss": 0.3956, + "step": 245000 + }, + { + "epoch": 1.6612981810307492, + "grad_norm": 0.3072659969329834, + "learning_rate": 4.983387018189693e-05, + "loss": 0.3954, + "step": 245500 + }, + { + "epoch": 1.6646816803811175, + "grad_norm": 0.31615200638771057, + "learning_rate": 4.983353183196189e-05, + "loss": 0.3969, + "step": 246000 + }, + { + "epoch": 1.6680651797314856, + "grad_norm": 0.292656272649765, + "learning_rate": 4.983319348202685e-05, + "loss": 0.398, + "step": 246500 + }, + { + "epoch": 1.6714486790818537, + "grad_norm": 0.27612897753715515, + "learning_rate": 4.983285513209182e-05, + "loss": 0.3957, + "step": 247000 + }, + { + "epoch": 1.6748321784322218, + "grad_norm": 0.3081589937210083, + "learning_rate": 4.983251678215678e-05, + "loss": 0.3952, + "step": 247500 + }, + { + "epoch": 1.6782156777825898, + "grad_norm": 0.2990529239177704, + "learning_rate": 4.983217843222174e-05, + "loss": 0.3969, + "step": 248000 + }, + { + "epoch": 1.681599177132958, + "grad_norm": 0.34440121054649353, + "learning_rate": 4.9831840082286705e-05, + "loss": 0.3971, + "step": 248500 + }, + { + "epoch": 1.684982676483326, + "grad_norm": 0.2921518385410309, + "learning_rate": 4.9831501732351674e-05, + "loss": 0.3959, + "step": 249000 + }, + { + "epoch": 1.688366175833694, + "grad_norm": 0.296103835105896, + "learning_rate": 4.9831163382416636e-05, + "loss": 0.3966, + "step": 249500 + }, + { + "epoch": 1.6917496751840624, + "grad_norm": 0.3215673565864563, + "learning_rate": 4.98308250324816e-05, + "loss": 0.3963, + "step": 250000 + }, + { + "epoch": 1.6951331745344305, + "grad_norm": 0.2866440713405609, + "learning_rate": 4.9830486682546554e-05, + "loss": 0.3959, + "step": 250500 + }, + { + "epoch": 1.6985166738847988, + "grad_norm": 0.33129727840423584, + "learning_rate": 4.983014833261152e-05, + "loss": 0.3975, + "step": 251000 + }, + { + "epoch": 1.7019001732351668, + "grad_norm": 0.2761145830154419, + "learning_rate": 4.9829809982676485e-05, + "loss": 0.3966, + "step": 251500 + }, + { + "epoch": 1.705283672585535, + "grad_norm": 0.29251420497894287, + "learning_rate": 4.982947163274145e-05, + "loss": 0.3984, + "step": 252000 + }, + { + "epoch": 1.708667171935903, + "grad_norm": 0.34012719988822937, + "learning_rate": 4.982913328280641e-05, + "loss": 0.3985, + "step": 252500 + }, + { + "epoch": 1.712050671286271, + "grad_norm": 0.2798214554786682, + "learning_rate": 4.982879493287138e-05, + "loss": 0.3971, + "step": 253000 + }, + { + "epoch": 1.7154341706366392, + "grad_norm": 0.30469515919685364, + "learning_rate": 4.982845658293634e-05, + "loss": 0.3976, + "step": 253500 + }, + { + "epoch": 1.7188176699870072, + "grad_norm": 0.28688544034957886, + "learning_rate": 4.98281182330013e-05, + "loss": 0.396, + "step": 254000 + }, + { + "epoch": 1.7222011693373753, + "grad_norm": 0.2807927429676056, + "learning_rate": 4.9827779883066264e-05, + "loss": 0.3953, + "step": 254500 + }, + { + "epoch": 1.7255846686877436, + "grad_norm": 0.3119663596153259, + "learning_rate": 4.982744153313123e-05, + "loss": 0.3974, + "step": 255000 + }, + { + "epoch": 1.7289681680381117, + "grad_norm": 0.3375037610530853, + "learning_rate": 4.982710318319619e-05, + "loss": 0.3963, + "step": 255500 + }, + { + "epoch": 1.73235166738848, + "grad_norm": 0.30482643842697144, + "learning_rate": 4.982676483326115e-05, + "loss": 0.3958, + "step": 256000 + }, + { + "epoch": 1.735735166738848, + "grad_norm": 0.2730487287044525, + "learning_rate": 4.982642648332612e-05, + "loss": 0.3954, + "step": 256500 + }, + { + "epoch": 1.7391186660892162, + "grad_norm": 0.2868850529193878, + "learning_rate": 4.982608813339108e-05, + "loss": 0.3941, + "step": 257000 + }, + { + "epoch": 1.7425021654395842, + "grad_norm": 0.31031176447868347, + "learning_rate": 4.9825749783456044e-05, + "loss": 0.3962, + "step": 257500 + }, + { + "epoch": 1.7458856647899523, + "grad_norm": 0.2912202775478363, + "learning_rate": 4.9825411433521006e-05, + "loss": 0.3962, + "step": 258000 + }, + { + "epoch": 1.7492691641403204, + "grad_norm": 0.32004088163375854, + "learning_rate": 4.982507308358597e-05, + "loss": 0.3966, + "step": 258500 + }, + { + "epoch": 1.7526526634906885, + "grad_norm": 0.2900611162185669, + "learning_rate": 4.982473473365094e-05, + "loss": 0.3972, + "step": 259000 + }, + { + "epoch": 1.7560361628410566, + "grad_norm": 0.2974969446659088, + "learning_rate": 4.98243963837159e-05, + "loss": 0.3974, + "step": 259500 + }, + { + "epoch": 1.7594196621914249, + "grad_norm": 0.2999788522720337, + "learning_rate": 4.982405803378086e-05, + "loss": 0.3964, + "step": 260000 + }, + { + "epoch": 1.762803161541793, + "grad_norm": 0.3285123407840729, + "learning_rate": 4.982371968384582e-05, + "loss": 0.3955, + "step": 260500 + }, + { + "epoch": 1.7661866608921613, + "grad_norm": 0.30693507194519043, + "learning_rate": 4.9823381333910785e-05, + "loss": 0.395, + "step": 261000 + }, + { + "epoch": 1.7695701602425293, + "grad_norm": 0.2624507546424866, + "learning_rate": 4.982304298397575e-05, + "loss": 0.3971, + "step": 261500 + }, + { + "epoch": 1.7729536595928974, + "grad_norm": 0.3088163137435913, + "learning_rate": 4.982270463404071e-05, + "loss": 0.3956, + "step": 262000 + }, + { + "epoch": 1.7763371589432655, + "grad_norm": 0.3004806935787201, + "learning_rate": 4.982236628410568e-05, + "loss": 0.396, + "step": 262500 + }, + { + "epoch": 1.7797206582936336, + "grad_norm": 0.3055258095264435, + "learning_rate": 4.982202793417064e-05, + "loss": 0.3967, + "step": 263000 + }, + { + "epoch": 1.7831041576440017, + "grad_norm": 0.29731282591819763, + "learning_rate": 4.98216895842356e-05, + "loss": 0.3951, + "step": 263500 + }, + { + "epoch": 1.7864876569943697, + "grad_norm": 0.3096189498901367, + "learning_rate": 4.9821351234300565e-05, + "loss": 0.3968, + "step": 264000 + }, + { + "epoch": 1.789871156344738, + "grad_norm": 0.30249008536338806, + "learning_rate": 4.9821012884365534e-05, + "loss": 0.3965, + "step": 264500 + }, + { + "epoch": 1.7932546556951061, + "grad_norm": 0.28994259238243103, + "learning_rate": 4.982067453443049e-05, + "loss": 0.3958, + "step": 265000 + }, + { + "epoch": 1.7966381550454742, + "grad_norm": 0.3007451295852661, + "learning_rate": 4.982033618449545e-05, + "loss": 0.3953, + "step": 265500 + }, + { + "epoch": 1.8000216543958425, + "grad_norm": 0.3501751124858856, + "learning_rate": 4.981999783456042e-05, + "loss": 0.3946, + "step": 266000 + }, + { + "epoch": 1.8034051537462106, + "grad_norm": 0.33922311663627625, + "learning_rate": 4.981965948462538e-05, + "loss": 0.3955, + "step": 266500 + }, + { + "epoch": 1.8067886530965787, + "grad_norm": 0.3255097270011902, + "learning_rate": 4.9819321134690344e-05, + "loss": 0.3952, + "step": 267000 + }, + { + "epoch": 1.8101721524469467, + "grad_norm": 0.3020866811275482, + "learning_rate": 4.9818982784755307e-05, + "loss": 0.3954, + "step": 267500 + }, + { + "epoch": 1.8135556517973148, + "grad_norm": 0.3044784665107727, + "learning_rate": 4.981864443482027e-05, + "loss": 0.3946, + "step": 268000 + }, + { + "epoch": 1.816939151147683, + "grad_norm": 0.3110942840576172, + "learning_rate": 4.981830608488524e-05, + "loss": 0.3946, + "step": 268500 + }, + { + "epoch": 1.820322650498051, + "grad_norm": 0.3102254271507263, + "learning_rate": 4.98179677349502e-05, + "loss": 0.395, + "step": 269000 + }, + { + "epoch": 1.8237061498484193, + "grad_norm": 0.29582446813583374, + "learning_rate": 4.981762938501516e-05, + "loss": 0.3942, + "step": 269500 + }, + { + "epoch": 1.8270896491987874, + "grad_norm": 0.30391183495521545, + "learning_rate": 4.9817291035080124e-05, + "loss": 0.3962, + "step": 270000 + }, + { + "epoch": 1.8304731485491554, + "grad_norm": 0.3510082960128784, + "learning_rate": 4.9816952685145086e-05, + "loss": 0.3936, + "step": 270500 + }, + { + "epoch": 1.8338566478995237, + "grad_norm": 0.31101343035697937, + "learning_rate": 4.981661433521005e-05, + "loss": 0.3962, + "step": 271000 + }, + { + "epoch": 1.8372401472498918, + "grad_norm": 0.295521080493927, + "learning_rate": 4.981627598527501e-05, + "loss": 0.3963, + "step": 271500 + }, + { + "epoch": 1.84062364660026, + "grad_norm": 0.2972811758518219, + "learning_rate": 4.981593763533998e-05, + "loss": 0.3953, + "step": 272000 + }, + { + "epoch": 1.844007145950628, + "grad_norm": 0.29879501461982727, + "learning_rate": 4.981559928540494e-05, + "loss": 0.3944, + "step": 272500 + }, + { + "epoch": 1.847390645300996, + "grad_norm": 0.29525479674339294, + "learning_rate": 4.9815260935469903e-05, + "loss": 0.395, + "step": 273000 + }, + { + "epoch": 1.8507741446513641, + "grad_norm": 0.3053932189941406, + "learning_rate": 4.9814922585534866e-05, + "loss": 0.3944, + "step": 273500 + }, + { + "epoch": 1.8541576440017322, + "grad_norm": 0.30265629291534424, + "learning_rate": 4.9814584235599834e-05, + "loss": 0.3969, + "step": 274000 + }, + { + "epoch": 1.8575411433521005, + "grad_norm": 0.30741703510284424, + "learning_rate": 4.981424588566479e-05, + "loss": 0.3948, + "step": 274500 + }, + { + "epoch": 1.8609246427024686, + "grad_norm": 0.33393386006355286, + "learning_rate": 4.981390753572975e-05, + "loss": 0.3967, + "step": 275000 + }, + { + "epoch": 1.8643081420528367, + "grad_norm": 0.28982001543045044, + "learning_rate": 4.9813569185794714e-05, + "loss": 0.3959, + "step": 275500 + }, + { + "epoch": 1.867691641403205, + "grad_norm": 0.32962051033973694, + "learning_rate": 4.981323083585968e-05, + "loss": 0.3954, + "step": 276000 + }, + { + "epoch": 1.871075140753573, + "grad_norm": 0.3257775902748108, + "learning_rate": 4.9812892485924645e-05, + "loss": 0.395, + "step": 276500 + }, + { + "epoch": 1.8744586401039411, + "grad_norm": 0.3227112889289856, + "learning_rate": 4.981255413598961e-05, + "loss": 0.3957, + "step": 277000 + }, + { + "epoch": 1.8778421394543092, + "grad_norm": 0.3045206665992737, + "learning_rate": 4.981221578605457e-05, + "loss": 0.3946, + "step": 277500 + }, + { + "epoch": 1.8812256388046773, + "grad_norm": 0.33415108919143677, + "learning_rate": 4.981187743611954e-05, + "loss": 0.3946, + "step": 278000 + }, + { + "epoch": 1.8846091381550454, + "grad_norm": 0.28261253237724304, + "learning_rate": 4.98115390861845e-05, + "loss": 0.3956, + "step": 278500 + }, + { + "epoch": 1.8879926375054135, + "grad_norm": 0.29565319418907166, + "learning_rate": 4.981120073624946e-05, + "loss": 0.3949, + "step": 279000 + }, + { + "epoch": 1.8913761368557818, + "grad_norm": 0.33516836166381836, + "learning_rate": 4.9810862386314425e-05, + "loss": 0.394, + "step": 279500 + }, + { + "epoch": 1.8947596362061498, + "grad_norm": 0.3220950663089752, + "learning_rate": 4.981052403637939e-05, + "loss": 0.3936, + "step": 280000 + }, + { + "epoch": 1.898143135556518, + "grad_norm": 0.28635403513908386, + "learning_rate": 4.981018568644435e-05, + "loss": 0.3956, + "step": 280500 + }, + { + "epoch": 1.9015266349068862, + "grad_norm": 0.317091703414917, + "learning_rate": 4.980984733650931e-05, + "loss": 0.3954, + "step": 281000 + }, + { + "epoch": 1.9049101342572543, + "grad_norm": 0.3175191283226013, + "learning_rate": 4.980950898657428e-05, + "loss": 0.3944, + "step": 281500 + }, + { + "epoch": 1.9082936336076224, + "grad_norm": 0.29234206676483154, + "learning_rate": 4.980917063663924e-05, + "loss": 0.3933, + "step": 282000 + }, + { + "epoch": 1.9116771329579905, + "grad_norm": 0.30621030926704407, + "learning_rate": 4.9808832286704204e-05, + "loss": 0.3951, + "step": 282500 + }, + { + "epoch": 1.9150606323083585, + "grad_norm": 0.30070793628692627, + "learning_rate": 4.9808493936769166e-05, + "loss": 0.3943, + "step": 283000 + }, + { + "epoch": 1.9184441316587266, + "grad_norm": 0.3219392001628876, + "learning_rate": 4.9808155586834135e-05, + "loss": 0.3927, + "step": 283500 + }, + { + "epoch": 1.9218276310090947, + "grad_norm": 0.31432652473449707, + "learning_rate": 4.980781723689909e-05, + "loss": 0.3948, + "step": 284000 + }, + { + "epoch": 1.925211130359463, + "grad_norm": 0.3412748873233795, + "learning_rate": 4.980747888696405e-05, + "loss": 0.3937, + "step": 284500 + }, + { + "epoch": 1.928594629709831, + "grad_norm": 0.3151736259460449, + "learning_rate": 4.9807140537029015e-05, + "loss": 0.3944, + "step": 285000 + }, + { + "epoch": 1.9319781290601992, + "grad_norm": 0.3102365732192993, + "learning_rate": 4.9806802187093984e-05, + "loss": 0.3928, + "step": 285500 + }, + { + "epoch": 1.9353616284105675, + "grad_norm": 0.29650557041168213, + "learning_rate": 4.9806463837158946e-05, + "loss": 0.3931, + "step": 286000 + }, + { + "epoch": 1.9387451277609355, + "grad_norm": 0.30163249373435974, + "learning_rate": 4.980612548722391e-05, + "loss": 0.395, + "step": 286500 + }, + { + "epoch": 1.9421286271113036, + "grad_norm": 0.3102872967720032, + "learning_rate": 4.980578713728887e-05, + "loss": 0.3932, + "step": 287000 + }, + { + "epoch": 1.9455121264616717, + "grad_norm": 0.33082282543182373, + "learning_rate": 4.980544878735384e-05, + "loss": 0.3938, + "step": 287500 + }, + { + "epoch": 1.9488956258120398, + "grad_norm": 0.3110976219177246, + "learning_rate": 4.98051104374188e-05, + "loss": 0.3952, + "step": 288000 + }, + { + "epoch": 1.9522791251624079, + "grad_norm": 0.33474001288414, + "learning_rate": 4.980477208748376e-05, + "loss": 0.3951, + "step": 288500 + }, + { + "epoch": 1.955662624512776, + "grad_norm": 0.30204835534095764, + "learning_rate": 4.9804433737548725e-05, + "loss": 0.3932, + "step": 289000 + }, + { + "epoch": 1.9590461238631443, + "grad_norm": 0.2943308651447296, + "learning_rate": 4.980409538761369e-05, + "loss": 0.393, + "step": 289500 + }, + { + "epoch": 1.9624296232135123, + "grad_norm": 0.3059576749801636, + "learning_rate": 4.980375703767865e-05, + "loss": 0.3941, + "step": 290000 + }, + { + "epoch": 1.9658131225638806, + "grad_norm": 0.2936910390853882, + "learning_rate": 4.980341868774361e-05, + "loss": 0.3943, + "step": 290500 + }, + { + "epoch": 1.9691966219142487, + "grad_norm": 0.3129732608795166, + "learning_rate": 4.980308033780858e-05, + "loss": 0.3943, + "step": 291000 + }, + { + "epoch": 1.9725801212646168, + "grad_norm": 0.32157155871391296, + "learning_rate": 4.980274198787354e-05, + "loss": 0.3937, + "step": 291500 + }, + { + "epoch": 1.9759636206149849, + "grad_norm": 0.30379122495651245, + "learning_rate": 4.9802403637938505e-05, + "loss": 0.3934, + "step": 292000 + }, + { + "epoch": 1.979347119965353, + "grad_norm": 0.31935739517211914, + "learning_rate": 4.980206528800347e-05, + "loss": 0.3946, + "step": 292500 + }, + { + "epoch": 1.982730619315721, + "grad_norm": 0.3241221010684967, + "learning_rate": 4.9801726938068436e-05, + "loss": 0.3928, + "step": 293000 + }, + { + "epoch": 1.9861141186660891, + "grad_norm": 0.30251502990722656, + "learning_rate": 4.980138858813339e-05, + "loss": 0.3938, + "step": 293500 + }, + { + "epoch": 1.9894976180164572, + "grad_norm": 0.28550073504447937, + "learning_rate": 4.980105023819835e-05, + "loss": 0.3943, + "step": 294000 + }, + { + "epoch": 1.9928811173668255, + "grad_norm": 0.33502620458602905, + "learning_rate": 4.9800711888263315e-05, + "loss": 0.393, + "step": 294500 + }, + { + "epoch": 1.9962646167171936, + "grad_norm": 0.30803167819976807, + "learning_rate": 4.9800373538328284e-05, + "loss": 0.3915, + "step": 295000 + }, + { + "epoch": 1.9996481160675619, + "grad_norm": 0.3257239758968353, + "learning_rate": 4.9800035188393246e-05, + "loss": 0.3932, + "step": 295500 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.8510164531812664, + "eval_loss": 0.6058459877967834, + "eval_runtime": 3354.2545, + "eval_samples_per_second": 86.679, + "eval_steps_per_second": 5.418, + "step": 295552 + }, + { + "epoch": 2.00303161541793, + "grad_norm": 0.33320993185043335, + "learning_rate": 4.979969683845821e-05, + "loss": 0.3923, + "step": 296000 + }, + { + "epoch": 2.006415114768298, + "grad_norm": 0.31498461961746216, + "learning_rate": 4.979935848852317e-05, + "loss": 0.3928, + "step": 296500 + }, + { + "epoch": 2.009798614118666, + "grad_norm": 0.28952863812446594, + "learning_rate": 4.979902013858814e-05, + "loss": 0.3926, + "step": 297000 + }, + { + "epoch": 2.013182113469034, + "grad_norm": 0.272152304649353, + "learning_rate": 4.97986817886531e-05, + "loss": 0.3907, + "step": 297500 + }, + { + "epoch": 2.0165656128194023, + "grad_norm": 0.31934404373168945, + "learning_rate": 4.9798343438718064e-05, + "loss": 0.391, + "step": 298000 + }, + { + "epoch": 2.0199491121697704, + "grad_norm": 0.3041870594024658, + "learning_rate": 4.9798005088783026e-05, + "loss": 0.3902, + "step": 298500 + }, + { + "epoch": 2.0233326115201384, + "grad_norm": 0.3236134350299835, + "learning_rate": 4.979766673884799e-05, + "loss": 0.3927, + "step": 299000 + }, + { + "epoch": 2.0267161108705065, + "grad_norm": 0.2894350290298462, + "learning_rate": 4.979732838891295e-05, + "loss": 0.3906, + "step": 299500 + }, + { + "epoch": 2.030099610220875, + "grad_norm": 0.2992889881134033, + "learning_rate": 4.979699003897791e-05, + "loss": 0.3929, + "step": 300000 + }, + { + "epoch": 2.033483109571243, + "grad_norm": 0.29241079092025757, + "learning_rate": 4.979665168904288e-05, + "loss": 0.3923, + "step": 300500 + }, + { + "epoch": 2.036866608921611, + "grad_norm": 0.2949068546295166, + "learning_rate": 4.979631333910784e-05, + "loss": 0.3917, + "step": 301000 + }, + { + "epoch": 2.0402501082719793, + "grad_norm": 0.3157097101211548, + "learning_rate": 4.9795974989172805e-05, + "loss": 0.392, + "step": 301500 + }, + { + "epoch": 2.0436336076223474, + "grad_norm": 0.31389084458351135, + "learning_rate": 4.979563663923777e-05, + "loss": 0.3926, + "step": 302000 + }, + { + "epoch": 2.0470171069727154, + "grad_norm": 0.3211916387081146, + "learning_rate": 4.9795298289302736e-05, + "loss": 0.3931, + "step": 302500 + }, + { + "epoch": 2.0504006063230835, + "grad_norm": 0.33857420086860657, + "learning_rate": 4.979495993936769e-05, + "loss": 0.3892, + "step": 303000 + }, + { + "epoch": 2.0537841056734516, + "grad_norm": 0.30625852942466736, + "learning_rate": 4.9794621589432654e-05, + "loss": 0.3897, + "step": 303500 + }, + { + "epoch": 2.0571676050238197, + "grad_norm": 0.32721972465515137, + "learning_rate": 4.9794283239497616e-05, + "loss": 0.3922, + "step": 304000 + }, + { + "epoch": 2.0605511043741878, + "grad_norm": 0.3397035002708435, + "learning_rate": 4.9793944889562585e-05, + "loss": 0.3926, + "step": 304500 + }, + { + "epoch": 2.0639346037245563, + "grad_norm": 0.3127745985984802, + "learning_rate": 4.979360653962755e-05, + "loss": 0.3907, + "step": 305000 + }, + { + "epoch": 2.0673181030749244, + "grad_norm": 0.3054274618625641, + "learning_rate": 4.979326818969251e-05, + "loss": 0.3931, + "step": 305500 + }, + { + "epoch": 2.0707016024252924, + "grad_norm": 0.3376386761665344, + "learning_rate": 4.979292983975747e-05, + "loss": 0.3932, + "step": 306000 + }, + { + "epoch": 2.0740851017756605, + "grad_norm": 0.30187106132507324, + "learning_rate": 4.979259148982244e-05, + "loss": 0.3915, + "step": 306500 + }, + { + "epoch": 2.0774686011260286, + "grad_norm": 0.2863011956214905, + "learning_rate": 4.97922531398874e-05, + "loss": 0.3887, + "step": 307000 + }, + { + "epoch": 2.0808521004763967, + "grad_norm": 0.29724055528640747, + "learning_rate": 4.9791914789952364e-05, + "loss": 0.3926, + "step": 307500 + }, + { + "epoch": 2.0842355998267648, + "grad_norm": 0.3097030222415924, + "learning_rate": 4.9791576440017326e-05, + "loss": 0.3916, + "step": 308000 + }, + { + "epoch": 2.087619099177133, + "grad_norm": 0.30876824259757996, + "learning_rate": 4.979123809008229e-05, + "loss": 0.3917, + "step": 308500 + }, + { + "epoch": 2.091002598527501, + "grad_norm": 0.33456218242645264, + "learning_rate": 4.979089974014725e-05, + "loss": 0.3917, + "step": 309000 + }, + { + "epoch": 2.094386097877869, + "grad_norm": 0.33212965726852417, + "learning_rate": 4.979056139021221e-05, + "loss": 0.3916, + "step": 309500 + }, + { + "epoch": 2.0977695972282375, + "grad_norm": 0.33890166878700256, + "learning_rate": 4.979022304027718e-05, + "loss": 0.3933, + "step": 310000 + }, + { + "epoch": 2.1011530965786056, + "grad_norm": 0.2876567840576172, + "learning_rate": 4.9789884690342144e-05, + "loss": 0.3922, + "step": 310500 + }, + { + "epoch": 2.1045365959289737, + "grad_norm": 0.314324289560318, + "learning_rate": 4.9789546340407106e-05, + "loss": 0.3923, + "step": 311000 + }, + { + "epoch": 2.1079200952793418, + "grad_norm": 0.3218834698200226, + "learning_rate": 4.978920799047207e-05, + "loss": 0.393, + "step": 311500 + }, + { + "epoch": 2.11130359462971, + "grad_norm": 0.3074497878551483, + "learning_rate": 4.978886964053704e-05, + "loss": 0.3911, + "step": 312000 + }, + { + "epoch": 2.114687093980078, + "grad_norm": 0.3323233723640442, + "learning_rate": 4.978853129060199e-05, + "loss": 0.3913, + "step": 312500 + }, + { + "epoch": 2.118070593330446, + "grad_norm": 0.3084787130355835, + "learning_rate": 4.9788192940666954e-05, + "loss": 0.3914, + "step": 313000 + }, + { + "epoch": 2.121454092680814, + "grad_norm": 0.3199438154697418, + "learning_rate": 4.9787854590731917e-05, + "loss": 0.3918, + "step": 313500 + }, + { + "epoch": 2.124837592031182, + "grad_norm": 0.2899732291698456, + "learning_rate": 4.9787516240796885e-05, + "loss": 0.3919, + "step": 314000 + }, + { + "epoch": 2.1282210913815502, + "grad_norm": 0.3069060146808624, + "learning_rate": 4.978717789086185e-05, + "loss": 0.3917, + "step": 314500 + }, + { + "epoch": 2.1316045907319188, + "grad_norm": 0.328224241733551, + "learning_rate": 4.978683954092681e-05, + "loss": 0.3915, + "step": 315000 + }, + { + "epoch": 2.134988090082287, + "grad_norm": 0.29223302006721497, + "learning_rate": 4.978650119099177e-05, + "loss": 0.3912, + "step": 315500 + }, + { + "epoch": 2.138371589432655, + "grad_norm": 0.297137588262558, + "learning_rate": 4.978616284105674e-05, + "loss": 0.3896, + "step": 316000 + }, + { + "epoch": 2.141755088783023, + "grad_norm": 0.3235442042350769, + "learning_rate": 4.97858244911217e-05, + "loss": 0.3925, + "step": 316500 + }, + { + "epoch": 2.145138588133391, + "grad_norm": 0.35218313336372375, + "learning_rate": 4.9785486141186665e-05, + "loss": 0.3908, + "step": 317000 + }, + { + "epoch": 2.148522087483759, + "grad_norm": 0.32196253538131714, + "learning_rate": 4.978514779125163e-05, + "loss": 0.3924, + "step": 317500 + }, + { + "epoch": 2.1519055868341272, + "grad_norm": 0.2944926917552948, + "learning_rate": 4.978480944131659e-05, + "loss": 0.3916, + "step": 318000 + }, + { + "epoch": 2.1552890861844953, + "grad_norm": 0.30309149622917175, + "learning_rate": 4.978447109138155e-05, + "loss": 0.3937, + "step": 318500 + }, + { + "epoch": 2.1586725855348634, + "grad_norm": 0.3118036389350891, + "learning_rate": 4.9784132741446513e-05, + "loss": 0.3907, + "step": 319000 + }, + { + "epoch": 2.162056084885232, + "grad_norm": 0.3191807270050049, + "learning_rate": 4.978379439151148e-05, + "loss": 0.3922, + "step": 319500 + }, + { + "epoch": 2.1654395842356, + "grad_norm": 0.32771632075309753, + "learning_rate": 4.9783456041576444e-05, + "loss": 0.3912, + "step": 320000 + }, + { + "epoch": 2.168823083585968, + "grad_norm": 0.31377536058425903, + "learning_rate": 4.9783117691641407e-05, + "loss": 0.392, + "step": 320500 + }, + { + "epoch": 2.172206582936336, + "grad_norm": 0.3084983825683594, + "learning_rate": 4.978277934170637e-05, + "loss": 0.3912, + "step": 321000 + }, + { + "epoch": 2.1755900822867043, + "grad_norm": 0.3182586431503296, + "learning_rate": 4.978244099177133e-05, + "loss": 0.3922, + "step": 321500 + }, + { + "epoch": 2.1789735816370723, + "grad_norm": 0.2851950228214264, + "learning_rate": 4.97821026418363e-05, + "loss": 0.3911, + "step": 322000 + }, + { + "epoch": 2.1823570809874404, + "grad_norm": 0.31247711181640625, + "learning_rate": 4.9781764291901255e-05, + "loss": 0.3913, + "step": 322500 + }, + { + "epoch": 2.1857405803378085, + "grad_norm": 0.30481797456741333, + "learning_rate": 4.978142594196622e-05, + "loss": 0.3904, + "step": 323000 + }, + { + "epoch": 2.1891240796881766, + "grad_norm": 0.3104479908943176, + "learning_rate": 4.9781087592031186e-05, + "loss": 0.3907, + "step": 323500 + }, + { + "epoch": 2.1925075790385447, + "grad_norm": 0.32999172806739807, + "learning_rate": 4.978074924209615e-05, + "loss": 0.391, + "step": 324000 + }, + { + "epoch": 2.1958910783889127, + "grad_norm": 0.32124051451683044, + "learning_rate": 4.978041089216111e-05, + "loss": 0.3908, + "step": 324500 + }, + { + "epoch": 2.1992745777392813, + "grad_norm": 0.3054026663303375, + "learning_rate": 4.978007254222607e-05, + "loss": 0.3899, + "step": 325000 + }, + { + "epoch": 2.2026580770896493, + "grad_norm": 0.31517350673675537, + "learning_rate": 4.977973419229104e-05, + "loss": 0.3917, + "step": 325500 + }, + { + "epoch": 2.2060415764400174, + "grad_norm": 0.3233264684677124, + "learning_rate": 4.9779395842356003e-05, + "loss": 0.3928, + "step": 326000 + }, + { + "epoch": 2.2094250757903855, + "grad_norm": 0.3129175007343292, + "learning_rate": 4.9779057492420966e-05, + "loss": 0.39, + "step": 326500 + }, + { + "epoch": 2.2128085751407536, + "grad_norm": 0.3183519244194031, + "learning_rate": 4.977871914248593e-05, + "loss": 0.392, + "step": 327000 + }, + { + "epoch": 2.2161920744911217, + "grad_norm": 0.30780667066574097, + "learning_rate": 4.977838079255089e-05, + "loss": 0.3926, + "step": 327500 + }, + { + "epoch": 2.2195755738414897, + "grad_norm": 0.3175506591796875, + "learning_rate": 4.977804244261585e-05, + "loss": 0.3917, + "step": 328000 + }, + { + "epoch": 2.222959073191858, + "grad_norm": 0.3365371823310852, + "learning_rate": 4.9777704092680814e-05, + "loss": 0.3905, + "step": 328500 + }, + { + "epoch": 2.226342572542226, + "grad_norm": 0.32077744603157043, + "learning_rate": 4.9777365742745776e-05, + "loss": 0.3922, + "step": 329000 + }, + { + "epoch": 2.2297260718925944, + "grad_norm": 0.3266448676586151, + "learning_rate": 4.9777027392810745e-05, + "loss": 0.3902, + "step": 329500 + }, + { + "epoch": 2.2331095712429625, + "grad_norm": 0.30548393726348877, + "learning_rate": 4.977668904287571e-05, + "loss": 0.3906, + "step": 330000 + }, + { + "epoch": 2.2364930705933306, + "grad_norm": 0.31031087040901184, + "learning_rate": 4.977635069294067e-05, + "loss": 0.3913, + "step": 330500 + }, + { + "epoch": 2.2398765699436987, + "grad_norm": 0.2958206534385681, + "learning_rate": 4.977601234300563e-05, + "loss": 0.3923, + "step": 331000 + }, + { + "epoch": 2.2432600692940667, + "grad_norm": 0.333795964717865, + "learning_rate": 4.97756739930706e-05, + "loss": 0.39, + "step": 331500 + }, + { + "epoch": 2.246643568644435, + "grad_norm": 0.30800214409828186, + "learning_rate": 4.9775335643135556e-05, + "loss": 0.3914, + "step": 332000 + }, + { + "epoch": 2.250027067994803, + "grad_norm": 0.31785306334495544, + "learning_rate": 4.977499729320052e-05, + "loss": 0.3915, + "step": 332500 + }, + { + "epoch": 2.253410567345171, + "grad_norm": 0.33681735396385193, + "learning_rate": 4.977465894326549e-05, + "loss": 0.3925, + "step": 333000 + }, + { + "epoch": 2.256794066695539, + "grad_norm": 0.30837705731391907, + "learning_rate": 4.977432059333045e-05, + "loss": 0.3906, + "step": 333500 + }, + { + "epoch": 2.260177566045907, + "grad_norm": 0.3389814794063568, + "learning_rate": 4.977398224339541e-05, + "loss": 0.3903, + "step": 334000 + }, + { + "epoch": 2.263561065396275, + "grad_norm": 0.338090717792511, + "learning_rate": 4.977364389346037e-05, + "loss": 0.3911, + "step": 334500 + }, + { + "epoch": 2.2669445647466437, + "grad_norm": 0.32688185572624207, + "learning_rate": 4.977330554352534e-05, + "loss": 0.3912, + "step": 335000 + }, + { + "epoch": 2.270328064097012, + "grad_norm": 0.35603490471839905, + "learning_rate": 4.9772967193590304e-05, + "loss": 0.3898, + "step": 335500 + }, + { + "epoch": 2.27371156344738, + "grad_norm": 0.3290579915046692, + "learning_rate": 4.9772628843655266e-05, + "loss": 0.3918, + "step": 336000 + }, + { + "epoch": 2.277095062797748, + "grad_norm": 0.32932424545288086, + "learning_rate": 4.977229049372023e-05, + "loss": 0.3913, + "step": 336500 + }, + { + "epoch": 2.280478562148116, + "grad_norm": 0.34741368889808655, + "learning_rate": 4.977195214378519e-05, + "loss": 0.3899, + "step": 337000 + }, + { + "epoch": 2.283862061498484, + "grad_norm": 0.33639004826545715, + "learning_rate": 4.977161379385015e-05, + "loss": 0.3902, + "step": 337500 + }, + { + "epoch": 2.287245560848852, + "grad_norm": 0.30797648429870605, + "learning_rate": 4.9771275443915115e-05, + "loss": 0.3905, + "step": 338000 + }, + { + "epoch": 2.2906290601992203, + "grad_norm": 0.31171470880508423, + "learning_rate": 4.977093709398008e-05, + "loss": 0.3893, + "step": 338500 + }, + { + "epoch": 2.2940125595495884, + "grad_norm": 0.34038031101226807, + "learning_rate": 4.9770598744045046e-05, + "loss": 0.3917, + "step": 339000 + }, + { + "epoch": 2.297396058899957, + "grad_norm": 0.30429357290267944, + "learning_rate": 4.977026039411001e-05, + "loss": 0.3902, + "step": 339500 + }, + { + "epoch": 2.300779558250325, + "grad_norm": 0.30760350823402405, + "learning_rate": 4.976992204417497e-05, + "loss": 0.3902, + "step": 340000 + }, + { + "epoch": 2.304163057600693, + "grad_norm": 0.2932508885860443, + "learning_rate": 4.976958369423993e-05, + "loss": 0.3918, + "step": 340500 + }, + { + "epoch": 2.307546556951061, + "grad_norm": 0.3048866093158722, + "learning_rate": 4.97692453443049e-05, + "loss": 0.3913, + "step": 341000 + }, + { + "epoch": 2.3109300563014292, + "grad_norm": 0.3053203821182251, + "learning_rate": 4.9768906994369856e-05, + "loss": 0.3903, + "step": 341500 + }, + { + "epoch": 2.3143135556517973, + "grad_norm": 0.3155381977558136, + "learning_rate": 4.976856864443482e-05, + "loss": 0.3904, + "step": 342000 + }, + { + "epoch": 2.3176970550021654, + "grad_norm": 0.3031008243560791, + "learning_rate": 4.976823029449979e-05, + "loss": 0.3896, + "step": 342500 + }, + { + "epoch": 2.3210805543525335, + "grad_norm": 0.32542043924331665, + "learning_rate": 4.976789194456475e-05, + "loss": 0.3896, + "step": 343000 + }, + { + "epoch": 2.3244640537029015, + "grad_norm": 0.3127189576625824, + "learning_rate": 4.976755359462971e-05, + "loss": 0.3899, + "step": 343500 + }, + { + "epoch": 2.3278475530532696, + "grad_norm": 0.327363520860672, + "learning_rate": 4.9767215244694674e-05, + "loss": 0.3906, + "step": 344000 + }, + { + "epoch": 2.3312310524036377, + "grad_norm": 0.3157011866569519, + "learning_rate": 4.976687689475964e-05, + "loss": 0.3912, + "step": 344500 + }, + { + "epoch": 2.3346145517540062, + "grad_norm": 0.3201100826263428, + "learning_rate": 4.9766538544824605e-05, + "loss": 0.3908, + "step": 345000 + }, + { + "epoch": 2.3379980511043743, + "grad_norm": 0.3359414041042328, + "learning_rate": 4.976620019488957e-05, + "loss": 0.3901, + "step": 345500 + }, + { + "epoch": 2.3413815504547424, + "grad_norm": 0.3239155411720276, + "learning_rate": 4.976586184495452e-05, + "loss": 0.3905, + "step": 346000 + }, + { + "epoch": 2.3447650498051105, + "grad_norm": 0.31146302819252014, + "learning_rate": 4.976552349501949e-05, + "loss": 0.3897, + "step": 346500 + }, + { + "epoch": 2.3481485491554785, + "grad_norm": 0.32907629013061523, + "learning_rate": 4.976518514508445e-05, + "loss": 0.3919, + "step": 347000 + }, + { + "epoch": 2.3515320485058466, + "grad_norm": 0.3361744284629822, + "learning_rate": 4.9764846795149415e-05, + "loss": 0.3903, + "step": 347500 + }, + { + "epoch": 2.3549155478562147, + "grad_norm": 0.319180428981781, + "learning_rate": 4.976450844521438e-05, + "loss": 0.3905, + "step": 348000 + }, + { + "epoch": 2.358299047206583, + "grad_norm": 0.3432812988758087, + "learning_rate": 4.9764170095279346e-05, + "loss": 0.3885, + "step": 348500 + }, + { + "epoch": 2.361682546556951, + "grad_norm": 0.33523839712142944, + "learning_rate": 4.976383174534431e-05, + "loss": 0.3898, + "step": 349000 + }, + { + "epoch": 2.3650660459073194, + "grad_norm": 0.3066440522670746, + "learning_rate": 4.976349339540927e-05, + "loss": 0.3905, + "step": 349500 + }, + { + "epoch": 2.3684495452576875, + "grad_norm": 0.3280928134918213, + "learning_rate": 4.976315504547423e-05, + "loss": 0.3888, + "step": 350000 + }, + { + "epoch": 2.3718330446080556, + "grad_norm": 0.28813743591308594, + "learning_rate": 4.97628166955392e-05, + "loss": 0.3909, + "step": 350500 + }, + { + "epoch": 2.3752165439584236, + "grad_norm": 0.3382049798965454, + "learning_rate": 4.976247834560416e-05, + "loss": 0.3918, + "step": 351000 + }, + { + "epoch": 2.3786000433087917, + "grad_norm": 0.32492849230766296, + "learning_rate": 4.976213999566912e-05, + "loss": 0.3902, + "step": 351500 + }, + { + "epoch": 2.38198354265916, + "grad_norm": 0.3084128797054291, + "learning_rate": 4.976180164573409e-05, + "loss": 0.3887, + "step": 352000 + }, + { + "epoch": 2.385367042009528, + "grad_norm": 0.3256911337375641, + "learning_rate": 4.976146329579905e-05, + "loss": 0.3897, + "step": 352500 + }, + { + "epoch": 2.388750541359896, + "grad_norm": 0.29890650510787964, + "learning_rate": 4.976112494586401e-05, + "loss": 0.3915, + "step": 353000 + }, + { + "epoch": 2.392134040710264, + "grad_norm": 0.34316855669021606, + "learning_rate": 4.9760786595928974e-05, + "loss": 0.3901, + "step": 353500 + }, + { + "epoch": 2.3955175400606326, + "grad_norm": 0.30501681566238403, + "learning_rate": 4.976044824599394e-05, + "loss": 0.3896, + "step": 354000 + }, + { + "epoch": 2.398901039411, + "grad_norm": 0.3112809360027313, + "learning_rate": 4.9760109896058905e-05, + "loss": 0.3895, + "step": 354500 + }, + { + "epoch": 2.4022845387613687, + "grad_norm": 0.3149002492427826, + "learning_rate": 4.975977154612387e-05, + "loss": 0.3898, + "step": 355000 + }, + { + "epoch": 2.405668038111737, + "grad_norm": 0.3182199001312256, + "learning_rate": 4.975943319618882e-05, + "loss": 0.3887, + "step": 355500 + }, + { + "epoch": 2.409051537462105, + "grad_norm": 0.3300721049308777, + "learning_rate": 4.975909484625379e-05, + "loss": 0.3898, + "step": 356000 + }, + { + "epoch": 2.412435036812473, + "grad_norm": 0.34610435366630554, + "learning_rate": 4.9758756496318754e-05, + "loss": 0.3897, + "step": 356500 + }, + { + "epoch": 2.415818536162841, + "grad_norm": 0.29432740807533264, + "learning_rate": 4.9758418146383716e-05, + "loss": 0.3893, + "step": 357000 + }, + { + "epoch": 2.419202035513209, + "grad_norm": 0.3141118586063385, + "learning_rate": 4.975807979644868e-05, + "loss": 0.3912, + "step": 357500 + }, + { + "epoch": 2.422585534863577, + "grad_norm": 0.2943814992904663, + "learning_rate": 4.975774144651365e-05, + "loss": 0.3905, + "step": 358000 + }, + { + "epoch": 2.4259690342139453, + "grad_norm": 0.34122055768966675, + "learning_rate": 4.975740309657861e-05, + "loss": 0.3906, + "step": 358500 + }, + { + "epoch": 2.4293525335643134, + "grad_norm": 0.34175580739974976, + "learning_rate": 4.975706474664357e-05, + "loss": 0.3903, + "step": 359000 + }, + { + "epoch": 2.432736032914682, + "grad_norm": 0.31737181544303894, + "learning_rate": 4.975672639670853e-05, + "loss": 0.3881, + "step": 359500 + }, + { + "epoch": 2.43611953226505, + "grad_norm": 0.2970747947692871, + "learning_rate": 4.97563880467735e-05, + "loss": 0.3912, + "step": 360000 + }, + { + "epoch": 2.439503031615418, + "grad_norm": 0.2924173176288605, + "learning_rate": 4.975604969683846e-05, + "loss": 0.3896, + "step": 360500 + }, + { + "epoch": 2.442886530965786, + "grad_norm": 0.35092678666114807, + "learning_rate": 4.975571134690342e-05, + "loss": 0.3885, + "step": 361000 + }, + { + "epoch": 2.446270030316154, + "grad_norm": 0.3329818844795227, + "learning_rate": 4.975537299696839e-05, + "loss": 0.3906, + "step": 361500 + }, + { + "epoch": 2.4496535296665223, + "grad_norm": 0.32808420062065125, + "learning_rate": 4.975503464703335e-05, + "loss": 0.3923, + "step": 362000 + }, + { + "epoch": 2.4530370290168904, + "grad_norm": 0.3235306739807129, + "learning_rate": 4.975469629709831e-05, + "loss": 0.3896, + "step": 362500 + }, + { + "epoch": 2.4564205283672584, + "grad_norm": 0.3019697666168213, + "learning_rate": 4.9754357947163275e-05, + "loss": 0.3904, + "step": 363000 + }, + { + "epoch": 2.4598040277176265, + "grad_norm": 0.2973634600639343, + "learning_rate": 4.9754019597228244e-05, + "loss": 0.3893, + "step": 363500 + }, + { + "epoch": 2.463187527067995, + "grad_norm": 0.30464956164360046, + "learning_rate": 4.9753681247293206e-05, + "loss": 0.3901, + "step": 364000 + }, + { + "epoch": 2.466571026418363, + "grad_norm": 0.28501152992248535, + "learning_rate": 4.975334289735817e-05, + "loss": 0.3904, + "step": 364500 + }, + { + "epoch": 2.469954525768731, + "grad_norm": 0.2966424524784088, + "learning_rate": 4.9753004547423123e-05, + "loss": 0.3918, + "step": 365000 + }, + { + "epoch": 2.4733380251190993, + "grad_norm": 0.30633342266082764, + "learning_rate": 4.975266619748809e-05, + "loss": 0.3905, + "step": 365500 + }, + { + "epoch": 2.4767215244694674, + "grad_norm": 0.31464701890945435, + "learning_rate": 4.9752327847553054e-05, + "loss": 0.3895, + "step": 366000 + }, + { + "epoch": 2.4801050238198354, + "grad_norm": 0.3180805444717407, + "learning_rate": 4.9751989497618017e-05, + "loss": 0.3879, + "step": 366500 + }, + { + "epoch": 2.4834885231702035, + "grad_norm": 0.3277004659175873, + "learning_rate": 4.975165114768298e-05, + "loss": 0.3898, + "step": 367000 + }, + { + "epoch": 2.4868720225205716, + "grad_norm": 0.3120267987251282, + "learning_rate": 4.975131279774795e-05, + "loss": 0.3896, + "step": 367500 + }, + { + "epoch": 2.4902555218709397, + "grad_norm": 0.3504237234592438, + "learning_rate": 4.975097444781291e-05, + "loss": 0.3889, + "step": 368000 + }, + { + "epoch": 2.4936390212213078, + "grad_norm": 0.31472906470298767, + "learning_rate": 4.975063609787787e-05, + "loss": 0.3902, + "step": 368500 + }, + { + "epoch": 2.497022520571676, + "grad_norm": 0.2980063557624817, + "learning_rate": 4.9750297747942834e-05, + "loss": 0.3893, + "step": 369000 + }, + { + "epoch": 2.5004060199220444, + "grad_norm": 0.31815919280052185, + "learning_rate": 4.97499593980078e-05, + "loss": 0.39, + "step": 369500 + }, + { + "epoch": 2.5037895192724124, + "grad_norm": 0.34098586440086365, + "learning_rate": 4.974962104807276e-05, + "loss": 0.3891, + "step": 370000 + }, + { + "epoch": 2.5071730186227805, + "grad_norm": 0.32201772928237915, + "learning_rate": 4.974928269813772e-05, + "loss": 0.3883, + "step": 370500 + }, + { + "epoch": 2.5105565179731486, + "grad_norm": 0.32348373532295227, + "learning_rate": 4.974894434820269e-05, + "loss": 0.3895, + "step": 371000 + }, + { + "epoch": 2.5139400173235167, + "grad_norm": 0.32830971479415894, + "learning_rate": 4.974860599826765e-05, + "loss": 0.3898, + "step": 371500 + }, + { + "epoch": 2.5173235166738848, + "grad_norm": 0.28930607438087463, + "learning_rate": 4.9748267648332613e-05, + "loss": 0.3888, + "step": 372000 + }, + { + "epoch": 2.520707016024253, + "grad_norm": 0.3103958070278168, + "learning_rate": 4.9747929298397576e-05, + "loss": 0.3899, + "step": 372500 + }, + { + "epoch": 2.524090515374621, + "grad_norm": 0.3025512099266052, + "learning_rate": 4.9747590948462545e-05, + "loss": 0.3915, + "step": 373000 + }, + { + "epoch": 2.527474014724989, + "grad_norm": 0.3118777871131897, + "learning_rate": 4.974725259852751e-05, + "loss": 0.3903, + "step": 373500 + }, + { + "epoch": 2.5308575140753575, + "grad_norm": 0.30235418677330017, + "learning_rate": 4.974691424859247e-05, + "loss": 0.3883, + "step": 374000 + }, + { + "epoch": 2.534241013425725, + "grad_norm": 0.31868302822113037, + "learning_rate": 4.9746575898657424e-05, + "loss": 0.3892, + "step": 374500 + }, + { + "epoch": 2.5376245127760937, + "grad_norm": 0.2960646152496338, + "learning_rate": 4.974623754872239e-05, + "loss": 0.3906, + "step": 375000 + }, + { + "epoch": 2.5410080121264618, + "grad_norm": 0.3299707770347595, + "learning_rate": 4.9745899198787355e-05, + "loss": 0.3909, + "step": 375500 + }, + { + "epoch": 2.54439151147683, + "grad_norm": 0.3465183675289154, + "learning_rate": 4.974556084885232e-05, + "loss": 0.3893, + "step": 376000 + }, + { + "epoch": 2.547775010827198, + "grad_norm": 0.29224497079849243, + "learning_rate": 4.974522249891728e-05, + "loss": 0.3886, + "step": 376500 + }, + { + "epoch": 2.551158510177566, + "grad_norm": 0.3109583556652069, + "learning_rate": 4.974488414898225e-05, + "loss": 0.389, + "step": 377000 + }, + { + "epoch": 2.554542009527934, + "grad_norm": 0.3018151819705963, + "learning_rate": 4.974454579904721e-05, + "loss": 0.3884, + "step": 377500 + }, + { + "epoch": 2.557925508878302, + "grad_norm": 0.2968336045742035, + "learning_rate": 4.974420744911217e-05, + "loss": 0.3891, + "step": 378000 + }, + { + "epoch": 2.5613090082286707, + "grad_norm": 0.33824700117111206, + "learning_rate": 4.9743869099177135e-05, + "loss": 0.3884, + "step": 378500 + }, + { + "epoch": 2.5646925075790383, + "grad_norm": 0.2871198058128357, + "learning_rate": 4.9743530749242104e-05, + "loss": 0.3902, + "step": 379000 + }, + { + "epoch": 2.568076006929407, + "grad_norm": 0.33174142241477966, + "learning_rate": 4.974319239930706e-05, + "loss": 0.3888, + "step": 379500 + }, + { + "epoch": 2.571459506279775, + "grad_norm": 0.29837650060653687, + "learning_rate": 4.974285404937202e-05, + "loss": 0.387, + "step": 380000 + }, + { + "epoch": 2.574843005630143, + "grad_norm": 0.3502374291419983, + "learning_rate": 4.974251569943699e-05, + "loss": 0.3907, + "step": 380500 + }, + { + "epoch": 2.578226504980511, + "grad_norm": 0.34483498334884644, + "learning_rate": 4.974217734950195e-05, + "loss": 0.3896, + "step": 381000 + }, + { + "epoch": 2.581610004330879, + "grad_norm": 0.31655412912368774, + "learning_rate": 4.9741838999566914e-05, + "loss": 0.3902, + "step": 381500 + }, + { + "epoch": 2.5849935036812473, + "grad_norm": 0.3395337164402008, + "learning_rate": 4.9741500649631876e-05, + "loss": 0.389, + "step": 382000 + }, + { + "epoch": 2.5883770030316153, + "grad_norm": 0.3357096016407013, + "learning_rate": 4.9741162299696845e-05, + "loss": 0.3899, + "step": 382500 + }, + { + "epoch": 2.5917605023819834, + "grad_norm": 0.306362509727478, + "learning_rate": 4.974082394976181e-05, + "loss": 0.3884, + "step": 383000 + }, + { + "epoch": 2.5951440017323515, + "grad_norm": 0.3453750014305115, + "learning_rate": 4.974048559982677e-05, + "loss": 0.3908, + "step": 383500 + }, + { + "epoch": 2.59852750108272, + "grad_norm": 0.28299570083618164, + "learning_rate": 4.974014724989173e-05, + "loss": 0.3898, + "step": 384000 + }, + { + "epoch": 2.6019110004330877, + "grad_norm": 0.34128639101982117, + "learning_rate": 4.9739808899956694e-05, + "loss": 0.388, + "step": 384500 + }, + { + "epoch": 2.605294499783456, + "grad_norm": 0.2998855710029602, + "learning_rate": 4.9739470550021656e-05, + "loss": 0.3886, + "step": 385000 + }, + { + "epoch": 2.6086779991338243, + "grad_norm": 0.332438200712204, + "learning_rate": 4.973913220008662e-05, + "loss": 0.389, + "step": 385500 + }, + { + "epoch": 2.6120614984841923, + "grad_norm": 0.30915123224258423, + "learning_rate": 4.973879385015158e-05, + "loss": 0.3885, + "step": 386000 + }, + { + "epoch": 2.6154449978345604, + "grad_norm": 0.31033918261528015, + "learning_rate": 4.973845550021655e-05, + "loss": 0.3895, + "step": 386500 + }, + { + "epoch": 2.6188284971849285, + "grad_norm": 0.29367709159851074, + "learning_rate": 4.973811715028151e-05, + "loss": 0.3884, + "step": 387000 + }, + { + "epoch": 2.6222119965352966, + "grad_norm": 0.3094245195388794, + "learning_rate": 4.973777880034647e-05, + "loss": 0.3898, + "step": 387500 + }, + { + "epoch": 2.6255954958856647, + "grad_norm": 0.34055420756340027, + "learning_rate": 4.9737440450411435e-05, + "loss": 0.3884, + "step": 388000 + }, + { + "epoch": 2.628978995236033, + "grad_norm": 0.3231176733970642, + "learning_rate": 4.9737102100476404e-05, + "loss": 0.3885, + "step": 388500 + }, + { + "epoch": 2.632362494586401, + "grad_norm": 0.32180097699165344, + "learning_rate": 4.973676375054136e-05, + "loss": 0.3882, + "step": 389000 + }, + { + "epoch": 2.6357459939367693, + "grad_norm": 0.29383426904678345, + "learning_rate": 4.973642540060632e-05, + "loss": 0.3901, + "step": 389500 + }, + { + "epoch": 2.6391294932871374, + "grad_norm": 0.31776711344718933, + "learning_rate": 4.973608705067129e-05, + "loss": 0.388, + "step": 390000 + }, + { + "epoch": 2.6425129926375055, + "grad_norm": 0.3176041543483734, + "learning_rate": 4.973574870073625e-05, + "loss": 0.3898, + "step": 390500 + }, + { + "epoch": 2.6458964919878736, + "grad_norm": 0.3469124734401703, + "learning_rate": 4.9735410350801215e-05, + "loss": 0.3894, + "step": 391000 + }, + { + "epoch": 2.6492799913382417, + "grad_norm": 0.3145827651023865, + "learning_rate": 4.973507200086618e-05, + "loss": 0.3876, + "step": 391500 + }, + { + "epoch": 2.6526634906886097, + "grad_norm": 0.35601696372032166, + "learning_rate": 4.973473365093114e-05, + "loss": 0.3893, + "step": 392000 + }, + { + "epoch": 2.656046990038978, + "grad_norm": 0.326543927192688, + "learning_rate": 4.973439530099611e-05, + "loss": 0.3902, + "step": 392500 + }, + { + "epoch": 2.659430489389346, + "grad_norm": 0.3303530216217041, + "learning_rate": 4.973405695106107e-05, + "loss": 0.3878, + "step": 393000 + }, + { + "epoch": 2.662813988739714, + "grad_norm": 0.3083018362522125, + "learning_rate": 4.973371860112603e-05, + "loss": 0.3876, + "step": 393500 + }, + { + "epoch": 2.6661974880900825, + "grad_norm": 0.31711965799331665, + "learning_rate": 4.9733380251190994e-05, + "loss": 0.3881, + "step": 394000 + }, + { + "epoch": 2.66958098744045, + "grad_norm": 0.3374720811843872, + "learning_rate": 4.9733041901255956e-05, + "loss": 0.3879, + "step": 394500 + }, + { + "epoch": 2.6729644867908187, + "grad_norm": 0.3124774098396301, + "learning_rate": 4.973270355132092e-05, + "loss": 0.3875, + "step": 395000 + }, + { + "epoch": 2.6763479861411867, + "grad_norm": 0.33346521854400635, + "learning_rate": 4.973236520138588e-05, + "loss": 0.3882, + "step": 395500 + }, + { + "epoch": 2.679731485491555, + "grad_norm": 0.2966834008693695, + "learning_rate": 4.973202685145085e-05, + "loss": 0.3883, + "step": 396000 + }, + { + "epoch": 2.683114984841923, + "grad_norm": 0.3253791332244873, + "learning_rate": 4.973168850151581e-05, + "loss": 0.3906, + "step": 396500 + }, + { + "epoch": 2.686498484192291, + "grad_norm": 0.34003323316574097, + "learning_rate": 4.9731350151580774e-05, + "loss": 0.3887, + "step": 397000 + }, + { + "epoch": 2.689881983542659, + "grad_norm": 0.31468504667282104, + "learning_rate": 4.9731011801645736e-05, + "loss": 0.3888, + "step": 397500 + }, + { + "epoch": 2.693265482893027, + "grad_norm": 0.3268261253833771, + "learning_rate": 4.9730673451710705e-05, + "loss": 0.389, + "step": 398000 + }, + { + "epoch": 2.6966489822433957, + "grad_norm": 0.37251192331314087, + "learning_rate": 4.973033510177566e-05, + "loss": 0.3908, + "step": 398500 + }, + { + "epoch": 2.7000324815937633, + "grad_norm": 0.3105468451976776, + "learning_rate": 4.972999675184062e-05, + "loss": 0.3887, + "step": 399000 + }, + { + "epoch": 2.703415980944132, + "grad_norm": 0.30996009707450867, + "learning_rate": 4.972965840190559e-05, + "loss": 0.3891, + "step": 399500 + }, + { + "epoch": 2.7067994802945, + "grad_norm": 0.3323148488998413, + "learning_rate": 4.972932005197055e-05, + "loss": 0.3878, + "step": 400000 + }, + { + "epoch": 2.710182979644868, + "grad_norm": 0.3385847508907318, + "learning_rate": 4.9728981702035515e-05, + "loss": 0.3887, + "step": 400500 + }, + { + "epoch": 2.713566478995236, + "grad_norm": 0.38123631477355957, + "learning_rate": 4.972864335210048e-05, + "loss": 0.3894, + "step": 401000 + }, + { + "epoch": 2.716949978345604, + "grad_norm": 0.35343465209007263, + "learning_rate": 4.972830500216544e-05, + "loss": 0.3862, + "step": 401500 + }, + { + "epoch": 2.7203334776959722, + "grad_norm": 0.3679075837135315, + "learning_rate": 4.972796665223041e-05, + "loss": 0.3872, + "step": 402000 + }, + { + "epoch": 2.7237169770463403, + "grad_norm": 0.3437575697898865, + "learning_rate": 4.972762830229537e-05, + "loss": 0.3895, + "step": 402500 + }, + { + "epoch": 2.7271004763967084, + "grad_norm": 0.35861557722091675, + "learning_rate": 4.972728995236033e-05, + "loss": 0.3882, + "step": 403000 + }, + { + "epoch": 2.7304839757470765, + "grad_norm": 0.31313803791999817, + "learning_rate": 4.9726951602425295e-05, + "loss": 0.3875, + "step": 403500 + }, + { + "epoch": 2.733867475097445, + "grad_norm": 0.31771916151046753, + "learning_rate": 4.972661325249026e-05, + "loss": 0.3893, + "step": 404000 + }, + { + "epoch": 2.737250974447813, + "grad_norm": 0.3310674726963043, + "learning_rate": 4.972627490255522e-05, + "loss": 0.3879, + "step": 404500 + }, + { + "epoch": 2.740634473798181, + "grad_norm": 0.32085293531417847, + "learning_rate": 4.972593655262018e-05, + "loss": 0.39, + "step": 405000 + }, + { + "epoch": 2.7440179731485492, + "grad_norm": 0.30057665705680847, + "learning_rate": 4.972559820268515e-05, + "loss": 0.3886, + "step": 405500 + }, + { + "epoch": 2.7474014724989173, + "grad_norm": 0.31213104724884033, + "learning_rate": 4.972525985275011e-05, + "loss": 0.3874, + "step": 406000 + }, + { + "epoch": 2.7507849718492854, + "grad_norm": 0.35481685400009155, + "learning_rate": 4.9724921502815074e-05, + "loss": 0.389, + "step": 406500 + }, + { + "epoch": 2.7541684711996535, + "grad_norm": 0.33751875162124634, + "learning_rate": 4.9724583152880037e-05, + "loss": 0.388, + "step": 407000 + }, + { + "epoch": 2.7575519705500215, + "grad_norm": 0.32072505354881287, + "learning_rate": 4.9724244802945005e-05, + "loss": 0.3878, + "step": 407500 + }, + { + "epoch": 2.7609354699003896, + "grad_norm": 0.29553598165512085, + "learning_rate": 4.972390645300996e-05, + "loss": 0.3886, + "step": 408000 + }, + { + "epoch": 2.764318969250758, + "grad_norm": 0.3134274482727051, + "learning_rate": 4.972356810307492e-05, + "loss": 0.3863, + "step": 408500 + }, + { + "epoch": 2.767702468601126, + "grad_norm": 0.3523977994918823, + "learning_rate": 4.9723229753139885e-05, + "loss": 0.39, + "step": 409000 + }, + { + "epoch": 2.7710859679514943, + "grad_norm": 0.33423638343811035, + "learning_rate": 4.9722891403204854e-05, + "loss": 0.3879, + "step": 409500 + }, + { + "epoch": 2.7744694673018624, + "grad_norm": 0.3362995386123657, + "learning_rate": 4.9722553053269816e-05, + "loss": 0.3883, + "step": 410000 + }, + { + "epoch": 2.7778529666522305, + "grad_norm": 0.3276742696762085, + "learning_rate": 4.972221470333478e-05, + "loss": 0.3868, + "step": 410500 + }, + { + "epoch": 2.7812364660025986, + "grad_norm": 0.323776513338089, + "learning_rate": 4.972187635339974e-05, + "loss": 0.3875, + "step": 411000 + }, + { + "epoch": 2.7846199653529666, + "grad_norm": 0.32547879219055176, + "learning_rate": 4.972153800346471e-05, + "loss": 0.3884, + "step": 411500 + }, + { + "epoch": 2.7880034647033347, + "grad_norm": 0.31964075565338135, + "learning_rate": 4.972119965352967e-05, + "loss": 0.3879, + "step": 412000 + }, + { + "epoch": 2.791386964053703, + "grad_norm": 0.2977871894836426, + "learning_rate": 4.9720861303594633e-05, + "loss": 0.3893, + "step": 412500 + }, + { + "epoch": 2.794770463404071, + "grad_norm": 0.3149718642234802, + "learning_rate": 4.9720522953659596e-05, + "loss": 0.3867, + "step": 413000 + }, + { + "epoch": 2.798153962754439, + "grad_norm": 0.33092522621154785, + "learning_rate": 4.972018460372456e-05, + "loss": 0.3881, + "step": 413500 + }, + { + "epoch": 2.8015374621048075, + "grad_norm": 0.32213321328163147, + "learning_rate": 4.971984625378952e-05, + "loss": 0.389, + "step": 414000 + }, + { + "epoch": 2.8049209614551756, + "grad_norm": 0.31457120180130005, + "learning_rate": 4.971950790385448e-05, + "loss": 0.3865, + "step": 414500 + }, + { + "epoch": 2.8083044608055436, + "grad_norm": 0.3207741975784302, + "learning_rate": 4.971916955391945e-05, + "loss": 0.3898, + "step": 415000 + }, + { + "epoch": 2.8116879601559117, + "grad_norm": 0.3099098801612854, + "learning_rate": 4.971883120398441e-05, + "loss": 0.388, + "step": 415500 + }, + { + "epoch": 2.81507145950628, + "grad_norm": 0.33796587586402893, + "learning_rate": 4.9718492854049375e-05, + "loss": 0.3867, + "step": 416000 + }, + { + "epoch": 2.818454958856648, + "grad_norm": 0.34027165174484253, + "learning_rate": 4.971815450411434e-05, + "loss": 0.388, + "step": 416500 + }, + { + "epoch": 2.821838458207016, + "grad_norm": 0.3038021922111511, + "learning_rate": 4.9717816154179306e-05, + "loss": 0.3871, + "step": 417000 + }, + { + "epoch": 2.825221957557384, + "grad_norm": 0.30802983045578003, + "learning_rate": 4.971747780424426e-05, + "loss": 0.387, + "step": 417500 + }, + { + "epoch": 2.828605456907752, + "grad_norm": 0.3201408088207245, + "learning_rate": 4.9717139454309224e-05, + "loss": 0.3882, + "step": 418000 + }, + { + "epoch": 2.8319889562581206, + "grad_norm": 0.3160160183906555, + "learning_rate": 4.9716801104374186e-05, + "loss": 0.3883, + "step": 418500 + }, + { + "epoch": 2.8353724556084883, + "grad_norm": 0.3320191204547882, + "learning_rate": 4.9716462754439155e-05, + "loss": 0.3867, + "step": 419000 + }, + { + "epoch": 2.838755954958857, + "grad_norm": 0.29257920384407043, + "learning_rate": 4.971612440450412e-05, + "loss": 0.3868, + "step": 419500 + }, + { + "epoch": 2.842139454309225, + "grad_norm": 0.3090723156929016, + "learning_rate": 4.971578605456908e-05, + "loss": 0.3878, + "step": 420000 + }, + { + "epoch": 2.845522953659593, + "grad_norm": 0.3091406524181366, + "learning_rate": 4.971544770463404e-05, + "loss": 0.3885, + "step": 420500 + }, + { + "epoch": 2.848906453009961, + "grad_norm": 0.31975674629211426, + "learning_rate": 4.971510935469901e-05, + "loss": 0.3878, + "step": 421000 + }, + { + "epoch": 2.852289952360329, + "grad_norm": 0.3032001256942749, + "learning_rate": 4.971477100476397e-05, + "loss": 0.3873, + "step": 421500 + }, + { + "epoch": 2.855673451710697, + "grad_norm": 0.29951295256614685, + "learning_rate": 4.9714432654828934e-05, + "loss": 0.3882, + "step": 422000 + }, + { + "epoch": 2.8590569510610653, + "grad_norm": 0.29293006658554077, + "learning_rate": 4.9714094304893896e-05, + "loss": 0.3874, + "step": 422500 + }, + { + "epoch": 2.8624404504114334, + "grad_norm": 0.35580480098724365, + "learning_rate": 4.971375595495886e-05, + "loss": 0.3883, + "step": 423000 + }, + { + "epoch": 2.8658239497618014, + "grad_norm": 0.3143445551395416, + "learning_rate": 4.971341760502382e-05, + "loss": 0.3873, + "step": 423500 + }, + { + "epoch": 2.86920744911217, + "grad_norm": 0.3491705358028412, + "learning_rate": 4.971307925508878e-05, + "loss": 0.3868, + "step": 424000 + }, + { + "epoch": 2.872590948462538, + "grad_norm": 0.3095901310443878, + "learning_rate": 4.971274090515375e-05, + "loss": 0.3887, + "step": 424500 + }, + { + "epoch": 2.875974447812906, + "grad_norm": 0.3482496440410614, + "learning_rate": 4.9712402555218714e-05, + "loss": 0.3881, + "step": 425000 + }, + { + "epoch": 2.879357947163274, + "grad_norm": 0.3041991889476776, + "learning_rate": 4.9712064205283676e-05, + "loss": 0.3867, + "step": 425500 + }, + { + "epoch": 2.8827414465136423, + "grad_norm": 0.2977460026741028, + "learning_rate": 4.971172585534864e-05, + "loss": 0.388, + "step": 426000 + }, + { + "epoch": 2.8861249458640104, + "grad_norm": 0.3527410328388214, + "learning_rate": 4.971138750541361e-05, + "loss": 0.389, + "step": 426500 + }, + { + "epoch": 2.8895084452143784, + "grad_norm": 0.31773945689201355, + "learning_rate": 4.971104915547856e-05, + "loss": 0.3875, + "step": 427000 + }, + { + "epoch": 2.8928919445647465, + "grad_norm": 0.3466269075870514, + "learning_rate": 4.9710710805543524e-05, + "loss": 0.3864, + "step": 427500 + }, + { + "epoch": 2.8962754439151146, + "grad_norm": 0.3149116039276123, + "learning_rate": 4.9710372455608486e-05, + "loss": 0.388, + "step": 428000 + }, + { + "epoch": 2.899658943265483, + "grad_norm": 0.3208950161933899, + "learning_rate": 4.9710034105673455e-05, + "loss": 0.3871, + "step": 428500 + }, + { + "epoch": 2.9030424426158508, + "grad_norm": 0.31378859281539917, + "learning_rate": 4.970969575573842e-05, + "loss": 0.3877, + "step": 429000 + }, + { + "epoch": 2.9064259419662193, + "grad_norm": 0.3283815383911133, + "learning_rate": 4.970935740580338e-05, + "loss": 0.388, + "step": 429500 + }, + { + "epoch": 2.9098094413165874, + "grad_norm": 0.34402093291282654, + "learning_rate": 4.970901905586834e-05, + "loss": 0.3861, + "step": 430000 + }, + { + "epoch": 2.9131929406669554, + "grad_norm": 0.3222994804382324, + "learning_rate": 4.970868070593331e-05, + "loss": 0.3868, + "step": 430500 + }, + { + "epoch": 2.9165764400173235, + "grad_norm": 0.349680632352829, + "learning_rate": 4.970834235599827e-05, + "loss": 0.3874, + "step": 431000 + }, + { + "epoch": 2.9199599393676916, + "grad_norm": 0.34699881076812744, + "learning_rate": 4.9708004006063235e-05, + "loss": 0.388, + "step": 431500 + }, + { + "epoch": 2.9233434387180597, + "grad_norm": 0.29213935136795044, + "learning_rate": 4.97076656561282e-05, + "loss": 0.3882, + "step": 432000 + }, + { + "epoch": 2.9267269380684278, + "grad_norm": 0.33256059885025024, + "learning_rate": 4.970732730619316e-05, + "loss": 0.3877, + "step": 432500 + }, + { + "epoch": 2.9301104374187963, + "grad_norm": 0.29620301723480225, + "learning_rate": 4.970698895625812e-05, + "loss": 0.3881, + "step": 433000 + }, + { + "epoch": 2.933493936769164, + "grad_norm": 0.3183768689632416, + "learning_rate": 4.970665060632308e-05, + "loss": 0.3871, + "step": 433500 + }, + { + "epoch": 2.9368774361195324, + "grad_norm": 0.3184598386287689, + "learning_rate": 4.970631225638805e-05, + "loss": 0.3879, + "step": 434000 + }, + { + "epoch": 2.9402609354699005, + "grad_norm": 0.33179497718811035, + "learning_rate": 4.9705973906453014e-05, + "loss": 0.386, + "step": 434500 + }, + { + "epoch": 2.9436444348202686, + "grad_norm": 0.32176026701927185, + "learning_rate": 4.9705635556517976e-05, + "loss": 0.3874, + "step": 435000 + }, + { + "epoch": 2.9470279341706367, + "grad_norm": 0.3503253161907196, + "learning_rate": 4.970529720658294e-05, + "loss": 0.3884, + "step": 435500 + }, + { + "epoch": 2.9504114335210048, + "grad_norm": 0.35486873984336853, + "learning_rate": 4.970495885664791e-05, + "loss": 0.3884, + "step": 436000 + }, + { + "epoch": 2.953794932871373, + "grad_norm": 0.3227147161960602, + "learning_rate": 4.970462050671287e-05, + "loss": 0.3862, + "step": 436500 + }, + { + "epoch": 2.957178432221741, + "grad_norm": 0.31731894612312317, + "learning_rate": 4.9704282156777825e-05, + "loss": 0.386, + "step": 437000 + }, + { + "epoch": 2.960561931572109, + "grad_norm": 0.31829655170440674, + "learning_rate": 4.970394380684279e-05, + "loss": 0.387, + "step": 437500 + }, + { + "epoch": 2.963945430922477, + "grad_norm": 0.3109760284423828, + "learning_rate": 4.9703605456907756e-05, + "loss": 0.3866, + "step": 438000 + }, + { + "epoch": 2.9673289302728456, + "grad_norm": 0.3356626033782959, + "learning_rate": 4.970326710697272e-05, + "loss": 0.3896, + "step": 438500 + }, + { + "epoch": 2.9707124296232132, + "grad_norm": 0.2957267761230469, + "learning_rate": 4.970292875703768e-05, + "loss": 0.3871, + "step": 439000 + }, + { + "epoch": 2.9740959289735818, + "grad_norm": 0.3141108453273773, + "learning_rate": 4.970259040710264e-05, + "loss": 0.3869, + "step": 439500 + }, + { + "epoch": 2.97747942832395, + "grad_norm": 0.30219319462776184, + "learning_rate": 4.970225205716761e-05, + "loss": 0.3866, + "step": 440000 + }, + { + "epoch": 2.980862927674318, + "grad_norm": 0.3704169988632202, + "learning_rate": 4.970191370723257e-05, + "loss": 0.3868, + "step": 440500 + }, + { + "epoch": 2.984246427024686, + "grad_norm": 0.3687022626399994, + "learning_rate": 4.9701575357297535e-05, + "loss": 0.3876, + "step": 441000 + }, + { + "epoch": 2.987629926375054, + "grad_norm": 0.3400663435459137, + "learning_rate": 4.97012370073625e-05, + "loss": 0.3897, + "step": 441500 + }, + { + "epoch": 2.991013425725422, + "grad_norm": 0.3268395960330963, + "learning_rate": 4.970089865742746e-05, + "loss": 0.3875, + "step": 442000 + }, + { + "epoch": 2.9943969250757903, + "grad_norm": 0.32051482796669006, + "learning_rate": 4.970056030749242e-05, + "loss": 0.3868, + "step": 442500 + }, + { + "epoch": 2.9977804244261588, + "grad_norm": 0.35251978039741516, + "learning_rate": 4.9700221957557384e-05, + "loss": 0.3878, + "step": 443000 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.8527967040650408, + "eval_loss": 0.5995772480964661, + "eval_runtime": 3342.0055, + "eval_samples_per_second": 86.997, + "eval_steps_per_second": 5.437, + "step": 443328 + }, + { + "epoch": 3.001163923776527, + "grad_norm": 0.3209432065486908, + "learning_rate": 4.969988360762235e-05, + "loss": 0.3848, + "step": 443500 + }, + { + "epoch": 3.004547423126895, + "grad_norm": 0.3334997892379761, + "learning_rate": 4.9699545257687315e-05, + "loss": 0.3848, + "step": 444000 + }, + { + "epoch": 3.007930922477263, + "grad_norm": 0.3184662163257599, + "learning_rate": 4.969920690775228e-05, + "loss": 0.3849, + "step": 444500 + }, + { + "epoch": 3.011314421827631, + "grad_norm": 0.3202681541442871, + "learning_rate": 4.969886855781724e-05, + "loss": 0.3838, + "step": 445000 + }, + { + "epoch": 3.014697921177999, + "grad_norm": 0.34588322043418884, + "learning_rate": 4.969853020788221e-05, + "loss": 0.3854, + "step": 445500 + }, + { + "epoch": 3.0180814205283673, + "grad_norm": 0.3345364034175873, + "learning_rate": 4.969819185794717e-05, + "loss": 0.3858, + "step": 446000 + }, + { + "epoch": 3.0214649198787353, + "grad_norm": 0.32423135638237, + "learning_rate": 4.9697853508012125e-05, + "loss": 0.385, + "step": 446500 + }, + { + "epoch": 3.0248484192291034, + "grad_norm": 0.38436517119407654, + "learning_rate": 4.969751515807709e-05, + "loss": 0.3861, + "step": 447000 + }, + { + "epoch": 3.0282319185794715, + "grad_norm": 0.34340643882751465, + "learning_rate": 4.9697176808142056e-05, + "loss": 0.3861, + "step": 447500 + }, + { + "epoch": 3.0316154179298396, + "grad_norm": 0.3383859395980835, + "learning_rate": 4.969683845820702e-05, + "loss": 0.3848, + "step": 448000 + }, + { + "epoch": 3.034998917280208, + "grad_norm": 0.34863921999931335, + "learning_rate": 4.969650010827198e-05, + "loss": 0.3848, + "step": 448500 + }, + { + "epoch": 3.038382416630576, + "grad_norm": 0.345042884349823, + "learning_rate": 4.969616175833694e-05, + "loss": 0.3876, + "step": 449000 + }, + { + "epoch": 3.0417659159809443, + "grad_norm": 0.3602176904678345, + "learning_rate": 4.969582340840191e-05, + "loss": 0.385, + "step": 449500 + }, + { + "epoch": 3.0451494153313123, + "grad_norm": 0.3208034634590149, + "learning_rate": 4.9695485058466874e-05, + "loss": 0.3865, + "step": 450000 + }, + { + "epoch": 3.0485329146816804, + "grad_norm": 0.35154417157173157, + "learning_rate": 4.9695146708531836e-05, + "loss": 0.3854, + "step": 450500 + }, + { + "epoch": 3.0519164140320485, + "grad_norm": 0.3550427258014679, + "learning_rate": 4.96948083585968e-05, + "loss": 0.3855, + "step": 451000 + }, + { + "epoch": 3.0552999133824166, + "grad_norm": 0.3407672643661499, + "learning_rate": 4.969447000866176e-05, + "loss": 0.3847, + "step": 451500 + }, + { + "epoch": 3.0586834127327847, + "grad_norm": 0.30612611770629883, + "learning_rate": 4.969413165872672e-05, + "loss": 0.3849, + "step": 452000 + }, + { + "epoch": 3.0620669120831527, + "grad_norm": 0.3213786482810974, + "learning_rate": 4.9693793308791684e-05, + "loss": 0.3855, + "step": 452500 + }, + { + "epoch": 3.065450411433521, + "grad_norm": 0.36959215998649597, + "learning_rate": 4.969345495885665e-05, + "loss": 0.3847, + "step": 453000 + }, + { + "epoch": 3.0688339107838893, + "grad_norm": 0.33155548572540283, + "learning_rate": 4.9693116608921615e-05, + "loss": 0.3843, + "step": 453500 + }, + { + "epoch": 3.0722174101342574, + "grad_norm": 0.324441134929657, + "learning_rate": 4.969277825898658e-05, + "loss": 0.3846, + "step": 454000 + }, + { + "epoch": 3.0756009094846255, + "grad_norm": 0.3025881350040436, + "learning_rate": 4.969243990905154e-05, + "loss": 0.3881, + "step": 454500 + }, + { + "epoch": 3.0789844088349936, + "grad_norm": 0.3309617340564728, + "learning_rate": 4.96921015591165e-05, + "loss": 0.3838, + "step": 455000 + }, + { + "epoch": 3.0823679081853617, + "grad_norm": 0.3241705298423767, + "learning_rate": 4.969176320918147e-05, + "loss": 0.385, + "step": 455500 + }, + { + "epoch": 3.0857514075357297, + "grad_norm": 0.3889763057231903, + "learning_rate": 4.9691424859246426e-05, + "loss": 0.3865, + "step": 456000 + }, + { + "epoch": 3.089134906886098, + "grad_norm": 0.310674250125885, + "learning_rate": 4.969108650931139e-05, + "loss": 0.3866, + "step": 456500 + }, + { + "epoch": 3.092518406236466, + "grad_norm": 0.3215824067592621, + "learning_rate": 4.969074815937636e-05, + "loss": 0.3863, + "step": 457000 + }, + { + "epoch": 3.095901905586834, + "grad_norm": 0.3177349865436554, + "learning_rate": 4.969040980944132e-05, + "loss": 0.3854, + "step": 457500 + }, + { + "epoch": 3.099285404937202, + "grad_norm": 0.3278276026248932, + "learning_rate": 4.969007145950628e-05, + "loss": 0.385, + "step": 458000 + }, + { + "epoch": 3.1026689042875706, + "grad_norm": 0.30803415179252625, + "learning_rate": 4.9689733109571243e-05, + "loss": 0.3843, + "step": 458500 + }, + { + "epoch": 3.1060524036379387, + "grad_norm": 0.32083410024642944, + "learning_rate": 4.968939475963621e-05, + "loss": 0.3845, + "step": 459000 + }, + { + "epoch": 3.1094359029883067, + "grad_norm": 0.3132006824016571, + "learning_rate": 4.9689056409701174e-05, + "loss": 0.3848, + "step": 459500 + }, + { + "epoch": 3.112819402338675, + "grad_norm": 0.3279072642326355, + "learning_rate": 4.9688718059766137e-05, + "loss": 0.3864, + "step": 460000 + }, + { + "epoch": 3.116202901689043, + "grad_norm": 0.3340642750263214, + "learning_rate": 4.96883797098311e-05, + "loss": 0.3847, + "step": 460500 + }, + { + "epoch": 3.119586401039411, + "grad_norm": 0.33182334899902344, + "learning_rate": 4.968804135989606e-05, + "loss": 0.3864, + "step": 461000 + }, + { + "epoch": 3.122969900389779, + "grad_norm": 0.34803441166877747, + "learning_rate": 4.968770300996102e-05, + "loss": 0.3845, + "step": 461500 + }, + { + "epoch": 3.126353399740147, + "grad_norm": 0.3448106646537781, + "learning_rate": 4.9687364660025985e-05, + "loss": 0.3842, + "step": 462000 + }, + { + "epoch": 3.1297368990905152, + "grad_norm": 0.34930822253227234, + "learning_rate": 4.968702631009095e-05, + "loss": 0.3858, + "step": 462500 + }, + { + "epoch": 3.1331203984408833, + "grad_norm": 0.3045295774936676, + "learning_rate": 4.9686687960155916e-05, + "loss": 0.3852, + "step": 463000 + }, + { + "epoch": 3.136503897791252, + "grad_norm": 0.34032517671585083, + "learning_rate": 4.968634961022088e-05, + "loss": 0.3864, + "step": 463500 + }, + { + "epoch": 3.13988739714162, + "grad_norm": 0.32840994000434875, + "learning_rate": 4.968601126028584e-05, + "loss": 0.3862, + "step": 464000 + }, + { + "epoch": 3.143270896491988, + "grad_norm": 0.33663713932037354, + "learning_rate": 4.96856729103508e-05, + "loss": 0.385, + "step": 464500 + }, + { + "epoch": 3.146654395842356, + "grad_norm": 0.3208543360233307, + "learning_rate": 4.968533456041577e-05, + "loss": 0.3858, + "step": 465000 + }, + { + "epoch": 3.150037895192724, + "grad_norm": 0.3440122604370117, + "learning_rate": 4.968499621048073e-05, + "loss": 0.3853, + "step": 465500 + }, + { + "epoch": 3.1534213945430922, + "grad_norm": 0.3290427625179291, + "learning_rate": 4.968465786054569e-05, + "loss": 0.3852, + "step": 466000 + }, + { + "epoch": 3.1568048938934603, + "grad_norm": 0.3532296121120453, + "learning_rate": 4.968431951061066e-05, + "loss": 0.3843, + "step": 466500 + }, + { + "epoch": 3.1601883932438284, + "grad_norm": 0.2814158499240875, + "learning_rate": 4.968398116067562e-05, + "loss": 0.3851, + "step": 467000 + }, + { + "epoch": 3.1635718925941965, + "grad_norm": 0.3247326910495758, + "learning_rate": 4.968364281074058e-05, + "loss": 0.3846, + "step": 467500 + }, + { + "epoch": 3.1669553919445645, + "grad_norm": 0.3341606855392456, + "learning_rate": 4.9683304460805544e-05, + "loss": 0.3853, + "step": 468000 + }, + { + "epoch": 3.170338891294933, + "grad_norm": 0.3080476224422455, + "learning_rate": 4.968296611087051e-05, + "loss": 0.3871, + "step": 468500 + }, + { + "epoch": 3.173722390645301, + "grad_norm": 0.36451050639152527, + "learning_rate": 4.9682627760935475e-05, + "loss": 0.3855, + "step": 469000 + }, + { + "epoch": 3.1771058899956692, + "grad_norm": 0.31912335753440857, + "learning_rate": 4.968228941100044e-05, + "loss": 0.385, + "step": 469500 + }, + { + "epoch": 3.1804893893460373, + "grad_norm": 0.3288278877735138, + "learning_rate": 4.96819510610654e-05, + "loss": 0.3844, + "step": 470000 + }, + { + "epoch": 3.1838728886964054, + "grad_norm": 0.3190681040287018, + "learning_rate": 4.968161271113036e-05, + "loss": 0.383, + "step": 470500 + }, + { + "epoch": 3.1872563880467735, + "grad_norm": 0.3208576440811157, + "learning_rate": 4.9681274361195324e-05, + "loss": 0.3859, + "step": 471000 + }, + { + "epoch": 3.1906398873971415, + "grad_norm": 0.3483971953392029, + "learning_rate": 4.9680936011260286e-05, + "loss": 0.3843, + "step": 471500 + }, + { + "epoch": 3.1940233867475096, + "grad_norm": 0.337174654006958, + "learning_rate": 4.968059766132525e-05, + "loss": 0.3844, + "step": 472000 + }, + { + "epoch": 3.1974068860978777, + "grad_norm": 0.3170009255409241, + "learning_rate": 4.968025931139022e-05, + "loss": 0.3841, + "step": 472500 + }, + { + "epoch": 3.200790385448246, + "grad_norm": 0.342808336019516, + "learning_rate": 4.967992096145518e-05, + "loss": 0.3842, + "step": 473000 + }, + { + "epoch": 3.2041738847986143, + "grad_norm": 0.33495399355888367, + "learning_rate": 4.967958261152014e-05, + "loss": 0.3856, + "step": 473500 + }, + { + "epoch": 3.2075573841489824, + "grad_norm": 0.317581444978714, + "learning_rate": 4.96792442615851e-05, + "loss": 0.3853, + "step": 474000 + }, + { + "epoch": 3.2109408834993505, + "grad_norm": 0.31482943892478943, + "learning_rate": 4.967890591165007e-05, + "loss": 0.3843, + "step": 474500 + }, + { + "epoch": 3.2143243828497186, + "grad_norm": 0.3111674189567566, + "learning_rate": 4.967856756171503e-05, + "loss": 0.3861, + "step": 475000 + }, + { + "epoch": 3.2177078822000866, + "grad_norm": 0.31386885046958923, + "learning_rate": 4.967822921177999e-05, + "loss": 0.3849, + "step": 475500 + }, + { + "epoch": 3.2210913815504547, + "grad_norm": 0.33925771713256836, + "learning_rate": 4.967789086184496e-05, + "loss": 0.3859, + "step": 476000 + }, + { + "epoch": 3.224474880900823, + "grad_norm": 0.3127824366092682, + "learning_rate": 4.967755251190992e-05, + "loss": 0.386, + "step": 476500 + }, + { + "epoch": 3.227858380251191, + "grad_norm": 0.3213912546634674, + "learning_rate": 4.967721416197488e-05, + "loss": 0.3845, + "step": 477000 + }, + { + "epoch": 3.231241879601559, + "grad_norm": 0.31241416931152344, + "learning_rate": 4.9676875812039845e-05, + "loss": 0.3857, + "step": 477500 + }, + { + "epoch": 3.234625378951927, + "grad_norm": 0.336775541305542, + "learning_rate": 4.9676537462104814e-05, + "loss": 0.385, + "step": 478000 + }, + { + "epoch": 3.2380088783022956, + "grad_norm": 0.34088069200515747, + "learning_rate": 4.9676199112169776e-05, + "loss": 0.3858, + "step": 478500 + }, + { + "epoch": 3.2413923776526636, + "grad_norm": 0.340310662984848, + "learning_rate": 4.967586076223474e-05, + "loss": 0.3853, + "step": 479000 + }, + { + "epoch": 3.2447758770030317, + "grad_norm": 0.27655091881752014, + "learning_rate": 4.967552241229969e-05, + "loss": 0.3844, + "step": 479500 + }, + { + "epoch": 3.2481593763534, + "grad_norm": 0.34276095032691956, + "learning_rate": 4.967518406236466e-05, + "loss": 0.3848, + "step": 480000 + }, + { + "epoch": 3.251542875703768, + "grad_norm": 0.33317485451698303, + "learning_rate": 4.9674845712429624e-05, + "loss": 0.3863, + "step": 480500 + }, + { + "epoch": 3.254926375054136, + "grad_norm": 0.2947559654712677, + "learning_rate": 4.9674507362494586e-05, + "loss": 0.3851, + "step": 481000 + }, + { + "epoch": 3.258309874404504, + "grad_norm": 0.33836454153060913, + "learning_rate": 4.967416901255955e-05, + "loss": 0.3848, + "step": 481500 + }, + { + "epoch": 3.261693373754872, + "grad_norm": 0.32393962144851685, + "learning_rate": 4.967383066262452e-05, + "loss": 0.3865, + "step": 482000 + }, + { + "epoch": 3.26507687310524, + "grad_norm": 0.34863951802253723, + "learning_rate": 4.967349231268948e-05, + "loss": 0.3839, + "step": 482500 + }, + { + "epoch": 3.2684603724556087, + "grad_norm": 0.35544320940971375, + "learning_rate": 4.967315396275444e-05, + "loss": 0.3859, + "step": 483000 + }, + { + "epoch": 3.271843871805977, + "grad_norm": 0.32499048113822937, + "learning_rate": 4.9672815612819404e-05, + "loss": 0.3859, + "step": 483500 + }, + { + "epoch": 3.275227371156345, + "grad_norm": 0.3342566192150116, + "learning_rate": 4.967247726288437e-05, + "loss": 0.385, + "step": 484000 + }, + { + "epoch": 3.278610870506713, + "grad_norm": 0.32709258794784546, + "learning_rate": 4.967213891294933e-05, + "loss": 0.3852, + "step": 484500 + }, + { + "epoch": 3.281994369857081, + "grad_norm": 0.34438183903694153, + "learning_rate": 4.967180056301429e-05, + "loss": 0.3836, + "step": 485000 + }, + { + "epoch": 3.285377869207449, + "grad_norm": 0.3213898241519928, + "learning_rate": 4.967146221307926e-05, + "loss": 0.3859, + "step": 485500 + }, + { + "epoch": 3.288761368557817, + "grad_norm": 0.3340561091899872, + "learning_rate": 4.967112386314422e-05, + "loss": 0.3853, + "step": 486000 + }, + { + "epoch": 3.2921448679081853, + "grad_norm": 0.31760358810424805, + "learning_rate": 4.967078551320918e-05, + "loss": 0.3852, + "step": 486500 + }, + { + "epoch": 3.2955283672585534, + "grad_norm": 0.31598398089408875, + "learning_rate": 4.9670447163274145e-05, + "loss": 0.3849, + "step": 487000 + }, + { + "epoch": 3.2989118666089214, + "grad_norm": 0.326427698135376, + "learning_rate": 4.9670108813339114e-05, + "loss": 0.3854, + "step": 487500 + }, + { + "epoch": 3.3022953659592895, + "grad_norm": 0.3454003930091858, + "learning_rate": 4.9669770463404076e-05, + "loss": 0.3844, + "step": 488000 + }, + { + "epoch": 3.305678865309658, + "grad_norm": 0.32165205478668213, + "learning_rate": 4.966943211346904e-05, + "loss": 0.3841, + "step": 488500 + }, + { + "epoch": 3.309062364660026, + "grad_norm": 0.3449212610721588, + "learning_rate": 4.9669093763533994e-05, + "loss": 0.3868, + "step": 489000 + }, + { + "epoch": 3.312445864010394, + "grad_norm": 0.3350136876106262, + "learning_rate": 4.966875541359896e-05, + "loss": 0.3839, + "step": 489500 + }, + { + "epoch": 3.3158293633607623, + "grad_norm": 0.30893436074256897, + "learning_rate": 4.9668417063663925e-05, + "loss": 0.3844, + "step": 490000 + }, + { + "epoch": 3.3192128627111304, + "grad_norm": 0.33487969636917114, + "learning_rate": 4.966807871372889e-05, + "loss": 0.386, + "step": 490500 + }, + { + "epoch": 3.3225963620614984, + "grad_norm": 0.3271050453186035, + "learning_rate": 4.966774036379385e-05, + "loss": 0.3851, + "step": 491000 + }, + { + "epoch": 3.3259798614118665, + "grad_norm": 0.3512710928916931, + "learning_rate": 4.966740201385882e-05, + "loss": 0.3859, + "step": 491500 + }, + { + "epoch": 3.3293633607622346, + "grad_norm": 0.3383200466632843, + "learning_rate": 4.966706366392378e-05, + "loss": 0.3838, + "step": 492000 + }, + { + "epoch": 3.3327468601126027, + "grad_norm": 0.34418985247612, + "learning_rate": 4.966672531398874e-05, + "loss": 0.3848, + "step": 492500 + }, + { + "epoch": 3.336130359462971, + "grad_norm": 0.3254546523094177, + "learning_rate": 4.9666386964053704e-05, + "loss": 0.385, + "step": 493000 + }, + { + "epoch": 3.3395138588133393, + "grad_norm": 0.3146657645702362, + "learning_rate": 4.966604861411867e-05, + "loss": 0.386, + "step": 493500 + }, + { + "epoch": 3.3428973581637074, + "grad_norm": 0.34255629777908325, + "learning_rate": 4.966571026418363e-05, + "loss": 0.3846, + "step": 494000 + }, + { + "epoch": 3.3462808575140754, + "grad_norm": 0.33413317799568176, + "learning_rate": 4.966537191424859e-05, + "loss": 0.3861, + "step": 494500 + }, + { + "epoch": 3.3496643568644435, + "grad_norm": 0.31402289867401123, + "learning_rate": 4.966503356431356e-05, + "loss": 0.3848, + "step": 495000 + }, + { + "epoch": 3.3530478562148116, + "grad_norm": 0.3087741732597351, + "learning_rate": 4.966469521437852e-05, + "loss": 0.3868, + "step": 495500 + }, + { + "epoch": 3.3564313555651797, + "grad_norm": 0.2981843054294586, + "learning_rate": 4.9664356864443484e-05, + "loss": 0.3836, + "step": 496000 + }, + { + "epoch": 3.3598148549155478, + "grad_norm": 0.3750145137310028, + "learning_rate": 4.9664018514508446e-05, + "loss": 0.3858, + "step": 496500 + }, + { + "epoch": 3.363198354265916, + "grad_norm": 0.33225759863853455, + "learning_rate": 4.9663680164573415e-05, + "loss": 0.3845, + "step": 497000 + }, + { + "epoch": 3.366581853616284, + "grad_norm": 0.3978593647480011, + "learning_rate": 4.966334181463838e-05, + "loss": 0.3836, + "step": 497500 + }, + { + "epoch": 3.369965352966652, + "grad_norm": 0.3188680112361908, + "learning_rate": 4.966300346470334e-05, + "loss": 0.3846, + "step": 498000 + }, + { + "epoch": 3.3733488523170205, + "grad_norm": 0.3641412854194641, + "learning_rate": 4.96626651147683e-05, + "loss": 0.3852, + "step": 498500 + }, + { + "epoch": 3.3767323516673886, + "grad_norm": 0.3012436628341675, + "learning_rate": 4.966232676483326e-05, + "loss": 0.387, + "step": 499000 + }, + { + "epoch": 3.3801158510177567, + "grad_norm": 0.33627980947494507, + "learning_rate": 4.9661988414898225e-05, + "loss": 0.3851, + "step": 499500 + }, + { + "epoch": 3.3834993503681248, + "grad_norm": 0.3410860598087311, + "learning_rate": 4.966165006496319e-05, + "loss": 0.3847, + "step": 500000 + }, + { + "epoch": 3.386882849718493, + "grad_norm": 0.3147096633911133, + "learning_rate": 4.966131171502815e-05, + "loss": 0.3857, + "step": 500500 + }, + { + "epoch": 3.390266349068861, + "grad_norm": 0.3315429389476776, + "learning_rate": 4.966097336509312e-05, + "loss": 0.3842, + "step": 501000 + }, + { + "epoch": 3.393649848419229, + "grad_norm": 0.316974014043808, + "learning_rate": 4.966063501515808e-05, + "loss": 0.3845, + "step": 501500 + }, + { + "epoch": 3.397033347769597, + "grad_norm": 0.29767683148384094, + "learning_rate": 4.966029666522304e-05, + "loss": 0.3857, + "step": 502000 + }, + { + "epoch": 3.400416847119965, + "grad_norm": 0.31953442096710205, + "learning_rate": 4.9659958315288005e-05, + "loss": 0.3857, + "step": 502500 + }, + { + "epoch": 3.4038003464703337, + "grad_norm": 0.3329973816871643, + "learning_rate": 4.9659619965352974e-05, + "loss": 0.3857, + "step": 503000 + }, + { + "epoch": 3.4071838458207018, + "grad_norm": 0.35081276297569275, + "learning_rate": 4.965928161541793e-05, + "loss": 0.3842, + "step": 503500 + }, + { + "epoch": 3.41056734517107, + "grad_norm": 0.34732785820961, + "learning_rate": 4.965894326548289e-05, + "loss": 0.3859, + "step": 504000 + }, + { + "epoch": 3.413950844521438, + "grad_norm": 0.3713749647140503, + "learning_rate": 4.965860491554786e-05, + "loss": 0.3839, + "step": 504500 + }, + { + "epoch": 3.417334343871806, + "grad_norm": 0.33542150259017944, + "learning_rate": 4.965826656561282e-05, + "loss": 0.3861, + "step": 505000 + }, + { + "epoch": 3.420717843222174, + "grad_norm": 0.29569828510284424, + "learning_rate": 4.9657928215677784e-05, + "loss": 0.3851, + "step": 505500 + }, + { + "epoch": 3.424101342572542, + "grad_norm": 0.33000361919403076, + "learning_rate": 4.965758986574275e-05, + "loss": 0.3846, + "step": 506000 + }, + { + "epoch": 3.4274848419229103, + "grad_norm": 0.3097745180130005, + "learning_rate": 4.9657251515807716e-05, + "loss": 0.3842, + "step": 506500 + }, + { + "epoch": 3.4308683412732783, + "grad_norm": 0.33550727367401123, + "learning_rate": 4.965691316587268e-05, + "loss": 0.3859, + "step": 507000 + }, + { + "epoch": 3.4342518406236464, + "grad_norm": 0.32614001631736755, + "learning_rate": 4.965657481593764e-05, + "loss": 0.3856, + "step": 507500 + }, + { + "epoch": 3.4376353399740145, + "grad_norm": 0.38159069418907166, + "learning_rate": 4.96562364660026e-05, + "loss": 0.3855, + "step": 508000 + }, + { + "epoch": 3.441018839324383, + "grad_norm": 0.33263829350471497, + "learning_rate": 4.9655898116067564e-05, + "loss": 0.3849, + "step": 508500 + }, + { + "epoch": 3.444402338674751, + "grad_norm": 0.317388653755188, + "learning_rate": 4.9655559766132526e-05, + "loss": 0.383, + "step": 509000 + }, + { + "epoch": 3.447785838025119, + "grad_norm": 0.34153813123703003, + "learning_rate": 4.965522141619749e-05, + "loss": 0.3836, + "step": 509500 + }, + { + "epoch": 3.4511693373754873, + "grad_norm": 0.3225274384021759, + "learning_rate": 4.965488306626245e-05, + "loss": 0.384, + "step": 510000 + }, + { + "epoch": 3.4545528367258553, + "grad_norm": 0.3087396025657654, + "learning_rate": 4.965454471632742e-05, + "loss": 0.385, + "step": 510500 + }, + { + "epoch": 3.4579363360762234, + "grad_norm": 0.3249145448207855, + "learning_rate": 4.965420636639238e-05, + "loss": 0.3857, + "step": 511000 + }, + { + "epoch": 3.4613198354265915, + "grad_norm": 0.34861987829208374, + "learning_rate": 4.9653868016457344e-05, + "loss": 0.3838, + "step": 511500 + }, + { + "epoch": 3.4647033347769596, + "grad_norm": 0.3348096013069153, + "learning_rate": 4.9653529666522306e-05, + "loss": 0.3846, + "step": 512000 + }, + { + "epoch": 3.4680868341273277, + "grad_norm": 0.3486187756061554, + "learning_rate": 4.9653191316587275e-05, + "loss": 0.3833, + "step": 512500 + }, + { + "epoch": 3.471470333477696, + "grad_norm": 0.31958213448524475, + "learning_rate": 4.965285296665223e-05, + "loss": 0.3843, + "step": 513000 + }, + { + "epoch": 3.4748538328280643, + "grad_norm": 0.35458359122276306, + "learning_rate": 4.965251461671719e-05, + "loss": 0.3845, + "step": 513500 + }, + { + "epoch": 3.4782373321784323, + "grad_norm": 0.31329068541526794, + "learning_rate": 4.965217626678216e-05, + "loss": 0.3843, + "step": 514000 + }, + { + "epoch": 3.4816208315288004, + "grad_norm": 0.3021182715892792, + "learning_rate": 4.965183791684712e-05, + "loss": 0.3855, + "step": 514500 + }, + { + "epoch": 3.4850043308791685, + "grad_norm": 0.3269021809101105, + "learning_rate": 4.9651499566912085e-05, + "loss": 0.3851, + "step": 515000 + }, + { + "epoch": 3.4883878302295366, + "grad_norm": 0.3389667868614197, + "learning_rate": 4.965116121697705e-05, + "loss": 0.3859, + "step": 515500 + }, + { + "epoch": 3.4917713295799047, + "grad_norm": 0.33493050932884216, + "learning_rate": 4.9650822867042016e-05, + "loss": 0.3839, + "step": 516000 + }, + { + "epoch": 3.4951548289302727, + "grad_norm": 0.326436311006546, + "learning_rate": 4.965048451710698e-05, + "loss": 0.3849, + "step": 516500 + }, + { + "epoch": 3.498538328280641, + "grad_norm": 0.3414977490901947, + "learning_rate": 4.965014616717194e-05, + "loss": 0.3847, + "step": 517000 + }, + { + "epoch": 3.5019218276310093, + "grad_norm": 0.31117111444473267, + "learning_rate": 4.96498078172369e-05, + "loss": 0.384, + "step": 517500 + }, + { + "epoch": 3.505305326981377, + "grad_norm": 0.31108585000038147, + "learning_rate": 4.9649469467301865e-05, + "loss": 0.3855, + "step": 518000 + }, + { + "epoch": 3.5086888263317455, + "grad_norm": 0.3296654522418976, + "learning_rate": 4.964913111736683e-05, + "loss": 0.3841, + "step": 518500 + }, + { + "epoch": 3.5120723256821136, + "grad_norm": 0.34887850284576416, + "learning_rate": 4.964879276743179e-05, + "loss": 0.3848, + "step": 519000 + }, + { + "epoch": 3.5154558250324817, + "grad_norm": 0.3279761075973511, + "learning_rate": 4.964845441749675e-05, + "loss": 0.3842, + "step": 519500 + }, + { + "epoch": 3.5188393243828497, + "grad_norm": 0.3656323254108429, + "learning_rate": 4.964811606756172e-05, + "loss": 0.3846, + "step": 520000 + }, + { + "epoch": 3.522222823733218, + "grad_norm": 0.30262666940689087, + "learning_rate": 4.964777771762668e-05, + "loss": 0.3848, + "step": 520500 + }, + { + "epoch": 3.525606323083586, + "grad_norm": 0.3520102798938751, + "learning_rate": 4.9647439367691644e-05, + "loss": 0.3823, + "step": 521000 + }, + { + "epoch": 3.528989822433954, + "grad_norm": 0.31039148569107056, + "learning_rate": 4.9647101017756606e-05, + "loss": 0.3834, + "step": 521500 + }, + { + "epoch": 3.532373321784322, + "grad_norm": 0.36634209752082825, + "learning_rate": 4.9646762667821575e-05, + "loss": 0.3854, + "step": 522000 + }, + { + "epoch": 3.53575682113469, + "grad_norm": 0.3379027247428894, + "learning_rate": 4.964642431788653e-05, + "loss": 0.386, + "step": 522500 + }, + { + "epoch": 3.5391403204850587, + "grad_norm": 0.37038281559944153, + "learning_rate": 4.964608596795149e-05, + "loss": 0.3854, + "step": 523000 + }, + { + "epoch": 3.5425238198354267, + "grad_norm": 0.35718047618865967, + "learning_rate": 4.964574761801646e-05, + "loss": 0.3852, + "step": 523500 + }, + { + "epoch": 3.545907319185795, + "grad_norm": 0.3311220705509186, + "learning_rate": 4.9645409268081424e-05, + "loss": 0.3843, + "step": 524000 + }, + { + "epoch": 3.549290818536163, + "grad_norm": 0.34186434745788574, + "learning_rate": 4.9645070918146386e-05, + "loss": 0.3837, + "step": 524500 + }, + { + "epoch": 3.552674317886531, + "grad_norm": 0.34989869594573975, + "learning_rate": 4.964473256821135e-05, + "loss": 0.3843, + "step": 525000 + }, + { + "epoch": 3.556057817236899, + "grad_norm": 0.36629295349121094, + "learning_rate": 4.964439421827631e-05, + "loss": 0.3852, + "step": 525500 + }, + { + "epoch": 3.559441316587267, + "grad_norm": 0.2990732192993164, + "learning_rate": 4.964405586834128e-05, + "loss": 0.3834, + "step": 526000 + }, + { + "epoch": 3.5628248159376352, + "grad_norm": 0.3365938663482666, + "learning_rate": 4.964371751840624e-05, + "loss": 0.3839, + "step": 526500 + }, + { + "epoch": 3.5662083152880033, + "grad_norm": 0.3220049738883972, + "learning_rate": 4.96433791684712e-05, + "loss": 0.3841, + "step": 527000 + }, + { + "epoch": 3.569591814638372, + "grad_norm": 0.309627890586853, + "learning_rate": 4.9643040818536165e-05, + "loss": 0.3862, + "step": 527500 + }, + { + "epoch": 3.5729753139887395, + "grad_norm": 0.3281790316104889, + "learning_rate": 4.964270246860113e-05, + "loss": 0.3832, + "step": 528000 + }, + { + "epoch": 3.576358813339108, + "grad_norm": 0.34404656291007996, + "learning_rate": 4.964236411866609e-05, + "loss": 0.3846, + "step": 528500 + }, + { + "epoch": 3.579742312689476, + "grad_norm": 0.3608299791812897, + "learning_rate": 4.964202576873105e-05, + "loss": 0.3852, + "step": 529000 + }, + { + "epoch": 3.583125812039844, + "grad_norm": 0.3281520903110504, + "learning_rate": 4.964168741879602e-05, + "loss": 0.3834, + "step": 529500 + }, + { + "epoch": 3.5865093113902122, + "grad_norm": 0.3279350697994232, + "learning_rate": 4.964134906886098e-05, + "loss": 0.3862, + "step": 530000 + }, + { + "epoch": 3.5898928107405803, + "grad_norm": 0.30105507373809814, + "learning_rate": 4.9641010718925945e-05, + "loss": 0.3857, + "step": 530500 + }, + { + "epoch": 3.5932763100909484, + "grad_norm": 0.33526769280433655, + "learning_rate": 4.964067236899091e-05, + "loss": 0.3844, + "step": 531000 + }, + { + "epoch": 3.5966598094413165, + "grad_norm": 0.3471981883049011, + "learning_rate": 4.9640334019055876e-05, + "loss": 0.3844, + "step": 531500 + }, + { + "epoch": 3.6000433087916845, + "grad_norm": 0.32439425587654114, + "learning_rate": 4.963999566912083e-05, + "loss": 0.3836, + "step": 532000 + }, + { + "epoch": 3.6034268081420526, + "grad_norm": 0.28566959500312805, + "learning_rate": 4.963965731918579e-05, + "loss": 0.3852, + "step": 532500 + }, + { + "epoch": 3.606810307492421, + "grad_norm": 0.32242149114608765, + "learning_rate": 4.9639318969250755e-05, + "loss": 0.3842, + "step": 533000 + }, + { + "epoch": 3.6101938068427892, + "grad_norm": 0.35472533106803894, + "learning_rate": 4.9638980619315724e-05, + "loss": 0.3819, + "step": 533500 + }, + { + "epoch": 3.6135773061931573, + "grad_norm": 0.3436533510684967, + "learning_rate": 4.9638642269380686e-05, + "loss": 0.3846, + "step": 534000 + }, + { + "epoch": 3.6169608055435254, + "grad_norm": 0.3572445511817932, + "learning_rate": 4.963830391944565e-05, + "loss": 0.3858, + "step": 534500 + }, + { + "epoch": 3.6203443048938935, + "grad_norm": 0.3280385434627533, + "learning_rate": 4.963796556951061e-05, + "loss": 0.385, + "step": 535000 + }, + { + "epoch": 3.6237278042442616, + "grad_norm": 0.34210699796676636, + "learning_rate": 4.963762721957558e-05, + "loss": 0.3843, + "step": 535500 + }, + { + "epoch": 3.6271113035946296, + "grad_norm": 0.312200129032135, + "learning_rate": 4.963728886964054e-05, + "loss": 0.3861, + "step": 536000 + }, + { + "epoch": 3.6304948029449977, + "grad_norm": 0.32678160071372986, + "learning_rate": 4.9636950519705504e-05, + "loss": 0.3859, + "step": 536500 + }, + { + "epoch": 3.633878302295366, + "grad_norm": 0.3201853930950165, + "learning_rate": 4.9636612169770466e-05, + "loss": 0.3848, + "step": 537000 + }, + { + "epoch": 3.6372618016457343, + "grad_norm": 0.33350878953933716, + "learning_rate": 4.963627381983543e-05, + "loss": 0.3833, + "step": 537500 + }, + { + "epoch": 3.640645300996102, + "grad_norm": 0.32399922609329224, + "learning_rate": 4.963593546990039e-05, + "loss": 0.3843, + "step": 538000 + }, + { + "epoch": 3.6440288003464705, + "grad_norm": 0.3093265891075134, + "learning_rate": 4.963559711996535e-05, + "loss": 0.3855, + "step": 538500 + }, + { + "epoch": 3.6474122996968386, + "grad_norm": 0.35990044474601746, + "learning_rate": 4.963525877003032e-05, + "loss": 0.3828, + "step": 539000 + }, + { + "epoch": 3.6507957990472066, + "grad_norm": 0.34472528100013733, + "learning_rate": 4.963492042009528e-05, + "loss": 0.3828, + "step": 539500 + }, + { + "epoch": 3.6541792983975747, + "grad_norm": 0.3378034830093384, + "learning_rate": 4.9634582070160245e-05, + "loss": 0.3842, + "step": 540000 + }, + { + "epoch": 3.657562797747943, + "grad_norm": 0.3017686903476715, + "learning_rate": 4.963424372022521e-05, + "loss": 0.3845, + "step": 540500 + }, + { + "epoch": 3.660946297098311, + "grad_norm": 0.3325106203556061, + "learning_rate": 4.9633905370290176e-05, + "loss": 0.3847, + "step": 541000 + }, + { + "epoch": 3.664329796448679, + "grad_norm": 0.37856626510620117, + "learning_rate": 4.963356702035513e-05, + "loss": 0.3842, + "step": 541500 + }, + { + "epoch": 3.6677132957990475, + "grad_norm": 0.36542338132858276, + "learning_rate": 4.9633228670420094e-05, + "loss": 0.3855, + "step": 542000 + }, + { + "epoch": 3.671096795149415, + "grad_norm": 0.3596114218235016, + "learning_rate": 4.9632890320485056e-05, + "loss": 0.3832, + "step": 542500 + }, + { + "epoch": 3.6744802944997836, + "grad_norm": 0.344609797000885, + "learning_rate": 4.9632551970550025e-05, + "loss": 0.3838, + "step": 543000 + }, + { + "epoch": 3.6778637938501517, + "grad_norm": 0.4544999897480011, + "learning_rate": 4.963221362061499e-05, + "loss": 0.3851, + "step": 543500 + }, + { + "epoch": 3.68124729320052, + "grad_norm": 0.3322232961654663, + "learning_rate": 4.963187527067995e-05, + "loss": 0.3837, + "step": 544000 + }, + { + "epoch": 3.684630792550888, + "grad_norm": 0.3328924775123596, + "learning_rate": 4.963153692074491e-05, + "loss": 0.3842, + "step": 544500 + }, + { + "epoch": 3.688014291901256, + "grad_norm": 0.3372010290622711, + "learning_rate": 4.963119857080988e-05, + "loss": 0.3856, + "step": 545000 + }, + { + "epoch": 3.691397791251624, + "grad_norm": 0.29983511567115784, + "learning_rate": 4.963086022087484e-05, + "loss": 0.3859, + "step": 545500 + }, + { + "epoch": 3.694781290601992, + "grad_norm": 0.32538631558418274, + "learning_rate": 4.9630521870939804e-05, + "loss": 0.3855, + "step": 546000 + }, + { + "epoch": 3.69816478995236, + "grad_norm": 0.3350183963775635, + "learning_rate": 4.9630183521004767e-05, + "loss": 0.3848, + "step": 546500 + }, + { + "epoch": 3.7015482893027283, + "grad_norm": 0.3626324534416199, + "learning_rate": 4.962984517106973e-05, + "loss": 0.3841, + "step": 547000 + }, + { + "epoch": 3.704931788653097, + "grad_norm": 0.32910341024398804, + "learning_rate": 4.962950682113469e-05, + "loss": 0.3824, + "step": 547500 + }, + { + "epoch": 3.7083152880034644, + "grad_norm": 0.3035507798194885, + "learning_rate": 4.962916847119965e-05, + "loss": 0.3841, + "step": 548000 + }, + { + "epoch": 3.711698787353833, + "grad_norm": 0.30795642733573914, + "learning_rate": 4.962883012126462e-05, + "loss": 0.3823, + "step": 548500 + }, + { + "epoch": 3.715082286704201, + "grad_norm": 0.2978059947490692, + "learning_rate": 4.9628491771329584e-05, + "loss": 0.384, + "step": 549000 + }, + { + "epoch": 3.718465786054569, + "grad_norm": 0.38770484924316406, + "learning_rate": 4.9628153421394546e-05, + "loss": 0.3851, + "step": 549500 + }, + { + "epoch": 3.721849285404937, + "grad_norm": 0.33769622445106506, + "learning_rate": 4.962781507145951e-05, + "loss": 0.3843, + "step": 550000 + }, + { + "epoch": 3.7252327847553053, + "grad_norm": 0.32749757170677185, + "learning_rate": 4.962747672152448e-05, + "loss": 0.3845, + "step": 550500 + }, + { + "epoch": 3.7286162841056734, + "grad_norm": 0.30986374616622925, + "learning_rate": 4.962713837158944e-05, + "loss": 0.3858, + "step": 551000 + }, + { + "epoch": 3.7319997834560414, + "grad_norm": 0.3359217345714569, + "learning_rate": 4.9626800021654395e-05, + "loss": 0.3848, + "step": 551500 + }, + { + "epoch": 3.73538328280641, + "grad_norm": 0.3180493414402008, + "learning_rate": 4.962646167171936e-05, + "loss": 0.3842, + "step": 552000 + }, + { + "epoch": 3.7387667821567776, + "grad_norm": 0.34266242384910583, + "learning_rate": 4.9626123321784326e-05, + "loss": 0.3842, + "step": 552500 + }, + { + "epoch": 3.742150281507146, + "grad_norm": 0.3216932713985443, + "learning_rate": 4.962578497184929e-05, + "loss": 0.3834, + "step": 553000 + }, + { + "epoch": 3.745533780857514, + "grad_norm": 0.3520362079143524, + "learning_rate": 4.962544662191425e-05, + "loss": 0.3859, + "step": 553500 + }, + { + "epoch": 3.7489172802078823, + "grad_norm": 0.3680345118045807, + "learning_rate": 4.962510827197921e-05, + "loss": 0.3844, + "step": 554000 + }, + { + "epoch": 3.7523007795582504, + "grad_norm": 0.34057295322418213, + "learning_rate": 4.962476992204418e-05, + "loss": 0.3835, + "step": 554500 + }, + { + "epoch": 3.7556842789086184, + "grad_norm": 0.3277778625488281, + "learning_rate": 4.962443157210914e-05, + "loss": 0.3859, + "step": 555000 + }, + { + "epoch": 3.7590677782589865, + "grad_norm": 0.3285628855228424, + "learning_rate": 4.9624093222174105e-05, + "loss": 0.3852, + "step": 555500 + }, + { + "epoch": 3.7624512776093546, + "grad_norm": 0.311026394367218, + "learning_rate": 4.962375487223907e-05, + "loss": 0.3839, + "step": 556000 + }, + { + "epoch": 3.7658347769597227, + "grad_norm": 0.33761194348335266, + "learning_rate": 4.962341652230403e-05, + "loss": 0.3837, + "step": 556500 + }, + { + "epoch": 3.7692182763100908, + "grad_norm": 0.30678847432136536, + "learning_rate": 4.962307817236899e-05, + "loss": 0.3831, + "step": 557000 + }, + { + "epoch": 3.7726017756604593, + "grad_norm": 0.32656219601631165, + "learning_rate": 4.9622739822433954e-05, + "loss": 0.3839, + "step": 557500 + }, + { + "epoch": 3.775985275010827, + "grad_norm": 0.3577435314655304, + "learning_rate": 4.962240147249892e-05, + "loss": 0.3847, + "step": 558000 + }, + { + "epoch": 3.7793687743611954, + "grad_norm": 0.3245508372783661, + "learning_rate": 4.9622063122563885e-05, + "loss": 0.3837, + "step": 558500 + }, + { + "epoch": 3.7827522737115635, + "grad_norm": 0.33766868710517883, + "learning_rate": 4.962172477262885e-05, + "loss": 0.3849, + "step": 559000 + }, + { + "epoch": 3.7861357730619316, + "grad_norm": 0.3148922324180603, + "learning_rate": 4.962138642269381e-05, + "loss": 0.3822, + "step": 559500 + }, + { + "epoch": 3.7895192724122997, + "grad_norm": 0.34013688564300537, + "learning_rate": 4.962104807275878e-05, + "loss": 0.3852, + "step": 560000 + }, + { + "epoch": 3.7929027717626678, + "grad_norm": 0.3306371867656708, + "learning_rate": 4.962070972282374e-05, + "loss": 0.3835, + "step": 560500 + }, + { + "epoch": 3.796286271113036, + "grad_norm": 0.30493348836898804, + "learning_rate": 4.9620371372888695e-05, + "loss": 0.3837, + "step": 561000 + }, + { + "epoch": 3.799669770463404, + "grad_norm": 0.3560996353626251, + "learning_rate": 4.962003302295366e-05, + "loss": 0.3833, + "step": 561500 + }, + { + "epoch": 3.8030532698137725, + "grad_norm": 0.3450899124145508, + "learning_rate": 4.9619694673018626e-05, + "loss": 0.3852, + "step": 562000 + }, + { + "epoch": 3.80643676916414, + "grad_norm": 0.3246195912361145, + "learning_rate": 4.961935632308359e-05, + "loss": 0.3848, + "step": 562500 + }, + { + "epoch": 3.8098202685145086, + "grad_norm": 0.3165920674800873, + "learning_rate": 4.961901797314855e-05, + "loss": 0.3836, + "step": 563000 + }, + { + "epoch": 3.8132037678648767, + "grad_norm": 0.3178715705871582, + "learning_rate": 4.961867962321351e-05, + "loss": 0.3836, + "step": 563500 + }, + { + "epoch": 3.8165872672152448, + "grad_norm": 0.3456692397594452, + "learning_rate": 4.961834127327848e-05, + "loss": 0.3842, + "step": 564000 + }, + { + "epoch": 3.819970766565613, + "grad_norm": 0.34803831577301025, + "learning_rate": 4.9618002923343444e-05, + "loss": 0.3841, + "step": 564500 + }, + { + "epoch": 3.823354265915981, + "grad_norm": 0.3458116948604584, + "learning_rate": 4.9617664573408406e-05, + "loss": 0.3831, + "step": 565000 + }, + { + "epoch": 3.826737765266349, + "grad_norm": 0.3486628234386444, + "learning_rate": 4.961732622347337e-05, + "loss": 0.3867, + "step": 565500 + }, + { + "epoch": 3.830121264616717, + "grad_norm": 0.34568849205970764, + "learning_rate": 4.961698787353833e-05, + "loss": 0.3834, + "step": 566000 + }, + { + "epoch": 3.833504763967085, + "grad_norm": 0.34365609288215637, + "learning_rate": 4.961664952360329e-05, + "loss": 0.3836, + "step": 566500 + }, + { + "epoch": 3.8368882633174533, + "grad_norm": 0.5501734614372253, + "learning_rate": 4.9616311173668254e-05, + "loss": 0.3841, + "step": 567000 + }, + { + "epoch": 3.8402717626678218, + "grad_norm": 0.3909134268760681, + "learning_rate": 4.961597282373322e-05, + "loss": 0.3838, + "step": 567500 + }, + { + "epoch": 3.84365526201819, + "grad_norm": 0.34609946608543396, + "learning_rate": 4.9615634473798185e-05, + "loss": 0.3841, + "step": 568000 + }, + { + "epoch": 3.847038761368558, + "grad_norm": 0.33834394812583923, + "learning_rate": 4.961529612386315e-05, + "loss": 0.384, + "step": 568500 + }, + { + "epoch": 3.850422260718926, + "grad_norm": 0.32245519757270813, + "learning_rate": 4.961495777392811e-05, + "loss": 0.3839, + "step": 569000 + }, + { + "epoch": 3.853805760069294, + "grad_norm": 0.3729417622089386, + "learning_rate": 4.961461942399308e-05, + "loss": 0.383, + "step": 569500 + }, + { + "epoch": 3.857189259419662, + "grad_norm": 0.34652623534202576, + "learning_rate": 4.961428107405804e-05, + "loss": 0.3832, + "step": 570000 + }, + { + "epoch": 3.8605727587700303, + "grad_norm": 0.33548983931541443, + "learning_rate": 4.9613942724122996e-05, + "loss": 0.3832, + "step": 570500 + }, + { + "epoch": 3.8639562581203983, + "grad_norm": 0.3460247814655304, + "learning_rate": 4.961360437418796e-05, + "loss": 0.3841, + "step": 571000 + }, + { + "epoch": 3.8673397574707664, + "grad_norm": 0.3622657358646393, + "learning_rate": 4.961326602425293e-05, + "loss": 0.3851, + "step": 571500 + }, + { + "epoch": 3.870723256821135, + "grad_norm": 0.32590335607528687, + "learning_rate": 4.961292767431789e-05, + "loss": 0.3848, + "step": 572000 + }, + { + "epoch": 3.8741067561715026, + "grad_norm": 0.3382300138473511, + "learning_rate": 4.961258932438285e-05, + "loss": 0.3819, + "step": 572500 + }, + { + "epoch": 3.877490255521871, + "grad_norm": 0.33236709237098694, + "learning_rate": 4.961225097444781e-05, + "loss": 0.3841, + "step": 573000 + }, + { + "epoch": 3.880873754872239, + "grad_norm": 0.3273625075817108, + "learning_rate": 4.961191262451278e-05, + "loss": 0.382, + "step": 573500 + }, + { + "epoch": 3.8842572542226073, + "grad_norm": 0.3772786259651184, + "learning_rate": 4.9611574274577744e-05, + "loss": 0.383, + "step": 574000 + }, + { + "epoch": 3.8876407535729753, + "grad_norm": 0.32407230138778687, + "learning_rate": 4.9611235924642706e-05, + "loss": 0.3854, + "step": 574500 + }, + { + "epoch": 3.8910242529233434, + "grad_norm": 0.326904296875, + "learning_rate": 4.961089757470767e-05, + "loss": 0.3824, + "step": 575000 + }, + { + "epoch": 3.8944077522737115, + "grad_norm": 0.2998929023742676, + "learning_rate": 4.961055922477263e-05, + "loss": 0.3827, + "step": 575500 + }, + { + "epoch": 3.8977912516240796, + "grad_norm": 0.3163350224494934, + "learning_rate": 4.961022087483759e-05, + "loss": 0.3833, + "step": 576000 + }, + { + "epoch": 3.9011747509744477, + "grad_norm": 0.34520888328552246, + "learning_rate": 4.9609882524902555e-05, + "loss": 0.3829, + "step": 576500 + }, + { + "epoch": 3.9045582503248157, + "grad_norm": 0.31207075715065, + "learning_rate": 4.9609544174967524e-05, + "loss": 0.3825, + "step": 577000 + }, + { + "epoch": 3.9079417496751843, + "grad_norm": 0.3276323676109314, + "learning_rate": 4.9609205825032486e-05, + "loss": 0.3828, + "step": 577500 + }, + { + "epoch": 3.9113252490255523, + "grad_norm": 0.30505678057670593, + "learning_rate": 4.960886747509745e-05, + "loss": 0.384, + "step": 578000 + }, + { + "epoch": 3.9147087483759204, + "grad_norm": 0.32663464546203613, + "learning_rate": 4.960852912516241e-05, + "loss": 0.3835, + "step": 578500 + }, + { + "epoch": 3.9180922477262885, + "grad_norm": 0.32040658593177795, + "learning_rate": 4.960819077522737e-05, + "loss": 0.3822, + "step": 579000 + }, + { + "epoch": 3.9214757470766566, + "grad_norm": 0.32430753111839294, + "learning_rate": 4.960785242529234e-05, + "loss": 0.3832, + "step": 579500 + }, + { + "epoch": 3.9248592464270247, + "grad_norm": 0.31457093358039856, + "learning_rate": 4.9607514075357296e-05, + "loss": 0.3837, + "step": 580000 + }, + { + "epoch": 3.9282427457773927, + "grad_norm": 0.33417144417762756, + "learning_rate": 4.960717572542226e-05, + "loss": 0.3838, + "step": 580500 + }, + { + "epoch": 3.931626245127761, + "grad_norm": 0.3194616436958313, + "learning_rate": 4.960683737548723e-05, + "loss": 0.3832, + "step": 581000 + }, + { + "epoch": 3.935009744478129, + "grad_norm": 0.3265133798122406, + "learning_rate": 4.960649902555219e-05, + "loss": 0.3832, + "step": 581500 + }, + { + "epoch": 3.9383932438284974, + "grad_norm": 0.3196572959423065, + "learning_rate": 4.960616067561715e-05, + "loss": 0.3836, + "step": 582000 + }, + { + "epoch": 3.941776743178865, + "grad_norm": 0.36828818917274475, + "learning_rate": 4.9605822325682114e-05, + "loss": 0.3837, + "step": 582500 + }, + { + "epoch": 3.9451602425292336, + "grad_norm": 0.3626387119293213, + "learning_rate": 4.960548397574708e-05, + "loss": 0.3833, + "step": 583000 + }, + { + "epoch": 3.9485437418796017, + "grad_norm": 0.3399355709552765, + "learning_rate": 4.9605145625812045e-05, + "loss": 0.385, + "step": 583500 + }, + { + "epoch": 3.9519272412299697, + "grad_norm": 0.3553299307823181, + "learning_rate": 4.960480727587701e-05, + "loss": 0.3848, + "step": 584000 + }, + { + "epoch": 3.955310740580338, + "grad_norm": 0.3709312379360199, + "learning_rate": 4.960446892594197e-05, + "loss": 0.3847, + "step": 584500 + }, + { + "epoch": 3.958694239930706, + "grad_norm": 0.3104972541332245, + "learning_rate": 4.960413057600693e-05, + "loss": 0.3853, + "step": 585000 + }, + { + "epoch": 3.962077739281074, + "grad_norm": 0.3424404263496399, + "learning_rate": 4.960379222607189e-05, + "loss": 0.3837, + "step": 585500 + }, + { + "epoch": 3.965461238631442, + "grad_norm": 0.3378540277481079, + "learning_rate": 4.9603453876136855e-05, + "loss": 0.3846, + "step": 586000 + }, + { + "epoch": 3.96884473798181, + "grad_norm": 0.30796873569488525, + "learning_rate": 4.9603115526201824e-05, + "loss": 0.3844, + "step": 586500 + }, + { + "epoch": 3.9722282373321782, + "grad_norm": 0.38140997290611267, + "learning_rate": 4.9602777176266786e-05, + "loss": 0.3844, + "step": 587000 + }, + { + "epoch": 3.9756117366825467, + "grad_norm": 0.34148746728897095, + "learning_rate": 4.960243882633175e-05, + "loss": 0.386, + "step": 587500 + }, + { + "epoch": 3.978995236032915, + "grad_norm": 0.3145066201686859, + "learning_rate": 4.960210047639671e-05, + "loss": 0.3833, + "step": 588000 + }, + { + "epoch": 3.982378735383283, + "grad_norm": 0.3204039931297302, + "learning_rate": 4.960176212646167e-05, + "loss": 0.3828, + "step": 588500 + }, + { + "epoch": 3.985762234733651, + "grad_norm": 0.3088099956512451, + "learning_rate": 4.960142377652664e-05, + "loss": 0.3829, + "step": 589000 + }, + { + "epoch": 3.989145734084019, + "grad_norm": 0.3536892831325531, + "learning_rate": 4.96010854265916e-05, + "loss": 0.3833, + "step": 589500 + }, + { + "epoch": 3.992529233434387, + "grad_norm": 0.36354970932006836, + "learning_rate": 4.960074707665656e-05, + "loss": 0.3847, + "step": 590000 + }, + { + "epoch": 3.9959127327847552, + "grad_norm": 0.34147948026657104, + "learning_rate": 4.960040872672153e-05, + "loss": 0.3833, + "step": 590500 + }, + { + "epoch": 3.9992962321351233, + "grad_norm": 0.32976600527763367, + "learning_rate": 4.960007037678649e-05, + "loss": 0.384, + "step": 591000 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.8540332559426399, + "eval_loss": 0.5933773517608643, + "eval_runtime": 3392.2025, + "eval_samples_per_second": 85.71, + "eval_steps_per_second": 5.357, + "step": 591104 + }, + { + "epoch": 4.002679731485491, + "grad_norm": 0.3177777826786041, + "learning_rate": 4.959973202685145e-05, + "loss": 0.3825, + "step": 591500 + }, + { + "epoch": 4.00606323083586, + "grad_norm": 0.3040407598018646, + "learning_rate": 4.9599393676916414e-05, + "loss": 0.3808, + "step": 592000 + }, + { + "epoch": 4.0094467301862275, + "grad_norm": 0.3417772948741913, + "learning_rate": 4.959905532698138e-05, + "loss": 0.3814, + "step": 592500 + }, + { + "epoch": 4.012830229536596, + "grad_norm": 0.32137414813041687, + "learning_rate": 4.9598716977046345e-05, + "loss": 0.3813, + "step": 593000 + }, + { + "epoch": 4.016213728886964, + "grad_norm": 0.3459019362926483, + "learning_rate": 4.959837862711131e-05, + "loss": 0.3809, + "step": 593500 + }, + { + "epoch": 4.019597228237332, + "grad_norm": 0.33652251958847046, + "learning_rate": 4.959804027717627e-05, + "loss": 0.3815, + "step": 594000 + }, + { + "epoch": 4.0229807275877, + "grad_norm": 0.28225409984588623, + "learning_rate": 4.959770192724123e-05, + "loss": 0.3816, + "step": 594500 + }, + { + "epoch": 4.026364226938068, + "grad_norm": 0.32062068581581116, + "learning_rate": 4.9597363577306194e-05, + "loss": 0.3821, + "step": 595000 + }, + { + "epoch": 4.029747726288437, + "grad_norm": 0.3513525128364563, + "learning_rate": 4.9597025227371156e-05, + "loss": 0.3815, + "step": 595500 + }, + { + "epoch": 4.0331312256388046, + "grad_norm": 0.32809221744537354, + "learning_rate": 4.959668687743612e-05, + "loss": 0.381, + "step": 596000 + }, + { + "epoch": 4.036514724989173, + "grad_norm": 0.33766239881515503, + "learning_rate": 4.959634852750109e-05, + "loss": 0.3808, + "step": 596500 + }, + { + "epoch": 4.039898224339541, + "grad_norm": 0.311460018157959, + "learning_rate": 4.959601017756605e-05, + "loss": 0.381, + "step": 597000 + }, + { + "epoch": 4.043281723689909, + "grad_norm": 0.3221346437931061, + "learning_rate": 4.959567182763101e-05, + "loss": 0.381, + "step": 597500 + }, + { + "epoch": 4.046665223040277, + "grad_norm": 0.330782413482666, + "learning_rate": 4.9595333477695973e-05, + "loss": 0.3805, + "step": 598000 + }, + { + "epoch": 4.050048722390645, + "grad_norm": 0.33255091309547424, + "learning_rate": 4.959499512776094e-05, + "loss": 0.3794, + "step": 598500 + }, + { + "epoch": 4.053432221741013, + "grad_norm": 0.33032065629959106, + "learning_rate": 4.95946567778259e-05, + "loss": 0.3821, + "step": 599000 + }, + { + "epoch": 4.0568157210913816, + "grad_norm": 0.35562410950660706, + "learning_rate": 4.959431842789086e-05, + "loss": 0.3819, + "step": 599500 + }, + { + "epoch": 4.06019922044175, + "grad_norm": 0.331027626991272, + "learning_rate": 4.959398007795583e-05, + "loss": 0.383, + "step": 600000 + }, + { + "epoch": 4.063582719792118, + "grad_norm": 0.31970423460006714, + "learning_rate": 4.959364172802079e-05, + "loss": 0.383, + "step": 600500 + }, + { + "epoch": 4.066966219142486, + "grad_norm": 0.36062178015708923, + "learning_rate": 4.959330337808575e-05, + "loss": 0.3816, + "step": 601000 + }, + { + "epoch": 4.070349718492854, + "grad_norm": 0.3241523802280426, + "learning_rate": 4.9592965028150715e-05, + "loss": 0.3824, + "step": 601500 + }, + { + "epoch": 4.073733217843222, + "grad_norm": 0.3760012686252594, + "learning_rate": 4.9592626678215684e-05, + "loss": 0.3816, + "step": 602000 + }, + { + "epoch": 4.07711671719359, + "grad_norm": 0.29808470606803894, + "learning_rate": 4.9592288328280646e-05, + "loss": 0.3821, + "step": 602500 + }, + { + "epoch": 4.080500216543959, + "grad_norm": 0.3388458490371704, + "learning_rate": 4.959194997834561e-05, + "loss": 0.3824, + "step": 603000 + }, + { + "epoch": 4.083883715894326, + "grad_norm": 0.34288451075553894, + "learning_rate": 4.959161162841057e-05, + "loss": 0.3826, + "step": 603500 + }, + { + "epoch": 4.087267215244695, + "grad_norm": 0.35253462195396423, + "learning_rate": 4.959127327847553e-05, + "loss": 0.3822, + "step": 604000 + }, + { + "epoch": 4.090650714595062, + "grad_norm": 0.346996009349823, + "learning_rate": 4.9590934928540495e-05, + "loss": 0.3813, + "step": 604500 + }, + { + "epoch": 4.094034213945431, + "grad_norm": 0.36608654260635376, + "learning_rate": 4.959059657860546e-05, + "loss": 0.3808, + "step": 605000 + }, + { + "epoch": 4.097417713295799, + "grad_norm": 0.38393664360046387, + "learning_rate": 4.959025822867042e-05, + "loss": 0.3808, + "step": 605500 + }, + { + "epoch": 4.100801212646167, + "grad_norm": 0.33853015303611755, + "learning_rate": 4.958991987873539e-05, + "loss": 0.3823, + "step": 606000 + }, + { + "epoch": 4.104184711996536, + "grad_norm": 0.3143649101257324, + "learning_rate": 4.958958152880035e-05, + "loss": 0.3818, + "step": 606500 + }, + { + "epoch": 4.107568211346903, + "grad_norm": 0.365041583776474, + "learning_rate": 4.958924317886531e-05, + "loss": 0.3809, + "step": 607000 + }, + { + "epoch": 4.110951710697272, + "grad_norm": 0.36500734090805054, + "learning_rate": 4.9588904828930274e-05, + "loss": 0.3817, + "step": 607500 + }, + { + "epoch": 4.114335210047639, + "grad_norm": 0.33343392610549927, + "learning_rate": 4.958856647899524e-05, + "loss": 0.3812, + "step": 608000 + }, + { + "epoch": 4.117718709398008, + "grad_norm": 0.2994712293148041, + "learning_rate": 4.95882281290602e-05, + "loss": 0.3821, + "step": 608500 + }, + { + "epoch": 4.1211022087483755, + "grad_norm": 0.32676804065704346, + "learning_rate": 4.958788977912516e-05, + "loss": 0.3823, + "step": 609000 + }, + { + "epoch": 4.124485708098744, + "grad_norm": 0.3354164659976959, + "learning_rate": 4.958755142919013e-05, + "loss": 0.3796, + "step": 609500 + }, + { + "epoch": 4.127869207449113, + "grad_norm": 0.3428443968296051, + "learning_rate": 4.958721307925509e-05, + "loss": 0.3812, + "step": 610000 + }, + { + "epoch": 4.13125270679948, + "grad_norm": 0.34081506729125977, + "learning_rate": 4.9586874729320054e-05, + "loss": 0.3825, + "step": 610500 + }, + { + "epoch": 4.134636206149849, + "grad_norm": 0.3499056100845337, + "learning_rate": 4.9586536379385016e-05, + "loss": 0.3819, + "step": 611000 + }, + { + "epoch": 4.138019705500216, + "grad_norm": 0.32771143317222595, + "learning_rate": 4.9586198029449985e-05, + "loss": 0.3824, + "step": 611500 + }, + { + "epoch": 4.141403204850585, + "grad_norm": 0.327528178691864, + "learning_rate": 4.958585967951495e-05, + "loss": 0.3827, + "step": 612000 + }, + { + "epoch": 4.1447867042009525, + "grad_norm": 0.3181435465812683, + "learning_rate": 4.958552132957991e-05, + "loss": 0.381, + "step": 612500 + }, + { + "epoch": 4.148170203551321, + "grad_norm": 0.3226792514324188, + "learning_rate": 4.958518297964487e-05, + "loss": 0.3826, + "step": 613000 + }, + { + "epoch": 4.151553702901689, + "grad_norm": 0.3568819463253021, + "learning_rate": 4.958484462970983e-05, + "loss": 0.3814, + "step": 613500 + }, + { + "epoch": 4.154937202252057, + "grad_norm": 0.33348801732063293, + "learning_rate": 4.9584506279774795e-05, + "loss": 0.3847, + "step": 614000 + }, + { + "epoch": 4.158320701602426, + "grad_norm": 0.35622504353523254, + "learning_rate": 4.958416792983976e-05, + "loss": 0.3809, + "step": 614500 + }, + { + "epoch": 4.161704200952793, + "grad_norm": 0.35121893882751465, + "learning_rate": 4.958382957990472e-05, + "loss": 0.3818, + "step": 615000 + }, + { + "epoch": 4.165087700303162, + "grad_norm": 0.34467947483062744, + "learning_rate": 4.958349122996969e-05, + "loss": 0.3806, + "step": 615500 + }, + { + "epoch": 4.1684711996535295, + "grad_norm": 0.34680140018463135, + "learning_rate": 4.958315288003465e-05, + "loss": 0.3813, + "step": 616000 + }, + { + "epoch": 4.171854699003898, + "grad_norm": 0.31994232535362244, + "learning_rate": 4.958281453009961e-05, + "loss": 0.3817, + "step": 616500 + }, + { + "epoch": 4.175238198354266, + "grad_norm": 0.32718199491500854, + "learning_rate": 4.9582476180164575e-05, + "loss": 0.3822, + "step": 617000 + }, + { + "epoch": 4.178621697704634, + "grad_norm": 0.31718477606773376, + "learning_rate": 4.9582137830229544e-05, + "loss": 0.3822, + "step": 617500 + }, + { + "epoch": 4.182005197055002, + "grad_norm": 0.32777708768844604, + "learning_rate": 4.95817994802945e-05, + "loss": 0.3817, + "step": 618000 + }, + { + "epoch": 4.18538869640537, + "grad_norm": 0.32051903009414673, + "learning_rate": 4.958146113035946e-05, + "loss": 0.381, + "step": 618500 + }, + { + "epoch": 4.188772195755738, + "grad_norm": 0.33153292536735535, + "learning_rate": 4.958112278042443e-05, + "loss": 0.3816, + "step": 619000 + }, + { + "epoch": 4.1921556951061065, + "grad_norm": 0.30980491638183594, + "learning_rate": 4.958078443048939e-05, + "loss": 0.3821, + "step": 619500 + }, + { + "epoch": 4.195539194456475, + "grad_norm": 0.3164099454879761, + "learning_rate": 4.9580446080554354e-05, + "loss": 0.3796, + "step": 620000 + }, + { + "epoch": 4.198922693806843, + "grad_norm": 0.33522167801856995, + "learning_rate": 4.9580107730619316e-05, + "loss": 0.382, + "step": 620500 + }, + { + "epoch": 4.202306193157211, + "grad_norm": 0.3143502175807953, + "learning_rate": 4.9579769380684285e-05, + "loss": 0.3818, + "step": 621000 + }, + { + "epoch": 4.205689692507579, + "grad_norm": 0.36703556776046753, + "learning_rate": 4.957943103074925e-05, + "loss": 0.3823, + "step": 621500 + }, + { + "epoch": 4.209073191857947, + "grad_norm": 0.32491037249565125, + "learning_rate": 4.957909268081421e-05, + "loss": 0.3809, + "step": 622000 + }, + { + "epoch": 4.212456691208315, + "grad_norm": 0.34407299757003784, + "learning_rate": 4.957875433087917e-05, + "loss": 0.3825, + "step": 622500 + }, + { + "epoch": 4.2158401905586835, + "grad_norm": 0.3241789937019348, + "learning_rate": 4.9578415980944134e-05, + "loss": 0.3816, + "step": 623000 + }, + { + "epoch": 4.219223689909051, + "grad_norm": 0.31797224283218384, + "learning_rate": 4.9578077631009096e-05, + "loss": 0.3823, + "step": 623500 + }, + { + "epoch": 4.22260718925942, + "grad_norm": 0.3514759838581085, + "learning_rate": 4.957773928107406e-05, + "loss": 0.3826, + "step": 624000 + }, + { + "epoch": 4.225990688609788, + "grad_norm": 0.34980130195617676, + "learning_rate": 4.957740093113902e-05, + "loss": 0.384, + "step": 624500 + }, + { + "epoch": 4.229374187960156, + "grad_norm": 0.31096556782722473, + "learning_rate": 4.957706258120399e-05, + "loss": 0.3817, + "step": 625000 + }, + { + "epoch": 4.232757687310524, + "grad_norm": 0.34254008531570435, + "learning_rate": 4.957672423126895e-05, + "loss": 0.3807, + "step": 625500 + }, + { + "epoch": 4.236141186660892, + "grad_norm": 0.3574650287628174, + "learning_rate": 4.957638588133391e-05, + "loss": 0.3809, + "step": 626000 + }, + { + "epoch": 4.2395246860112605, + "grad_norm": 0.3042188882827759, + "learning_rate": 4.9576047531398875e-05, + "loss": 0.3815, + "step": 626500 + }, + { + "epoch": 4.242908185361628, + "grad_norm": 0.3633319139480591, + "learning_rate": 4.9575709181463844e-05, + "loss": 0.3816, + "step": 627000 + }, + { + "epoch": 4.246291684711997, + "grad_norm": 0.3306909203529358, + "learning_rate": 4.95753708315288e-05, + "loss": 0.3817, + "step": 627500 + }, + { + "epoch": 4.249675184062364, + "grad_norm": 0.3724987506866455, + "learning_rate": 4.957503248159376e-05, + "loss": 0.3822, + "step": 628000 + }, + { + "epoch": 4.253058683412733, + "grad_norm": 0.36483970284461975, + "learning_rate": 4.957469413165873e-05, + "loss": 0.3817, + "step": 628500 + }, + { + "epoch": 4.2564421827631005, + "grad_norm": 0.3343786895275116, + "learning_rate": 4.957435578172369e-05, + "loss": 0.3822, + "step": 629000 + }, + { + "epoch": 4.259825682113469, + "grad_norm": 0.3305645287036896, + "learning_rate": 4.9574017431788655e-05, + "loss": 0.3818, + "step": 629500 + }, + { + "epoch": 4.2632091814638375, + "grad_norm": 0.32104507088661194, + "learning_rate": 4.957367908185362e-05, + "loss": 0.3821, + "step": 630000 + }, + { + "epoch": 4.266592680814205, + "grad_norm": 0.3260861337184906, + "learning_rate": 4.9573340731918586e-05, + "loss": 0.3824, + "step": 630500 + }, + { + "epoch": 4.269976180164574, + "grad_norm": 0.31556007266044617, + "learning_rate": 4.957300238198355e-05, + "loss": 0.3824, + "step": 631000 + }, + { + "epoch": 4.273359679514941, + "grad_norm": 0.3143795430660248, + "learning_rate": 4.957266403204851e-05, + "loss": 0.3811, + "step": 631500 + }, + { + "epoch": 4.27674317886531, + "grad_norm": 0.3627665936946869, + "learning_rate": 4.957232568211347e-05, + "loss": 0.3836, + "step": 632000 + }, + { + "epoch": 4.2801266782156775, + "grad_norm": 0.34783482551574707, + "learning_rate": 4.9571987332178434e-05, + "loss": 0.3831, + "step": 632500 + }, + { + "epoch": 4.283510177566046, + "grad_norm": 0.3361124098300934, + "learning_rate": 4.9571648982243396e-05, + "loss": 0.3832, + "step": 633000 + }, + { + "epoch": 4.286893676916414, + "grad_norm": 0.3182179927825928, + "learning_rate": 4.957131063230836e-05, + "loss": 0.3819, + "step": 633500 + }, + { + "epoch": 4.290277176266782, + "grad_norm": 0.33180859684944153, + "learning_rate": 4.957097228237332e-05, + "loss": 0.3833, + "step": 634000 + }, + { + "epoch": 4.293660675617151, + "grad_norm": 0.33058416843414307, + "learning_rate": 4.957063393243829e-05, + "loss": 0.3831, + "step": 634500 + }, + { + "epoch": 4.297044174967518, + "grad_norm": 0.3244820535182953, + "learning_rate": 4.957029558250325e-05, + "loss": 0.3803, + "step": 635000 + }, + { + "epoch": 4.300427674317887, + "grad_norm": 0.3454670011997223, + "learning_rate": 4.9569957232568214e-05, + "loss": 0.3809, + "step": 635500 + }, + { + "epoch": 4.3038111736682545, + "grad_norm": 0.3536366820335388, + "learning_rate": 4.9569618882633176e-05, + "loss": 0.3802, + "step": 636000 + }, + { + "epoch": 4.307194673018623, + "grad_norm": 0.31699424982070923, + "learning_rate": 4.9569280532698145e-05, + "loss": 0.3812, + "step": 636500 + }, + { + "epoch": 4.310578172368991, + "grad_norm": 0.3276674151420593, + "learning_rate": 4.95689421827631e-05, + "loss": 0.3839, + "step": 637000 + }, + { + "epoch": 4.313961671719359, + "grad_norm": 0.32476815581321716, + "learning_rate": 4.956860383282806e-05, + "loss": 0.382, + "step": 637500 + }, + { + "epoch": 4.317345171069727, + "grad_norm": 0.36478739976882935, + "learning_rate": 4.956826548289303e-05, + "loss": 0.3825, + "step": 638000 + }, + { + "epoch": 4.320728670420095, + "grad_norm": 0.39364731311798096, + "learning_rate": 4.956792713295799e-05, + "loss": 0.3834, + "step": 638500 + }, + { + "epoch": 4.324112169770464, + "grad_norm": 0.33854612708091736, + "learning_rate": 4.9567588783022955e-05, + "loss": 0.3816, + "step": 639000 + }, + { + "epoch": 4.3274956691208315, + "grad_norm": 0.34368497133255005, + "learning_rate": 4.956725043308792e-05, + "loss": 0.3825, + "step": 639500 + }, + { + "epoch": 4.3308791684712, + "grad_norm": 0.3576470911502838, + "learning_rate": 4.9566912083152887e-05, + "loss": 0.3817, + "step": 640000 + }, + { + "epoch": 4.334262667821568, + "grad_norm": 0.30935782194137573, + "learning_rate": 4.956657373321785e-05, + "loss": 0.3825, + "step": 640500 + }, + { + "epoch": 4.337646167171936, + "grad_norm": 0.32550904154777527, + "learning_rate": 4.956623538328281e-05, + "loss": 0.382, + "step": 641000 + }, + { + "epoch": 4.341029666522304, + "grad_norm": 0.339445024728775, + "learning_rate": 4.956589703334777e-05, + "loss": 0.3815, + "step": 641500 + }, + { + "epoch": 4.344413165872672, + "grad_norm": 0.3440740406513214, + "learning_rate": 4.9565558683412735e-05, + "loss": 0.3813, + "step": 642000 + }, + { + "epoch": 4.34779666522304, + "grad_norm": 0.37303054332733154, + "learning_rate": 4.95652203334777e-05, + "loss": 0.3824, + "step": 642500 + }, + { + "epoch": 4.3511801645734085, + "grad_norm": 0.31598225235939026, + "learning_rate": 4.956488198354266e-05, + "loss": 0.3818, + "step": 643000 + }, + { + "epoch": 4.354563663923776, + "grad_norm": 0.32695379853248596, + "learning_rate": 4.956454363360762e-05, + "loss": 0.3833, + "step": 643500 + }, + { + "epoch": 4.357947163274145, + "grad_norm": 0.38666465878486633, + "learning_rate": 4.956420528367259e-05, + "loss": 0.3804, + "step": 644000 + }, + { + "epoch": 4.361330662624513, + "grad_norm": 0.31230244040489197, + "learning_rate": 4.956386693373755e-05, + "loss": 0.3817, + "step": 644500 + }, + { + "epoch": 4.364714161974881, + "grad_norm": 0.3382791578769684, + "learning_rate": 4.9563528583802515e-05, + "loss": 0.3808, + "step": 645000 + }, + { + "epoch": 4.368097661325249, + "grad_norm": 0.3403199315071106, + "learning_rate": 4.956319023386748e-05, + "loss": 0.3806, + "step": 645500 + }, + { + "epoch": 4.371481160675617, + "grad_norm": 0.33164751529693604, + "learning_rate": 4.9562851883932446e-05, + "loss": 0.3832, + "step": 646000 + }, + { + "epoch": 4.3748646600259855, + "grad_norm": 0.3421335518360138, + "learning_rate": 4.95625135339974e-05, + "loss": 0.382, + "step": 646500 + }, + { + "epoch": 4.378248159376353, + "grad_norm": 0.32325634360313416, + "learning_rate": 4.956217518406236e-05, + "loss": 0.3797, + "step": 647000 + }, + { + "epoch": 4.381631658726722, + "grad_norm": 0.3200243413448334, + "learning_rate": 4.956183683412733e-05, + "loss": 0.3803, + "step": 647500 + }, + { + "epoch": 4.385015158077089, + "grad_norm": 0.320699006319046, + "learning_rate": 4.9561498484192294e-05, + "loss": 0.381, + "step": 648000 + }, + { + "epoch": 4.388398657427458, + "grad_norm": 0.32693010568618774, + "learning_rate": 4.9561160134257256e-05, + "loss": 0.3803, + "step": 648500 + }, + { + "epoch": 4.3917821567778255, + "grad_norm": 0.317874550819397, + "learning_rate": 4.956082178432222e-05, + "loss": 0.3809, + "step": 649000 + }, + { + "epoch": 4.395165656128194, + "grad_norm": 0.34446126222610474, + "learning_rate": 4.956048343438719e-05, + "loss": 0.3806, + "step": 649500 + }, + { + "epoch": 4.3985491554785625, + "grad_norm": 0.3517157733440399, + "learning_rate": 4.956014508445215e-05, + "loss": 0.3819, + "step": 650000 + }, + { + "epoch": 4.40193265482893, + "grad_norm": 0.35229623317718506, + "learning_rate": 4.955980673451711e-05, + "loss": 0.3828, + "step": 650500 + }, + { + "epoch": 4.405316154179299, + "grad_norm": 0.33478739857673645, + "learning_rate": 4.9559468384582074e-05, + "loss": 0.3833, + "step": 651000 + }, + { + "epoch": 4.408699653529666, + "grad_norm": 0.324706494808197, + "learning_rate": 4.9559130034647036e-05, + "loss": 0.3822, + "step": 651500 + }, + { + "epoch": 4.412083152880035, + "grad_norm": 0.36632421612739563, + "learning_rate": 4.9558791684712e-05, + "loss": 0.38, + "step": 652000 + }, + { + "epoch": 4.4154666522304025, + "grad_norm": 0.3096342980861664, + "learning_rate": 4.955845333477696e-05, + "loss": 0.3823, + "step": 652500 + }, + { + "epoch": 4.418850151580771, + "grad_norm": 0.35870158672332764, + "learning_rate": 4.955811498484192e-05, + "loss": 0.3814, + "step": 653000 + }, + { + "epoch": 4.422233650931139, + "grad_norm": 0.3230837881565094, + "learning_rate": 4.955777663490689e-05, + "loss": 0.3817, + "step": 653500 + }, + { + "epoch": 4.425617150281507, + "grad_norm": 0.35571718215942383, + "learning_rate": 4.955743828497185e-05, + "loss": 0.3805, + "step": 654000 + }, + { + "epoch": 4.429000649631876, + "grad_norm": 0.32466378808021545, + "learning_rate": 4.9557099935036815e-05, + "loss": 0.3818, + "step": 654500 + }, + { + "epoch": 4.432384148982243, + "grad_norm": 0.313757985830307, + "learning_rate": 4.955676158510178e-05, + "loss": 0.3822, + "step": 655000 + }, + { + "epoch": 4.435767648332612, + "grad_norm": 0.33871760964393616, + "learning_rate": 4.9556423235166746e-05, + "loss": 0.3816, + "step": 655500 + }, + { + "epoch": 4.4391511476829795, + "grad_norm": 0.34044113755226135, + "learning_rate": 4.95560848852317e-05, + "loss": 0.3828, + "step": 656000 + }, + { + "epoch": 4.442534647033348, + "grad_norm": 0.33314165472984314, + "learning_rate": 4.9555746535296664e-05, + "loss": 0.3809, + "step": 656500 + }, + { + "epoch": 4.445918146383716, + "grad_norm": 0.34168577194213867, + "learning_rate": 4.955540818536163e-05, + "loss": 0.3824, + "step": 657000 + }, + { + "epoch": 4.449301645734084, + "grad_norm": 0.3416721522808075, + "learning_rate": 4.9555069835426595e-05, + "loss": 0.3803, + "step": 657500 + }, + { + "epoch": 4.452685145084452, + "grad_norm": 0.3214699625968933, + "learning_rate": 4.955473148549156e-05, + "loss": 0.379, + "step": 658000 + }, + { + "epoch": 4.45606864443482, + "grad_norm": 0.3384753465652466, + "learning_rate": 4.955439313555652e-05, + "loss": 0.382, + "step": 658500 + }, + { + "epoch": 4.459452143785189, + "grad_norm": 0.3610079884529114, + "learning_rate": 4.955405478562148e-05, + "loss": 0.3822, + "step": 659000 + }, + { + "epoch": 4.4628356431355565, + "grad_norm": 0.31960728764533997, + "learning_rate": 4.955371643568645e-05, + "loss": 0.3825, + "step": 659500 + }, + { + "epoch": 4.466219142485925, + "grad_norm": 0.3292570114135742, + "learning_rate": 4.955337808575141e-05, + "loss": 0.3806, + "step": 660000 + }, + { + "epoch": 4.469602641836293, + "grad_norm": 0.32012736797332764, + "learning_rate": 4.9553039735816374e-05, + "loss": 0.3817, + "step": 660500 + }, + { + "epoch": 4.472986141186661, + "grad_norm": 0.33299851417541504, + "learning_rate": 4.9552701385881336e-05, + "loss": 0.3808, + "step": 661000 + }, + { + "epoch": 4.476369640537029, + "grad_norm": 0.3335835933685303, + "learning_rate": 4.95523630359463e-05, + "loss": 0.3826, + "step": 661500 + }, + { + "epoch": 4.479753139887397, + "grad_norm": 0.32194557785987854, + "learning_rate": 4.955202468601126e-05, + "loss": 0.3832, + "step": 662000 + }, + { + "epoch": 4.483136639237765, + "grad_norm": 0.3210223615169525, + "learning_rate": 4.955168633607622e-05, + "loss": 0.3815, + "step": 662500 + }, + { + "epoch": 4.4865201385881335, + "grad_norm": 0.3858987092971802, + "learning_rate": 4.955134798614119e-05, + "loss": 0.3813, + "step": 663000 + }, + { + "epoch": 4.489903637938501, + "grad_norm": 0.3279559314250946, + "learning_rate": 4.9551009636206154e-05, + "loss": 0.3812, + "step": 663500 + }, + { + "epoch": 4.49328713728887, + "grad_norm": 0.31878533959388733, + "learning_rate": 4.9550671286271116e-05, + "loss": 0.3802, + "step": 664000 + }, + { + "epoch": 4.496670636639238, + "grad_norm": 0.3142237067222595, + "learning_rate": 4.955033293633608e-05, + "loss": 0.382, + "step": 664500 + }, + { + "epoch": 4.500054135989606, + "grad_norm": 0.30280283093452454, + "learning_rate": 4.954999458640105e-05, + "loss": 0.3824, + "step": 665000 + }, + { + "epoch": 4.503437635339974, + "grad_norm": 0.3333982229232788, + "learning_rate": 4.954965623646601e-05, + "loss": 0.3809, + "step": 665500 + }, + { + "epoch": 4.506821134690342, + "grad_norm": 0.34333544969558716, + "learning_rate": 4.9549317886530964e-05, + "loss": 0.3826, + "step": 666000 + }, + { + "epoch": 4.5102046340407105, + "grad_norm": 0.3583124577999115, + "learning_rate": 4.9548979536595926e-05, + "loss": 0.3831, + "step": 666500 + }, + { + "epoch": 4.513588133391078, + "grad_norm": 0.31179922819137573, + "learning_rate": 4.9548641186660895e-05, + "loss": 0.3808, + "step": 667000 + }, + { + "epoch": 4.516971632741447, + "grad_norm": 0.37017133831977844, + "learning_rate": 4.954830283672586e-05, + "loss": 0.3808, + "step": 667500 + }, + { + "epoch": 4.520355132091814, + "grad_norm": 0.34011372923851013, + "learning_rate": 4.954796448679082e-05, + "loss": 0.3815, + "step": 668000 + }, + { + "epoch": 4.523738631442183, + "grad_norm": 0.33286014199256897, + "learning_rate": 4.954762613685578e-05, + "loss": 0.382, + "step": 668500 + }, + { + "epoch": 4.52712213079255, + "grad_norm": 0.3390562832355499, + "learning_rate": 4.954728778692075e-05, + "loss": 0.3818, + "step": 669000 + }, + { + "epoch": 4.530505630142919, + "grad_norm": 0.3356575667858124, + "learning_rate": 4.954694943698571e-05, + "loss": 0.3815, + "step": 669500 + }, + { + "epoch": 4.5338891294932875, + "grad_norm": 0.34956133365631104, + "learning_rate": 4.9546611087050675e-05, + "loss": 0.3833, + "step": 670000 + }, + { + "epoch": 4.537272628843655, + "grad_norm": 0.34817448258399963, + "learning_rate": 4.954627273711564e-05, + "loss": 0.3799, + "step": 670500 + }, + { + "epoch": 4.540656128194024, + "grad_norm": 0.36447280645370483, + "learning_rate": 4.95459343871806e-05, + "loss": 0.3823, + "step": 671000 + }, + { + "epoch": 4.544039627544391, + "grad_norm": 0.3624734580516815, + "learning_rate": 4.954559603724556e-05, + "loss": 0.3822, + "step": 671500 + }, + { + "epoch": 4.54742312689476, + "grad_norm": 0.3400898277759552, + "learning_rate": 4.954525768731052e-05, + "loss": 0.3806, + "step": 672000 + }, + { + "epoch": 4.550806626245127, + "grad_norm": 0.3510250747203827, + "learning_rate": 4.954491933737549e-05, + "loss": 0.3818, + "step": 672500 + }, + { + "epoch": 4.554190125595496, + "grad_norm": 0.32453998923301697, + "learning_rate": 4.9544580987440454e-05, + "loss": 0.3825, + "step": 673000 + }, + { + "epoch": 4.557573624945864, + "grad_norm": 0.3194526731967926, + "learning_rate": 4.9544242637505416e-05, + "loss": 0.3804, + "step": 673500 + }, + { + "epoch": 4.560957124296232, + "grad_norm": 0.320198655128479, + "learning_rate": 4.954390428757038e-05, + "loss": 0.3817, + "step": 674000 + }, + { + "epoch": 4.564340623646601, + "grad_norm": 0.3281102776527405, + "learning_rate": 4.954356593763535e-05, + "loss": 0.38, + "step": 674500 + }, + { + "epoch": 4.567724122996968, + "grad_norm": 0.3195473849773407, + "learning_rate": 4.954322758770031e-05, + "loss": 0.3822, + "step": 675000 + }, + { + "epoch": 4.571107622347337, + "grad_norm": 0.31520745158195496, + "learning_rate": 4.9542889237765265e-05, + "loss": 0.3816, + "step": 675500 + }, + { + "epoch": 4.574491121697704, + "grad_norm": 0.3593721389770508, + "learning_rate": 4.954255088783023e-05, + "loss": 0.3809, + "step": 676000 + }, + { + "epoch": 4.577874621048073, + "grad_norm": 0.3455953598022461, + "learning_rate": 4.9542212537895196e-05, + "loss": 0.3816, + "step": 676500 + }, + { + "epoch": 4.581258120398441, + "grad_norm": 0.3602738082408905, + "learning_rate": 4.954187418796016e-05, + "loss": 0.3813, + "step": 677000 + }, + { + "epoch": 4.584641619748809, + "grad_norm": 0.33302927017211914, + "learning_rate": 4.954153583802512e-05, + "loss": 0.3809, + "step": 677500 + }, + { + "epoch": 4.588025119099177, + "grad_norm": 0.31621211767196655, + "learning_rate": 4.954119748809008e-05, + "loss": 0.3806, + "step": 678000 + }, + { + "epoch": 4.591408618449545, + "grad_norm": 0.3371010422706604, + "learning_rate": 4.954085913815505e-05, + "loss": 0.3813, + "step": 678500 + }, + { + "epoch": 4.594792117799914, + "grad_norm": 0.3501533269882202, + "learning_rate": 4.954052078822001e-05, + "loss": 0.3819, + "step": 679000 + }, + { + "epoch": 4.5981756171502814, + "grad_norm": 0.32012107968330383, + "learning_rate": 4.9540182438284975e-05, + "loss": 0.3792, + "step": 679500 + }, + { + "epoch": 4.60155911650065, + "grad_norm": 0.33262699842453003, + "learning_rate": 4.953984408834994e-05, + "loss": 0.3805, + "step": 680000 + }, + { + "epoch": 4.604942615851018, + "grad_norm": 0.3417918086051941, + "learning_rate": 4.95395057384149e-05, + "loss": 0.3801, + "step": 680500 + }, + { + "epoch": 4.608326115201386, + "grad_norm": 0.3096560537815094, + "learning_rate": 4.953916738847986e-05, + "loss": 0.3806, + "step": 681000 + }, + { + "epoch": 4.611709614551754, + "grad_norm": 0.34804731607437134, + "learning_rate": 4.9538829038544824e-05, + "loss": 0.3811, + "step": 681500 + }, + { + "epoch": 4.615093113902122, + "grad_norm": 0.3319779932498932, + "learning_rate": 4.953849068860979e-05, + "loss": 0.3816, + "step": 682000 + }, + { + "epoch": 4.61847661325249, + "grad_norm": 0.31977346539497375, + "learning_rate": 4.9538152338674755e-05, + "loss": 0.3818, + "step": 682500 + }, + { + "epoch": 4.6218601126028585, + "grad_norm": 0.32984334230422974, + "learning_rate": 4.953781398873972e-05, + "loss": 0.3806, + "step": 683000 + }, + { + "epoch": 4.625243611953227, + "grad_norm": 0.3309282064437866, + "learning_rate": 4.953747563880468e-05, + "loss": 0.3818, + "step": 683500 + }, + { + "epoch": 4.628627111303595, + "grad_norm": 0.3470269441604614, + "learning_rate": 4.953713728886965e-05, + "loss": 0.3814, + "step": 684000 + }, + { + "epoch": 4.632010610653963, + "grad_norm": 0.35394591093063354, + "learning_rate": 4.953679893893461e-05, + "loss": 0.3798, + "step": 684500 + }, + { + "epoch": 4.635394110004331, + "grad_norm": 0.3420203924179077, + "learning_rate": 4.9536460588999566e-05, + "loss": 0.381, + "step": 685000 + }, + { + "epoch": 4.638777609354699, + "grad_norm": 0.32648688554763794, + "learning_rate": 4.953612223906453e-05, + "loss": 0.3823, + "step": 685500 + }, + { + "epoch": 4.642161108705067, + "grad_norm": 0.34044334292411804, + "learning_rate": 4.9535783889129497e-05, + "loss": 0.3815, + "step": 686000 + }, + { + "epoch": 4.6455446080554355, + "grad_norm": 0.34604334831237793, + "learning_rate": 4.953544553919446e-05, + "loss": 0.3833, + "step": 686500 + }, + { + "epoch": 4.648928107405803, + "grad_norm": 0.3389892578125, + "learning_rate": 4.953510718925942e-05, + "loss": 0.3823, + "step": 687000 + }, + { + "epoch": 4.652311606756172, + "grad_norm": 0.34680214524269104, + "learning_rate": 4.953476883932438e-05, + "loss": 0.3818, + "step": 687500 + }, + { + "epoch": 4.655695106106539, + "grad_norm": 0.3397471308708191, + "learning_rate": 4.953443048938935e-05, + "loss": 0.3821, + "step": 688000 + }, + { + "epoch": 4.659078605456908, + "grad_norm": 0.31688883900642395, + "learning_rate": 4.9534092139454314e-05, + "loss": 0.3824, + "step": 688500 + }, + { + "epoch": 4.662462104807275, + "grad_norm": 0.32283297181129456, + "learning_rate": 4.9533753789519276e-05, + "loss": 0.3811, + "step": 689000 + }, + { + "epoch": 4.665845604157644, + "grad_norm": 0.3226644992828369, + "learning_rate": 4.953341543958424e-05, + "loss": 0.3808, + "step": 689500 + }, + { + "epoch": 4.6692291035080125, + "grad_norm": 0.2814815640449524, + "learning_rate": 4.95330770896492e-05, + "loss": 0.3821, + "step": 690000 + }, + { + "epoch": 4.67261260285838, + "grad_norm": 0.33735325932502747, + "learning_rate": 4.953273873971416e-05, + "loss": 0.3805, + "step": 690500 + }, + { + "epoch": 4.675996102208749, + "grad_norm": 0.3787975013256073, + "learning_rate": 4.9532400389779125e-05, + "loss": 0.3817, + "step": 691000 + }, + { + "epoch": 4.679379601559116, + "grad_norm": 0.3654099702835083, + "learning_rate": 4.9532062039844093e-05, + "loss": 0.3819, + "step": 691500 + }, + { + "epoch": 4.682763100909485, + "grad_norm": 0.33603522181510925, + "learning_rate": 4.9531723689909056e-05, + "loss": 0.3818, + "step": 692000 + }, + { + "epoch": 4.686146600259852, + "grad_norm": 0.3216695487499237, + "learning_rate": 4.953138533997402e-05, + "loss": 0.3814, + "step": 692500 + }, + { + "epoch": 4.689530099610221, + "grad_norm": 0.37610581517219543, + "learning_rate": 4.953104699003898e-05, + "loss": 0.3821, + "step": 693000 + }, + { + "epoch": 4.692913598960589, + "grad_norm": 0.3684719502925873, + "learning_rate": 4.953070864010395e-05, + "loss": 0.3813, + "step": 693500 + }, + { + "epoch": 4.696297098310957, + "grad_norm": 0.32597842812538147, + "learning_rate": 4.953037029016891e-05, + "loss": 0.3803, + "step": 694000 + }, + { + "epoch": 4.699680597661326, + "grad_norm": 0.34688663482666016, + "learning_rate": 4.9530031940233866e-05, + "loss": 0.3814, + "step": 694500 + }, + { + "epoch": 4.703064097011693, + "grad_norm": 0.32867276668548584, + "learning_rate": 4.952969359029883e-05, + "loss": 0.3796, + "step": 695000 + }, + { + "epoch": 4.706447596362062, + "grad_norm": 0.36180058121681213, + "learning_rate": 4.95293552403638e-05, + "loss": 0.3805, + "step": 695500 + }, + { + "epoch": 4.709831095712429, + "grad_norm": 0.3275575041770935, + "learning_rate": 4.952901689042876e-05, + "loss": 0.3819, + "step": 696000 + }, + { + "epoch": 4.713214595062798, + "grad_norm": 0.3100377917289734, + "learning_rate": 4.952867854049372e-05, + "loss": 0.3802, + "step": 696500 + }, + { + "epoch": 4.716598094413166, + "grad_norm": 0.3666843771934509, + "learning_rate": 4.9528340190558684e-05, + "loss": 0.3811, + "step": 697000 + }, + { + "epoch": 4.719981593763534, + "grad_norm": 0.34636420011520386, + "learning_rate": 4.952800184062365e-05, + "loss": 0.3795, + "step": 697500 + }, + { + "epoch": 4.723365093113902, + "grad_norm": 0.3316555917263031, + "learning_rate": 4.9527663490688615e-05, + "loss": 0.3804, + "step": 698000 + }, + { + "epoch": 4.72674859246427, + "grad_norm": 0.3498135209083557, + "learning_rate": 4.952732514075358e-05, + "loss": 0.3812, + "step": 698500 + }, + { + "epoch": 4.730132091814639, + "grad_norm": 0.29856714606285095, + "learning_rate": 4.952698679081854e-05, + "loss": 0.3816, + "step": 699000 + }, + { + "epoch": 4.733515591165006, + "grad_norm": 0.32720449566841125, + "learning_rate": 4.95266484408835e-05, + "loss": 0.3817, + "step": 699500 + }, + { + "epoch": 4.736899090515375, + "grad_norm": 0.2867276668548584, + "learning_rate": 4.952631009094846e-05, + "loss": 0.3808, + "step": 700000 + }, + { + "epoch": 4.740282589865743, + "grad_norm": 0.36703354120254517, + "learning_rate": 4.9525971741013425e-05, + "loss": 0.3811, + "step": 700500 + }, + { + "epoch": 4.743666089216111, + "grad_norm": 0.3417615592479706, + "learning_rate": 4.9525633391078394e-05, + "loss": 0.3839, + "step": 701000 + }, + { + "epoch": 4.747049588566479, + "grad_norm": 0.32866930961608887, + "learning_rate": 4.9525295041143356e-05, + "loss": 0.3828, + "step": 701500 + }, + { + "epoch": 4.750433087916847, + "grad_norm": 0.30230823159217834, + "learning_rate": 4.952495669120832e-05, + "loss": 0.3818, + "step": 702000 + }, + { + "epoch": 4.753816587267215, + "grad_norm": 0.2984265387058258, + "learning_rate": 4.952461834127328e-05, + "loss": 0.3819, + "step": 702500 + }, + { + "epoch": 4.757200086617583, + "grad_norm": 0.3164541721343994, + "learning_rate": 4.952427999133825e-05, + "loss": 0.38, + "step": 703000 + }, + { + "epoch": 4.760583585967952, + "grad_norm": 0.3470172882080078, + "learning_rate": 4.952394164140321e-05, + "loss": 0.3822, + "step": 703500 + }, + { + "epoch": 4.76396708531832, + "grad_norm": 0.34124675393104553, + "learning_rate": 4.952360329146817e-05, + "loss": 0.3818, + "step": 704000 + }, + { + "epoch": 4.767350584668688, + "grad_norm": 0.36501240730285645, + "learning_rate": 4.952326494153313e-05, + "loss": 0.3804, + "step": 704500 + }, + { + "epoch": 4.770734084019056, + "grad_norm": 0.3430570363998413, + "learning_rate": 4.95229265915981e-05, + "loss": 0.3819, + "step": 705000 + }, + { + "epoch": 4.774117583369424, + "grad_norm": 0.33082473278045654, + "learning_rate": 4.952258824166306e-05, + "loss": 0.3813, + "step": 705500 + }, + { + "epoch": 4.777501082719792, + "grad_norm": 0.344489723443985, + "learning_rate": 4.952224989172802e-05, + "loss": 0.3816, + "step": 706000 + }, + { + "epoch": 4.78088458207016, + "grad_norm": 0.3429262638092041, + "learning_rate": 4.9521911541792984e-05, + "loss": 0.382, + "step": 706500 + }, + { + "epoch": 4.784268081420528, + "grad_norm": 0.32940617203712463, + "learning_rate": 4.952157319185795e-05, + "loss": 0.3813, + "step": 707000 + }, + { + "epoch": 4.787651580770897, + "grad_norm": 0.3384110629558563, + "learning_rate": 4.9521234841922915e-05, + "loss": 0.3819, + "step": 707500 + }, + { + "epoch": 4.791035080121265, + "grad_norm": 0.3162197172641754, + "learning_rate": 4.952089649198788e-05, + "loss": 0.3806, + "step": 708000 + }, + { + "epoch": 4.794418579471633, + "grad_norm": 0.3531859815120697, + "learning_rate": 4.952055814205284e-05, + "loss": 0.3809, + "step": 708500 + }, + { + "epoch": 4.797802078822, + "grad_norm": 0.34763363003730774, + "learning_rate": 4.95202197921178e-05, + "loss": 0.3807, + "step": 709000 + }, + { + "epoch": 4.801185578172369, + "grad_norm": 0.34678885340690613, + "learning_rate": 4.9519881442182764e-05, + "loss": 0.3794, + "step": 709500 + }, + { + "epoch": 4.804569077522737, + "grad_norm": 0.33274465799331665, + "learning_rate": 4.9519543092247726e-05, + "loss": 0.3799, + "step": 710000 + }, + { + "epoch": 4.807952576873105, + "grad_norm": 0.339770644903183, + "learning_rate": 4.9519204742312695e-05, + "loss": 0.3803, + "step": 710500 + }, + { + "epoch": 4.811336076223474, + "grad_norm": 0.3638891577720642, + "learning_rate": 4.951886639237766e-05, + "loss": 0.382, + "step": 711000 + }, + { + "epoch": 4.814719575573841, + "grad_norm": 0.3298690617084503, + "learning_rate": 4.951852804244262e-05, + "loss": 0.3799, + "step": 711500 + }, + { + "epoch": 4.81810307492421, + "grad_norm": 0.3221777677536011, + "learning_rate": 4.951818969250758e-05, + "loss": 0.3823, + "step": 712000 + }, + { + "epoch": 4.821486574274577, + "grad_norm": 0.3054807186126709, + "learning_rate": 4.951785134257254e-05, + "loss": 0.3826, + "step": 712500 + }, + { + "epoch": 4.824870073624946, + "grad_norm": 0.32414326071739197, + "learning_rate": 4.951751299263751e-05, + "loss": 0.3811, + "step": 713000 + }, + { + "epoch": 4.8282535729753135, + "grad_norm": 0.3311294615268707, + "learning_rate": 4.951717464270247e-05, + "loss": 0.382, + "step": 713500 + }, + { + "epoch": 4.831637072325682, + "grad_norm": 0.3301956355571747, + "learning_rate": 4.951683629276743e-05, + "loss": 0.3806, + "step": 714000 + }, + { + "epoch": 4.835020571676051, + "grad_norm": 0.2924399673938751, + "learning_rate": 4.95164979428324e-05, + "loss": 0.3812, + "step": 714500 + }, + { + "epoch": 4.838404071026418, + "grad_norm": 0.3669053912162781, + "learning_rate": 4.951615959289736e-05, + "loss": 0.3808, + "step": 715000 + }, + { + "epoch": 4.841787570376787, + "grad_norm": 0.3688805401325226, + "learning_rate": 4.951582124296232e-05, + "loss": 0.381, + "step": 715500 + }, + { + "epoch": 4.845171069727154, + "grad_norm": 0.3344734311103821, + "learning_rate": 4.9515482893027285e-05, + "loss": 0.3815, + "step": 716000 + }, + { + "epoch": 4.848554569077523, + "grad_norm": 0.31872445344924927, + "learning_rate": 4.9515144543092254e-05, + "loss": 0.3799, + "step": 716500 + }, + { + "epoch": 4.8519380684278905, + "grad_norm": 0.3189932405948639, + "learning_rate": 4.9514806193157216e-05, + "loss": 0.3794, + "step": 717000 + }, + { + "epoch": 4.855321567778259, + "grad_norm": 0.3555068075656891, + "learning_rate": 4.951446784322218e-05, + "loss": 0.3816, + "step": 717500 + }, + { + "epoch": 4.858705067128627, + "grad_norm": 0.34589362144470215, + "learning_rate": 4.951412949328714e-05, + "loss": 0.381, + "step": 718000 + }, + { + "epoch": 4.862088566478995, + "grad_norm": 0.3822649121284485, + "learning_rate": 4.95137911433521e-05, + "loss": 0.3804, + "step": 718500 + }, + { + "epoch": 4.865472065829364, + "grad_norm": 0.3114178478717804, + "learning_rate": 4.9513452793417064e-05, + "loss": 0.3813, + "step": 719000 + }, + { + "epoch": 4.868855565179731, + "grad_norm": 0.3315790891647339, + "learning_rate": 4.9513114443482026e-05, + "loss": 0.3806, + "step": 719500 + }, + { + "epoch": 4.8722390645301, + "grad_norm": 0.3110921084880829, + "learning_rate": 4.9512776093546995e-05, + "loss": 0.3812, + "step": 720000 + }, + { + "epoch": 4.8756225638804676, + "grad_norm": 0.3719049394130707, + "learning_rate": 4.951243774361196e-05, + "loss": 0.3804, + "step": 720500 + }, + { + "epoch": 4.879006063230836, + "grad_norm": 0.31962937116622925, + "learning_rate": 4.951209939367692e-05, + "loss": 0.3824, + "step": 721000 + }, + { + "epoch": 4.882389562581204, + "grad_norm": 0.34589603543281555, + "learning_rate": 4.951176104374188e-05, + "loss": 0.3803, + "step": 721500 + }, + { + "epoch": 4.885773061931572, + "grad_norm": 0.3213373124599457, + "learning_rate": 4.9511422693806844e-05, + "loss": 0.3817, + "step": 722000 + }, + { + "epoch": 4.88915656128194, + "grad_norm": 0.33251869678497314, + "learning_rate": 4.951108434387181e-05, + "loss": 0.379, + "step": 722500 + }, + { + "epoch": 4.892540060632308, + "grad_norm": 0.3400231897830963, + "learning_rate": 4.951074599393677e-05, + "loss": 0.3815, + "step": 723000 + }, + { + "epoch": 4.895923559982677, + "grad_norm": 0.33018866181373596, + "learning_rate": 4.951040764400173e-05, + "loss": 0.3818, + "step": 723500 + }, + { + "epoch": 4.899307059333045, + "grad_norm": 0.3175853490829468, + "learning_rate": 4.95100692940667e-05, + "loss": 0.3809, + "step": 724000 + }, + { + "epoch": 4.902690558683413, + "grad_norm": 0.3687296509742737, + "learning_rate": 4.950973094413166e-05, + "loss": 0.3807, + "step": 724500 + }, + { + "epoch": 4.906074058033781, + "grad_norm": 0.3353497087955475, + "learning_rate": 4.950939259419662e-05, + "loss": 0.3795, + "step": 725000 + }, + { + "epoch": 4.909457557384149, + "grad_norm": 0.3679102063179016, + "learning_rate": 4.9509054244261585e-05, + "loss": 0.3797, + "step": 725500 + }, + { + "epoch": 4.912841056734517, + "grad_norm": 0.3353082537651062, + "learning_rate": 4.9508715894326554e-05, + "loss": 0.3807, + "step": 726000 + }, + { + "epoch": 4.916224556084885, + "grad_norm": 0.332936555147171, + "learning_rate": 4.9508377544391516e-05, + "loss": 0.381, + "step": 726500 + }, + { + "epoch": 4.919608055435253, + "grad_norm": 0.33889758586883545, + "learning_rate": 4.950803919445648e-05, + "loss": 0.3804, + "step": 727000 + }, + { + "epoch": 4.922991554785622, + "grad_norm": 0.29593414068222046, + "learning_rate": 4.950770084452144e-05, + "loss": 0.3808, + "step": 727500 + }, + { + "epoch": 4.92637505413599, + "grad_norm": 0.329371839761734, + "learning_rate": 4.95073624945864e-05, + "loss": 0.3807, + "step": 728000 + }, + { + "epoch": 4.929758553486358, + "grad_norm": 0.3529195487499237, + "learning_rate": 4.9507024144651365e-05, + "loss": 0.3803, + "step": 728500 + }, + { + "epoch": 4.933142052836726, + "grad_norm": 0.35925522446632385, + "learning_rate": 4.950668579471633e-05, + "loss": 0.3812, + "step": 729000 + }, + { + "epoch": 4.936525552187094, + "grad_norm": 0.2887399196624756, + "learning_rate": 4.950634744478129e-05, + "loss": 0.3813, + "step": 729500 + }, + { + "epoch": 4.939909051537462, + "grad_norm": 0.3366541564464569, + "learning_rate": 4.950600909484626e-05, + "loss": 0.3813, + "step": 730000 + }, + { + "epoch": 4.94329255088783, + "grad_norm": 0.336927592754364, + "learning_rate": 4.950567074491122e-05, + "loss": 0.3804, + "step": 730500 + }, + { + "epoch": 4.946676050238199, + "grad_norm": 0.3061399757862091, + "learning_rate": 4.950533239497618e-05, + "loss": 0.3812, + "step": 731000 + }, + { + "epoch": 4.950059549588566, + "grad_norm": 0.38005000352859497, + "learning_rate": 4.9504994045041144e-05, + "loss": 0.3807, + "step": 731500 + }, + { + "epoch": 4.953443048938935, + "grad_norm": 0.3589371144771576, + "learning_rate": 4.950465569510611e-05, + "loss": 0.3807, + "step": 732000 + }, + { + "epoch": 4.956826548289302, + "grad_norm": 0.3375893831253052, + "learning_rate": 4.950431734517107e-05, + "loss": 0.3817, + "step": 732500 + }, + { + "epoch": 4.960210047639671, + "grad_norm": 0.31873199343681335, + "learning_rate": 4.950397899523603e-05, + "loss": 0.3809, + "step": 733000 + }, + { + "epoch": 4.9635935469900385, + "grad_norm": 0.33864834904670715, + "learning_rate": 4.9503640645301e-05, + "loss": 0.3794, + "step": 733500 + }, + { + "epoch": 4.966977046340407, + "grad_norm": 0.3273870646953583, + "learning_rate": 4.950330229536596e-05, + "loss": 0.3807, + "step": 734000 + }, + { + "epoch": 4.970360545690776, + "grad_norm": 0.33497095108032227, + "learning_rate": 4.9502963945430924e-05, + "loss": 0.3824, + "step": 734500 + }, + { + "epoch": 4.973744045041143, + "grad_norm": 0.33082765340805054, + "learning_rate": 4.9502625595495886e-05, + "loss": 0.3795, + "step": 735000 + }, + { + "epoch": 4.977127544391512, + "grad_norm": 0.32591503858566284, + "learning_rate": 4.9502287245560855e-05, + "loss": 0.3794, + "step": 735500 + }, + { + "epoch": 4.980511043741879, + "grad_norm": 0.3750314712524414, + "learning_rate": 4.950194889562582e-05, + "loss": 0.3795, + "step": 736000 + }, + { + "epoch": 4.983894543092248, + "grad_norm": 0.37751951813697815, + "learning_rate": 4.950161054569078e-05, + "loss": 0.3816, + "step": 736500 + }, + { + "epoch": 4.9872780424426155, + "grad_norm": 0.33533382415771484, + "learning_rate": 4.950127219575574e-05, + "loss": 0.3809, + "step": 737000 + }, + { + "epoch": 4.990661541792984, + "grad_norm": 0.3547668755054474, + "learning_rate": 4.9500933845820703e-05, + "loss": 0.3802, + "step": 737500 + }, + { + "epoch": 4.994045041143352, + "grad_norm": 0.3214530646800995, + "learning_rate": 4.9500595495885666e-05, + "loss": 0.3816, + "step": 738000 + }, + { + "epoch": 4.99742854049372, + "grad_norm": 0.3437938392162323, + "learning_rate": 4.950025714595063e-05, + "loss": 0.3818, + "step": 738500 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.8552021551800539, + "eval_loss": 0.5871427655220032, + "eval_runtime": 3408.6048, + "eval_samples_per_second": 85.297, + "eval_steps_per_second": 5.331, + "step": 738880 + }, + { + "epoch": 5.000812039844089, + "grad_norm": 0.3535323441028595, + "learning_rate": 4.949991879601559e-05, + "loss": 0.3779, + "step": 739000 + }, + { + "epoch": 5.004195539194456, + "grad_norm": 0.34933435916900635, + "learning_rate": 4.949958044608056e-05, + "loss": 0.3781, + "step": 739500 + }, + { + "epoch": 5.007579038544825, + "grad_norm": 0.34405067563056946, + "learning_rate": 4.949924209614552e-05, + "loss": 0.3774, + "step": 740000 + }, + { + "epoch": 5.0109625378951925, + "grad_norm": 0.3053302764892578, + "learning_rate": 4.949890374621048e-05, + "loss": 0.3785, + "step": 740500 + }, + { + "epoch": 5.014346037245561, + "grad_norm": 0.3137829601764679, + "learning_rate": 4.9498565396275445e-05, + "loss": 0.3784, + "step": 741000 + }, + { + "epoch": 5.017729536595929, + "grad_norm": 0.35309338569641113, + "learning_rate": 4.9498227046340414e-05, + "loss": 0.3794, + "step": 741500 + }, + { + "epoch": 5.021113035946297, + "grad_norm": 0.3512505292892456, + "learning_rate": 4.949788869640537e-05, + "loss": 0.3785, + "step": 742000 + }, + { + "epoch": 5.024496535296665, + "grad_norm": 0.33110466599464417, + "learning_rate": 4.949755034647033e-05, + "loss": 0.3792, + "step": 742500 + }, + { + "epoch": 5.027880034647033, + "grad_norm": 0.30952101945877075, + "learning_rate": 4.94972119965353e-05, + "loss": 0.3788, + "step": 743000 + }, + { + "epoch": 5.031263533997402, + "grad_norm": 0.3484048843383789, + "learning_rate": 4.949687364660026e-05, + "loss": 0.3799, + "step": 743500 + }, + { + "epoch": 5.0346470333477695, + "grad_norm": 0.3439270257949829, + "learning_rate": 4.9496535296665225e-05, + "loss": 0.3795, + "step": 744000 + }, + { + "epoch": 5.038030532698138, + "grad_norm": 0.3962043821811676, + "learning_rate": 4.949619694673019e-05, + "loss": 0.3793, + "step": 744500 + }, + { + "epoch": 5.041414032048506, + "grad_norm": 0.35327818989753723, + "learning_rate": 4.9495858596795156e-05, + "loss": 0.3789, + "step": 745000 + }, + { + "epoch": 5.044797531398874, + "grad_norm": 0.3789896070957184, + "learning_rate": 4.949552024686012e-05, + "loss": 0.3788, + "step": 745500 + }, + { + "epoch": 5.048181030749242, + "grad_norm": 0.35470277070999146, + "learning_rate": 4.949518189692508e-05, + "loss": 0.3783, + "step": 746000 + }, + { + "epoch": 5.05156453009961, + "grad_norm": 0.31829628348350525, + "learning_rate": 4.949484354699004e-05, + "loss": 0.3789, + "step": 746500 + }, + { + "epoch": 5.054948029449978, + "grad_norm": 0.3227141499519348, + "learning_rate": 4.9494505197055004e-05, + "loss": 0.378, + "step": 747000 + }, + { + "epoch": 5.0583315288003465, + "grad_norm": 0.35483554005622864, + "learning_rate": 4.9494166847119966e-05, + "loss": 0.3788, + "step": 747500 + }, + { + "epoch": 5.061715028150714, + "grad_norm": 0.32808616757392883, + "learning_rate": 4.949382849718493e-05, + "loss": 0.3799, + "step": 748000 + }, + { + "epoch": 5.065098527501083, + "grad_norm": 0.3302007019519806, + "learning_rate": 4.949349014724989e-05, + "loss": 0.3773, + "step": 748500 + }, + { + "epoch": 5.068482026851451, + "grad_norm": 0.336599200963974, + "learning_rate": 4.949315179731486e-05, + "loss": 0.3788, + "step": 749000 + }, + { + "epoch": 5.071865526201819, + "grad_norm": 0.32140204310417175, + "learning_rate": 4.949281344737982e-05, + "loss": 0.3792, + "step": 749500 + }, + { + "epoch": 5.075249025552187, + "grad_norm": 0.31169798970222473, + "learning_rate": 4.9492475097444784e-05, + "loss": 0.3789, + "step": 750000 + }, + { + "epoch": 5.078632524902555, + "grad_norm": 0.36335715651512146, + "learning_rate": 4.9492136747509746e-05, + "loss": 0.3786, + "step": 750500 + }, + { + "epoch": 5.0820160242529235, + "grad_norm": 0.3447102904319763, + "learning_rate": 4.9491798397574715e-05, + "loss": 0.3806, + "step": 751000 + }, + { + "epoch": 5.085399523603291, + "grad_norm": 0.3479219377040863, + "learning_rate": 4.949146004763967e-05, + "loss": 0.3796, + "step": 751500 + }, + { + "epoch": 5.08878302295366, + "grad_norm": 0.3650398850440979, + "learning_rate": 4.949112169770463e-05, + "loss": 0.3809, + "step": 752000 + }, + { + "epoch": 5.092166522304027, + "grad_norm": 0.3772086799144745, + "learning_rate": 4.94907833477696e-05, + "loss": 0.379, + "step": 752500 + }, + { + "epoch": 5.095550021654396, + "grad_norm": 0.35594046115875244, + "learning_rate": 4.949044499783456e-05, + "loss": 0.3806, + "step": 753000 + }, + { + "epoch": 5.098933521004764, + "grad_norm": 0.3071289360523224, + "learning_rate": 4.9490106647899525e-05, + "loss": 0.3801, + "step": 753500 + }, + { + "epoch": 5.102317020355132, + "grad_norm": 0.3742023706436157, + "learning_rate": 4.948976829796449e-05, + "loss": 0.379, + "step": 754000 + }, + { + "epoch": 5.1057005197055005, + "grad_norm": 0.3852168619632721, + "learning_rate": 4.9489429948029456e-05, + "loss": 0.3792, + "step": 754500 + }, + { + "epoch": 5.109084019055868, + "grad_norm": 0.3463808596134186, + "learning_rate": 4.948909159809442e-05, + "loss": 0.3785, + "step": 755000 + }, + { + "epoch": 5.112467518406237, + "grad_norm": 0.3639352321624756, + "learning_rate": 4.948875324815938e-05, + "loss": 0.3786, + "step": 755500 + }, + { + "epoch": 5.115851017756604, + "grad_norm": 0.33799809217453003, + "learning_rate": 4.948841489822434e-05, + "loss": 0.3788, + "step": 756000 + }, + { + "epoch": 5.119234517106973, + "grad_norm": 0.31555867195129395, + "learning_rate": 4.9488076548289305e-05, + "loss": 0.3795, + "step": 756500 + }, + { + "epoch": 5.1226180164573405, + "grad_norm": 0.3564879596233368, + "learning_rate": 4.948773819835427e-05, + "loss": 0.3792, + "step": 757000 + }, + { + "epoch": 5.126001515807709, + "grad_norm": 0.335040420293808, + "learning_rate": 4.948739984841923e-05, + "loss": 0.3796, + "step": 757500 + }, + { + "epoch": 5.129385015158077, + "grad_norm": 0.34456387162208557, + "learning_rate": 4.948706149848419e-05, + "loss": 0.3798, + "step": 758000 + }, + { + "epoch": 5.132768514508445, + "grad_norm": 0.36215341091156006, + "learning_rate": 4.948672314854916e-05, + "loss": 0.3796, + "step": 758500 + }, + { + "epoch": 5.136152013858814, + "grad_norm": 0.35378921031951904, + "learning_rate": 4.948638479861412e-05, + "loss": 0.3797, + "step": 759000 + }, + { + "epoch": 5.139535513209181, + "grad_norm": 0.35382765531539917, + "learning_rate": 4.9486046448679084e-05, + "loss": 0.3793, + "step": 759500 + }, + { + "epoch": 5.14291901255955, + "grad_norm": 0.3886372745037079, + "learning_rate": 4.9485708098744046e-05, + "loss": 0.379, + "step": 760000 + }, + { + "epoch": 5.1463025119099175, + "grad_norm": 0.3264980912208557, + "learning_rate": 4.9485369748809015e-05, + "loss": 0.3797, + "step": 760500 + }, + { + "epoch": 5.149686011260286, + "grad_norm": 0.3303718566894531, + "learning_rate": 4.948503139887397e-05, + "loss": 0.3791, + "step": 761000 + }, + { + "epoch": 5.153069510610654, + "grad_norm": 0.35948291420936584, + "learning_rate": 4.948469304893893e-05, + "loss": 0.379, + "step": 761500 + }, + { + "epoch": 5.156453009961022, + "grad_norm": 0.3072948157787323, + "learning_rate": 4.94843546990039e-05, + "loss": 0.3786, + "step": 762000 + }, + { + "epoch": 5.15983650931139, + "grad_norm": 0.34276217222213745, + "learning_rate": 4.9484016349068864e-05, + "loss": 0.3791, + "step": 762500 + }, + { + "epoch": 5.163220008661758, + "grad_norm": 0.3674897253513336, + "learning_rate": 4.9483677999133826e-05, + "loss": 0.3807, + "step": 763000 + }, + { + "epoch": 5.166603508012127, + "grad_norm": 0.3624192178249359, + "learning_rate": 4.948333964919879e-05, + "loss": 0.378, + "step": 763500 + }, + { + "epoch": 5.1699870073624945, + "grad_norm": 0.3511359691619873, + "learning_rate": 4.948300129926376e-05, + "loss": 0.3792, + "step": 764000 + }, + { + "epoch": 5.173370506712863, + "grad_norm": 0.33653759956359863, + "learning_rate": 4.948266294932872e-05, + "loss": 0.3788, + "step": 764500 + }, + { + "epoch": 5.176754006063231, + "grad_norm": 0.3576100766658783, + "learning_rate": 4.948232459939368e-05, + "loss": 0.3806, + "step": 765000 + }, + { + "epoch": 5.180137505413599, + "grad_norm": 0.33840882778167725, + "learning_rate": 4.948198624945864e-05, + "loss": 0.3798, + "step": 765500 + }, + { + "epoch": 5.183521004763967, + "grad_norm": 0.33222416043281555, + "learning_rate": 4.9481647899523605e-05, + "loss": 0.3788, + "step": 766000 + }, + { + "epoch": 5.186904504114335, + "grad_norm": 0.3831847310066223, + "learning_rate": 4.948130954958857e-05, + "loss": 0.3796, + "step": 766500 + }, + { + "epoch": 5.190288003464703, + "grad_norm": 0.3188963830471039, + "learning_rate": 4.948097119965353e-05, + "loss": 0.3783, + "step": 767000 + }, + { + "epoch": 5.1936715028150715, + "grad_norm": 0.3332746624946594, + "learning_rate": 4.948063284971849e-05, + "loss": 0.3797, + "step": 767500 + }, + { + "epoch": 5.19705500216544, + "grad_norm": 0.33291539549827576, + "learning_rate": 4.948029449978346e-05, + "loss": 0.3799, + "step": 768000 + }, + { + "epoch": 5.200438501515808, + "grad_norm": 0.3115937113761902, + "learning_rate": 4.947995614984842e-05, + "loss": 0.3793, + "step": 768500 + }, + { + "epoch": 5.203822000866176, + "grad_norm": 0.3307870030403137, + "learning_rate": 4.9479617799913385e-05, + "loss": 0.3787, + "step": 769000 + }, + { + "epoch": 5.207205500216544, + "grad_norm": 0.309565931558609, + "learning_rate": 4.947927944997835e-05, + "loss": 0.3808, + "step": 769500 + }, + { + "epoch": 5.210588999566912, + "grad_norm": 0.3607114255428314, + "learning_rate": 4.9478941100043316e-05, + "loss": 0.3791, + "step": 770000 + }, + { + "epoch": 5.21397249891728, + "grad_norm": 0.3276194930076599, + "learning_rate": 4.947860275010827e-05, + "loss": 0.3809, + "step": 770500 + }, + { + "epoch": 5.2173559982676485, + "grad_norm": 0.3212314546108246, + "learning_rate": 4.947826440017323e-05, + "loss": 0.3776, + "step": 771000 + }, + { + "epoch": 5.220739497618016, + "grad_norm": 0.3371840715408325, + "learning_rate": 4.94779260502382e-05, + "loss": 0.3783, + "step": 771500 + }, + { + "epoch": 5.224122996968385, + "grad_norm": 0.37716975808143616, + "learning_rate": 4.9477587700303164e-05, + "loss": 0.3798, + "step": 772000 + }, + { + "epoch": 5.227506496318752, + "grad_norm": 0.3523920476436615, + "learning_rate": 4.9477249350368126e-05, + "loss": 0.3795, + "step": 772500 + }, + { + "epoch": 5.230889995669121, + "grad_norm": 0.31858694553375244, + "learning_rate": 4.947691100043309e-05, + "loss": 0.3795, + "step": 773000 + }, + { + "epoch": 5.234273495019489, + "grad_norm": 0.3189753592014313, + "learning_rate": 4.947657265049806e-05, + "loss": 0.378, + "step": 773500 + }, + { + "epoch": 5.237656994369857, + "grad_norm": 0.33399447798728943, + "learning_rate": 4.947623430056302e-05, + "loss": 0.3788, + "step": 774000 + }, + { + "epoch": 5.2410404937202255, + "grad_norm": 0.34511929750442505, + "learning_rate": 4.947589595062798e-05, + "loss": 0.3792, + "step": 774500 + }, + { + "epoch": 5.244423993070593, + "grad_norm": 0.3606366515159607, + "learning_rate": 4.9475557600692944e-05, + "loss": 0.3822, + "step": 775000 + }, + { + "epoch": 5.247807492420962, + "grad_norm": 0.32893481850624084, + "learning_rate": 4.9475219250757906e-05, + "loss": 0.3811, + "step": 775500 + }, + { + "epoch": 5.251190991771329, + "grad_norm": 0.3491991460323334, + "learning_rate": 4.947488090082287e-05, + "loss": 0.3787, + "step": 776000 + }, + { + "epoch": 5.254574491121698, + "grad_norm": 0.3675939440727234, + "learning_rate": 4.947454255088783e-05, + "loss": 0.3794, + "step": 776500 + }, + { + "epoch": 5.2579579904720655, + "grad_norm": 0.3883334696292877, + "learning_rate": 4.947420420095279e-05, + "loss": 0.3795, + "step": 777000 + }, + { + "epoch": 5.261341489822434, + "grad_norm": 0.34572160243988037, + "learning_rate": 4.947386585101776e-05, + "loss": 0.3808, + "step": 777500 + }, + { + "epoch": 5.264724989172802, + "grad_norm": 0.3183426558971405, + "learning_rate": 4.947352750108272e-05, + "loss": 0.3792, + "step": 778000 + }, + { + "epoch": 5.26810848852317, + "grad_norm": 0.31833508610725403, + "learning_rate": 4.9473189151147686e-05, + "loss": 0.3776, + "step": 778500 + }, + { + "epoch": 5.271491987873539, + "grad_norm": 0.3360603153705597, + "learning_rate": 4.947285080121265e-05, + "loss": 0.3804, + "step": 779000 + }, + { + "epoch": 5.274875487223906, + "grad_norm": 0.3228151202201843, + "learning_rate": 4.9472512451277617e-05, + "loss": 0.3792, + "step": 779500 + }, + { + "epoch": 5.278258986574275, + "grad_norm": 0.3585045337677002, + "learning_rate": 4.947217410134258e-05, + "loss": 0.3793, + "step": 780000 + }, + { + "epoch": 5.2816424859246425, + "grad_norm": 0.3748699426651001, + "learning_rate": 4.9471835751407534e-05, + "loss": 0.3779, + "step": 780500 + }, + { + "epoch": 5.285025985275011, + "grad_norm": 0.334375262260437, + "learning_rate": 4.94714974014725e-05, + "loss": 0.3801, + "step": 781000 + }, + { + "epoch": 5.288409484625379, + "grad_norm": 0.33239230513572693, + "learning_rate": 4.9471159051537465e-05, + "loss": 0.3786, + "step": 781500 + }, + { + "epoch": 5.291792983975747, + "grad_norm": 0.37011659145355225, + "learning_rate": 4.947082070160243e-05, + "loss": 0.3789, + "step": 782000 + }, + { + "epoch": 5.295176483326115, + "grad_norm": 0.3369242250919342, + "learning_rate": 4.947048235166739e-05, + "loss": 0.38, + "step": 782500 + }, + { + "epoch": 5.298559982676483, + "grad_norm": 0.3385688364505768, + "learning_rate": 4.947014400173235e-05, + "loss": 0.3783, + "step": 783000 + }, + { + "epoch": 5.301943482026852, + "grad_norm": 0.3433874249458313, + "learning_rate": 4.946980565179732e-05, + "loss": 0.3787, + "step": 783500 + }, + { + "epoch": 5.3053269813772195, + "grad_norm": 0.3598771095275879, + "learning_rate": 4.946946730186228e-05, + "loss": 0.3788, + "step": 784000 + }, + { + "epoch": 5.308710480727588, + "grad_norm": 0.2988731861114502, + "learning_rate": 4.9469128951927245e-05, + "loss": 0.3799, + "step": 784500 + }, + { + "epoch": 5.312093980077956, + "grad_norm": 0.3294981122016907, + "learning_rate": 4.946879060199221e-05, + "loss": 0.3793, + "step": 785000 + }, + { + "epoch": 5.315477479428324, + "grad_norm": 0.32675206661224365, + "learning_rate": 4.946845225205717e-05, + "loss": 0.3786, + "step": 785500 + }, + { + "epoch": 5.318860978778692, + "grad_norm": 0.3722259998321533, + "learning_rate": 4.946811390212213e-05, + "loss": 0.3798, + "step": 786000 + }, + { + "epoch": 5.32224447812906, + "grad_norm": 0.34133970737457275, + "learning_rate": 4.946777555218709e-05, + "loss": 0.3803, + "step": 786500 + }, + { + "epoch": 5.325627977479428, + "grad_norm": 0.3698797821998596, + "learning_rate": 4.946743720225206e-05, + "loss": 0.3792, + "step": 787000 + }, + { + "epoch": 5.3290114768297965, + "grad_norm": 0.37338992953300476, + "learning_rate": 4.9467098852317024e-05, + "loss": 0.3789, + "step": 787500 + }, + { + "epoch": 5.332394976180165, + "grad_norm": 0.31401678919792175, + "learning_rate": 4.9466760502381986e-05, + "loss": 0.3787, + "step": 788000 + }, + { + "epoch": 5.335778475530533, + "grad_norm": 0.338290810585022, + "learning_rate": 4.946642215244695e-05, + "loss": 0.3793, + "step": 788500 + }, + { + "epoch": 5.339161974880901, + "grad_norm": 0.32382553815841675, + "learning_rate": 4.946608380251192e-05, + "loss": 0.3798, + "step": 789000 + }, + { + "epoch": 5.342545474231269, + "grad_norm": 0.3406978249549866, + "learning_rate": 4.946574545257688e-05, + "loss": 0.3778, + "step": 789500 + }, + { + "epoch": 5.345928973581637, + "grad_norm": 0.32253405451774597, + "learning_rate": 4.9465407102641835e-05, + "loss": 0.3788, + "step": 790000 + }, + { + "epoch": 5.349312472932005, + "grad_norm": 0.3149709701538086, + "learning_rate": 4.9465068752706804e-05, + "loss": 0.3784, + "step": 790500 + }, + { + "epoch": 5.3526959722823735, + "grad_norm": 0.32785189151763916, + "learning_rate": 4.9464730402771766e-05, + "loss": 0.3807, + "step": 791000 + }, + { + "epoch": 5.356079471632741, + "grad_norm": 0.36253809928894043, + "learning_rate": 4.946439205283673e-05, + "loss": 0.3788, + "step": 791500 + }, + { + "epoch": 5.35946297098311, + "grad_norm": 0.35410332679748535, + "learning_rate": 4.946405370290169e-05, + "loss": 0.3815, + "step": 792000 + }, + { + "epoch": 5.362846470333478, + "grad_norm": 0.3683050870895386, + "learning_rate": 4.946371535296665e-05, + "loss": 0.3799, + "step": 792500 + }, + { + "epoch": 5.366229969683846, + "grad_norm": 0.3157660961151123, + "learning_rate": 4.946337700303162e-05, + "loss": 0.3787, + "step": 793000 + }, + { + "epoch": 5.369613469034214, + "grad_norm": 0.33882051706314087, + "learning_rate": 4.946303865309658e-05, + "loss": 0.38, + "step": 793500 + }, + { + "epoch": 5.372996968384582, + "grad_norm": 0.34864357113838196, + "learning_rate": 4.9462700303161545e-05, + "loss": 0.3782, + "step": 794000 + }, + { + "epoch": 5.3763804677349505, + "grad_norm": 0.34172624349594116, + "learning_rate": 4.946236195322651e-05, + "loss": 0.38, + "step": 794500 + }, + { + "epoch": 5.379763967085318, + "grad_norm": 0.31064727902412415, + "learning_rate": 4.946202360329147e-05, + "loss": 0.3784, + "step": 795000 + }, + { + "epoch": 5.383147466435687, + "grad_norm": 0.34635475277900696, + "learning_rate": 4.946168525335643e-05, + "loss": 0.378, + "step": 795500 + }, + { + "epoch": 5.386530965786054, + "grad_norm": 0.3327074944972992, + "learning_rate": 4.9461346903421394e-05, + "loss": 0.3798, + "step": 796000 + }, + { + "epoch": 5.389914465136423, + "grad_norm": 0.359301894903183, + "learning_rate": 4.946100855348636e-05, + "loss": 0.3794, + "step": 796500 + }, + { + "epoch": 5.39329796448679, + "grad_norm": 0.3421958088874817, + "learning_rate": 4.9460670203551325e-05, + "loss": 0.3796, + "step": 797000 + }, + { + "epoch": 5.396681463837159, + "grad_norm": 0.337579607963562, + "learning_rate": 4.946033185361629e-05, + "loss": 0.3795, + "step": 797500 + }, + { + "epoch": 5.400064963187527, + "grad_norm": 0.4035895764827728, + "learning_rate": 4.945999350368125e-05, + "loss": 0.3799, + "step": 798000 + }, + { + "epoch": 5.403448462537895, + "grad_norm": 0.3159906566143036, + "learning_rate": 4.945965515374622e-05, + "loss": 0.3785, + "step": 798500 + }, + { + "epoch": 5.406831961888264, + "grad_norm": 0.32930704951286316, + "learning_rate": 4.945931680381118e-05, + "loss": 0.3795, + "step": 799000 + }, + { + "epoch": 5.410215461238631, + "grad_norm": 0.3148539066314697, + "learning_rate": 4.9458978453876135e-05, + "loss": 0.3797, + "step": 799500 + }, + { + "epoch": 5.413598960589, + "grad_norm": 0.3461363613605499, + "learning_rate": 4.94586401039411e-05, + "loss": 0.3788, + "step": 800000 + }, + { + "epoch": 5.4169824599393674, + "grad_norm": 0.3183637261390686, + "learning_rate": 4.9458301754006066e-05, + "loss": 0.3788, + "step": 800500 + }, + { + "epoch": 5.420365959289736, + "grad_norm": 0.3579789996147156, + "learning_rate": 4.945796340407103e-05, + "loss": 0.3792, + "step": 801000 + }, + { + "epoch": 5.423749458640104, + "grad_norm": 0.3455021381378174, + "learning_rate": 4.945762505413599e-05, + "loss": 0.3779, + "step": 801500 + }, + { + "epoch": 5.427132957990472, + "grad_norm": 0.36724910140037537, + "learning_rate": 4.945728670420095e-05, + "loss": 0.3782, + "step": 802000 + }, + { + "epoch": 5.43051645734084, + "grad_norm": 0.31053024530410767, + "learning_rate": 4.945694835426592e-05, + "loss": 0.3785, + "step": 802500 + }, + { + "epoch": 5.433899956691208, + "grad_norm": 0.40029093623161316, + "learning_rate": 4.9456610004330884e-05, + "loss": 0.3801, + "step": 803000 + }, + { + "epoch": 5.437283456041577, + "grad_norm": 0.3455757200717926, + "learning_rate": 4.9456271654395846e-05, + "loss": 0.3798, + "step": 803500 + }, + { + "epoch": 5.4406669553919444, + "grad_norm": 0.3255632221698761, + "learning_rate": 4.945593330446081e-05, + "loss": 0.379, + "step": 804000 + }, + { + "epoch": 5.444050454742313, + "grad_norm": 0.37028753757476807, + "learning_rate": 4.945559495452577e-05, + "loss": 0.3789, + "step": 804500 + }, + { + "epoch": 5.447433954092681, + "grad_norm": 0.3766026496887207, + "learning_rate": 4.945525660459073e-05, + "loss": 0.3797, + "step": 805000 + }, + { + "epoch": 5.450817453443049, + "grad_norm": 0.3767787218093872, + "learning_rate": 4.9454918254655694e-05, + "loss": 0.3803, + "step": 805500 + }, + { + "epoch": 5.454200952793417, + "grad_norm": 0.33477210998535156, + "learning_rate": 4.945457990472066e-05, + "loss": 0.3799, + "step": 806000 + }, + { + "epoch": 5.457584452143785, + "grad_norm": 0.5394200682640076, + "learning_rate": 4.9454241554785625e-05, + "loss": 0.3804, + "step": 806500 + }, + { + "epoch": 5.460967951494153, + "grad_norm": 0.3293921947479248, + "learning_rate": 4.945390320485059e-05, + "loss": 0.3793, + "step": 807000 + }, + { + "epoch": 5.4643514508445215, + "grad_norm": 0.35536640882492065, + "learning_rate": 4.945356485491555e-05, + "loss": 0.379, + "step": 807500 + }, + { + "epoch": 5.46773495019489, + "grad_norm": 0.3433263599872589, + "learning_rate": 4.945322650498052e-05, + "loss": 0.3798, + "step": 808000 + }, + { + "epoch": 5.471118449545258, + "grad_norm": 0.34488338232040405, + "learning_rate": 4.945288815504548e-05, + "loss": 0.3788, + "step": 808500 + }, + { + "epoch": 5.474501948895626, + "grad_norm": 0.35691148042678833, + "learning_rate": 4.9452549805110436e-05, + "loss": 0.3781, + "step": 809000 + }, + { + "epoch": 5.477885448245994, + "grad_norm": 0.3488087058067322, + "learning_rate": 4.94522114551754e-05, + "loss": 0.3795, + "step": 809500 + }, + { + "epoch": 5.481268947596362, + "grad_norm": 0.32626432180404663, + "learning_rate": 4.945187310524037e-05, + "loss": 0.3796, + "step": 810000 + }, + { + "epoch": 5.48465244694673, + "grad_norm": 0.3325187861919403, + "learning_rate": 4.945153475530533e-05, + "loss": 0.3793, + "step": 810500 + }, + { + "epoch": 5.4880359462970985, + "grad_norm": 0.3402871787548065, + "learning_rate": 4.945119640537029e-05, + "loss": 0.3785, + "step": 811000 + }, + { + "epoch": 5.491419445647466, + "grad_norm": 0.3130391240119934, + "learning_rate": 4.945085805543525e-05, + "loss": 0.3797, + "step": 811500 + }, + { + "epoch": 5.494802944997835, + "grad_norm": 0.3534984290599823, + "learning_rate": 4.945051970550022e-05, + "loss": 0.3784, + "step": 812000 + }, + { + "epoch": 5.498186444348203, + "grad_norm": 0.31189948320388794, + "learning_rate": 4.9450181355565184e-05, + "loss": 0.378, + "step": 812500 + }, + { + "epoch": 5.501569943698571, + "grad_norm": 0.34218019247055054, + "learning_rate": 4.9449843005630146e-05, + "loss": 0.3791, + "step": 813000 + }, + { + "epoch": 5.504953443048939, + "grad_norm": 0.357378751039505, + "learning_rate": 4.944950465569511e-05, + "loss": 0.3805, + "step": 813500 + }, + { + "epoch": 5.508336942399307, + "grad_norm": 0.34804004430770874, + "learning_rate": 4.944916630576007e-05, + "loss": 0.3779, + "step": 814000 + }, + { + "epoch": 5.5117204417496755, + "grad_norm": 0.36242908239364624, + "learning_rate": 4.944882795582503e-05, + "loss": 0.3794, + "step": 814500 + }, + { + "epoch": 5.515103941100043, + "grad_norm": 0.35949084162712097, + "learning_rate": 4.9448489605889995e-05, + "loss": 0.3783, + "step": 815000 + }, + { + "epoch": 5.518487440450412, + "grad_norm": 0.3674199879169464, + "learning_rate": 4.9448151255954964e-05, + "loss": 0.3783, + "step": 815500 + }, + { + "epoch": 5.521870939800779, + "grad_norm": 0.35882535576820374, + "learning_rate": 4.9447812906019926e-05, + "loss": 0.3803, + "step": 816000 + }, + { + "epoch": 5.525254439151148, + "grad_norm": 0.28911158442497253, + "learning_rate": 4.944747455608489e-05, + "loss": 0.3785, + "step": 816500 + }, + { + "epoch": 5.528637938501516, + "grad_norm": 0.3618195354938507, + "learning_rate": 4.944713620614985e-05, + "loss": 0.3804, + "step": 817000 + }, + { + "epoch": 5.532021437851884, + "grad_norm": 0.34649333357810974, + "learning_rate": 4.944679785621482e-05, + "loss": 0.3787, + "step": 817500 + }, + { + "epoch": 5.535404937202252, + "grad_norm": 0.3594622313976288, + "learning_rate": 4.944645950627978e-05, + "loss": 0.38, + "step": 818000 + }, + { + "epoch": 5.53878843655262, + "grad_norm": 0.32172903418540955, + "learning_rate": 4.9446121156344737e-05, + "loss": 0.38, + "step": 818500 + }, + { + "epoch": 5.542171935902989, + "grad_norm": 0.3536199629306793, + "learning_rate": 4.94457828064097e-05, + "loss": 0.3797, + "step": 819000 + }, + { + "epoch": 5.545555435253356, + "grad_norm": 0.3164806663990021, + "learning_rate": 4.944544445647467e-05, + "loss": 0.3805, + "step": 819500 + }, + { + "epoch": 5.548938934603725, + "grad_norm": 0.3280503451824188, + "learning_rate": 4.944510610653963e-05, + "loss": 0.3803, + "step": 820000 + }, + { + "epoch": 5.552322433954092, + "grad_norm": 0.31813400983810425, + "learning_rate": 4.944476775660459e-05, + "loss": 0.3803, + "step": 820500 + }, + { + "epoch": 5.555705933304461, + "grad_norm": 0.3344866633415222, + "learning_rate": 4.9444429406669554e-05, + "loss": 0.3786, + "step": 821000 + }, + { + "epoch": 5.559089432654829, + "grad_norm": 0.3649905323982239, + "learning_rate": 4.944409105673452e-05, + "loss": 0.3782, + "step": 821500 + }, + { + "epoch": 5.562472932005197, + "grad_norm": 0.3461797833442688, + "learning_rate": 4.9443752706799485e-05, + "loss": 0.3793, + "step": 822000 + }, + { + "epoch": 5.565856431355565, + "grad_norm": 0.35041677951812744, + "learning_rate": 4.944341435686445e-05, + "loss": 0.3789, + "step": 822500 + }, + { + "epoch": 5.569239930705933, + "grad_norm": 0.31106075644493103, + "learning_rate": 4.944307600692941e-05, + "loss": 0.3801, + "step": 823000 + }, + { + "epoch": 5.572623430056302, + "grad_norm": 0.3350198566913605, + "learning_rate": 4.944273765699437e-05, + "loss": 0.3788, + "step": 823500 + }, + { + "epoch": 5.576006929406669, + "grad_norm": 0.3699307441711426, + "learning_rate": 4.9442399307059333e-05, + "loss": 0.3793, + "step": 824000 + }, + { + "epoch": 5.579390428757038, + "grad_norm": 0.34947749972343445, + "learning_rate": 4.9442060957124296e-05, + "loss": 0.3795, + "step": 824500 + }, + { + "epoch": 5.582773928107406, + "grad_norm": 0.320469468832016, + "learning_rate": 4.9441722607189264e-05, + "loss": 0.3792, + "step": 825000 + }, + { + "epoch": 5.586157427457774, + "grad_norm": 0.37775009870529175, + "learning_rate": 4.9441384257254227e-05, + "loss": 0.38, + "step": 825500 + }, + { + "epoch": 5.589540926808142, + "grad_norm": 0.3509116470813751, + "learning_rate": 4.944104590731919e-05, + "loss": 0.3784, + "step": 826000 + }, + { + "epoch": 5.59292442615851, + "grad_norm": 0.3317195475101471, + "learning_rate": 4.944070755738415e-05, + "loss": 0.3801, + "step": 826500 + }, + { + "epoch": 5.596307925508878, + "grad_norm": 0.37722665071487427, + "learning_rate": 4.944036920744912e-05, + "loss": 0.3797, + "step": 827000 + }, + { + "epoch": 5.599691424859246, + "grad_norm": 0.34442973136901855, + "learning_rate": 4.944003085751408e-05, + "loss": 0.3789, + "step": 827500 + }, + { + "epoch": 5.603074924209615, + "grad_norm": 0.37473928928375244, + "learning_rate": 4.943969250757904e-05, + "loss": 0.3796, + "step": 828000 + }, + { + "epoch": 5.606458423559983, + "grad_norm": 0.2993427515029907, + "learning_rate": 4.9439354157644e-05, + "loss": 0.3808, + "step": 828500 + }, + { + "epoch": 5.609841922910351, + "grad_norm": 0.33846497535705566, + "learning_rate": 4.943901580770897e-05, + "loss": 0.3789, + "step": 829000 + }, + { + "epoch": 5.613225422260719, + "grad_norm": 0.34987756609916687, + "learning_rate": 4.943867745777393e-05, + "loss": 0.3798, + "step": 829500 + }, + { + "epoch": 5.616608921611087, + "grad_norm": 0.3368557393550873, + "learning_rate": 4.943833910783889e-05, + "loss": 0.3793, + "step": 830000 + }, + { + "epoch": 5.619992420961455, + "grad_norm": 0.3821764886379242, + "learning_rate": 4.9438000757903855e-05, + "loss": 0.3776, + "step": 830500 + }, + { + "epoch": 5.623375920311823, + "grad_norm": 0.3596230447292328, + "learning_rate": 4.9437662407968823e-05, + "loss": 0.3798, + "step": 831000 + }, + { + "epoch": 5.626759419662191, + "grad_norm": 0.35004204511642456, + "learning_rate": 4.9437324058033786e-05, + "loss": 0.379, + "step": 831500 + }, + { + "epoch": 5.63014291901256, + "grad_norm": 0.36982133984565735, + "learning_rate": 4.943698570809875e-05, + "loss": 0.3794, + "step": 832000 + }, + { + "epoch": 5.633526418362928, + "grad_norm": 0.3210700452327728, + "learning_rate": 4.943664735816371e-05, + "loss": 0.3781, + "step": 832500 + }, + { + "epoch": 5.636909917713296, + "grad_norm": 0.3624829351902008, + "learning_rate": 4.943630900822867e-05, + "loss": 0.3791, + "step": 833000 + }, + { + "epoch": 5.640293417063664, + "grad_norm": 0.3207123279571533, + "learning_rate": 4.9435970658293634e-05, + "loss": 0.3776, + "step": 833500 + }, + { + "epoch": 5.643676916414032, + "grad_norm": 0.33684486150741577, + "learning_rate": 4.9435632308358596e-05, + "loss": 0.3796, + "step": 834000 + }, + { + "epoch": 5.6470604157644, + "grad_norm": 0.35455456376075745, + "learning_rate": 4.9435293958423565e-05, + "loss": 0.3795, + "step": 834500 + }, + { + "epoch": 5.650443915114768, + "grad_norm": 0.33251824975013733, + "learning_rate": 4.943495560848853e-05, + "loss": 0.3785, + "step": 835000 + }, + { + "epoch": 5.653827414465137, + "grad_norm": 0.33744561672210693, + "learning_rate": 4.943461725855349e-05, + "loss": 0.3786, + "step": 835500 + }, + { + "epoch": 5.657210913815504, + "grad_norm": 0.3235434591770172, + "learning_rate": 4.943427890861845e-05, + "loss": 0.3787, + "step": 836000 + }, + { + "epoch": 5.660594413165873, + "grad_norm": 0.3207637369632721, + "learning_rate": 4.943394055868342e-05, + "loss": 0.3794, + "step": 836500 + }, + { + "epoch": 5.663977912516241, + "grad_norm": 0.3127499520778656, + "learning_rate": 4.943360220874838e-05, + "loss": 0.3792, + "step": 837000 + }, + { + "epoch": 5.667361411866609, + "grad_norm": 0.3637065589427948, + "learning_rate": 4.943326385881334e-05, + "loss": 0.3796, + "step": 837500 + }, + { + "epoch": 5.6707449112169765, + "grad_norm": 0.3559350073337555, + "learning_rate": 4.94329255088783e-05, + "loss": 0.3787, + "step": 838000 + }, + { + "epoch": 5.674128410567345, + "grad_norm": 0.41822707653045654, + "learning_rate": 4.943258715894327e-05, + "loss": 0.3793, + "step": 838500 + }, + { + "epoch": 5.677511909917714, + "grad_norm": 0.3356015980243683, + "learning_rate": 4.943224880900823e-05, + "loss": 0.3786, + "step": 839000 + }, + { + "epoch": 5.680895409268081, + "grad_norm": 0.3275659680366516, + "learning_rate": 4.943191045907319e-05, + "loss": 0.3781, + "step": 839500 + }, + { + "epoch": 5.68427890861845, + "grad_norm": 0.32490238547325134, + "learning_rate": 4.9431572109138155e-05, + "loss": 0.3792, + "step": 840000 + }, + { + "epoch": 5.687662407968817, + "grad_norm": 0.3571946322917938, + "learning_rate": 4.9431233759203124e-05, + "loss": 0.379, + "step": 840500 + }, + { + "epoch": 5.691045907319186, + "grad_norm": 0.3392782211303711, + "learning_rate": 4.9430895409268086e-05, + "loss": 0.377, + "step": 841000 + }, + { + "epoch": 5.6944294066695536, + "grad_norm": 0.3352891802787781, + "learning_rate": 4.943055705933305e-05, + "loss": 0.3813, + "step": 841500 + }, + { + "epoch": 5.697812906019922, + "grad_norm": 0.3490934371948242, + "learning_rate": 4.943021870939801e-05, + "loss": 0.3785, + "step": 842000 + }, + { + "epoch": 5.70119640537029, + "grad_norm": 0.3238186538219452, + "learning_rate": 4.942988035946297e-05, + "loss": 0.3783, + "step": 842500 + }, + { + "epoch": 5.704579904720658, + "grad_norm": 0.340578556060791, + "learning_rate": 4.9429542009527935e-05, + "loss": 0.3805, + "step": 843000 + }, + { + "epoch": 5.707963404071027, + "grad_norm": 0.32143744826316833, + "learning_rate": 4.94292036595929e-05, + "loss": 0.3774, + "step": 843500 + }, + { + "epoch": 5.711346903421394, + "grad_norm": 0.3530731201171875, + "learning_rate": 4.9428865309657866e-05, + "loss": 0.3793, + "step": 844000 + }, + { + "epoch": 5.714730402771763, + "grad_norm": 0.3481243848800659, + "learning_rate": 4.942852695972283e-05, + "loss": 0.3788, + "step": 844500 + }, + { + "epoch": 5.7181139021221306, + "grad_norm": 0.35261571407318115, + "learning_rate": 4.942818860978779e-05, + "loss": 0.379, + "step": 845000 + }, + { + "epoch": 5.721497401472499, + "grad_norm": 0.3564860224723816, + "learning_rate": 4.942785025985275e-05, + "loss": 0.3783, + "step": 845500 + }, + { + "epoch": 5.724880900822867, + "grad_norm": 0.3480117917060852, + "learning_rate": 4.9427511909917714e-05, + "loss": 0.3785, + "step": 846000 + }, + { + "epoch": 5.728264400173235, + "grad_norm": 0.3417789041996002, + "learning_rate": 4.942717355998268e-05, + "loss": 0.3795, + "step": 846500 + }, + { + "epoch": 5.731647899523603, + "grad_norm": 0.3160167932510376, + "learning_rate": 4.942683521004764e-05, + "loss": 0.3797, + "step": 847000 + }, + { + "epoch": 5.735031398873971, + "grad_norm": 0.32285985350608826, + "learning_rate": 4.94264968601126e-05, + "loss": 0.3796, + "step": 847500 + }, + { + "epoch": 5.73841489822434, + "grad_norm": 0.3177568316459656, + "learning_rate": 4.942615851017757e-05, + "loss": 0.3794, + "step": 848000 + }, + { + "epoch": 5.741798397574708, + "grad_norm": 0.3382292687892914, + "learning_rate": 4.942582016024253e-05, + "loss": 0.3774, + "step": 848500 + }, + { + "epoch": 5.745181896925076, + "grad_norm": 0.3521333634853363, + "learning_rate": 4.9425481810307494e-05, + "loss": 0.3801, + "step": 849000 + }, + { + "epoch": 5.748565396275444, + "grad_norm": 0.3552185595035553, + "learning_rate": 4.9425143460372456e-05, + "loss": 0.3782, + "step": 849500 + }, + { + "epoch": 5.751948895625812, + "grad_norm": 0.37381207942962646, + "learning_rate": 4.9424805110437425e-05, + "loss": 0.3788, + "step": 850000 + }, + { + "epoch": 5.75533239497618, + "grad_norm": 0.3287741243839264, + "learning_rate": 4.942446676050239e-05, + "loss": 0.3793, + "step": 850500 + }, + { + "epoch": 5.758715894326548, + "grad_norm": 0.3196081519126892, + "learning_rate": 4.942412841056735e-05, + "loss": 0.3785, + "step": 851000 + }, + { + "epoch": 5.762099393676916, + "grad_norm": 0.31051212549209595, + "learning_rate": 4.942379006063231e-05, + "loss": 0.379, + "step": 851500 + }, + { + "epoch": 5.765482893027285, + "grad_norm": 0.3289923667907715, + "learning_rate": 4.942345171069727e-05, + "loss": 0.3785, + "step": 852000 + }, + { + "epoch": 5.768866392377653, + "grad_norm": 0.3510921597480774, + "learning_rate": 4.9423113360762235e-05, + "loss": 0.3798, + "step": 852500 + }, + { + "epoch": 5.772249891728021, + "grad_norm": 0.32584503293037415, + "learning_rate": 4.94227750108272e-05, + "loss": 0.3781, + "step": 853000 + }, + { + "epoch": 5.775633391078389, + "grad_norm": 0.3186558187007904, + "learning_rate": 4.9422436660892166e-05, + "loss": 0.3799, + "step": 853500 + }, + { + "epoch": 5.779016890428757, + "grad_norm": 0.34689462184906006, + "learning_rate": 4.942209831095713e-05, + "loss": 0.3786, + "step": 854000 + }, + { + "epoch": 5.782400389779125, + "grad_norm": 0.3494212031364441, + "learning_rate": 4.942175996102209e-05, + "loss": 0.378, + "step": 854500 + }, + { + "epoch": 5.785783889129493, + "grad_norm": 0.3484705984592438, + "learning_rate": 4.942142161108705e-05, + "loss": 0.3801, + "step": 855000 + }, + { + "epoch": 5.789167388479862, + "grad_norm": 0.3759133815765381, + "learning_rate": 4.9421083261152015e-05, + "loss": 0.3797, + "step": 855500 + }, + { + "epoch": 5.792550887830229, + "grad_norm": 0.34640076756477356, + "learning_rate": 4.9420744911216984e-05, + "loss": 0.3809, + "step": 856000 + }, + { + "epoch": 5.795934387180598, + "grad_norm": 0.3386498689651489, + "learning_rate": 4.942040656128194e-05, + "loss": 0.3791, + "step": 856500 + }, + { + "epoch": 5.799317886530966, + "grad_norm": 0.29726603627204895, + "learning_rate": 4.94200682113469e-05, + "loss": 0.379, + "step": 857000 + }, + { + "epoch": 5.802701385881334, + "grad_norm": 0.35376840829849243, + "learning_rate": 4.941972986141187e-05, + "loss": 0.3789, + "step": 857500 + }, + { + "epoch": 5.806084885231702, + "grad_norm": 0.33722490072250366, + "learning_rate": 4.941939151147683e-05, + "loss": 0.3788, + "step": 858000 + }, + { + "epoch": 5.80946838458207, + "grad_norm": 0.36351436376571655, + "learning_rate": 4.9419053161541794e-05, + "loss": 0.3785, + "step": 858500 + }, + { + "epoch": 5.812851883932439, + "grad_norm": 0.3795541226863861, + "learning_rate": 4.9418714811606756e-05, + "loss": 0.3782, + "step": 859000 + }, + { + "epoch": 5.816235383282806, + "grad_norm": 0.3474687933921814, + "learning_rate": 4.9418376461671725e-05, + "loss": 0.3783, + "step": 859500 + }, + { + "epoch": 5.819618882633175, + "grad_norm": 0.3508880138397217, + "learning_rate": 4.941803811173669e-05, + "loss": 0.3777, + "step": 860000 + }, + { + "epoch": 5.823002381983542, + "grad_norm": 0.32037851214408875, + "learning_rate": 4.941769976180165e-05, + "loss": 0.3801, + "step": 860500 + }, + { + "epoch": 5.826385881333911, + "grad_norm": 0.35430553555488586, + "learning_rate": 4.941736141186661e-05, + "loss": 0.3777, + "step": 861000 + }, + { + "epoch": 5.8297693806842785, + "grad_norm": 0.3321610391139984, + "learning_rate": 4.9417023061931574e-05, + "loss": 0.3774, + "step": 861500 + }, + { + "epoch": 5.833152880034647, + "grad_norm": 0.31289544701576233, + "learning_rate": 4.9416684711996536e-05, + "loss": 0.379, + "step": 862000 + }, + { + "epoch": 5.836536379385015, + "grad_norm": 0.3334125876426697, + "learning_rate": 4.94163463620615e-05, + "loss": 0.3777, + "step": 862500 + }, + { + "epoch": 5.839919878735383, + "grad_norm": 0.35061657428741455, + "learning_rate": 4.941600801212646e-05, + "loss": 0.3774, + "step": 863000 + }, + { + "epoch": 5.843303378085752, + "grad_norm": 0.3509184420108795, + "learning_rate": 4.941566966219143e-05, + "loss": 0.3784, + "step": 863500 + }, + { + "epoch": 5.846686877436119, + "grad_norm": 0.3638246953487396, + "learning_rate": 4.941533131225639e-05, + "loss": 0.3775, + "step": 864000 + }, + { + "epoch": 5.850070376786488, + "grad_norm": 0.3070426881313324, + "learning_rate": 4.941499296232135e-05, + "loss": 0.3773, + "step": 864500 + }, + { + "epoch": 5.8534538761368555, + "grad_norm": 0.32031184434890747, + "learning_rate": 4.9414654612386315e-05, + "loss": 0.379, + "step": 865000 + }, + { + "epoch": 5.856837375487224, + "grad_norm": 0.32593870162963867, + "learning_rate": 4.9414316262451284e-05, + "loss": 0.3765, + "step": 865500 + }, + { + "epoch": 5.860220874837592, + "grad_norm": 0.3688930869102478, + "learning_rate": 4.941397791251624e-05, + "loss": 0.3782, + "step": 866000 + }, + { + "epoch": 5.86360437418796, + "grad_norm": 0.348285049200058, + "learning_rate": 4.94136395625812e-05, + "loss": 0.3794, + "step": 866500 + }, + { + "epoch": 5.866987873538328, + "grad_norm": 0.3707033693790436, + "learning_rate": 4.941330121264617e-05, + "loss": 0.3792, + "step": 867000 + }, + { + "epoch": 5.870371372888696, + "grad_norm": 0.38194599747657776, + "learning_rate": 4.941296286271113e-05, + "loss": 0.3775, + "step": 867500 + }, + { + "epoch": 5.873754872239065, + "grad_norm": 0.3280165493488312, + "learning_rate": 4.9412624512776095e-05, + "loss": 0.3796, + "step": 868000 + }, + { + "epoch": 5.8771383715894325, + "grad_norm": 0.34023532271385193, + "learning_rate": 4.941228616284106e-05, + "loss": 0.3801, + "step": 868500 + }, + { + "epoch": 5.880521870939801, + "grad_norm": 0.3545888364315033, + "learning_rate": 4.9411947812906026e-05, + "loss": 0.3787, + "step": 869000 + }, + { + "epoch": 5.883905370290169, + "grad_norm": 0.3295728862285614, + "learning_rate": 4.941160946297099e-05, + "loss": 0.3777, + "step": 869500 + }, + { + "epoch": 5.887288869640537, + "grad_norm": 0.32660236954689026, + "learning_rate": 4.941127111303595e-05, + "loss": 0.3797, + "step": 870000 + }, + { + "epoch": 5.890672368990905, + "grad_norm": 0.36814743280410767, + "learning_rate": 4.941093276310091e-05, + "loss": 0.38, + "step": 870500 + }, + { + "epoch": 5.894055868341273, + "grad_norm": 0.3505696952342987, + "learning_rate": 4.9410594413165874e-05, + "loss": 0.3795, + "step": 871000 + }, + { + "epoch": 5.897439367691641, + "grad_norm": 0.30137890577316284, + "learning_rate": 4.9410256063230837e-05, + "loss": 0.3773, + "step": 871500 + }, + { + "epoch": 5.9008228670420095, + "grad_norm": 0.35418471693992615, + "learning_rate": 4.94099177132958e-05, + "loss": 0.3791, + "step": 872000 + }, + { + "epoch": 5.904206366392378, + "grad_norm": 0.3241247236728668, + "learning_rate": 4.940957936336076e-05, + "loss": 0.3782, + "step": 872500 + }, + { + "epoch": 5.907589865742746, + "grad_norm": 0.3622310161590576, + "learning_rate": 4.940924101342573e-05, + "loss": 0.379, + "step": 873000 + }, + { + "epoch": 5.910973365093114, + "grad_norm": 0.3449992835521698, + "learning_rate": 4.940890266349069e-05, + "loss": 0.3789, + "step": 873500 + }, + { + "epoch": 5.914356864443482, + "grad_norm": 0.3535126745700836, + "learning_rate": 4.9408564313555654e-05, + "loss": 0.3788, + "step": 874000 + }, + { + "epoch": 5.91774036379385, + "grad_norm": 0.3176920711994171, + "learning_rate": 4.9408225963620616e-05, + "loss": 0.3779, + "step": 874500 + }, + { + "epoch": 5.921123863144218, + "grad_norm": 0.34803539514541626, + "learning_rate": 4.9407887613685585e-05, + "loss": 0.3782, + "step": 875000 + }, + { + "epoch": 5.9245073624945865, + "grad_norm": 0.31535181403160095, + "learning_rate": 4.940754926375054e-05, + "loss": 0.3796, + "step": 875500 + }, + { + "epoch": 5.927890861844954, + "grad_norm": 0.3422606587409973, + "learning_rate": 4.94072109138155e-05, + "loss": 0.3776, + "step": 876000 + }, + { + "epoch": 5.931274361195323, + "grad_norm": 0.3442267179489136, + "learning_rate": 4.940687256388047e-05, + "loss": 0.3779, + "step": 876500 + }, + { + "epoch": 5.934657860545691, + "grad_norm": 0.3561495840549469, + "learning_rate": 4.9406534213945433e-05, + "loss": 0.3785, + "step": 877000 + }, + { + "epoch": 5.938041359896059, + "grad_norm": 0.30205509066581726, + "learning_rate": 4.9406195864010396e-05, + "loss": 0.3792, + "step": 877500 + }, + { + "epoch": 5.941424859246427, + "grad_norm": 0.32767045497894287, + "learning_rate": 4.940585751407536e-05, + "loss": 0.3796, + "step": 878000 + }, + { + "epoch": 5.944808358596795, + "grad_norm": 0.3319656252861023, + "learning_rate": 4.9405519164140327e-05, + "loss": 0.3802, + "step": 878500 + }, + { + "epoch": 5.9481918579471635, + "grad_norm": 0.3690793514251709, + "learning_rate": 4.940518081420529e-05, + "loss": 0.3787, + "step": 879000 + }, + { + "epoch": 5.951575357297531, + "grad_norm": 0.32809531688690186, + "learning_rate": 4.940484246427025e-05, + "loss": 0.3801, + "step": 879500 + }, + { + "epoch": 5.9549588566479, + "grad_norm": 0.35235708951950073, + "learning_rate": 4.940450411433521e-05, + "loss": 0.3787, + "step": 880000 + }, + { + "epoch": 5.958342355998267, + "grad_norm": 0.3218901455402374, + "learning_rate": 4.9404165764400175e-05, + "loss": 0.381, + "step": 880500 + }, + { + "epoch": 5.961725855348636, + "grad_norm": 0.34744882583618164, + "learning_rate": 4.940382741446514e-05, + "loss": 0.3801, + "step": 881000 + }, + { + "epoch": 5.965109354699004, + "grad_norm": 0.35583585500717163, + "learning_rate": 4.94034890645301e-05, + "loss": 0.3782, + "step": 881500 + }, + { + "epoch": 5.968492854049372, + "grad_norm": 0.3385174870491028, + "learning_rate": 4.940315071459506e-05, + "loss": 0.3798, + "step": 882000 + }, + { + "epoch": 5.97187635339974, + "grad_norm": 0.3421972393989563, + "learning_rate": 4.940281236466003e-05, + "loss": 0.3775, + "step": 882500 + }, + { + "epoch": 5.975259852750108, + "grad_norm": 0.3534969687461853, + "learning_rate": 4.940247401472499e-05, + "loss": 0.3793, + "step": 883000 + }, + { + "epoch": 5.978643352100477, + "grad_norm": 0.3438570201396942, + "learning_rate": 4.9402135664789955e-05, + "loss": 0.3778, + "step": 883500 + }, + { + "epoch": 5.982026851450844, + "grad_norm": 0.33352580666542053, + "learning_rate": 4.940179731485492e-05, + "loss": 0.3793, + "step": 884000 + }, + { + "epoch": 5.985410350801213, + "grad_norm": 0.3402976989746094, + "learning_rate": 4.9401458964919886e-05, + "loss": 0.3785, + "step": 884500 + }, + { + "epoch": 5.9887938501515805, + "grad_norm": 0.33715447783470154, + "learning_rate": 4.940112061498484e-05, + "loss": 0.3801, + "step": 885000 + }, + { + "epoch": 5.992177349501949, + "grad_norm": 0.38319554924964905, + "learning_rate": 4.94007822650498e-05, + "loss": 0.3776, + "step": 885500 + }, + { + "epoch": 5.995560848852317, + "grad_norm": 0.3456675708293915, + "learning_rate": 4.940044391511477e-05, + "loss": 0.3768, + "step": 886000 + }, + { + "epoch": 5.998944348202685, + "grad_norm": 0.3558131754398346, + "learning_rate": 4.9400105565179734e-05, + "loss": 0.38, + "step": 886500 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.85590746471741, + "eval_loss": 0.5863214135169983, + "eval_runtime": 3371.7967, + "eval_samples_per_second": 86.228, + "eval_steps_per_second": 5.389, + "step": 886656 + }, + { + "epoch": 6.002327847553054, + "grad_norm": 0.38339632749557495, + "learning_rate": 4.9399767215244696e-05, + "loss": 0.3773, + "step": 887000 + }, + { + "epoch": 6.005711346903421, + "grad_norm": 0.3409227132797241, + "learning_rate": 4.939942886530966e-05, + "loss": 0.377, + "step": 887500 + }, + { + "epoch": 6.00909484625379, + "grad_norm": 0.35882502794265747, + "learning_rate": 4.939909051537463e-05, + "loss": 0.3754, + "step": 888000 + }, + { + "epoch": 6.0124783456041575, + "grad_norm": 0.3217174708843231, + "learning_rate": 4.939875216543959e-05, + "loss": 0.3769, + "step": 888500 + }, + { + "epoch": 6.015861844954526, + "grad_norm": 0.3731520473957062, + "learning_rate": 4.939841381550455e-05, + "loss": 0.3763, + "step": 889000 + }, + { + "epoch": 6.019245344304894, + "grad_norm": 0.34831157326698303, + "learning_rate": 4.9398075465569514e-05, + "loss": 0.3773, + "step": 889500 + }, + { + "epoch": 6.022628843655262, + "grad_norm": 0.36046484112739563, + "learning_rate": 4.9397737115634476e-05, + "loss": 0.3771, + "step": 890000 + }, + { + "epoch": 6.02601234300563, + "grad_norm": 0.3497769832611084, + "learning_rate": 4.939739876569944e-05, + "loss": 0.3776, + "step": 890500 + }, + { + "epoch": 6.029395842355998, + "grad_norm": 0.3198765814304352, + "learning_rate": 4.93970604157644e-05, + "loss": 0.3771, + "step": 891000 + }, + { + "epoch": 6.032779341706366, + "grad_norm": 0.3285377323627472, + "learning_rate": 4.939672206582936e-05, + "loss": 0.3755, + "step": 891500 + }, + { + "epoch": 6.0361628410567345, + "grad_norm": 0.3330004811286926, + "learning_rate": 4.939638371589433e-05, + "loss": 0.3773, + "step": 892000 + }, + { + "epoch": 6.039546340407103, + "grad_norm": 0.3420369029045105, + "learning_rate": 4.939604536595929e-05, + "loss": 0.3775, + "step": 892500 + }, + { + "epoch": 6.042929839757471, + "grad_norm": 0.36481571197509766, + "learning_rate": 4.9395707016024255e-05, + "loss": 0.3773, + "step": 893000 + }, + { + "epoch": 6.046313339107839, + "grad_norm": 0.34153133630752563, + "learning_rate": 4.939536866608922e-05, + "loss": 0.3747, + "step": 893500 + }, + { + "epoch": 6.049696838458207, + "grad_norm": 0.3566543161869049, + "learning_rate": 4.9395030316154186e-05, + "loss": 0.3758, + "step": 894000 + }, + { + "epoch": 6.053080337808575, + "grad_norm": 0.3604832887649536, + "learning_rate": 4.939469196621914e-05, + "loss": 0.3757, + "step": 894500 + }, + { + "epoch": 6.056463837158943, + "grad_norm": 0.35277846455574036, + "learning_rate": 4.9394353616284104e-05, + "loss": 0.3771, + "step": 895000 + }, + { + "epoch": 6.0598473365093115, + "grad_norm": 0.34369081258773804, + "learning_rate": 4.939401526634907e-05, + "loss": 0.3769, + "step": 895500 + }, + { + "epoch": 6.063230835859679, + "grad_norm": 0.31082791090011597, + "learning_rate": 4.9393676916414035e-05, + "loss": 0.3771, + "step": 896000 + }, + { + "epoch": 6.066614335210048, + "grad_norm": 0.34710177779197693, + "learning_rate": 4.9393338566479e-05, + "loss": 0.3758, + "step": 896500 + }, + { + "epoch": 6.069997834560416, + "grad_norm": 0.3685329556465149, + "learning_rate": 4.939300021654396e-05, + "loss": 0.3777, + "step": 897000 + }, + { + "epoch": 6.073381333910784, + "grad_norm": 0.30597200989723206, + "learning_rate": 4.939266186660893e-05, + "loss": 0.3778, + "step": 897500 + }, + { + "epoch": 6.076764833261152, + "grad_norm": 0.34194767475128174, + "learning_rate": 4.939232351667389e-05, + "loss": 0.3766, + "step": 898000 + }, + { + "epoch": 6.08014833261152, + "grad_norm": 0.33248472213745117, + "learning_rate": 4.939198516673885e-05, + "loss": 0.3761, + "step": 898500 + }, + { + "epoch": 6.0835318319618885, + "grad_norm": 0.32487326860427856, + "learning_rate": 4.9391646816803814e-05, + "loss": 0.377, + "step": 899000 + }, + { + "epoch": 6.086915331312256, + "grad_norm": 0.37924644351005554, + "learning_rate": 4.9391308466868776e-05, + "loss": 0.3773, + "step": 899500 + }, + { + "epoch": 6.090298830662625, + "grad_norm": 0.3005082905292511, + "learning_rate": 4.939097011693374e-05, + "loss": 0.3764, + "step": 900000 + }, + { + "epoch": 6.093682330012992, + "grad_norm": 0.3714136481285095, + "learning_rate": 4.93906317669987e-05, + "loss": 0.376, + "step": 900500 + }, + { + "epoch": 6.097065829363361, + "grad_norm": 0.35402193665504456, + "learning_rate": 4.939029341706366e-05, + "loss": 0.3782, + "step": 901000 + }, + { + "epoch": 6.1004493287137285, + "grad_norm": 0.40786847472190857, + "learning_rate": 4.938995506712863e-05, + "loss": 0.3776, + "step": 901500 + }, + { + "epoch": 6.103832828064097, + "grad_norm": 0.36258766055107117, + "learning_rate": 4.9389616717193594e-05, + "loss": 0.3774, + "step": 902000 + }, + { + "epoch": 6.1072163274144655, + "grad_norm": 0.3297172486782074, + "learning_rate": 4.9389278367258556e-05, + "loss": 0.3776, + "step": 902500 + }, + { + "epoch": 6.110599826764833, + "grad_norm": 0.34612369537353516, + "learning_rate": 4.938894001732352e-05, + "loss": 0.3785, + "step": 903000 + }, + { + "epoch": 6.113983326115202, + "grad_norm": 0.3194482624530792, + "learning_rate": 4.938860166738849e-05, + "loss": 0.3777, + "step": 903500 + }, + { + "epoch": 6.117366825465569, + "grad_norm": 0.3068385720252991, + "learning_rate": 4.938826331745345e-05, + "loss": 0.3786, + "step": 904000 + }, + { + "epoch": 6.120750324815938, + "grad_norm": 0.3310864567756653, + "learning_rate": 4.9387924967518404e-05, + "loss": 0.3769, + "step": 904500 + }, + { + "epoch": 6.1241338241663055, + "grad_norm": 0.363061785697937, + "learning_rate": 4.938758661758337e-05, + "loss": 0.3778, + "step": 905000 + }, + { + "epoch": 6.127517323516674, + "grad_norm": 0.3285543620586395, + "learning_rate": 4.9387248267648335e-05, + "loss": 0.378, + "step": 905500 + }, + { + "epoch": 6.130900822867042, + "grad_norm": 0.32650935649871826, + "learning_rate": 4.93869099177133e-05, + "loss": 0.3756, + "step": 906000 + }, + { + "epoch": 6.13428432221741, + "grad_norm": 0.36548131704330444, + "learning_rate": 4.938657156777826e-05, + "loss": 0.3773, + "step": 906500 + }, + { + "epoch": 6.137667821567779, + "grad_norm": 0.3245476186275482, + "learning_rate": 4.938623321784323e-05, + "loss": 0.3761, + "step": 907000 + }, + { + "epoch": 6.141051320918146, + "grad_norm": 0.32007771730422974, + "learning_rate": 4.938589486790819e-05, + "loss": 0.3772, + "step": 907500 + }, + { + "epoch": 6.144434820268515, + "grad_norm": 0.3352614939212799, + "learning_rate": 4.938555651797315e-05, + "loss": 0.3768, + "step": 908000 + }, + { + "epoch": 6.1478183196188825, + "grad_norm": 0.31153857707977295, + "learning_rate": 4.9385218168038115e-05, + "loss": 0.376, + "step": 908500 + }, + { + "epoch": 6.151201818969251, + "grad_norm": 0.36174407601356506, + "learning_rate": 4.938487981810308e-05, + "loss": 0.3772, + "step": 909000 + }, + { + "epoch": 6.154585318319619, + "grad_norm": 0.36423182487487793, + "learning_rate": 4.938454146816804e-05, + "loss": 0.3764, + "step": 909500 + }, + { + "epoch": 6.157968817669987, + "grad_norm": 0.302528440952301, + "learning_rate": 4.9384203118233e-05, + "loss": 0.3767, + "step": 910000 + }, + { + "epoch": 6.161352317020355, + "grad_norm": 0.3159232437610626, + "learning_rate": 4.938386476829796e-05, + "loss": 0.3763, + "step": 910500 + }, + { + "epoch": 6.164735816370723, + "grad_norm": 0.3770563304424286, + "learning_rate": 4.938352641836293e-05, + "loss": 0.3774, + "step": 911000 + }, + { + "epoch": 6.168119315721091, + "grad_norm": 0.3554304242134094, + "learning_rate": 4.9383188068427894e-05, + "loss": 0.3776, + "step": 911500 + }, + { + "epoch": 6.1715028150714595, + "grad_norm": 0.3524431884288788, + "learning_rate": 4.9382849718492857e-05, + "loss": 0.3774, + "step": 912000 + }, + { + "epoch": 6.174886314421828, + "grad_norm": 0.3047381639480591, + "learning_rate": 4.938251136855782e-05, + "loss": 0.3764, + "step": 912500 + }, + { + "epoch": 6.178269813772196, + "grad_norm": 0.3575003743171692, + "learning_rate": 4.938217301862279e-05, + "loss": 0.3767, + "step": 913000 + }, + { + "epoch": 6.181653313122564, + "grad_norm": 0.3069573938846588, + "learning_rate": 4.938183466868775e-05, + "loss": 0.3755, + "step": 913500 + }, + { + "epoch": 6.185036812472932, + "grad_norm": 0.3316364884376526, + "learning_rate": 4.9381496318752705e-05, + "loss": 0.3788, + "step": 914000 + }, + { + "epoch": 6.1884203118233, + "grad_norm": 0.414703905582428, + "learning_rate": 4.9381157968817674e-05, + "loss": 0.377, + "step": 914500 + }, + { + "epoch": 6.191803811173668, + "grad_norm": 0.2995516359806061, + "learning_rate": 4.9380819618882636e-05, + "loss": 0.3782, + "step": 915000 + }, + { + "epoch": 6.1951873105240365, + "grad_norm": 0.32982611656188965, + "learning_rate": 4.93804812689476e-05, + "loss": 0.3773, + "step": 915500 + }, + { + "epoch": 6.198570809874404, + "grad_norm": 0.3949192464351654, + "learning_rate": 4.938014291901256e-05, + "loss": 0.3782, + "step": 916000 + }, + { + "epoch": 6.201954309224773, + "grad_norm": 0.34046921133995056, + "learning_rate": 4.937980456907752e-05, + "loss": 0.3787, + "step": 916500 + }, + { + "epoch": 6.205337808575141, + "grad_norm": 0.3586592674255371, + "learning_rate": 4.937946621914249e-05, + "loss": 0.3768, + "step": 917000 + }, + { + "epoch": 6.208721307925509, + "grad_norm": 0.31867897510528564, + "learning_rate": 4.9379127869207453e-05, + "loss": 0.3786, + "step": 917500 + }, + { + "epoch": 6.212104807275877, + "grad_norm": 0.3539881408214569, + "learning_rate": 4.9378789519272416e-05, + "loss": 0.3756, + "step": 918000 + }, + { + "epoch": 6.215488306626245, + "grad_norm": 0.3635047972202301, + "learning_rate": 4.937845116933738e-05, + "loss": 0.3783, + "step": 918500 + }, + { + "epoch": 6.2188718059766135, + "grad_norm": 0.3581390082836151, + "learning_rate": 4.937811281940234e-05, + "loss": 0.3786, + "step": 919000 + }, + { + "epoch": 6.222255305326981, + "grad_norm": 0.3544600009918213, + "learning_rate": 4.93777744694673e-05, + "loss": 0.3756, + "step": 919500 + }, + { + "epoch": 6.22563880467735, + "grad_norm": 0.345814049243927, + "learning_rate": 4.9377436119532264e-05, + "loss": 0.3783, + "step": 920000 + }, + { + "epoch": 6.229022304027717, + "grad_norm": 0.3887973129749298, + "learning_rate": 4.937709776959723e-05, + "loss": 0.3786, + "step": 920500 + }, + { + "epoch": 6.232405803378086, + "grad_norm": 0.371076762676239, + "learning_rate": 4.9376759419662195e-05, + "loss": 0.3786, + "step": 921000 + }, + { + "epoch": 6.235789302728454, + "grad_norm": 0.3317933976650238, + "learning_rate": 4.937642106972716e-05, + "loss": 0.3763, + "step": 921500 + }, + { + "epoch": 6.239172802078822, + "grad_norm": 0.36534351110458374, + "learning_rate": 4.937608271979212e-05, + "loss": 0.3772, + "step": 922000 + }, + { + "epoch": 6.2425563014291905, + "grad_norm": 0.36879029870033264, + "learning_rate": 4.937574436985709e-05, + "loss": 0.3769, + "step": 922500 + }, + { + "epoch": 6.245939800779558, + "grad_norm": 0.36423808336257935, + "learning_rate": 4.937540601992205e-05, + "loss": 0.3777, + "step": 923000 + }, + { + "epoch": 6.249323300129927, + "grad_norm": 0.30669546127319336, + "learning_rate": 4.9375067669987006e-05, + "loss": 0.3779, + "step": 923500 + }, + { + "epoch": 6.252706799480294, + "grad_norm": 0.38380688428878784, + "learning_rate": 4.9374729320051975e-05, + "loss": 0.3781, + "step": 924000 + }, + { + "epoch": 6.256090298830663, + "grad_norm": 0.3103152811527252, + "learning_rate": 4.937439097011694e-05, + "loss": 0.3781, + "step": 924500 + }, + { + "epoch": 6.2594737981810304, + "grad_norm": 0.36598241329193115, + "learning_rate": 4.93740526201819e-05, + "loss": 0.377, + "step": 925000 + }, + { + "epoch": 6.262857297531399, + "grad_norm": 0.3761679530143738, + "learning_rate": 4.937371427024686e-05, + "loss": 0.3766, + "step": 925500 + }, + { + "epoch": 6.266240796881767, + "grad_norm": 0.31851160526275635, + "learning_rate": 4.937337592031182e-05, + "loss": 0.3769, + "step": 926000 + }, + { + "epoch": 6.269624296232135, + "grad_norm": 0.32151755690574646, + "learning_rate": 4.937303757037679e-05, + "loss": 0.3769, + "step": 926500 + }, + { + "epoch": 6.273007795582504, + "grad_norm": 0.3226669728755951, + "learning_rate": 4.9372699220441754e-05, + "loss": 0.3786, + "step": 927000 + }, + { + "epoch": 6.276391294932871, + "grad_norm": 0.35841917991638184, + "learning_rate": 4.9372360870506716e-05, + "loss": 0.3782, + "step": 927500 + }, + { + "epoch": 6.27977479428324, + "grad_norm": 0.35071441531181335, + "learning_rate": 4.937202252057168e-05, + "loss": 0.3771, + "step": 928000 + }, + { + "epoch": 6.2831582936336074, + "grad_norm": 0.3182418644428253, + "learning_rate": 4.937168417063664e-05, + "loss": 0.3767, + "step": 928500 + }, + { + "epoch": 6.286541792983976, + "grad_norm": 0.33331987261772156, + "learning_rate": 4.93713458207016e-05, + "loss": 0.3777, + "step": 929000 + }, + { + "epoch": 6.289925292334344, + "grad_norm": 0.3495878577232361, + "learning_rate": 4.9371007470766565e-05, + "loss": 0.3774, + "step": 929500 + }, + { + "epoch": 6.293308791684712, + "grad_norm": 0.39855191111564636, + "learning_rate": 4.9370669120831534e-05, + "loss": 0.3764, + "step": 930000 + }, + { + "epoch": 6.29669229103508, + "grad_norm": 0.33193016052246094, + "learning_rate": 4.9370330770896496e-05, + "loss": 0.3778, + "step": 930500 + }, + { + "epoch": 6.300075790385448, + "grad_norm": 0.32373687624931335, + "learning_rate": 4.936999242096146e-05, + "loss": 0.377, + "step": 931000 + }, + { + "epoch": 6.303459289735816, + "grad_norm": 0.32472047209739685, + "learning_rate": 4.936965407102642e-05, + "loss": 0.3763, + "step": 931500 + }, + { + "epoch": 6.3068427890861845, + "grad_norm": 0.37571972608566284, + "learning_rate": 4.936931572109139e-05, + "loss": 0.3768, + "step": 932000 + }, + { + "epoch": 6.310226288436553, + "grad_norm": 0.3355121910572052, + "learning_rate": 4.936897737115635e-05, + "loss": 0.3767, + "step": 932500 + }, + { + "epoch": 6.313609787786921, + "grad_norm": 0.340402752161026, + "learning_rate": 4.9368639021221306e-05, + "loss": 0.378, + "step": 933000 + }, + { + "epoch": 6.316993287137289, + "grad_norm": 0.36386340856552124, + "learning_rate": 4.936830067128627e-05, + "loss": 0.3771, + "step": 933500 + }, + { + "epoch": 6.320376786487657, + "grad_norm": 0.3460202217102051, + "learning_rate": 4.936796232135124e-05, + "loss": 0.3762, + "step": 934000 + }, + { + "epoch": 6.323760285838025, + "grad_norm": 0.3330344557762146, + "learning_rate": 4.93676239714162e-05, + "loss": 0.3775, + "step": 934500 + }, + { + "epoch": 6.327143785188393, + "grad_norm": 0.3709957003593445, + "learning_rate": 4.936728562148116e-05, + "loss": 0.377, + "step": 935000 + }, + { + "epoch": 6.3305272845387615, + "grad_norm": 0.3179211914539337, + "learning_rate": 4.9366947271546124e-05, + "loss": 0.3779, + "step": 935500 + }, + { + "epoch": 6.333910783889129, + "grad_norm": 0.35370567440986633, + "learning_rate": 4.936660892161109e-05, + "loss": 0.3781, + "step": 936000 + }, + { + "epoch": 6.337294283239498, + "grad_norm": 0.3791564404964447, + "learning_rate": 4.9366270571676055e-05, + "loss": 0.3784, + "step": 936500 + }, + { + "epoch": 6.340677782589866, + "grad_norm": 0.304982453584671, + "learning_rate": 4.936593222174102e-05, + "loss": 0.3783, + "step": 937000 + }, + { + "epoch": 6.344061281940234, + "grad_norm": 0.3278897702693939, + "learning_rate": 4.936559387180598e-05, + "loss": 0.3774, + "step": 937500 + }, + { + "epoch": 6.347444781290602, + "grad_norm": 0.333046019077301, + "learning_rate": 4.936525552187094e-05, + "loss": 0.3774, + "step": 938000 + }, + { + "epoch": 6.35082828064097, + "grad_norm": 0.34410935640335083, + "learning_rate": 4.93649171719359e-05, + "loss": 0.3776, + "step": 938500 + }, + { + "epoch": 6.3542117799913385, + "grad_norm": 0.3665798604488373, + "learning_rate": 4.9364578822000865e-05, + "loss": 0.3796, + "step": 939000 + }, + { + "epoch": 6.357595279341706, + "grad_norm": 0.31881019473075867, + "learning_rate": 4.9364240472065834e-05, + "loss": 0.3764, + "step": 939500 + }, + { + "epoch": 6.360978778692075, + "grad_norm": 0.33913347125053406, + "learning_rate": 4.9363902122130796e-05, + "loss": 0.3768, + "step": 940000 + }, + { + "epoch": 6.364362278042442, + "grad_norm": 0.3458954691886902, + "learning_rate": 4.936356377219576e-05, + "loss": 0.3763, + "step": 940500 + }, + { + "epoch": 6.367745777392811, + "grad_norm": 0.35770922899246216, + "learning_rate": 4.936322542226072e-05, + "loss": 0.3779, + "step": 941000 + }, + { + "epoch": 6.371129276743179, + "grad_norm": 0.3633727431297302, + "learning_rate": 4.936288707232569e-05, + "loss": 0.3766, + "step": 941500 + }, + { + "epoch": 6.374512776093547, + "grad_norm": 0.35198426246643066, + "learning_rate": 4.936254872239065e-05, + "loss": 0.3774, + "step": 942000 + }, + { + "epoch": 6.3778962754439155, + "grad_norm": 0.3605923652648926, + "learning_rate": 4.936221037245561e-05, + "loss": 0.3784, + "step": 942500 + }, + { + "epoch": 6.381279774794283, + "grad_norm": 0.3893333077430725, + "learning_rate": 4.936187202252057e-05, + "loss": 0.376, + "step": 943000 + }, + { + "epoch": 6.384663274144652, + "grad_norm": 0.34985458850860596, + "learning_rate": 4.936153367258554e-05, + "loss": 0.3777, + "step": 943500 + }, + { + "epoch": 6.388046773495019, + "grad_norm": 0.3702256977558136, + "learning_rate": 4.93611953226505e-05, + "loss": 0.3772, + "step": 944000 + }, + { + "epoch": 6.391430272845388, + "grad_norm": 0.337545782327652, + "learning_rate": 4.936085697271546e-05, + "loss": 0.3775, + "step": 944500 + }, + { + "epoch": 6.394813772195755, + "grad_norm": 0.3542190194129944, + "learning_rate": 4.9360518622780424e-05, + "loss": 0.3773, + "step": 945000 + }, + { + "epoch": 6.398197271546124, + "grad_norm": 0.3288930058479309, + "learning_rate": 4.936018027284539e-05, + "loss": 0.3761, + "step": 945500 + }, + { + "epoch": 6.401580770896492, + "grad_norm": 0.3794701397418976, + "learning_rate": 4.9359841922910355e-05, + "loss": 0.3774, + "step": 946000 + }, + { + "epoch": 6.40496427024686, + "grad_norm": 0.3350881338119507, + "learning_rate": 4.935950357297532e-05, + "loss": 0.3784, + "step": 946500 + }, + { + "epoch": 6.408347769597229, + "grad_norm": 0.3109876811504364, + "learning_rate": 4.935916522304028e-05, + "loss": 0.3765, + "step": 947000 + }, + { + "epoch": 6.411731268947596, + "grad_norm": 0.362585186958313, + "learning_rate": 4.935882687310524e-05, + "loss": 0.3779, + "step": 947500 + }, + { + "epoch": 6.415114768297965, + "grad_norm": 0.3090763986110687, + "learning_rate": 4.9358488523170204e-05, + "loss": 0.378, + "step": 948000 + }, + { + "epoch": 6.418498267648332, + "grad_norm": 0.3567908704280853, + "learning_rate": 4.9358150173235166e-05, + "loss": 0.3767, + "step": 948500 + }, + { + "epoch": 6.421881766998701, + "grad_norm": 0.3453218936920166, + "learning_rate": 4.9357811823300135e-05, + "loss": 0.3772, + "step": 949000 + }, + { + "epoch": 6.425265266349069, + "grad_norm": 0.365347683429718, + "learning_rate": 4.93574734733651e-05, + "loss": 0.3777, + "step": 949500 + }, + { + "epoch": 6.428648765699437, + "grad_norm": 0.3815634846687317, + "learning_rate": 4.935713512343006e-05, + "loss": 0.3764, + "step": 950000 + }, + { + "epoch": 6.432032265049805, + "grad_norm": 0.3377106785774231, + "learning_rate": 4.935679677349502e-05, + "loss": 0.3776, + "step": 950500 + }, + { + "epoch": 6.435415764400173, + "grad_norm": 0.3434532582759857, + "learning_rate": 4.935645842355999e-05, + "loss": 0.3775, + "step": 951000 + }, + { + "epoch": 6.438799263750541, + "grad_norm": 0.37483832240104675, + "learning_rate": 4.935612007362495e-05, + "loss": 0.3769, + "step": 951500 + }, + { + "epoch": 6.442182763100909, + "grad_norm": 0.34985026717185974, + "learning_rate": 4.935578172368991e-05, + "loss": 0.3781, + "step": 952000 + }, + { + "epoch": 6.445566262451278, + "grad_norm": 0.34416043758392334, + "learning_rate": 4.935544337375487e-05, + "loss": 0.3774, + "step": 952500 + }, + { + "epoch": 6.448949761801646, + "grad_norm": 0.37591052055358887, + "learning_rate": 4.935510502381984e-05, + "loss": 0.3757, + "step": 953000 + }, + { + "epoch": 6.452333261152014, + "grad_norm": 0.35309016704559326, + "learning_rate": 4.93547666738848e-05, + "loss": 0.3781, + "step": 953500 + }, + { + "epoch": 6.455716760502382, + "grad_norm": 0.3455849587917328, + "learning_rate": 4.935442832394976e-05, + "loss": 0.3775, + "step": 954000 + }, + { + "epoch": 6.45910025985275, + "grad_norm": 0.30579593777656555, + "learning_rate": 4.9354089974014725e-05, + "loss": 0.378, + "step": 954500 + }, + { + "epoch": 6.462483759203118, + "grad_norm": 0.34462663531303406, + "learning_rate": 4.9353751624079694e-05, + "loss": 0.3782, + "step": 955000 + }, + { + "epoch": 6.465867258553486, + "grad_norm": 0.35439208149909973, + "learning_rate": 4.9353413274144656e-05, + "loss": 0.3787, + "step": 955500 + }, + { + "epoch": 6.469250757903854, + "grad_norm": 0.3425807058811188, + "learning_rate": 4.935307492420962e-05, + "loss": 0.3787, + "step": 956000 + }, + { + "epoch": 6.472634257254223, + "grad_norm": 0.33159133791923523, + "learning_rate": 4.935273657427458e-05, + "loss": 0.3778, + "step": 956500 + }, + { + "epoch": 6.476017756604591, + "grad_norm": 0.3423265814781189, + "learning_rate": 4.935239822433954e-05, + "loss": 0.3787, + "step": 957000 + }, + { + "epoch": 6.479401255954959, + "grad_norm": 0.3442203998565674, + "learning_rate": 4.9352059874404504e-05, + "loss": 0.3783, + "step": 957500 + }, + { + "epoch": 6.482784755305327, + "grad_norm": 0.3310578167438507, + "learning_rate": 4.9351721524469467e-05, + "loss": 0.3781, + "step": 958000 + }, + { + "epoch": 6.486168254655695, + "grad_norm": 0.37686657905578613, + "learning_rate": 4.9351383174534435e-05, + "loss": 0.3765, + "step": 958500 + }, + { + "epoch": 6.489551754006063, + "grad_norm": 0.3256211578845978, + "learning_rate": 4.93510448245994e-05, + "loss": 0.3777, + "step": 959000 + }, + { + "epoch": 6.492935253356431, + "grad_norm": 0.33287709951400757, + "learning_rate": 4.935070647466436e-05, + "loss": 0.3756, + "step": 959500 + }, + { + "epoch": 6.4963187527068, + "grad_norm": 0.33640360832214355, + "learning_rate": 4.935036812472932e-05, + "loss": 0.3785, + "step": 960000 + }, + { + "epoch": 6.499702252057167, + "grad_norm": 0.35884687304496765, + "learning_rate": 4.935002977479429e-05, + "loss": 0.3765, + "step": 960500 + }, + { + "epoch": 6.503085751407536, + "grad_norm": 0.3432214856147766, + "learning_rate": 4.934969142485925e-05, + "loss": 0.3772, + "step": 961000 + }, + { + "epoch": 6.506469250757904, + "grad_norm": 0.34991124272346497, + "learning_rate": 4.934935307492421e-05, + "loss": 0.3781, + "step": 961500 + }, + { + "epoch": 6.509852750108272, + "grad_norm": 0.3342727720737457, + "learning_rate": 4.934901472498917e-05, + "loss": 0.3767, + "step": 962000 + }, + { + "epoch": 6.51323624945864, + "grad_norm": 0.35070064663887024, + "learning_rate": 4.934867637505414e-05, + "loss": 0.3779, + "step": 962500 + }, + { + "epoch": 6.516619748809008, + "grad_norm": 0.3440904915332794, + "learning_rate": 4.93483380251191e-05, + "loss": 0.3783, + "step": 963000 + }, + { + "epoch": 6.520003248159377, + "grad_norm": 0.31607815623283386, + "learning_rate": 4.9347999675184063e-05, + "loss": 0.3758, + "step": 963500 + }, + { + "epoch": 6.523386747509744, + "grad_norm": 0.3344024121761322, + "learning_rate": 4.9347661325249026e-05, + "loss": 0.3784, + "step": 964000 + }, + { + "epoch": 6.526770246860113, + "grad_norm": 0.33444446325302124, + "learning_rate": 4.9347322975313994e-05, + "loss": 0.376, + "step": 964500 + }, + { + "epoch": 6.53015374621048, + "grad_norm": 0.3433579206466675, + "learning_rate": 4.9346984625378957e-05, + "loss": 0.3776, + "step": 965000 + }, + { + "epoch": 6.533537245560849, + "grad_norm": 0.3734302520751953, + "learning_rate": 4.934664627544392e-05, + "loss": 0.3775, + "step": 965500 + }, + { + "epoch": 6.536920744911217, + "grad_norm": 0.3572101891040802, + "learning_rate": 4.934630792550888e-05, + "loss": 0.3776, + "step": 966000 + }, + { + "epoch": 6.540304244261585, + "grad_norm": 0.3306787610054016, + "learning_rate": 4.934596957557384e-05, + "loss": 0.3778, + "step": 966500 + }, + { + "epoch": 6.543687743611954, + "grad_norm": 0.3770751357078552, + "learning_rate": 4.9345631225638805e-05, + "loss": 0.3761, + "step": 967000 + }, + { + "epoch": 6.547071242962321, + "grad_norm": 0.3891368508338928, + "learning_rate": 4.934529287570377e-05, + "loss": 0.3779, + "step": 967500 + }, + { + "epoch": 6.55045474231269, + "grad_norm": 0.3260124921798706, + "learning_rate": 4.9344954525768736e-05, + "loss": 0.3777, + "step": 968000 + }, + { + "epoch": 6.553838241663057, + "grad_norm": 0.327788770198822, + "learning_rate": 4.93446161758337e-05, + "loss": 0.3776, + "step": 968500 + }, + { + "epoch": 6.557221741013426, + "grad_norm": 0.38705557584762573, + "learning_rate": 4.934427782589866e-05, + "loss": 0.3769, + "step": 969000 + }, + { + "epoch": 6.5606052403637936, + "grad_norm": 0.3410389721393585, + "learning_rate": 4.934393947596362e-05, + "loss": 0.3783, + "step": 969500 + }, + { + "epoch": 6.563988739714162, + "grad_norm": 0.3226074278354645, + "learning_rate": 4.934360112602859e-05, + "loss": 0.3792, + "step": 970000 + }, + { + "epoch": 6.56737223906453, + "grad_norm": 0.3195018172264099, + "learning_rate": 4.9343262776093553e-05, + "loss": 0.3763, + "step": 970500 + }, + { + "epoch": 6.570755738414898, + "grad_norm": 0.3643065392971039, + "learning_rate": 4.934292442615851e-05, + "loss": 0.3768, + "step": 971000 + }, + { + "epoch": 6.574139237765266, + "grad_norm": 0.3433854281902313, + "learning_rate": 4.934258607622347e-05, + "loss": 0.3777, + "step": 971500 + }, + { + "epoch": 6.577522737115634, + "grad_norm": 0.3245162069797516, + "learning_rate": 4.934224772628844e-05, + "loss": 0.3783, + "step": 972000 + }, + { + "epoch": 6.580906236466003, + "grad_norm": 0.32305416464805603, + "learning_rate": 4.93419093763534e-05, + "loss": 0.377, + "step": 972500 + }, + { + "epoch": 6.584289735816371, + "grad_norm": 0.3494907021522522, + "learning_rate": 4.9341571026418364e-05, + "loss": 0.3779, + "step": 973000 + }, + { + "epoch": 6.587673235166739, + "grad_norm": 0.34951290488243103, + "learning_rate": 4.9341232676483326e-05, + "loss": 0.3786, + "step": 973500 + }, + { + "epoch": 6.591056734517107, + "grad_norm": 0.33310452103614807, + "learning_rate": 4.9340894326548295e-05, + "loss": 0.3769, + "step": 974000 + }, + { + "epoch": 6.594440233867475, + "grad_norm": 0.46549180150032043, + "learning_rate": 4.934055597661326e-05, + "loss": 0.3767, + "step": 974500 + }, + { + "epoch": 6.597823733217843, + "grad_norm": 0.35990414023399353, + "learning_rate": 4.934021762667822e-05, + "loss": 0.3761, + "step": 975000 + }, + { + "epoch": 6.601207232568211, + "grad_norm": 0.3343123495578766, + "learning_rate": 4.933987927674318e-05, + "loss": 0.3782, + "step": 975500 + }, + { + "epoch": 6.604590731918579, + "grad_norm": 0.32567545771598816, + "learning_rate": 4.9339540926808144e-05, + "loss": 0.3765, + "step": 976000 + }, + { + "epoch": 6.607974231268948, + "grad_norm": 0.3510096073150635, + "learning_rate": 4.9339202576873106e-05, + "loss": 0.3768, + "step": 976500 + }, + { + "epoch": 6.611357730619316, + "grad_norm": 0.32567307353019714, + "learning_rate": 4.933886422693807e-05, + "loss": 0.378, + "step": 977000 + }, + { + "epoch": 6.614741229969684, + "grad_norm": 0.33810216188430786, + "learning_rate": 4.933852587700304e-05, + "loss": 0.3776, + "step": 977500 + }, + { + "epoch": 6.618124729320052, + "grad_norm": 0.354951947927475, + "learning_rate": 4.9338187527068e-05, + "loss": 0.378, + "step": 978000 + }, + { + "epoch": 6.62150822867042, + "grad_norm": 0.32536765933036804, + "learning_rate": 4.933784917713296e-05, + "loss": 0.3773, + "step": 978500 + }, + { + "epoch": 6.624891728020788, + "grad_norm": 0.3651494085788727, + "learning_rate": 4.933751082719792e-05, + "loss": 0.3779, + "step": 979000 + }, + { + "epoch": 6.628275227371156, + "grad_norm": 0.3602128326892853, + "learning_rate": 4.9337172477262885e-05, + "loss": 0.3778, + "step": 979500 + }, + { + "epoch": 6.631658726721525, + "grad_norm": 0.3357810080051422, + "learning_rate": 4.9336834127327854e-05, + "loss": 0.3767, + "step": 980000 + }, + { + "epoch": 6.635042226071892, + "grad_norm": 0.31764236092567444, + "learning_rate": 4.933649577739281e-05, + "loss": 0.3786, + "step": 980500 + }, + { + "epoch": 6.638425725422261, + "grad_norm": 0.38006776571273804, + "learning_rate": 4.933615742745777e-05, + "loss": 0.378, + "step": 981000 + }, + { + "epoch": 6.641809224772629, + "grad_norm": 0.28093838691711426, + "learning_rate": 4.933581907752274e-05, + "loss": 0.3785, + "step": 981500 + }, + { + "epoch": 6.645192724122997, + "grad_norm": 0.32980915904045105, + "learning_rate": 4.93354807275877e-05, + "loss": 0.3763, + "step": 982000 + }, + { + "epoch": 6.648576223473365, + "grad_norm": 0.34391888976097107, + "learning_rate": 4.9335142377652665e-05, + "loss": 0.3783, + "step": 982500 + }, + { + "epoch": 6.651959722823733, + "grad_norm": 0.361083447933197, + "learning_rate": 4.933480402771763e-05, + "loss": 0.3766, + "step": 983000 + }, + { + "epoch": 6.655343222174102, + "grad_norm": 0.36109864711761475, + "learning_rate": 4.9334465677782596e-05, + "loss": 0.3774, + "step": 983500 + }, + { + "epoch": 6.658726721524469, + "grad_norm": 0.3269791901111603, + "learning_rate": 4.933412732784756e-05, + "loss": 0.378, + "step": 984000 + }, + { + "epoch": 6.662110220874838, + "grad_norm": 0.3839172124862671, + "learning_rate": 4.933378897791252e-05, + "loss": 0.3784, + "step": 984500 + }, + { + "epoch": 6.665493720225205, + "grad_norm": 0.33352527022361755, + "learning_rate": 4.933345062797748e-05, + "loss": 0.3783, + "step": 985000 + }, + { + "epoch": 6.668877219575574, + "grad_norm": 0.36033135652542114, + "learning_rate": 4.9333112278042444e-05, + "loss": 0.3769, + "step": 985500 + }, + { + "epoch": 6.672260718925942, + "grad_norm": 0.3391295075416565, + "learning_rate": 4.9332773928107406e-05, + "loss": 0.3772, + "step": 986000 + }, + { + "epoch": 6.67564421827631, + "grad_norm": 0.3773353099822998, + "learning_rate": 4.933243557817237e-05, + "loss": 0.3771, + "step": 986500 + }, + { + "epoch": 6.679027717626679, + "grad_norm": 0.3952521085739136, + "learning_rate": 4.933209722823733e-05, + "loss": 0.3763, + "step": 987000 + }, + { + "epoch": 6.682411216977046, + "grad_norm": 0.37172383069992065, + "learning_rate": 4.93317588783023e-05, + "loss": 0.3776, + "step": 987500 + }, + { + "epoch": 6.685794716327415, + "grad_norm": 0.3839188516139984, + "learning_rate": 4.933142052836726e-05, + "loss": 0.3776, + "step": 988000 + }, + { + "epoch": 6.689178215677782, + "grad_norm": 0.3491535186767578, + "learning_rate": 4.9331082178432224e-05, + "loss": 0.378, + "step": 988500 + }, + { + "epoch": 6.692561715028151, + "grad_norm": 0.32665354013442993, + "learning_rate": 4.9330743828497186e-05, + "loss": 0.3782, + "step": 989000 + }, + { + "epoch": 6.6959452143785185, + "grad_norm": 0.3311278223991394, + "learning_rate": 4.9330405478562155e-05, + "loss": 0.378, + "step": 989500 + }, + { + "epoch": 6.699328713728887, + "grad_norm": 0.31300148367881775, + "learning_rate": 4.933006712862711e-05, + "loss": 0.3771, + "step": 990000 + }, + { + "epoch": 6.702712213079256, + "grad_norm": 0.359619140625, + "learning_rate": 4.932972877869207e-05, + "loss": 0.3774, + "step": 990500 + }, + { + "epoch": 6.706095712429623, + "grad_norm": 0.3795474171638489, + "learning_rate": 4.932939042875704e-05, + "loss": 0.3765, + "step": 991000 + }, + { + "epoch": 6.709479211779991, + "grad_norm": 0.3238963484764099, + "learning_rate": 4.9329052078822e-05, + "loss": 0.3775, + "step": 991500 + }, + { + "epoch": 6.712862711130359, + "grad_norm": 0.35236936807632446, + "learning_rate": 4.9328713728886965e-05, + "loss": 0.3779, + "step": 992000 + }, + { + "epoch": 6.716246210480728, + "grad_norm": 0.3102133274078369, + "learning_rate": 4.932837537895193e-05, + "loss": 0.378, + "step": 992500 + }, + { + "epoch": 6.7196297098310955, + "grad_norm": 0.33269041776657104, + "learning_rate": 4.9328037029016896e-05, + "loss": 0.3793, + "step": 993000 + }, + { + "epoch": 6.723013209181464, + "grad_norm": 0.39434024691581726, + "learning_rate": 4.932769867908186e-05, + "loss": 0.3766, + "step": 993500 + }, + { + "epoch": 6.726396708531832, + "grad_norm": 0.3668605387210846, + "learning_rate": 4.932736032914682e-05, + "loss": 0.3766, + "step": 994000 + }, + { + "epoch": 6.7297802078822, + "grad_norm": 0.418206125497818, + "learning_rate": 4.932702197921178e-05, + "loss": 0.3766, + "step": 994500 + }, + { + "epoch": 6.733163707232568, + "grad_norm": 0.37223225831985474, + "learning_rate": 4.9326683629276745e-05, + "loss": 0.3766, + "step": 995000 + }, + { + "epoch": 6.736547206582936, + "grad_norm": 0.36875730752944946, + "learning_rate": 4.932634527934171e-05, + "loss": 0.3777, + "step": 995500 + }, + { + "epoch": 6.739930705933304, + "grad_norm": 0.34293901920318604, + "learning_rate": 4.932600692940667e-05, + "loss": 0.3781, + "step": 996000 + }, + { + "epoch": 6.7433142052836725, + "grad_norm": 0.3703588545322418, + "learning_rate": 4.932566857947163e-05, + "loss": 0.3764, + "step": 996500 + }, + { + "epoch": 6.746697704634041, + "grad_norm": 0.33531859517097473, + "learning_rate": 4.93253302295366e-05, + "loss": 0.3776, + "step": 997000 + }, + { + "epoch": 6.750081203984409, + "grad_norm": 0.33256930112838745, + "learning_rate": 4.932499187960156e-05, + "loss": 0.3776, + "step": 997500 + }, + { + "epoch": 6.753464703334777, + "grad_norm": 0.34108084440231323, + "learning_rate": 4.9324653529666524e-05, + "loss": 0.3786, + "step": 998000 + }, + { + "epoch": 6.756848202685145, + "grad_norm": 0.32973945140838623, + "learning_rate": 4.9324315179731486e-05, + "loss": 0.3764, + "step": 998500 + }, + { + "epoch": 6.760231702035513, + "grad_norm": 0.3286641240119934, + "learning_rate": 4.9323976829796455e-05, + "loss": 0.3773, + "step": 999000 + }, + { + "epoch": 6.763615201385881, + "grad_norm": 0.3770553767681122, + "learning_rate": 4.932363847986141e-05, + "loss": 0.376, + "step": 999500 + }, + { + "epoch": 6.7669987007362495, + "grad_norm": 0.3492177426815033, + "learning_rate": 4.932330012992637e-05, + "loss": 0.3762, + "step": 1000000 + }, + { + "epoch": 6.770382200086617, + "grad_norm": 0.3836628496646881, + "learning_rate": 4.932296177999134e-05, + "loss": 0.3758, + "step": 1000500 + }, + { + "epoch": 6.773765699436986, + "grad_norm": 0.32772648334503174, + "learning_rate": 4.9322623430056304e-05, + "loss": 0.379, + "step": 1001000 + }, + { + "epoch": 6.777149198787354, + "grad_norm": 0.31784480810165405, + "learning_rate": 4.9322285080121266e-05, + "loss": 0.3769, + "step": 1001500 + }, + { + "epoch": 6.780532698137722, + "grad_norm": 0.3466293215751648, + "learning_rate": 4.932194673018623e-05, + "loss": 0.3779, + "step": 1002000 + }, + { + "epoch": 6.78391619748809, + "grad_norm": 0.3728845715522766, + "learning_rate": 4.93216083802512e-05, + "loss": 0.3774, + "step": 1002500 + }, + { + "epoch": 6.787299696838458, + "grad_norm": 0.31999626755714417, + "learning_rate": 4.932127003031616e-05, + "loss": 0.3764, + "step": 1003000 + }, + { + "epoch": 6.7906831961888265, + "grad_norm": 0.3349250555038452, + "learning_rate": 4.932093168038112e-05, + "loss": 0.3764, + "step": 1003500 + }, + { + "epoch": 6.794066695539194, + "grad_norm": 0.3492509424686432, + "learning_rate": 4.932059333044608e-05, + "loss": 0.3763, + "step": 1004000 + }, + { + "epoch": 6.797450194889563, + "grad_norm": 0.31739434599876404, + "learning_rate": 4.9320254980511045e-05, + "loss": 0.3767, + "step": 1004500 + }, + { + "epoch": 6.80083369423993, + "grad_norm": 0.3199305534362793, + "learning_rate": 4.931991663057601e-05, + "loss": 0.3768, + "step": 1005000 + }, + { + "epoch": 6.804217193590299, + "grad_norm": 0.28688299655914307, + "learning_rate": 4.931957828064097e-05, + "loss": 0.3771, + "step": 1005500 + }, + { + "epoch": 6.807600692940667, + "grad_norm": 0.30341842770576477, + "learning_rate": 4.931923993070593e-05, + "loss": 0.3771, + "step": 1006000 + }, + { + "epoch": 6.810984192291035, + "grad_norm": 0.34070146083831787, + "learning_rate": 4.93189015807709e-05, + "loss": 0.3788, + "step": 1006500 + }, + { + "epoch": 6.8143676916414035, + "grad_norm": 0.3197017014026642, + "learning_rate": 4.931856323083586e-05, + "loss": 0.378, + "step": 1007000 + }, + { + "epoch": 6.817751190991771, + "grad_norm": 0.33668169379234314, + "learning_rate": 4.9318224880900825e-05, + "loss": 0.3768, + "step": 1007500 + }, + { + "epoch": 6.82113469034214, + "grad_norm": 0.3488815724849701, + "learning_rate": 4.931788653096579e-05, + "loss": 0.3758, + "step": 1008000 + }, + { + "epoch": 6.824518189692507, + "grad_norm": 0.35777804255485535, + "learning_rate": 4.9317548181030756e-05, + "loss": 0.3777, + "step": 1008500 + }, + { + "epoch": 6.827901689042876, + "grad_norm": 0.31806090474128723, + "learning_rate": 4.931720983109571e-05, + "loss": 0.3776, + "step": 1009000 + }, + { + "epoch": 6.8312851883932435, + "grad_norm": 0.339591920375824, + "learning_rate": 4.9316871481160673e-05, + "loss": 0.3768, + "step": 1009500 + }, + { + "epoch": 6.834668687743612, + "grad_norm": 0.3289056122303009, + "learning_rate": 4.931653313122564e-05, + "loss": 0.3771, + "step": 1010000 + }, + { + "epoch": 6.8380521870939805, + "grad_norm": 0.3581726551055908, + "learning_rate": 4.9316194781290604e-05, + "loss": 0.3761, + "step": 1010500 + }, + { + "epoch": 6.841435686444348, + "grad_norm": 0.35697153210639954, + "learning_rate": 4.9315856431355567e-05, + "loss": 0.3774, + "step": 1011000 + }, + { + "epoch": 6.844819185794717, + "grad_norm": 0.34281229972839355, + "learning_rate": 4.931551808142053e-05, + "loss": 0.3766, + "step": 1011500 + }, + { + "epoch": 6.848202685145084, + "grad_norm": 0.327567994594574, + "learning_rate": 4.93151797314855e-05, + "loss": 0.3778, + "step": 1012000 + }, + { + "epoch": 6.851586184495453, + "grad_norm": 0.3587280809879303, + "learning_rate": 4.931484138155046e-05, + "loss": 0.3795, + "step": 1012500 + }, + { + "epoch": 6.8549696838458205, + "grad_norm": 0.3584112226963043, + "learning_rate": 4.931450303161542e-05, + "loss": 0.3769, + "step": 1013000 + }, + { + "epoch": 6.858353183196189, + "grad_norm": 0.33898597955703735, + "learning_rate": 4.9314164681680384e-05, + "loss": 0.3759, + "step": 1013500 + }, + { + "epoch": 6.861736682546557, + "grad_norm": 0.3355078101158142, + "learning_rate": 4.9313826331745346e-05, + "loss": 0.3786, + "step": 1014000 + }, + { + "epoch": 6.865120181896925, + "grad_norm": 0.3411511182785034, + "learning_rate": 4.931348798181031e-05, + "loss": 0.3765, + "step": 1014500 + }, + { + "epoch": 6.868503681247293, + "grad_norm": 0.3562315106391907, + "learning_rate": 4.931314963187527e-05, + "loss": 0.3778, + "step": 1015000 + }, + { + "epoch": 6.871887180597661, + "grad_norm": 0.35385093092918396, + "learning_rate": 4.931281128194023e-05, + "loss": 0.3774, + "step": 1015500 + }, + { + "epoch": 6.875270679948029, + "grad_norm": 0.3415220081806183, + "learning_rate": 4.93124729320052e-05, + "loss": 0.3754, + "step": 1016000 + }, + { + "epoch": 6.8786541792983975, + "grad_norm": 0.3337489664554596, + "learning_rate": 4.9312134582070163e-05, + "loss": 0.3768, + "step": 1016500 + }, + { + "epoch": 6.882037678648766, + "grad_norm": 0.33073464035987854, + "learning_rate": 4.9311796232135126e-05, + "loss": 0.3766, + "step": 1017000 + }, + { + "epoch": 6.885421177999134, + "grad_norm": 0.37152549624443054, + "learning_rate": 4.931145788220009e-05, + "loss": 0.3763, + "step": 1017500 + }, + { + "epoch": 6.888804677349502, + "grad_norm": 0.34675320982933044, + "learning_rate": 4.931111953226506e-05, + "loss": 0.3782, + "step": 1018000 + }, + { + "epoch": 6.89218817669987, + "grad_norm": 0.31108322739601135, + "learning_rate": 4.931078118233002e-05, + "loss": 0.3774, + "step": 1018500 + }, + { + "epoch": 6.895571676050238, + "grad_norm": 0.327083557844162, + "learning_rate": 4.9310442832394974e-05, + "loss": 0.3767, + "step": 1019000 + }, + { + "epoch": 6.898955175400606, + "grad_norm": 0.34527599811553955, + "learning_rate": 4.931010448245994e-05, + "loss": 0.3765, + "step": 1019500 + }, + { + "epoch": 6.9023386747509745, + "grad_norm": 0.33913472294807434, + "learning_rate": 4.9309766132524905e-05, + "loss": 0.3753, + "step": 1020000 + }, + { + "epoch": 6.905722174101342, + "grad_norm": 0.3427327275276184, + "learning_rate": 4.930942778258987e-05, + "loss": 0.3781, + "step": 1020500 + }, + { + "epoch": 6.909105673451711, + "grad_norm": 0.35090121626853943, + "learning_rate": 4.930908943265483e-05, + "loss": 0.3766, + "step": 1021000 + }, + { + "epoch": 6.912489172802079, + "grad_norm": 0.3135407269001007, + "learning_rate": 4.93087510827198e-05, + "loss": 0.3779, + "step": 1021500 + }, + { + "epoch": 6.915872672152447, + "grad_norm": 0.34838712215423584, + "learning_rate": 4.930841273278476e-05, + "loss": 0.3772, + "step": 1022000 + }, + { + "epoch": 6.919256171502815, + "grad_norm": 0.33714696764945984, + "learning_rate": 4.930807438284972e-05, + "loss": 0.3778, + "step": 1022500 + }, + { + "epoch": 6.922639670853183, + "grad_norm": 0.3507121801376343, + "learning_rate": 4.9307736032914685e-05, + "loss": 0.3775, + "step": 1023000 + }, + { + "epoch": 6.9260231702035515, + "grad_norm": 0.3211919069290161, + "learning_rate": 4.930739768297965e-05, + "loss": 0.3765, + "step": 1023500 + }, + { + "epoch": 6.929406669553919, + "grad_norm": 0.32550227642059326, + "learning_rate": 4.930705933304461e-05, + "loss": 0.3785, + "step": 1024000 + }, + { + "epoch": 6.932790168904288, + "grad_norm": 0.3237001597881317, + "learning_rate": 4.930672098310957e-05, + "loss": 0.3765, + "step": 1024500 + }, + { + "epoch": 6.936173668254655, + "grad_norm": 0.3335512578487396, + "learning_rate": 4.930638263317453e-05, + "loss": 0.3767, + "step": 1025000 + }, + { + "epoch": 6.939557167605024, + "grad_norm": 0.3482947051525116, + "learning_rate": 4.93060442832395e-05, + "loss": 0.3776, + "step": 1025500 + }, + { + "epoch": 6.942940666955392, + "grad_norm": 0.30258816480636597, + "learning_rate": 4.9305705933304464e-05, + "loss": 0.3773, + "step": 1026000 + }, + { + "epoch": 6.94632416630576, + "grad_norm": 0.3438469469547272, + "learning_rate": 4.9305367583369426e-05, + "loss": 0.3775, + "step": 1026500 + }, + { + "epoch": 6.9497076656561285, + "grad_norm": 0.358698308467865, + "learning_rate": 4.930502923343439e-05, + "loss": 0.3788, + "step": 1027000 + }, + { + "epoch": 6.953091165006496, + "grad_norm": 0.3043401837348938, + "learning_rate": 4.930469088349936e-05, + "loss": 0.3765, + "step": 1027500 + }, + { + "epoch": 6.956474664356865, + "grad_norm": 0.34928059577941895, + "learning_rate": 4.930435253356432e-05, + "loss": 0.3766, + "step": 1028000 + }, + { + "epoch": 6.959858163707232, + "grad_norm": 0.38060516119003296, + "learning_rate": 4.9304014183629275e-05, + "loss": 0.3764, + "step": 1028500 + }, + { + "epoch": 6.963241663057601, + "grad_norm": 0.3338804841041565, + "learning_rate": 4.9303675833694244e-05, + "loss": 0.3755, + "step": 1029000 + }, + { + "epoch": 6.9666251624079685, + "grad_norm": 0.3243337571620941, + "learning_rate": 4.9303337483759206e-05, + "loss": 0.3774, + "step": 1029500 + }, + { + "epoch": 6.970008661758337, + "grad_norm": 0.340320348739624, + "learning_rate": 4.930299913382417e-05, + "loss": 0.377, + "step": 1030000 + }, + { + "epoch": 6.9733921611087055, + "grad_norm": 0.36838603019714355, + "learning_rate": 4.930266078388913e-05, + "loss": 0.3792, + "step": 1030500 + }, + { + "epoch": 6.976775660459073, + "grad_norm": 0.3550949692726135, + "learning_rate": 4.93023224339541e-05, + "loss": 0.3776, + "step": 1031000 + }, + { + "epoch": 6.980159159809442, + "grad_norm": 0.337386816740036, + "learning_rate": 4.930198408401906e-05, + "loss": 0.3769, + "step": 1031500 + }, + { + "epoch": 6.983542659159809, + "grad_norm": 0.3563162386417389, + "learning_rate": 4.930164573408402e-05, + "loss": 0.3757, + "step": 1032000 + }, + { + "epoch": 6.986926158510178, + "grad_norm": 0.37489381432533264, + "learning_rate": 4.9301307384148985e-05, + "loss": 0.3782, + "step": 1032500 + }, + { + "epoch": 6.9903096578605455, + "grad_norm": 0.36588630080223083, + "learning_rate": 4.930096903421395e-05, + "loss": 0.3761, + "step": 1033000 + }, + { + "epoch": 6.993693157210914, + "grad_norm": 0.3377918601036072, + "learning_rate": 4.930063068427891e-05, + "loss": 0.3779, + "step": 1033500 + }, + { + "epoch": 6.997076656561282, + "grad_norm": 0.33746615052223206, + "learning_rate": 4.930029233434387e-05, + "loss": 0.3765, + "step": 1034000 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.8563648176791943, + "eval_loss": 0.5842289924621582, + "eval_runtime": 3394.0562, + "eval_samples_per_second": 85.663, + "eval_steps_per_second": 5.354, + "step": 1034432 + }, + { + "epoch": 7.00046015591165, + "grad_norm": 0.371980220079422, + "learning_rate": 4.9299953984408834e-05, + "loss": 0.3758, + "step": 1034500 + }, + { + "epoch": 7.003843655262018, + "grad_norm": 0.38439419865608215, + "learning_rate": 4.92996156344738e-05, + "loss": 0.376, + "step": 1035000 + }, + { + "epoch": 7.007227154612386, + "grad_norm": 0.38531625270843506, + "learning_rate": 4.9299277284538765e-05, + "loss": 0.3747, + "step": 1035500 + }, + { + "epoch": 7.010610653962755, + "grad_norm": 0.31885409355163574, + "learning_rate": 4.929893893460373e-05, + "loss": 0.3743, + "step": 1036000 + }, + { + "epoch": 7.0139941533131225, + "grad_norm": 0.3155359625816345, + "learning_rate": 4.929860058466869e-05, + "loss": 0.3756, + "step": 1036500 + }, + { + "epoch": 7.017377652663491, + "grad_norm": 0.31193721294403076, + "learning_rate": 4.929826223473366e-05, + "loss": 0.3759, + "step": 1037000 + }, + { + "epoch": 7.020761152013859, + "grad_norm": 0.33127009868621826, + "learning_rate": 4.929792388479862e-05, + "loss": 0.3756, + "step": 1037500 + }, + { + "epoch": 7.024144651364227, + "grad_norm": 0.36094731092453003, + "learning_rate": 4.9297585534863575e-05, + "loss": 0.3757, + "step": 1038000 + }, + { + "epoch": 7.027528150714595, + "grad_norm": 0.3782150447368622, + "learning_rate": 4.9297247184928544e-05, + "loss": 0.3761, + "step": 1038500 + }, + { + "epoch": 7.030911650064963, + "grad_norm": 0.3468509614467621, + "learning_rate": 4.9296908834993506e-05, + "loss": 0.3753, + "step": 1039000 + }, + { + "epoch": 7.034295149415331, + "grad_norm": 0.33092188835144043, + "learning_rate": 4.929657048505847e-05, + "loss": 0.3744, + "step": 1039500 + }, + { + "epoch": 7.0376786487656995, + "grad_norm": 0.3647925555706024, + "learning_rate": 4.929623213512343e-05, + "loss": 0.3746, + "step": 1040000 + }, + { + "epoch": 7.041062148116067, + "grad_norm": 0.3791462182998657, + "learning_rate": 4.92958937851884e-05, + "loss": 0.3753, + "step": 1040500 + }, + { + "epoch": 7.044445647466436, + "grad_norm": 0.3329277038574219, + "learning_rate": 4.929555543525336e-05, + "loss": 0.374, + "step": 1041000 + }, + { + "epoch": 7.047829146816804, + "grad_norm": 0.3497406840324402, + "learning_rate": 4.9295217085318324e-05, + "loss": 0.3763, + "step": 1041500 + }, + { + "epoch": 7.051212646167172, + "grad_norm": 0.33520519733428955, + "learning_rate": 4.9294878735383286e-05, + "loss": 0.3757, + "step": 1042000 + }, + { + "epoch": 7.05459614551754, + "grad_norm": 0.34984537959098816, + "learning_rate": 4.929454038544825e-05, + "loss": 0.3753, + "step": 1042500 + }, + { + "epoch": 7.057979644867908, + "grad_norm": 0.370272696018219, + "learning_rate": 4.929420203551321e-05, + "loss": 0.3751, + "step": 1043000 + }, + { + "epoch": 7.0613631442182765, + "grad_norm": 0.3326875567436218, + "learning_rate": 4.929386368557817e-05, + "loss": 0.3748, + "step": 1043500 + }, + { + "epoch": 7.064746643568644, + "grad_norm": 0.3748356103897095, + "learning_rate": 4.9293525335643134e-05, + "loss": 0.3767, + "step": 1044000 + }, + { + "epoch": 7.068130142919013, + "grad_norm": 0.6138136386871338, + "learning_rate": 4.92931869857081e-05, + "loss": 0.3744, + "step": 1044500 + }, + { + "epoch": 7.07151364226938, + "grad_norm": 0.3744707405567169, + "learning_rate": 4.9292848635773065e-05, + "loss": 0.3752, + "step": 1045000 + }, + { + "epoch": 7.074897141619749, + "grad_norm": 0.3307333290576935, + "learning_rate": 4.929251028583803e-05, + "loss": 0.3766, + "step": 1045500 + }, + { + "epoch": 7.078280640970117, + "grad_norm": 0.3341595530509949, + "learning_rate": 4.929217193590299e-05, + "loss": 0.3737, + "step": 1046000 + }, + { + "epoch": 7.081664140320485, + "grad_norm": 0.3514774739742279, + "learning_rate": 4.929183358596796e-05, + "loss": 0.3746, + "step": 1046500 + }, + { + "epoch": 7.0850476396708535, + "grad_norm": 0.3368781805038452, + "learning_rate": 4.929149523603292e-05, + "loss": 0.3743, + "step": 1047000 + }, + { + "epoch": 7.088431139021221, + "grad_norm": 0.3600742816925049, + "learning_rate": 4.9291156886097876e-05, + "loss": 0.3751, + "step": 1047500 + }, + { + "epoch": 7.09181463837159, + "grad_norm": 0.3651391267776489, + "learning_rate": 4.9290818536162845e-05, + "loss": 0.3766, + "step": 1048000 + }, + { + "epoch": 7.095198137721957, + "grad_norm": 0.4543780982494354, + "learning_rate": 4.929048018622781e-05, + "loss": 0.3732, + "step": 1048500 + }, + { + "epoch": 7.098581637072326, + "grad_norm": 0.32120242714881897, + "learning_rate": 4.929014183629277e-05, + "loss": 0.3762, + "step": 1049000 + }, + { + "epoch": 7.1019651364226934, + "grad_norm": 0.32367223501205444, + "learning_rate": 4.928980348635773e-05, + "loss": 0.3767, + "step": 1049500 + }, + { + "epoch": 7.105348635773062, + "grad_norm": 0.3515234589576721, + "learning_rate": 4.928946513642269e-05, + "loss": 0.373, + "step": 1050000 + }, + { + "epoch": 7.1087321351234305, + "grad_norm": 0.32753437757492065, + "learning_rate": 4.928912678648766e-05, + "loss": 0.3752, + "step": 1050500 + }, + { + "epoch": 7.112115634473798, + "grad_norm": 0.3608008027076721, + "learning_rate": 4.9288788436552624e-05, + "loss": 0.375, + "step": 1051000 + }, + { + "epoch": 7.115499133824167, + "grad_norm": 0.36846715211868286, + "learning_rate": 4.9288450086617587e-05, + "loss": 0.3755, + "step": 1051500 + }, + { + "epoch": 7.118882633174534, + "grad_norm": 0.36164039373397827, + "learning_rate": 4.928811173668255e-05, + "loss": 0.3766, + "step": 1052000 + }, + { + "epoch": 7.122266132524903, + "grad_norm": 0.34308990836143494, + "learning_rate": 4.928777338674751e-05, + "loss": 0.3743, + "step": 1052500 + }, + { + "epoch": 7.1256496318752705, + "grad_norm": 0.34098562598228455, + "learning_rate": 4.928743503681247e-05, + "loss": 0.3773, + "step": 1053000 + }, + { + "epoch": 7.129033131225639, + "grad_norm": 0.35562875866889954, + "learning_rate": 4.9287096686877435e-05, + "loss": 0.3748, + "step": 1053500 + }, + { + "epoch": 7.132416630576007, + "grad_norm": 0.38288894295692444, + "learning_rate": 4.9286758336942404e-05, + "loss": 0.3744, + "step": 1054000 + }, + { + "epoch": 7.135800129926375, + "grad_norm": 0.33995917439460754, + "learning_rate": 4.9286419987007366e-05, + "loss": 0.3753, + "step": 1054500 + }, + { + "epoch": 7.139183629276743, + "grad_norm": 0.3554493486881256, + "learning_rate": 4.928608163707233e-05, + "loss": 0.3744, + "step": 1055000 + }, + { + "epoch": 7.142567128627111, + "grad_norm": 0.3418475389480591, + "learning_rate": 4.928574328713729e-05, + "loss": 0.3766, + "step": 1055500 + }, + { + "epoch": 7.14595062797748, + "grad_norm": 0.3773251175880432, + "learning_rate": 4.928540493720226e-05, + "loss": 0.3765, + "step": 1056000 + }, + { + "epoch": 7.1493341273278475, + "grad_norm": 0.3789617717266083, + "learning_rate": 4.928506658726722e-05, + "loss": 0.3774, + "step": 1056500 + }, + { + "epoch": 7.152717626678216, + "grad_norm": 0.348550945520401, + "learning_rate": 4.928472823733218e-05, + "loss": 0.3749, + "step": 1057000 + }, + { + "epoch": 7.156101126028584, + "grad_norm": 0.3775283396244049, + "learning_rate": 4.9284389887397146e-05, + "loss": 0.3755, + "step": 1057500 + }, + { + "epoch": 7.159484625378952, + "grad_norm": 0.33230406045913696, + "learning_rate": 4.928405153746211e-05, + "loss": 0.3752, + "step": 1058000 + }, + { + "epoch": 7.16286812472932, + "grad_norm": 0.36388495564460754, + "learning_rate": 4.928371318752707e-05, + "loss": 0.3769, + "step": 1058500 + }, + { + "epoch": 7.166251624079688, + "grad_norm": 0.3778153359889984, + "learning_rate": 4.928337483759203e-05, + "loss": 0.3743, + "step": 1059000 + }, + { + "epoch": 7.169635123430056, + "grad_norm": 0.36843207478523254, + "learning_rate": 4.9283036487656994e-05, + "loss": 0.3751, + "step": 1059500 + }, + { + "epoch": 7.1730186227804245, + "grad_norm": 0.3462293744087219, + "learning_rate": 4.928269813772196e-05, + "loss": 0.3749, + "step": 1060000 + }, + { + "epoch": 7.176402122130792, + "grad_norm": 0.3491370975971222, + "learning_rate": 4.9282359787786925e-05, + "loss": 0.3753, + "step": 1060500 + }, + { + "epoch": 7.179785621481161, + "grad_norm": 0.35125911235809326, + "learning_rate": 4.928202143785189e-05, + "loss": 0.3755, + "step": 1061000 + }, + { + "epoch": 7.183169120831529, + "grad_norm": 0.3585151433944702, + "learning_rate": 4.928168308791685e-05, + "loss": 0.3756, + "step": 1061500 + }, + { + "epoch": 7.186552620181897, + "grad_norm": 0.3339557647705078, + "learning_rate": 4.928134473798181e-05, + "loss": 0.3772, + "step": 1062000 + }, + { + "epoch": 7.189936119532265, + "grad_norm": 0.3724111020565033, + "learning_rate": 4.9281006388046774e-05, + "loss": 0.3764, + "step": 1062500 + }, + { + "epoch": 7.193319618882633, + "grad_norm": 0.3494477868080139, + "learning_rate": 4.9280668038111736e-05, + "loss": 0.3753, + "step": 1063000 + }, + { + "epoch": 7.1967031182330015, + "grad_norm": 0.32152417302131653, + "learning_rate": 4.9280329688176705e-05, + "loss": 0.3771, + "step": 1063500 + }, + { + "epoch": 7.200086617583369, + "grad_norm": 0.3638397455215454, + "learning_rate": 4.927999133824167e-05, + "loss": 0.3761, + "step": 1064000 + }, + { + "epoch": 7.203470116933738, + "grad_norm": 0.3315892219543457, + "learning_rate": 4.927965298830663e-05, + "loss": 0.3767, + "step": 1064500 + }, + { + "epoch": 7.206853616284105, + "grad_norm": 0.3420919179916382, + "learning_rate": 4.927931463837159e-05, + "loss": 0.3767, + "step": 1065000 + }, + { + "epoch": 7.210237115634474, + "grad_norm": 0.35963183641433716, + "learning_rate": 4.927897628843656e-05, + "loss": 0.3754, + "step": 1065500 + }, + { + "epoch": 7.213620614984842, + "grad_norm": 0.3440105617046356, + "learning_rate": 4.927863793850152e-05, + "loss": 0.3753, + "step": 1066000 + }, + { + "epoch": 7.21700411433521, + "grad_norm": 0.3412320017814636, + "learning_rate": 4.927829958856648e-05, + "loss": 0.3747, + "step": 1066500 + }, + { + "epoch": 7.2203876136855785, + "grad_norm": 0.37493258714675903, + "learning_rate": 4.927796123863144e-05, + "loss": 0.3765, + "step": 1067000 + }, + { + "epoch": 7.223771113035946, + "grad_norm": 0.3456762433052063, + "learning_rate": 4.927762288869641e-05, + "loss": 0.3749, + "step": 1067500 + }, + { + "epoch": 7.227154612386315, + "grad_norm": 0.3558253049850464, + "learning_rate": 4.927728453876137e-05, + "loss": 0.375, + "step": 1068000 + }, + { + "epoch": 7.230538111736682, + "grad_norm": 0.3644653856754303, + "learning_rate": 4.927694618882633e-05, + "loss": 0.3751, + "step": 1068500 + }, + { + "epoch": 7.233921611087051, + "grad_norm": 0.4022010862827301, + "learning_rate": 4.9276607838891295e-05, + "loss": 0.3752, + "step": 1069000 + }, + { + "epoch": 7.237305110437418, + "grad_norm": 0.3526616096496582, + "learning_rate": 4.9276269488956264e-05, + "loss": 0.3765, + "step": 1069500 + }, + { + "epoch": 7.240688609787787, + "grad_norm": 0.35260966420173645, + "learning_rate": 4.9275931139021226e-05, + "loss": 0.3764, + "step": 1070000 + }, + { + "epoch": 7.2440721091381555, + "grad_norm": 0.36006537079811096, + "learning_rate": 4.927559278908619e-05, + "loss": 0.3745, + "step": 1070500 + }, + { + "epoch": 7.247455608488523, + "grad_norm": 0.34686923027038574, + "learning_rate": 4.927525443915115e-05, + "loss": 0.3748, + "step": 1071000 + }, + { + "epoch": 7.250839107838892, + "grad_norm": 0.3436018228530884, + "learning_rate": 4.927491608921611e-05, + "loss": 0.3756, + "step": 1071500 + }, + { + "epoch": 7.254222607189259, + "grad_norm": 0.3261730968952179, + "learning_rate": 4.9274577739281074e-05, + "loss": 0.3764, + "step": 1072000 + }, + { + "epoch": 7.257606106539628, + "grad_norm": 0.3459329903125763, + "learning_rate": 4.9274239389346036e-05, + "loss": 0.3761, + "step": 1072500 + }, + { + "epoch": 7.260989605889995, + "grad_norm": 0.3929137587547302, + "learning_rate": 4.9273901039411005e-05, + "loss": 0.3753, + "step": 1073000 + }, + { + "epoch": 7.264373105240364, + "grad_norm": 0.3764931559562683, + "learning_rate": 4.927356268947597e-05, + "loss": 0.3772, + "step": 1073500 + }, + { + "epoch": 7.267756604590732, + "grad_norm": 0.39603105187416077, + "learning_rate": 4.927322433954093e-05, + "loss": 0.3751, + "step": 1074000 + }, + { + "epoch": 7.2711401039411, + "grad_norm": 0.3758893311023712, + "learning_rate": 4.927288598960589e-05, + "loss": 0.3766, + "step": 1074500 + }, + { + "epoch": 7.274523603291469, + "grad_norm": 0.39224499464035034, + "learning_rate": 4.927254763967086e-05, + "loss": 0.3757, + "step": 1075000 + }, + { + "epoch": 7.277907102641836, + "grad_norm": 0.3400898575782776, + "learning_rate": 4.927220928973582e-05, + "loss": 0.3757, + "step": 1075500 + }, + { + "epoch": 7.281290601992205, + "grad_norm": 0.32646670937538147, + "learning_rate": 4.927187093980078e-05, + "loss": 0.374, + "step": 1076000 + }, + { + "epoch": 7.284674101342572, + "grad_norm": 0.344763845205307, + "learning_rate": 4.927153258986574e-05, + "loss": 0.3767, + "step": 1076500 + }, + { + "epoch": 7.288057600692941, + "grad_norm": 0.34435445070266724, + "learning_rate": 4.927119423993071e-05, + "loss": 0.3748, + "step": 1077000 + }, + { + "epoch": 7.291441100043309, + "grad_norm": 0.31645989418029785, + "learning_rate": 4.927085588999567e-05, + "loss": 0.376, + "step": 1077500 + }, + { + "epoch": 7.294824599393677, + "grad_norm": 0.334351509809494, + "learning_rate": 4.927051754006063e-05, + "loss": 0.3768, + "step": 1078000 + }, + { + "epoch": 7.298208098744045, + "grad_norm": 0.31009796261787415, + "learning_rate": 4.9270179190125595e-05, + "loss": 0.3769, + "step": 1078500 + }, + { + "epoch": 7.301591598094413, + "grad_norm": 0.3454814553260803, + "learning_rate": 4.9269840840190564e-05, + "loss": 0.3752, + "step": 1079000 + }, + { + "epoch": 7.304975097444781, + "grad_norm": 0.34109362959861755, + "learning_rate": 4.9269502490255526e-05, + "loss": 0.3771, + "step": 1079500 + }, + { + "epoch": 7.308358596795149, + "grad_norm": 0.38113102316856384, + "learning_rate": 4.926916414032049e-05, + "loss": 0.3748, + "step": 1080000 + }, + { + "epoch": 7.311742096145518, + "grad_norm": 0.3147388696670532, + "learning_rate": 4.926882579038545e-05, + "loss": 0.3769, + "step": 1080500 + }, + { + "epoch": 7.315125595495886, + "grad_norm": 0.32656314969062805, + "learning_rate": 4.926848744045041e-05, + "loss": 0.3775, + "step": 1081000 + }, + { + "epoch": 7.318509094846254, + "grad_norm": 0.34077635407447815, + "learning_rate": 4.9268149090515375e-05, + "loss": 0.3771, + "step": 1081500 + }, + { + "epoch": 7.321892594196622, + "grad_norm": 0.37306562066078186, + "learning_rate": 4.926781074058034e-05, + "loss": 0.376, + "step": 1082000 + }, + { + "epoch": 7.32527609354699, + "grad_norm": 0.3802151381969452, + "learning_rate": 4.9267472390645306e-05, + "loss": 0.3764, + "step": 1082500 + }, + { + "epoch": 7.328659592897358, + "grad_norm": 0.3427773416042328, + "learning_rate": 4.926713404071027e-05, + "loss": 0.3765, + "step": 1083000 + }, + { + "epoch": 7.332043092247726, + "grad_norm": 0.30575892329216003, + "learning_rate": 4.926679569077523e-05, + "loss": 0.3767, + "step": 1083500 + }, + { + "epoch": 7.335426591598094, + "grad_norm": 0.328872948884964, + "learning_rate": 4.926645734084019e-05, + "loss": 0.3766, + "step": 1084000 + }, + { + "epoch": 7.338810090948463, + "grad_norm": 0.38873931765556335, + "learning_rate": 4.926611899090516e-05, + "loss": 0.3755, + "step": 1084500 + }, + { + "epoch": 7.34219359029883, + "grad_norm": 0.3008991479873657, + "learning_rate": 4.926578064097012e-05, + "loss": 0.3757, + "step": 1085000 + }, + { + "epoch": 7.345577089649199, + "grad_norm": 0.32548144459724426, + "learning_rate": 4.926544229103508e-05, + "loss": 0.3758, + "step": 1085500 + }, + { + "epoch": 7.348960588999567, + "grad_norm": 0.3852235674858093, + "learning_rate": 4.926510394110004e-05, + "loss": 0.3761, + "step": 1086000 + }, + { + "epoch": 7.352344088349935, + "grad_norm": 0.3904217481613159, + "learning_rate": 4.926476559116501e-05, + "loss": 0.3754, + "step": 1086500 + }, + { + "epoch": 7.355727587700303, + "grad_norm": 0.3320090174674988, + "learning_rate": 4.926442724122997e-05, + "loss": 0.376, + "step": 1087000 + }, + { + "epoch": 7.359111087050671, + "grad_norm": 0.3738870322704315, + "learning_rate": 4.9264088891294934e-05, + "loss": 0.3759, + "step": 1087500 + }, + { + "epoch": 7.36249458640104, + "grad_norm": 0.3391508162021637, + "learning_rate": 4.9263750541359896e-05, + "loss": 0.376, + "step": 1088000 + }, + { + "epoch": 7.365878085751407, + "grad_norm": 0.3554190993309021, + "learning_rate": 4.9263412191424865e-05, + "loss": 0.3757, + "step": 1088500 + }, + { + "epoch": 7.369261585101776, + "grad_norm": 0.37407541275024414, + "learning_rate": 4.926307384148983e-05, + "loss": 0.3757, + "step": 1089000 + }, + { + "epoch": 7.372645084452143, + "grad_norm": 0.3865109384059906, + "learning_rate": 4.926273549155479e-05, + "loss": 0.3755, + "step": 1089500 + }, + { + "epoch": 7.376028583802512, + "grad_norm": 0.32073667645454407, + "learning_rate": 4.926239714161975e-05, + "loss": 0.3749, + "step": 1090000 + }, + { + "epoch": 7.37941208315288, + "grad_norm": 0.3938956558704376, + "learning_rate": 4.926205879168471e-05, + "loss": 0.3766, + "step": 1090500 + }, + { + "epoch": 7.382795582503248, + "grad_norm": 0.35945776104927063, + "learning_rate": 4.9261720441749675e-05, + "loss": 0.3756, + "step": 1091000 + }, + { + "epoch": 7.386179081853617, + "grad_norm": 0.35040605068206787, + "learning_rate": 4.926138209181464e-05, + "loss": 0.3758, + "step": 1091500 + }, + { + "epoch": 7.389562581203984, + "grad_norm": 0.3531397581100464, + "learning_rate": 4.9261043741879606e-05, + "loss": 0.3759, + "step": 1092000 + }, + { + "epoch": 7.392946080554353, + "grad_norm": 0.3244784474372864, + "learning_rate": 4.926070539194457e-05, + "loss": 0.3756, + "step": 1092500 + }, + { + "epoch": 7.39632957990472, + "grad_norm": 0.3516186773777008, + "learning_rate": 4.926036704200953e-05, + "loss": 0.3763, + "step": 1093000 + }, + { + "epoch": 7.399713079255089, + "grad_norm": 0.3564213514328003, + "learning_rate": 4.926002869207449e-05, + "loss": 0.3759, + "step": 1093500 + }, + { + "epoch": 7.403096578605457, + "grad_norm": 0.33187055587768555, + "learning_rate": 4.925969034213946e-05, + "loss": 0.3757, + "step": 1094000 + }, + { + "epoch": 7.406480077955825, + "grad_norm": 0.4159274995326996, + "learning_rate": 4.9259351992204424e-05, + "loss": 0.3758, + "step": 1094500 + }, + { + "epoch": 7.409863577306194, + "grad_norm": 0.37454167008399963, + "learning_rate": 4.925901364226938e-05, + "loss": 0.3773, + "step": 1095000 + }, + { + "epoch": 7.413247076656561, + "grad_norm": 0.32225438952445984, + "learning_rate": 4.925867529233434e-05, + "loss": 0.3766, + "step": 1095500 + }, + { + "epoch": 7.41663057600693, + "grad_norm": 0.3051474690437317, + "learning_rate": 4.925833694239931e-05, + "loss": 0.3775, + "step": 1096000 + }, + { + "epoch": 7.420014075357297, + "grad_norm": 0.32634198665618896, + "learning_rate": 4.925799859246427e-05, + "loss": 0.3755, + "step": 1096500 + }, + { + "epoch": 7.423397574707666, + "grad_norm": 0.338914692401886, + "learning_rate": 4.9257660242529234e-05, + "loss": 0.3761, + "step": 1097000 + }, + { + "epoch": 7.426781074058034, + "grad_norm": 0.3681485950946808, + "learning_rate": 4.9257321892594197e-05, + "loss": 0.3757, + "step": 1097500 + }, + { + "epoch": 7.430164573408402, + "grad_norm": 0.3372761309146881, + "learning_rate": 4.9256983542659165e-05, + "loss": 0.3764, + "step": 1098000 + }, + { + "epoch": 7.43354807275877, + "grad_norm": 0.3378821015357971, + "learning_rate": 4.925664519272413e-05, + "loss": 0.3765, + "step": 1098500 + }, + { + "epoch": 7.436931572109138, + "grad_norm": 0.3365425169467926, + "learning_rate": 4.925630684278909e-05, + "loss": 0.3763, + "step": 1099000 + }, + { + "epoch": 7.440315071459506, + "grad_norm": 0.3577186167240143, + "learning_rate": 4.925596849285405e-05, + "loss": 0.3762, + "step": 1099500 + }, + { + "epoch": 7.443698570809874, + "grad_norm": 0.3601377606391907, + "learning_rate": 4.9255630142919014e-05, + "loss": 0.377, + "step": 1100000 + }, + { + "epoch": 7.447082070160243, + "grad_norm": 0.3320385813713074, + "learning_rate": 4.9255291792983976e-05, + "loss": 0.3759, + "step": 1100500 + }, + { + "epoch": 7.450465569510611, + "grad_norm": 0.33235663175582886, + "learning_rate": 4.925495344304894e-05, + "loss": 0.376, + "step": 1101000 + }, + { + "epoch": 7.453849068860979, + "grad_norm": 0.33040741086006165, + "learning_rate": 4.925461509311391e-05, + "loss": 0.3756, + "step": 1101500 + }, + { + "epoch": 7.457232568211347, + "grad_norm": 0.37784650921821594, + "learning_rate": 4.925427674317887e-05, + "loss": 0.3748, + "step": 1102000 + }, + { + "epoch": 7.460616067561715, + "grad_norm": 0.35406967997550964, + "learning_rate": 4.925393839324383e-05, + "loss": 0.3758, + "step": 1102500 + }, + { + "epoch": 7.463999566912083, + "grad_norm": 0.35439401865005493, + "learning_rate": 4.9253600043308793e-05, + "loss": 0.376, + "step": 1103000 + }, + { + "epoch": 7.467383066262451, + "grad_norm": 0.3342632055282593, + "learning_rate": 4.925326169337376e-05, + "loss": 0.3758, + "step": 1103500 + }, + { + "epoch": 7.470766565612819, + "grad_norm": 0.32852116227149963, + "learning_rate": 4.9252923343438724e-05, + "loss": 0.3754, + "step": 1104000 + }, + { + "epoch": 7.474150064963188, + "grad_norm": 0.3155595064163208, + "learning_rate": 4.925258499350368e-05, + "loss": 0.3767, + "step": 1104500 + }, + { + "epoch": 7.477533564313555, + "grad_norm": 0.35901975631713867, + "learning_rate": 4.925224664356864e-05, + "loss": 0.3764, + "step": 1105000 + }, + { + "epoch": 7.480917063663924, + "grad_norm": 0.3231262266635895, + "learning_rate": 4.925190829363361e-05, + "loss": 0.3757, + "step": 1105500 + }, + { + "epoch": 7.484300563014292, + "grad_norm": 0.37099528312683105, + "learning_rate": 4.925156994369857e-05, + "loss": 0.3746, + "step": 1106000 + }, + { + "epoch": 7.48768406236466, + "grad_norm": 0.34101614356040955, + "learning_rate": 4.9251231593763535e-05, + "loss": 0.3756, + "step": 1106500 + }, + { + "epoch": 7.491067561715028, + "grad_norm": 0.3836347758769989, + "learning_rate": 4.92508932438285e-05, + "loss": 0.3785, + "step": 1107000 + }, + { + "epoch": 7.494451061065396, + "grad_norm": 0.326123982667923, + "learning_rate": 4.9250554893893466e-05, + "loss": 0.3762, + "step": 1107500 + }, + { + "epoch": 7.497834560415765, + "grad_norm": 0.3567655086517334, + "learning_rate": 4.925021654395843e-05, + "loss": 0.3761, + "step": 1108000 + }, + { + "epoch": 7.501218059766132, + "grad_norm": 0.3998945653438568, + "learning_rate": 4.924987819402339e-05, + "loss": 0.3764, + "step": 1108500 + }, + { + "epoch": 7.504601559116501, + "grad_norm": 0.3774033784866333, + "learning_rate": 4.924953984408835e-05, + "loss": 0.3765, + "step": 1109000 + }, + { + "epoch": 7.507985058466868, + "grad_norm": 0.3236536979675293, + "learning_rate": 4.9249201494153315e-05, + "loss": 0.376, + "step": 1109500 + }, + { + "epoch": 7.511368557817237, + "grad_norm": 0.3851451277732849, + "learning_rate": 4.924886314421828e-05, + "loss": 0.3744, + "step": 1110000 + }, + { + "epoch": 7.514752057167605, + "grad_norm": 0.34989050030708313, + "learning_rate": 4.924852479428324e-05, + "loss": 0.3753, + "step": 1110500 + }, + { + "epoch": 7.518135556517973, + "grad_norm": 0.33295780420303345, + "learning_rate": 4.924818644434821e-05, + "loss": 0.3747, + "step": 1111000 + }, + { + "epoch": 7.521519055868342, + "grad_norm": 0.36218512058258057, + "learning_rate": 4.924784809441317e-05, + "loss": 0.3744, + "step": 1111500 + }, + { + "epoch": 7.524902555218709, + "grad_norm": 0.33659127354621887, + "learning_rate": 4.924750974447813e-05, + "loss": 0.3758, + "step": 1112000 + }, + { + "epoch": 7.528286054569078, + "grad_norm": 0.3490145206451416, + "learning_rate": 4.9247171394543094e-05, + "loss": 0.3753, + "step": 1112500 + }, + { + "epoch": 7.531669553919445, + "grad_norm": 0.3658861219882965, + "learning_rate": 4.9246833044608056e-05, + "loss": 0.3766, + "step": 1113000 + }, + { + "epoch": 7.535053053269814, + "grad_norm": 0.33971354365348816, + "learning_rate": 4.9246494694673025e-05, + "loss": 0.3754, + "step": 1113500 + }, + { + "epoch": 7.5384365526201815, + "grad_norm": 0.37164369225502014, + "learning_rate": 4.924615634473798e-05, + "loss": 0.3754, + "step": 1114000 + }, + { + "epoch": 7.54182005197055, + "grad_norm": 0.3788661062717438, + "learning_rate": 4.924581799480294e-05, + "loss": 0.3755, + "step": 1114500 + }, + { + "epoch": 7.545203551320919, + "grad_norm": 0.3288120627403259, + "learning_rate": 4.924547964486791e-05, + "loss": 0.377, + "step": 1115000 + }, + { + "epoch": 7.548587050671286, + "grad_norm": 0.356442928314209, + "learning_rate": 4.9245141294932874e-05, + "loss": 0.378, + "step": 1115500 + }, + { + "epoch": 7.551970550021655, + "grad_norm": 0.35600316524505615, + "learning_rate": 4.9244802944997836e-05, + "loss": 0.3749, + "step": 1116000 + }, + { + "epoch": 7.555354049372022, + "grad_norm": 0.37920600175857544, + "learning_rate": 4.92444645950628e-05, + "loss": 0.3756, + "step": 1116500 + }, + { + "epoch": 7.558737548722391, + "grad_norm": 0.32114550471305847, + "learning_rate": 4.924412624512777e-05, + "loss": 0.3776, + "step": 1117000 + }, + { + "epoch": 7.5621210480727585, + "grad_norm": 0.33889034390449524, + "learning_rate": 4.924378789519273e-05, + "loss": 0.3773, + "step": 1117500 + }, + { + "epoch": 7.565504547423127, + "grad_norm": 0.35345473885536194, + "learning_rate": 4.924344954525769e-05, + "loss": 0.3747, + "step": 1118000 + }, + { + "epoch": 7.568888046773495, + "grad_norm": 0.32850387692451477, + "learning_rate": 4.924311119532265e-05, + "loss": 0.3756, + "step": 1118500 + }, + { + "epoch": 7.572271546123863, + "grad_norm": 0.3322440981864929, + "learning_rate": 4.9242772845387615e-05, + "loss": 0.3768, + "step": 1119000 + }, + { + "epoch": 7.575655045474232, + "grad_norm": 0.354925274848938, + "learning_rate": 4.924243449545258e-05, + "loss": 0.3751, + "step": 1119500 + }, + { + "epoch": 7.579038544824599, + "grad_norm": 0.3208562731742859, + "learning_rate": 4.924209614551754e-05, + "loss": 0.3761, + "step": 1120000 + }, + { + "epoch": 7.582422044174968, + "grad_norm": 0.3661039173603058, + "learning_rate": 4.92417577955825e-05, + "loss": 0.3749, + "step": 1120500 + }, + { + "epoch": 7.5858055435253355, + "grad_norm": 0.3295091986656189, + "learning_rate": 4.924141944564747e-05, + "loss": 0.375, + "step": 1121000 + }, + { + "epoch": 7.589189042875704, + "grad_norm": 0.3443576395511627, + "learning_rate": 4.924108109571243e-05, + "loss": 0.3756, + "step": 1121500 + }, + { + "epoch": 7.592572542226072, + "grad_norm": 0.345859557390213, + "learning_rate": 4.9240742745777395e-05, + "loss": 0.3773, + "step": 1122000 + }, + { + "epoch": 7.59595604157644, + "grad_norm": 0.35861361026763916, + "learning_rate": 4.924040439584236e-05, + "loss": 0.3753, + "step": 1122500 + }, + { + "epoch": 7.599339540926808, + "grad_norm": 0.32491734623908997, + "learning_rate": 4.9240066045907326e-05, + "loss": 0.376, + "step": 1123000 + }, + { + "epoch": 7.602723040277176, + "grad_norm": 0.3575875759124756, + "learning_rate": 4.923972769597228e-05, + "loss": 0.3759, + "step": 1123500 + }, + { + "epoch": 7.606106539627544, + "grad_norm": 0.3628663122653961, + "learning_rate": 4.923938934603724e-05, + "loss": 0.375, + "step": 1124000 + }, + { + "epoch": 7.6094900389779125, + "grad_norm": 0.3580170273780823, + "learning_rate": 4.923905099610221e-05, + "loss": 0.3766, + "step": 1124500 + }, + { + "epoch": 7.61287353832828, + "grad_norm": 0.3711313307285309, + "learning_rate": 4.9238712646167174e-05, + "loss": 0.3759, + "step": 1125000 + }, + { + "epoch": 7.616257037678649, + "grad_norm": 0.320784330368042, + "learning_rate": 4.9238374296232136e-05, + "loss": 0.3762, + "step": 1125500 + }, + { + "epoch": 7.619640537029017, + "grad_norm": 0.32204243540763855, + "learning_rate": 4.92380359462971e-05, + "loss": 0.3777, + "step": 1126000 + }, + { + "epoch": 7.623024036379385, + "grad_norm": 0.3574248254299164, + "learning_rate": 4.923769759636207e-05, + "loss": 0.3756, + "step": 1126500 + }, + { + "epoch": 7.626407535729753, + "grad_norm": 0.3387276828289032, + "learning_rate": 4.923735924642703e-05, + "loss": 0.3758, + "step": 1127000 + }, + { + "epoch": 7.629791035080121, + "grad_norm": 0.33094269037246704, + "learning_rate": 4.923702089649199e-05, + "loss": 0.3761, + "step": 1127500 + }, + { + "epoch": 7.6331745344304895, + "grad_norm": 0.33912894129753113, + "learning_rate": 4.9236682546556954e-05, + "loss": 0.3754, + "step": 1128000 + }, + { + "epoch": 7.636558033780857, + "grad_norm": 0.3694305121898651, + "learning_rate": 4.9236344196621916e-05, + "loss": 0.3754, + "step": 1128500 + }, + { + "epoch": 7.639941533131226, + "grad_norm": 0.35075610876083374, + "learning_rate": 4.923600584668688e-05, + "loss": 0.3757, + "step": 1129000 + }, + { + "epoch": 7.643325032481593, + "grad_norm": 0.3775167167186737, + "learning_rate": 4.923566749675184e-05, + "loss": 0.3766, + "step": 1129500 + }, + { + "epoch": 7.646708531831962, + "grad_norm": 0.3631054759025574, + "learning_rate": 4.92353291468168e-05, + "loss": 0.3751, + "step": 1130000 + }, + { + "epoch": 7.65009203118233, + "grad_norm": 0.3584959805011749, + "learning_rate": 4.923499079688177e-05, + "loss": 0.3763, + "step": 1130500 + }, + { + "epoch": 7.653475530532698, + "grad_norm": 0.361883282661438, + "learning_rate": 4.923465244694673e-05, + "loss": 0.3761, + "step": 1131000 + }, + { + "epoch": 7.6568590298830665, + "grad_norm": 0.3296509087085724, + "learning_rate": 4.9234314097011695e-05, + "loss": 0.3756, + "step": 1131500 + }, + { + "epoch": 7.660242529233434, + "grad_norm": 0.34193360805511475, + "learning_rate": 4.923397574707666e-05, + "loss": 0.3749, + "step": 1132000 + }, + { + "epoch": 7.663626028583803, + "grad_norm": 0.337868332862854, + "learning_rate": 4.9233637397141626e-05, + "loss": 0.3769, + "step": 1132500 + }, + { + "epoch": 7.66700952793417, + "grad_norm": 0.37247219681739807, + "learning_rate": 4.923329904720659e-05, + "loss": 0.3753, + "step": 1133000 + }, + { + "epoch": 7.670393027284539, + "grad_norm": 0.3282815217971802, + "learning_rate": 4.9232960697271544e-05, + "loss": 0.3762, + "step": 1133500 + }, + { + "epoch": 7.6737765266349065, + "grad_norm": 0.29900139570236206, + "learning_rate": 4.923262234733651e-05, + "loss": 0.3769, + "step": 1134000 + }, + { + "epoch": 7.677160025985275, + "grad_norm": 0.31819576025009155, + "learning_rate": 4.9232283997401475e-05, + "loss": 0.3759, + "step": 1134500 + }, + { + "epoch": 7.6805435253356436, + "grad_norm": 0.3621688187122345, + "learning_rate": 4.923194564746644e-05, + "loss": 0.3753, + "step": 1135000 + }, + { + "epoch": 7.683927024686011, + "grad_norm": 0.35284703969955444, + "learning_rate": 4.92316072975314e-05, + "loss": 0.3769, + "step": 1135500 + }, + { + "epoch": 7.68731052403638, + "grad_norm": 0.34793928265571594, + "learning_rate": 4.923126894759637e-05, + "loss": 0.3764, + "step": 1136000 + }, + { + "epoch": 7.690694023386747, + "grad_norm": 0.34080761671066284, + "learning_rate": 4.923093059766133e-05, + "loss": 0.376, + "step": 1136500 + }, + { + "epoch": 7.694077522737116, + "grad_norm": 0.3793800175189972, + "learning_rate": 4.923059224772629e-05, + "loss": 0.3763, + "step": 1137000 + }, + { + "epoch": 7.6974610220874835, + "grad_norm": 0.340820848941803, + "learning_rate": 4.9230253897791254e-05, + "loss": 0.3752, + "step": 1137500 + }, + { + "epoch": 7.700844521437852, + "grad_norm": 0.3901347815990448, + "learning_rate": 4.9229915547856216e-05, + "loss": 0.3761, + "step": 1138000 + }, + { + "epoch": 7.70422802078822, + "grad_norm": 0.3766717314720154, + "learning_rate": 4.922957719792118e-05, + "loss": 0.3755, + "step": 1138500 + }, + { + "epoch": 7.707611520138588, + "grad_norm": 0.33552950620651245, + "learning_rate": 4.922923884798614e-05, + "loss": 0.3758, + "step": 1139000 + }, + { + "epoch": 7.710995019488957, + "grad_norm": 0.3720012307167053, + "learning_rate": 4.92289004980511e-05, + "loss": 0.3747, + "step": 1139500 + }, + { + "epoch": 7.714378518839324, + "grad_norm": 0.33156847953796387, + "learning_rate": 4.922856214811607e-05, + "loss": 0.3772, + "step": 1140000 + }, + { + "epoch": 7.717762018189693, + "grad_norm": 0.3560585081577301, + "learning_rate": 4.9228223798181034e-05, + "loss": 0.3771, + "step": 1140500 + }, + { + "epoch": 7.7211455175400605, + "grad_norm": 0.33120888471603394, + "learning_rate": 4.9227885448245996e-05, + "loss": 0.3751, + "step": 1141000 + }, + { + "epoch": 7.724529016890429, + "grad_norm": 0.3647676706314087, + "learning_rate": 4.922754709831096e-05, + "loss": 0.374, + "step": 1141500 + }, + { + "epoch": 7.727912516240797, + "grad_norm": 0.3265502452850342, + "learning_rate": 4.922720874837593e-05, + "loss": 0.3761, + "step": 1142000 + }, + { + "epoch": 7.731296015591165, + "grad_norm": 0.3567357659339905, + "learning_rate": 4.922687039844089e-05, + "loss": 0.3751, + "step": 1142500 + }, + { + "epoch": 7.734679514941533, + "grad_norm": 0.380672425031662, + "learning_rate": 4.9226532048505844e-05, + "loss": 0.3746, + "step": 1143000 + }, + { + "epoch": 7.738063014291901, + "grad_norm": 0.3651910424232483, + "learning_rate": 4.922619369857081e-05, + "loss": 0.3771, + "step": 1143500 + }, + { + "epoch": 7.74144651364227, + "grad_norm": 0.3869389295578003, + "learning_rate": 4.9225855348635775e-05, + "loss": 0.3763, + "step": 1144000 + }, + { + "epoch": 7.7448300129926375, + "grad_norm": 0.3407086133956909, + "learning_rate": 4.922551699870074e-05, + "loss": 0.3777, + "step": 1144500 + }, + { + "epoch": 7.748213512343005, + "grad_norm": 0.33296847343444824, + "learning_rate": 4.92251786487657e-05, + "loss": 0.3764, + "step": 1145000 + }, + { + "epoch": 7.751597011693374, + "grad_norm": 0.3720654547214508, + "learning_rate": 4.922484029883067e-05, + "loss": 0.3749, + "step": 1145500 + }, + { + "epoch": 7.754980511043742, + "grad_norm": 0.33162111043930054, + "learning_rate": 4.922450194889563e-05, + "loss": 0.375, + "step": 1146000 + }, + { + "epoch": 7.75836401039411, + "grad_norm": 0.36200985312461853, + "learning_rate": 4.922416359896059e-05, + "loss": 0.3755, + "step": 1146500 + }, + { + "epoch": 7.761747509744478, + "grad_norm": 0.3476463258266449, + "learning_rate": 4.9223825249025555e-05, + "loss": 0.3751, + "step": 1147000 + }, + { + "epoch": 7.765131009094846, + "grad_norm": 0.3678581118583679, + "learning_rate": 4.922348689909052e-05, + "loss": 0.3755, + "step": 1147500 + }, + { + "epoch": 7.7685145084452145, + "grad_norm": 0.35781508684158325, + "learning_rate": 4.922314854915548e-05, + "loss": 0.3754, + "step": 1148000 + }, + { + "epoch": 7.771898007795582, + "grad_norm": 0.36711201071739197, + "learning_rate": 4.922281019922044e-05, + "loss": 0.3775, + "step": 1148500 + }, + { + "epoch": 7.775281507145951, + "grad_norm": 0.33325284719467163, + "learning_rate": 4.9222471849285403e-05, + "loss": 0.3754, + "step": 1149000 + }, + { + "epoch": 7.778665006496318, + "grad_norm": 0.33583301305770874, + "learning_rate": 4.922213349935037e-05, + "loss": 0.376, + "step": 1149500 + }, + { + "epoch": 7.782048505846687, + "grad_norm": 0.3751732110977173, + "learning_rate": 4.9221795149415334e-05, + "loss": 0.3751, + "step": 1150000 + }, + { + "epoch": 7.785432005197055, + "grad_norm": 0.33746105432510376, + "learning_rate": 4.9221456799480297e-05, + "loss": 0.3773, + "step": 1150500 + }, + { + "epoch": 7.788815504547423, + "grad_norm": 0.33039334416389465, + "learning_rate": 4.922111844954526e-05, + "loss": 0.3771, + "step": 1151000 + }, + { + "epoch": 7.7921990038977915, + "grad_norm": 0.3974260687828064, + "learning_rate": 4.922078009961023e-05, + "loss": 0.3754, + "step": 1151500 + }, + { + "epoch": 7.795582503248159, + "grad_norm": 0.35063496232032776, + "learning_rate": 4.922044174967519e-05, + "loss": 0.374, + "step": 1152000 + }, + { + "epoch": 7.798966002598528, + "grad_norm": 0.3475898504257202, + "learning_rate": 4.9220103399740145e-05, + "loss": 0.3754, + "step": 1152500 + }, + { + "epoch": 7.802349501948895, + "grad_norm": 0.36210474371910095, + "learning_rate": 4.9219765049805114e-05, + "loss": 0.3767, + "step": 1153000 + }, + { + "epoch": 7.805733001299264, + "grad_norm": 0.34278056025505066, + "learning_rate": 4.9219426699870076e-05, + "loss": 0.3763, + "step": 1153500 + }, + { + "epoch": 7.8091165006496315, + "grad_norm": 0.3511875867843628, + "learning_rate": 4.921908834993504e-05, + "loss": 0.3756, + "step": 1154000 + }, + { + "epoch": 7.8125, + "grad_norm": 0.422054260969162, + "learning_rate": 4.921875e-05, + "loss": 0.3759, + "step": 1154500 + }, + { + "epoch": 7.8158834993503685, + "grad_norm": 0.3470151126384735, + "learning_rate": 4.921841165006497e-05, + "loss": 0.3762, + "step": 1155000 + }, + { + "epoch": 7.819266998700736, + "grad_norm": 0.4256981313228607, + "learning_rate": 4.921807330012993e-05, + "loss": 0.3767, + "step": 1155500 + }, + { + "epoch": 7.822650498051105, + "grad_norm": 0.3501984477043152, + "learning_rate": 4.9217734950194893e-05, + "loss": 0.3768, + "step": 1156000 + }, + { + "epoch": 7.826033997401472, + "grad_norm": 0.3682548403739929, + "learning_rate": 4.9217396600259856e-05, + "loss": 0.3758, + "step": 1156500 + }, + { + "epoch": 7.829417496751841, + "grad_norm": 0.32662200927734375, + "learning_rate": 4.921705825032482e-05, + "loss": 0.3765, + "step": 1157000 + }, + { + "epoch": 7.8328009961022085, + "grad_norm": 0.3631592392921448, + "learning_rate": 4.921671990038978e-05, + "loss": 0.3763, + "step": 1157500 + }, + { + "epoch": 7.836184495452577, + "grad_norm": 0.3583422899246216, + "learning_rate": 4.921638155045474e-05, + "loss": 0.3744, + "step": 1158000 + }, + { + "epoch": 7.839567994802945, + "grad_norm": 0.35009801387786865, + "learning_rate": 4.9216043200519704e-05, + "loss": 0.3754, + "step": 1158500 + }, + { + "epoch": 7.842951494153313, + "grad_norm": 0.361879825592041, + "learning_rate": 4.921570485058467e-05, + "loss": 0.3749, + "step": 1159000 + }, + { + "epoch": 7.846334993503682, + "grad_norm": 0.3249756693840027, + "learning_rate": 4.9215366500649635e-05, + "loss": 0.376, + "step": 1159500 + }, + { + "epoch": 7.849718492854049, + "grad_norm": 0.33673372864723206, + "learning_rate": 4.92150281507146e-05, + "loss": 0.3763, + "step": 1160000 + }, + { + "epoch": 7.853101992204418, + "grad_norm": 0.40804365277290344, + "learning_rate": 4.921468980077956e-05, + "loss": 0.3758, + "step": 1160500 + }, + { + "epoch": 7.8564854915547855, + "grad_norm": 0.33804696798324585, + "learning_rate": 4.921435145084453e-05, + "loss": 0.3773, + "step": 1161000 + }, + { + "epoch": 7.859868990905154, + "grad_norm": 0.3609027862548828, + "learning_rate": 4.921401310090949e-05, + "loss": 0.3739, + "step": 1161500 + }, + { + "epoch": 7.863252490255522, + "grad_norm": 0.3798917829990387, + "learning_rate": 4.9213674750974446e-05, + "loss": 0.3763, + "step": 1162000 + }, + { + "epoch": 7.86663598960589, + "grad_norm": 0.39154064655303955, + "learning_rate": 4.9213336401039415e-05, + "loss": 0.3766, + "step": 1162500 + }, + { + "epoch": 7.870019488956258, + "grad_norm": 0.35580357909202576, + "learning_rate": 4.921299805110438e-05, + "loss": 0.3762, + "step": 1163000 + }, + { + "epoch": 7.873402988306626, + "grad_norm": 0.357096791267395, + "learning_rate": 4.921265970116934e-05, + "loss": 0.3761, + "step": 1163500 + }, + { + "epoch": 7.876786487656995, + "grad_norm": 0.3507744371891022, + "learning_rate": 4.92123213512343e-05, + "loss": 0.3755, + "step": 1164000 + }, + { + "epoch": 7.8801699870073625, + "grad_norm": 0.375263512134552, + "learning_rate": 4.921198300129927e-05, + "loss": 0.3759, + "step": 1164500 + }, + { + "epoch": 7.88355348635773, + "grad_norm": 0.3427620530128479, + "learning_rate": 4.921164465136423e-05, + "loss": 0.3748, + "step": 1165000 + }, + { + "epoch": 7.886936985708099, + "grad_norm": 0.32488977909088135, + "learning_rate": 4.9211306301429194e-05, + "loss": 0.3752, + "step": 1165500 + }, + { + "epoch": 7.890320485058467, + "grad_norm": 0.3429107964038849, + "learning_rate": 4.9210967951494156e-05, + "loss": 0.3767, + "step": 1166000 + }, + { + "epoch": 7.893703984408835, + "grad_norm": 0.3646707832813263, + "learning_rate": 4.921062960155912e-05, + "loss": 0.376, + "step": 1166500 + }, + { + "epoch": 7.897087483759203, + "grad_norm": 0.34920671582221985, + "learning_rate": 4.921029125162408e-05, + "loss": 0.3754, + "step": 1167000 + }, + { + "epoch": 7.900470983109571, + "grad_norm": 0.3300837576389313, + "learning_rate": 4.920995290168904e-05, + "loss": 0.3758, + "step": 1167500 + }, + { + "epoch": 7.9038544824599395, + "grad_norm": 0.38310351967811584, + "learning_rate": 4.9209614551754005e-05, + "loss": 0.3763, + "step": 1168000 + }, + { + "epoch": 7.907237981810307, + "grad_norm": 0.3786688446998596, + "learning_rate": 4.9209276201818974e-05, + "loss": 0.3748, + "step": 1168500 + }, + { + "epoch": 7.910621481160676, + "grad_norm": 0.3455840051174164, + "learning_rate": 4.9208937851883936e-05, + "loss": 0.3756, + "step": 1169000 + }, + { + "epoch": 7.914004980511043, + "grad_norm": 0.31133291125297546, + "learning_rate": 4.92085995019489e-05, + "loss": 0.3746, + "step": 1169500 + }, + { + "epoch": 7.917388479861412, + "grad_norm": 0.3466230034828186, + "learning_rate": 4.920826115201386e-05, + "loss": 0.3756, + "step": 1170000 + }, + { + "epoch": 7.92077197921178, + "grad_norm": 0.35299792885780334, + "learning_rate": 4.920792280207883e-05, + "loss": 0.3757, + "step": 1170500 + }, + { + "epoch": 7.924155478562148, + "grad_norm": 0.3923153281211853, + "learning_rate": 4.920758445214379e-05, + "loss": 0.3765, + "step": 1171000 + }, + { + "epoch": 7.9275389779125165, + "grad_norm": 0.34024280309677124, + "learning_rate": 4.9207246102208746e-05, + "loss": 0.3744, + "step": 1171500 + }, + { + "epoch": 7.930922477262884, + "grad_norm": 0.35224542021751404, + "learning_rate": 4.9206907752273715e-05, + "loss": 0.3763, + "step": 1172000 + }, + { + "epoch": 7.934305976613253, + "grad_norm": 0.4136195182800293, + "learning_rate": 4.920656940233868e-05, + "loss": 0.3747, + "step": 1172500 + }, + { + "epoch": 7.93768947596362, + "grad_norm": 0.3392421305179596, + "learning_rate": 4.920623105240364e-05, + "loss": 0.3783, + "step": 1173000 + }, + { + "epoch": 7.941072975313989, + "grad_norm": 0.3497343063354492, + "learning_rate": 4.92058927024686e-05, + "loss": 0.3752, + "step": 1173500 + }, + { + "epoch": 7.9444564746643564, + "grad_norm": 0.34304261207580566, + "learning_rate": 4.920555435253357e-05, + "loss": 0.3759, + "step": 1174000 + }, + { + "epoch": 7.947839974014725, + "grad_norm": 0.31310826539993286, + "learning_rate": 4.920521600259853e-05, + "loss": 0.3775, + "step": 1174500 + }, + { + "epoch": 7.9512234733650935, + "grad_norm": 0.33917033672332764, + "learning_rate": 4.9204877652663495e-05, + "loss": 0.3753, + "step": 1175000 + }, + { + "epoch": 7.954606972715461, + "grad_norm": 0.28793883323669434, + "learning_rate": 4.920453930272846e-05, + "loss": 0.3757, + "step": 1175500 + }, + { + "epoch": 7.95799047206583, + "grad_norm": 0.3498222231864929, + "learning_rate": 4.920420095279342e-05, + "loss": 0.376, + "step": 1176000 + }, + { + "epoch": 7.961373971416197, + "grad_norm": 0.3331172466278076, + "learning_rate": 4.920386260285838e-05, + "loss": 0.3764, + "step": 1176500 + }, + { + "epoch": 7.964757470766566, + "grad_norm": 0.3206407129764557, + "learning_rate": 4.920352425292334e-05, + "loss": 0.3747, + "step": 1177000 + }, + { + "epoch": 7.9681409701169335, + "grad_norm": 0.35807299613952637, + "learning_rate": 4.9203185902988305e-05, + "loss": 0.3746, + "step": 1177500 + }, + { + "epoch": 7.971524469467302, + "grad_norm": 0.3473316431045532, + "learning_rate": 4.9202847553053274e-05, + "loss": 0.3765, + "step": 1178000 + }, + { + "epoch": 7.97490796881767, + "grad_norm": 0.3588818907737732, + "learning_rate": 4.9202509203118236e-05, + "loss": 0.3775, + "step": 1178500 + }, + { + "epoch": 7.978291468168038, + "grad_norm": 0.39430102705955505, + "learning_rate": 4.92021708531832e-05, + "loss": 0.3761, + "step": 1179000 + }, + { + "epoch": 7.981674967518407, + "grad_norm": 0.34274429082870483, + "learning_rate": 4.920183250324816e-05, + "loss": 0.3783, + "step": 1179500 + }, + { + "epoch": 7.985058466868774, + "grad_norm": 0.38946691155433655, + "learning_rate": 4.920149415331313e-05, + "loss": 0.374, + "step": 1180000 + }, + { + "epoch": 7.988441966219143, + "grad_norm": 0.35403940081596375, + "learning_rate": 4.920115580337809e-05, + "loss": 0.3746, + "step": 1180500 + }, + { + "epoch": 7.9918254655695105, + "grad_norm": 0.36802688241004944, + "learning_rate": 4.920081745344305e-05, + "loss": 0.3757, + "step": 1181000 + }, + { + "epoch": 7.995208964919879, + "grad_norm": 0.38939595222473145, + "learning_rate": 4.9200479103508016e-05, + "loss": 0.3754, + "step": 1181500 + }, + { + "epoch": 7.998592464270247, + "grad_norm": 0.35823124647140503, + "learning_rate": 4.920014075357298e-05, + "loss": 0.3733, + "step": 1182000 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.8572758723371212, + "eval_loss": 0.5793046951293945, + "eval_runtime": 3396.5385, + "eval_samples_per_second": 85.6, + "eval_steps_per_second": 5.35, + "step": 1182208 + }, + { + "epoch": 8.001975963620614, + "grad_norm": 0.3599242568016052, + "learning_rate": 4.919980240363794e-05, + "loss": 0.374, + "step": 1182500 + }, + { + "epoch": 8.005359462970983, + "grad_norm": 0.3641102612018585, + "learning_rate": 4.91994640537029e-05, + "loss": 0.3737, + "step": 1183000 + }, + { + "epoch": 8.008742962321351, + "grad_norm": 0.30603310465812683, + "learning_rate": 4.9199125703767864e-05, + "loss": 0.3746, + "step": 1183500 + }, + { + "epoch": 8.01212646167172, + "grad_norm": 0.3400936722755432, + "learning_rate": 4.919878735383283e-05, + "loss": 0.3744, + "step": 1184000 + }, + { + "epoch": 8.015509961022088, + "grad_norm": 0.3894224464893341, + "learning_rate": 4.9198449003897795e-05, + "loss": 0.3736, + "step": 1184500 + }, + { + "epoch": 8.018893460372455, + "grad_norm": 0.3433375060558319, + "learning_rate": 4.919811065396276e-05, + "loss": 0.3734, + "step": 1185000 + }, + { + "epoch": 8.022276959722824, + "grad_norm": 0.36428365111351013, + "learning_rate": 4.919777230402772e-05, + "loss": 0.3733, + "step": 1185500 + }, + { + "epoch": 8.025660459073192, + "grad_norm": 0.3363524377346039, + "learning_rate": 4.919743395409268e-05, + "loss": 0.373, + "step": 1186000 + }, + { + "epoch": 8.02904395842356, + "grad_norm": 0.33948156237602234, + "learning_rate": 4.9197095604157644e-05, + "loss": 0.3736, + "step": 1186500 + }, + { + "epoch": 8.032427457773927, + "grad_norm": 0.32381463050842285, + "learning_rate": 4.9196757254222606e-05, + "loss": 0.3733, + "step": 1187000 + }, + { + "epoch": 8.035810957124296, + "grad_norm": 0.3439604938030243, + "learning_rate": 4.9196418904287575e-05, + "loss": 0.3735, + "step": 1187500 + }, + { + "epoch": 8.039194456474664, + "grad_norm": 0.3471587002277374, + "learning_rate": 4.919608055435254e-05, + "loss": 0.3733, + "step": 1188000 + }, + { + "epoch": 8.042577955825033, + "grad_norm": 0.344894140958786, + "learning_rate": 4.91957422044175e-05, + "loss": 0.3726, + "step": 1188500 + }, + { + "epoch": 8.0459614551754, + "grad_norm": 0.3696286082267761, + "learning_rate": 4.919540385448246e-05, + "loss": 0.3735, + "step": 1189000 + }, + { + "epoch": 8.049344954525768, + "grad_norm": 0.39218756556510925, + "learning_rate": 4.919506550454743e-05, + "loss": 0.375, + "step": 1189500 + }, + { + "epoch": 8.052728453876137, + "grad_norm": 0.3539004325866699, + "learning_rate": 4.919472715461239e-05, + "loss": 0.3737, + "step": 1190000 + }, + { + "epoch": 8.056111953226505, + "grad_norm": 0.3537065088748932, + "learning_rate": 4.919438880467735e-05, + "loss": 0.373, + "step": 1190500 + }, + { + "epoch": 8.059495452576874, + "grad_norm": 0.4091944694519043, + "learning_rate": 4.919405045474231e-05, + "loss": 0.3738, + "step": 1191000 + }, + { + "epoch": 8.06287895192724, + "grad_norm": 0.2999604642391205, + "learning_rate": 4.919371210480728e-05, + "loss": 0.3728, + "step": 1191500 + }, + { + "epoch": 8.066262451277609, + "grad_norm": 0.3378213942050934, + "learning_rate": 4.919337375487224e-05, + "loss": 0.373, + "step": 1192000 + }, + { + "epoch": 8.069645950627978, + "grad_norm": 0.38694649934768677, + "learning_rate": 4.91930354049372e-05, + "loss": 0.3737, + "step": 1192500 + }, + { + "epoch": 8.073029449978346, + "grad_norm": 0.3591739535331726, + "learning_rate": 4.9192697055002165e-05, + "loss": 0.3744, + "step": 1193000 + }, + { + "epoch": 8.076412949328713, + "grad_norm": 0.34628209471702576, + "learning_rate": 4.9192358705067134e-05, + "loss": 0.3741, + "step": 1193500 + }, + { + "epoch": 8.079796448679081, + "grad_norm": 0.3418349325656891, + "learning_rate": 4.9192020355132096e-05, + "loss": 0.3737, + "step": 1194000 + }, + { + "epoch": 8.08317994802945, + "grad_norm": 0.33706989884376526, + "learning_rate": 4.919168200519706e-05, + "loss": 0.3731, + "step": 1194500 + }, + { + "epoch": 8.086563447379818, + "grad_norm": 0.3412095904350281, + "learning_rate": 4.919134365526202e-05, + "loss": 0.3758, + "step": 1195000 + }, + { + "epoch": 8.089946946730187, + "grad_norm": 0.3373109698295593, + "learning_rate": 4.919100530532698e-05, + "loss": 0.3731, + "step": 1195500 + }, + { + "epoch": 8.093330446080554, + "grad_norm": 0.33834993839263916, + "learning_rate": 4.9190666955391945e-05, + "loss": 0.3765, + "step": 1196000 + }, + { + "epoch": 8.096713945430922, + "grad_norm": 0.3101872205734253, + "learning_rate": 4.919032860545691e-05, + "loss": 0.3758, + "step": 1196500 + }, + { + "epoch": 8.10009744478129, + "grad_norm": 0.31594258546829224, + "learning_rate": 4.9189990255521876e-05, + "loss": 0.3751, + "step": 1197000 + }, + { + "epoch": 8.10348094413166, + "grad_norm": 0.39463159441947937, + "learning_rate": 4.918965190558684e-05, + "loss": 0.3752, + "step": 1197500 + }, + { + "epoch": 8.106864443482026, + "grad_norm": 0.36349910497665405, + "learning_rate": 4.91893135556518e-05, + "loss": 0.374, + "step": 1198000 + }, + { + "epoch": 8.110247942832395, + "grad_norm": 0.35538309812545776, + "learning_rate": 4.918897520571676e-05, + "loss": 0.3725, + "step": 1198500 + }, + { + "epoch": 8.113631442182763, + "grad_norm": 0.3548257350921631, + "learning_rate": 4.918863685578173e-05, + "loss": 0.3736, + "step": 1199000 + }, + { + "epoch": 8.117014941533132, + "grad_norm": 0.36630895733833313, + "learning_rate": 4.918829850584669e-05, + "loss": 0.3733, + "step": 1199500 + }, + { + "epoch": 8.1203984408835, + "grad_norm": 0.34236612915992737, + "learning_rate": 4.918796015591165e-05, + "loss": 0.3734, + "step": 1200000 + }, + { + "epoch": 8.123781940233867, + "grad_norm": 0.37828969955444336, + "learning_rate": 4.918762180597661e-05, + "loss": 0.3747, + "step": 1200500 + }, + { + "epoch": 8.127165439584235, + "grad_norm": 0.3627392053604126, + "learning_rate": 4.918728345604158e-05, + "loss": 0.3736, + "step": 1201000 + }, + { + "epoch": 8.130548938934604, + "grad_norm": 0.3256118893623352, + "learning_rate": 4.918694510610654e-05, + "loss": 0.3745, + "step": 1201500 + }, + { + "epoch": 8.133932438284972, + "grad_norm": 0.3644091486930847, + "learning_rate": 4.9186606756171504e-05, + "loss": 0.3748, + "step": 1202000 + }, + { + "epoch": 8.13731593763534, + "grad_norm": 0.34699633717536926, + "learning_rate": 4.9186268406236466e-05, + "loss": 0.3756, + "step": 1202500 + }, + { + "epoch": 8.140699436985708, + "grad_norm": 0.4031618535518646, + "learning_rate": 4.9185930056301435e-05, + "loss": 0.3748, + "step": 1203000 + }, + { + "epoch": 8.144082936336076, + "grad_norm": 0.4030303955078125, + "learning_rate": 4.91855917063664e-05, + "loss": 0.3751, + "step": 1203500 + }, + { + "epoch": 8.147466435686445, + "grad_norm": 0.39308834075927734, + "learning_rate": 4.918525335643136e-05, + "loss": 0.3763, + "step": 1204000 + }, + { + "epoch": 8.150849935036813, + "grad_norm": 0.36041557788848877, + "learning_rate": 4.918491500649632e-05, + "loss": 0.3734, + "step": 1204500 + }, + { + "epoch": 8.15423343438718, + "grad_norm": 0.3809669017791748, + "learning_rate": 4.918457665656128e-05, + "loss": 0.3755, + "step": 1205000 + }, + { + "epoch": 8.157616933737549, + "grad_norm": 0.3428496718406677, + "learning_rate": 4.9184238306626245e-05, + "loss": 0.3752, + "step": 1205500 + }, + { + "epoch": 8.161000433087917, + "grad_norm": 0.3226144313812256, + "learning_rate": 4.918389995669121e-05, + "loss": 0.3734, + "step": 1206000 + }, + { + "epoch": 8.164383932438286, + "grad_norm": 0.3690769672393799, + "learning_rate": 4.9183561606756176e-05, + "loss": 0.3749, + "step": 1206500 + }, + { + "epoch": 8.167767431788652, + "grad_norm": 0.371756911277771, + "learning_rate": 4.918322325682114e-05, + "loss": 0.3752, + "step": 1207000 + }, + { + "epoch": 8.171150931139021, + "grad_norm": 0.37059494853019714, + "learning_rate": 4.91828849068861e-05, + "loss": 0.3751, + "step": 1207500 + }, + { + "epoch": 8.17453443048939, + "grad_norm": 0.3456904888153076, + "learning_rate": 4.918254655695106e-05, + "loss": 0.375, + "step": 1208000 + }, + { + "epoch": 8.177917929839758, + "grad_norm": 0.3586566746234894, + "learning_rate": 4.918220820701603e-05, + "loss": 0.3748, + "step": 1208500 + }, + { + "epoch": 8.181301429190125, + "grad_norm": 0.33923983573913574, + "learning_rate": 4.9181869857080994e-05, + "loss": 0.3739, + "step": 1209000 + }, + { + "epoch": 8.184684928540493, + "grad_norm": 0.36180174350738525, + "learning_rate": 4.918153150714595e-05, + "loss": 0.375, + "step": 1209500 + }, + { + "epoch": 8.188068427890862, + "grad_norm": 0.44262030720710754, + "learning_rate": 4.918119315721091e-05, + "loss": 0.3736, + "step": 1210000 + }, + { + "epoch": 8.19145192724123, + "grad_norm": 0.3610953688621521, + "learning_rate": 4.918085480727588e-05, + "loss": 0.3733, + "step": 1210500 + }, + { + "epoch": 8.194835426591599, + "grad_norm": 0.34002241492271423, + "learning_rate": 4.918051645734084e-05, + "loss": 0.3745, + "step": 1211000 + }, + { + "epoch": 8.198218925941966, + "grad_norm": 0.3462762236595154, + "learning_rate": 4.9180178107405804e-05, + "loss": 0.3751, + "step": 1211500 + }, + { + "epoch": 8.201602425292334, + "grad_norm": 0.39004191756248474, + "learning_rate": 4.9179839757470766e-05, + "loss": 0.3763, + "step": 1212000 + }, + { + "epoch": 8.204985924642703, + "grad_norm": 0.3487882912158966, + "learning_rate": 4.9179501407535735e-05, + "loss": 0.3745, + "step": 1212500 + }, + { + "epoch": 8.208369423993071, + "grad_norm": 0.37315833568573, + "learning_rate": 4.91791630576007e-05, + "loss": 0.3739, + "step": 1213000 + }, + { + "epoch": 8.211752923343438, + "grad_norm": 0.3424568176269531, + "learning_rate": 4.917882470766566e-05, + "loss": 0.375, + "step": 1213500 + }, + { + "epoch": 8.215136422693806, + "grad_norm": 0.38436761498451233, + "learning_rate": 4.917848635773062e-05, + "loss": 0.3742, + "step": 1214000 + }, + { + "epoch": 8.218519922044175, + "grad_norm": 0.36517414450645447, + "learning_rate": 4.9178148007795584e-05, + "loss": 0.3719, + "step": 1214500 + }, + { + "epoch": 8.221903421394543, + "grad_norm": 0.329480916261673, + "learning_rate": 4.9177809657860546e-05, + "loss": 0.374, + "step": 1215000 + }, + { + "epoch": 8.225286920744912, + "grad_norm": 0.3483608663082123, + "learning_rate": 4.917747130792551e-05, + "loss": 0.3741, + "step": 1215500 + }, + { + "epoch": 8.228670420095279, + "grad_norm": 0.32875847816467285, + "learning_rate": 4.917713295799048e-05, + "loss": 0.3748, + "step": 1216000 + }, + { + "epoch": 8.232053919445647, + "grad_norm": 0.340464323759079, + "learning_rate": 4.917679460805544e-05, + "loss": 0.3747, + "step": 1216500 + }, + { + "epoch": 8.235437418796016, + "grad_norm": 0.3463500738143921, + "learning_rate": 4.91764562581204e-05, + "loss": 0.3741, + "step": 1217000 + }, + { + "epoch": 8.238820918146384, + "grad_norm": 0.3442372679710388, + "learning_rate": 4.917611790818536e-05, + "loss": 0.3757, + "step": 1217500 + }, + { + "epoch": 8.242204417496751, + "grad_norm": 0.35514867305755615, + "learning_rate": 4.917577955825033e-05, + "loss": 0.3738, + "step": 1218000 + }, + { + "epoch": 8.24558791684712, + "grad_norm": 0.3340912163257599, + "learning_rate": 4.9175441208315294e-05, + "loss": 0.3738, + "step": 1218500 + }, + { + "epoch": 8.248971416197488, + "grad_norm": 0.3596610426902771, + "learning_rate": 4.917510285838025e-05, + "loss": 0.3751, + "step": 1219000 + }, + { + "epoch": 8.252354915547857, + "grad_norm": 0.33829501271247864, + "learning_rate": 4.917476450844521e-05, + "loss": 0.3736, + "step": 1219500 + }, + { + "epoch": 8.255738414898225, + "grad_norm": 0.3386409282684326, + "learning_rate": 4.917442615851018e-05, + "loss": 0.3742, + "step": 1220000 + }, + { + "epoch": 8.259121914248592, + "grad_norm": 0.3590584397315979, + "learning_rate": 4.917408780857514e-05, + "loss": 0.3721, + "step": 1220500 + }, + { + "epoch": 8.26250541359896, + "grad_norm": 0.341302752494812, + "learning_rate": 4.9173749458640105e-05, + "loss": 0.3739, + "step": 1221000 + }, + { + "epoch": 8.265888912949329, + "grad_norm": 0.36562225222587585, + "learning_rate": 4.917341110870507e-05, + "loss": 0.3732, + "step": 1221500 + }, + { + "epoch": 8.269272412299697, + "grad_norm": 0.34959301352500916, + "learning_rate": 4.9173072758770036e-05, + "loss": 0.3751, + "step": 1222000 + }, + { + "epoch": 8.272655911650064, + "grad_norm": 0.3470049500465393, + "learning_rate": 4.9172734408835e-05, + "loss": 0.3732, + "step": 1222500 + }, + { + "epoch": 8.276039411000433, + "grad_norm": 0.36257070302963257, + "learning_rate": 4.917239605889996e-05, + "loss": 0.3749, + "step": 1223000 + }, + { + "epoch": 8.279422910350801, + "grad_norm": 0.40908801555633545, + "learning_rate": 4.917205770896492e-05, + "loss": 0.3744, + "step": 1223500 + }, + { + "epoch": 8.28280640970117, + "grad_norm": 0.3480644226074219, + "learning_rate": 4.9171719359029884e-05, + "loss": 0.3753, + "step": 1224000 + }, + { + "epoch": 8.286189909051538, + "grad_norm": 0.33613091707229614, + "learning_rate": 4.9171381009094846e-05, + "loss": 0.3765, + "step": 1224500 + }, + { + "epoch": 8.289573408401905, + "grad_norm": 0.36790555715560913, + "learning_rate": 4.917104265915981e-05, + "loss": 0.3754, + "step": 1225000 + }, + { + "epoch": 8.292956907752274, + "grad_norm": 0.37085211277008057, + "learning_rate": 4.917070430922478e-05, + "loss": 0.3743, + "step": 1225500 + }, + { + "epoch": 8.296340407102642, + "grad_norm": 0.35964706540107727, + "learning_rate": 4.917036595928974e-05, + "loss": 0.3725, + "step": 1226000 + }, + { + "epoch": 8.29972390645301, + "grad_norm": 0.357497900724411, + "learning_rate": 4.91700276093547e-05, + "loss": 0.3737, + "step": 1226500 + }, + { + "epoch": 8.303107405803377, + "grad_norm": 0.398779034614563, + "learning_rate": 4.9169689259419664e-05, + "loss": 0.3747, + "step": 1227000 + }, + { + "epoch": 8.306490905153746, + "grad_norm": 0.3341631293296814, + "learning_rate": 4.916935090948463e-05, + "loss": 0.3741, + "step": 1227500 + }, + { + "epoch": 8.309874404504114, + "grad_norm": 0.3211187720298767, + "learning_rate": 4.9169012559549595e-05, + "loss": 0.374, + "step": 1228000 + }, + { + "epoch": 8.313257903854483, + "grad_norm": 0.350558876991272, + "learning_rate": 4.916867420961455e-05, + "loss": 0.3745, + "step": 1228500 + }, + { + "epoch": 8.316641403204851, + "grad_norm": 0.35707876086235046, + "learning_rate": 4.916833585967951e-05, + "loss": 0.375, + "step": 1229000 + }, + { + "epoch": 8.320024902555218, + "grad_norm": 0.36113375425338745, + "learning_rate": 4.916799750974448e-05, + "loss": 0.3755, + "step": 1229500 + }, + { + "epoch": 8.323408401905587, + "grad_norm": 0.3182533383369446, + "learning_rate": 4.916765915980944e-05, + "loss": 0.3748, + "step": 1230000 + }, + { + "epoch": 8.326791901255955, + "grad_norm": 0.3398992121219635, + "learning_rate": 4.9167320809874405e-05, + "loss": 0.373, + "step": 1230500 + }, + { + "epoch": 8.330175400606324, + "grad_norm": 0.34391719102859497, + "learning_rate": 4.916698245993937e-05, + "loss": 0.3745, + "step": 1231000 + }, + { + "epoch": 8.33355889995669, + "grad_norm": 0.3682703375816345, + "learning_rate": 4.9166644110004336e-05, + "loss": 0.3747, + "step": 1231500 + }, + { + "epoch": 8.336942399307059, + "grad_norm": 0.3944721817970276, + "learning_rate": 4.91663057600693e-05, + "loss": 0.374, + "step": 1232000 + }, + { + "epoch": 8.340325898657428, + "grad_norm": 0.34578758478164673, + "learning_rate": 4.916596741013426e-05, + "loss": 0.3744, + "step": 1232500 + }, + { + "epoch": 8.343709398007796, + "grad_norm": 0.3377029597759247, + "learning_rate": 4.916562906019922e-05, + "loss": 0.3749, + "step": 1233000 + }, + { + "epoch": 8.347092897358163, + "grad_norm": 0.3452177047729492, + "learning_rate": 4.9165290710264185e-05, + "loss": 0.3727, + "step": 1233500 + }, + { + "epoch": 8.350476396708531, + "grad_norm": 0.3850466012954712, + "learning_rate": 4.916495236032915e-05, + "loss": 0.3751, + "step": 1234000 + }, + { + "epoch": 8.3538598960589, + "grad_norm": 0.3559221029281616, + "learning_rate": 4.916461401039411e-05, + "loss": 0.3748, + "step": 1234500 + }, + { + "epoch": 8.357243395409268, + "grad_norm": 0.34174761176109314, + "learning_rate": 4.916427566045908e-05, + "loss": 0.374, + "step": 1235000 + }, + { + "epoch": 8.360626894759637, + "grad_norm": 0.35429567098617554, + "learning_rate": 4.916393731052404e-05, + "loss": 0.3742, + "step": 1235500 + }, + { + "epoch": 8.364010394110004, + "grad_norm": 0.3283572196960449, + "learning_rate": 4.9163598960589e-05, + "loss": 0.3743, + "step": 1236000 + }, + { + "epoch": 8.367393893460372, + "grad_norm": 0.36495864391326904, + "learning_rate": 4.9163260610653964e-05, + "loss": 0.3754, + "step": 1236500 + }, + { + "epoch": 8.37077739281074, + "grad_norm": 0.34595993161201477, + "learning_rate": 4.9162922260718927e-05, + "loss": 0.3741, + "step": 1237000 + }, + { + "epoch": 8.37416089216111, + "grad_norm": 0.3537779152393341, + "learning_rate": 4.9162583910783895e-05, + "loss": 0.3753, + "step": 1237500 + }, + { + "epoch": 8.377544391511476, + "grad_norm": 0.3216964602470398, + "learning_rate": 4.916224556084885e-05, + "loss": 0.3762, + "step": 1238000 + }, + { + "epoch": 8.380927890861845, + "grad_norm": 0.3425578773021698, + "learning_rate": 4.916190721091381e-05, + "loss": 0.3751, + "step": 1238500 + }, + { + "epoch": 8.384311390212213, + "grad_norm": 0.36288127303123474, + "learning_rate": 4.916156886097878e-05, + "loss": 0.3753, + "step": 1239000 + }, + { + "epoch": 8.387694889562582, + "grad_norm": 0.3431650400161743, + "learning_rate": 4.9161230511043744e-05, + "loss": 0.376, + "step": 1239500 + }, + { + "epoch": 8.39107838891295, + "grad_norm": 0.34114548563957214, + "learning_rate": 4.9160892161108706e-05, + "loss": 0.3733, + "step": 1240000 + }, + { + "epoch": 8.394461888263317, + "grad_norm": 0.3551206588745117, + "learning_rate": 4.916055381117367e-05, + "loss": 0.374, + "step": 1240500 + }, + { + "epoch": 8.397845387613685, + "grad_norm": 0.3942086100578308, + "learning_rate": 4.916021546123864e-05, + "loss": 0.3734, + "step": 1241000 + }, + { + "epoch": 8.401228886964054, + "grad_norm": 0.3584712743759155, + "learning_rate": 4.91598771113036e-05, + "loss": 0.3739, + "step": 1241500 + }, + { + "epoch": 8.404612386314422, + "grad_norm": 0.3915914297103882, + "learning_rate": 4.915953876136856e-05, + "loss": 0.3739, + "step": 1242000 + }, + { + "epoch": 8.40799588566479, + "grad_norm": 0.34406983852386475, + "learning_rate": 4.9159200411433523e-05, + "loss": 0.3759, + "step": 1242500 + }, + { + "epoch": 8.411379385015158, + "grad_norm": 0.3534091114997864, + "learning_rate": 4.9158862061498486e-05, + "loss": 0.3734, + "step": 1243000 + }, + { + "epoch": 8.414762884365526, + "grad_norm": 0.32999998331069946, + "learning_rate": 4.915852371156345e-05, + "loss": 0.3736, + "step": 1243500 + }, + { + "epoch": 8.418146383715895, + "grad_norm": 0.3992055356502533, + "learning_rate": 4.915818536162841e-05, + "loss": 0.3755, + "step": 1244000 + }, + { + "epoch": 8.421529883066263, + "grad_norm": 0.3797067403793335, + "learning_rate": 4.915784701169338e-05, + "loss": 0.374, + "step": 1244500 + }, + { + "epoch": 8.42491338241663, + "grad_norm": 0.35519033670425415, + "learning_rate": 4.915750866175834e-05, + "loss": 0.3751, + "step": 1245000 + }, + { + "epoch": 8.428296881766999, + "grad_norm": 0.37754616141319275, + "learning_rate": 4.91571703118233e-05, + "loss": 0.3757, + "step": 1245500 + }, + { + "epoch": 8.431680381117367, + "grad_norm": 0.3400075435638428, + "learning_rate": 4.9156831961888265e-05, + "loss": 0.3744, + "step": 1246000 + }, + { + "epoch": 8.435063880467736, + "grad_norm": 0.36160823702812195, + "learning_rate": 4.915649361195323e-05, + "loss": 0.3737, + "step": 1246500 + }, + { + "epoch": 8.438447379818102, + "grad_norm": 0.4066164791584015, + "learning_rate": 4.9156155262018196e-05, + "loss": 0.374, + "step": 1247000 + }, + { + "epoch": 8.44183087916847, + "grad_norm": 0.3818412125110626, + "learning_rate": 4.915581691208316e-05, + "loss": 0.373, + "step": 1247500 + }, + { + "epoch": 8.44521437851884, + "grad_norm": 0.33103445172309875, + "learning_rate": 4.9155478562148114e-05, + "loss": 0.3739, + "step": 1248000 + }, + { + "epoch": 8.448597877869208, + "grad_norm": 0.3441535532474518, + "learning_rate": 4.915514021221308e-05, + "loss": 0.3754, + "step": 1248500 + }, + { + "epoch": 8.451981377219576, + "grad_norm": 0.33515992760658264, + "learning_rate": 4.9154801862278045e-05, + "loss": 0.3752, + "step": 1249000 + }, + { + "epoch": 8.455364876569943, + "grad_norm": 0.3669910728931427, + "learning_rate": 4.915446351234301e-05, + "loss": 0.3738, + "step": 1249500 + }, + { + "epoch": 8.458748375920312, + "grad_norm": 0.38384097814559937, + "learning_rate": 4.915412516240797e-05, + "loss": 0.3766, + "step": 1250000 + }, + { + "epoch": 8.46213187527068, + "grad_norm": 0.3466211259365082, + "learning_rate": 4.915378681247294e-05, + "loss": 0.3743, + "step": 1250500 + }, + { + "epoch": 8.465515374621049, + "grad_norm": 0.3566823899745941, + "learning_rate": 4.91534484625379e-05, + "loss": 0.374, + "step": 1251000 + }, + { + "epoch": 8.468898873971415, + "grad_norm": 0.36961984634399414, + "learning_rate": 4.915311011260286e-05, + "loss": 0.3768, + "step": 1251500 + }, + { + "epoch": 8.472282373321784, + "grad_norm": 0.37993258237838745, + "learning_rate": 4.9152771762667824e-05, + "loss": 0.3744, + "step": 1252000 + }, + { + "epoch": 8.475665872672153, + "grad_norm": 0.3526294231414795, + "learning_rate": 4.9152433412732786e-05, + "loss": 0.3754, + "step": 1252500 + }, + { + "epoch": 8.479049372022521, + "grad_norm": 0.37616288661956787, + "learning_rate": 4.915209506279775e-05, + "loss": 0.3761, + "step": 1253000 + }, + { + "epoch": 8.48243287137289, + "grad_norm": 0.34510985016822815, + "learning_rate": 4.915175671286271e-05, + "loss": 0.3757, + "step": 1253500 + }, + { + "epoch": 8.485816370723256, + "grad_norm": 0.3667396605014801, + "learning_rate": 4.915141836292767e-05, + "loss": 0.3758, + "step": 1254000 + }, + { + "epoch": 8.489199870073625, + "grad_norm": 0.33316004276275635, + "learning_rate": 4.915108001299264e-05, + "loss": 0.3734, + "step": 1254500 + }, + { + "epoch": 8.492583369423993, + "grad_norm": 0.3826241195201874, + "learning_rate": 4.9150741663057604e-05, + "loss": 0.3758, + "step": 1255000 + }, + { + "epoch": 8.495966868774362, + "grad_norm": 0.3249056339263916, + "learning_rate": 4.9150403313122566e-05, + "loss": 0.3743, + "step": 1255500 + }, + { + "epoch": 8.499350368124729, + "grad_norm": 0.4200061857700348, + "learning_rate": 4.915006496318753e-05, + "loss": 0.3757, + "step": 1256000 + }, + { + "epoch": 8.502733867475097, + "grad_norm": 0.3448467552661896, + "learning_rate": 4.91497266132525e-05, + "loss": 0.3731, + "step": 1256500 + }, + { + "epoch": 8.506117366825466, + "grad_norm": 0.3419877588748932, + "learning_rate": 4.914938826331746e-05, + "loss": 0.3744, + "step": 1257000 + }, + { + "epoch": 8.509500866175834, + "grad_norm": 0.3729589283466339, + "learning_rate": 4.9149049913382414e-05, + "loss": 0.3739, + "step": 1257500 + }, + { + "epoch": 8.512884365526201, + "grad_norm": 0.382418155670166, + "learning_rate": 4.914871156344738e-05, + "loss": 0.3741, + "step": 1258000 + }, + { + "epoch": 8.51626786487657, + "grad_norm": 0.33111730217933655, + "learning_rate": 4.9148373213512345e-05, + "loss": 0.3741, + "step": 1258500 + }, + { + "epoch": 8.519651364226938, + "grad_norm": 0.33443519473075867, + "learning_rate": 4.914803486357731e-05, + "loss": 0.374, + "step": 1259000 + }, + { + "epoch": 8.523034863577307, + "grad_norm": 0.30831006169319153, + "learning_rate": 4.914769651364227e-05, + "loss": 0.3748, + "step": 1259500 + }, + { + "epoch": 8.526418362927675, + "grad_norm": 0.3246803879737854, + "learning_rate": 4.914735816370724e-05, + "loss": 0.3752, + "step": 1260000 + }, + { + "epoch": 8.529801862278042, + "grad_norm": 0.32946979999542236, + "learning_rate": 4.91470198137722e-05, + "loss": 0.3741, + "step": 1260500 + }, + { + "epoch": 8.53318536162841, + "grad_norm": 0.33466988801956177, + "learning_rate": 4.914668146383716e-05, + "loss": 0.3733, + "step": 1261000 + }, + { + "epoch": 8.536568860978779, + "grad_norm": 0.3437216281890869, + "learning_rate": 4.9146343113902125e-05, + "loss": 0.376, + "step": 1261500 + }, + { + "epoch": 8.539952360329147, + "grad_norm": 0.3692063093185425, + "learning_rate": 4.914600476396709e-05, + "loss": 0.3747, + "step": 1262000 + }, + { + "epoch": 8.543335859679514, + "grad_norm": 0.33591610193252563, + "learning_rate": 4.914566641403205e-05, + "loss": 0.3758, + "step": 1262500 + }, + { + "epoch": 8.546719359029883, + "grad_norm": 0.3632853627204895, + "learning_rate": 4.914532806409701e-05, + "loss": 0.3751, + "step": 1263000 + }, + { + "epoch": 8.550102858380251, + "grad_norm": 0.3478579819202423, + "learning_rate": 4.914498971416197e-05, + "loss": 0.3725, + "step": 1263500 + }, + { + "epoch": 8.55348635773062, + "grad_norm": 0.35932010412216187, + "learning_rate": 4.914465136422694e-05, + "loss": 0.3754, + "step": 1264000 + }, + { + "epoch": 8.556869857080988, + "grad_norm": 0.32846808433532715, + "learning_rate": 4.9144313014291904e-05, + "loss": 0.374, + "step": 1264500 + }, + { + "epoch": 8.560253356431355, + "grad_norm": 0.3679615557193756, + "learning_rate": 4.9143974664356866e-05, + "loss": 0.3764, + "step": 1265000 + }, + { + "epoch": 8.563636855781724, + "grad_norm": 0.3527098000049591, + "learning_rate": 4.914363631442183e-05, + "loss": 0.3757, + "step": 1265500 + }, + { + "epoch": 8.567020355132092, + "grad_norm": 0.32888293266296387, + "learning_rate": 4.91432979644868e-05, + "loss": 0.3742, + "step": 1266000 + }, + { + "epoch": 8.57040385448246, + "grad_norm": 0.38354864716529846, + "learning_rate": 4.914295961455176e-05, + "loss": 0.3743, + "step": 1266500 + }, + { + "epoch": 8.573787353832827, + "grad_norm": 0.3360845148563385, + "learning_rate": 4.9142621264616715e-05, + "loss": 0.3744, + "step": 1267000 + }, + { + "epoch": 8.577170853183196, + "grad_norm": 0.367890864610672, + "learning_rate": 4.9142282914681684e-05, + "loss": 0.3751, + "step": 1267500 + }, + { + "epoch": 8.580554352533564, + "grad_norm": 0.3678390383720398, + "learning_rate": 4.9141944564746646e-05, + "loss": 0.3746, + "step": 1268000 + }, + { + "epoch": 8.583937851883933, + "grad_norm": 0.36338627338409424, + "learning_rate": 4.914160621481161e-05, + "loss": 0.3758, + "step": 1268500 + }, + { + "epoch": 8.587321351234301, + "grad_norm": 0.3547922670841217, + "learning_rate": 4.914126786487657e-05, + "loss": 0.3739, + "step": 1269000 + }, + { + "epoch": 8.590704850584668, + "grad_norm": 0.3636038303375244, + "learning_rate": 4.914092951494154e-05, + "loss": 0.3751, + "step": 1269500 + }, + { + "epoch": 8.594088349935037, + "grad_norm": 0.35374486446380615, + "learning_rate": 4.91405911650065e-05, + "loss": 0.3756, + "step": 1270000 + }, + { + "epoch": 8.597471849285405, + "grad_norm": 0.373153418302536, + "learning_rate": 4.914025281507146e-05, + "loss": 0.3741, + "step": 1270500 + }, + { + "epoch": 8.600855348635774, + "grad_norm": 0.35778605937957764, + "learning_rate": 4.9139914465136425e-05, + "loss": 0.3754, + "step": 1271000 + }, + { + "epoch": 8.60423884798614, + "grad_norm": 0.39764803647994995, + "learning_rate": 4.913957611520139e-05, + "loss": 0.3741, + "step": 1271500 + }, + { + "epoch": 8.607622347336509, + "grad_norm": 0.32927289605140686, + "learning_rate": 4.913923776526635e-05, + "loss": 0.3747, + "step": 1272000 + }, + { + "epoch": 8.611005846686878, + "grad_norm": 0.3471881151199341, + "learning_rate": 4.913889941533131e-05, + "loss": 0.3766, + "step": 1272500 + }, + { + "epoch": 8.614389346037246, + "grad_norm": 0.3120143711566925, + "learning_rate": 4.9138561065396274e-05, + "loss": 0.3751, + "step": 1273000 + }, + { + "epoch": 8.617772845387613, + "grad_norm": 0.34172213077545166, + "learning_rate": 4.913822271546124e-05, + "loss": 0.3752, + "step": 1273500 + }, + { + "epoch": 8.621156344737981, + "grad_norm": 0.37015801668167114, + "learning_rate": 4.9137884365526205e-05, + "loss": 0.3751, + "step": 1274000 + }, + { + "epoch": 8.62453984408835, + "grad_norm": 0.34678173065185547, + "learning_rate": 4.913754601559117e-05, + "loss": 0.3748, + "step": 1274500 + }, + { + "epoch": 8.627923343438718, + "grad_norm": 0.369243323802948, + "learning_rate": 4.913720766565613e-05, + "loss": 0.3756, + "step": 1275000 + }, + { + "epoch": 8.631306842789087, + "grad_norm": 0.3433549106121063, + "learning_rate": 4.91368693157211e-05, + "loss": 0.376, + "step": 1275500 + }, + { + "epoch": 8.634690342139454, + "grad_norm": 0.3244531452655792, + "learning_rate": 4.913653096578606e-05, + "loss": 0.3736, + "step": 1276000 + }, + { + "epoch": 8.638073841489822, + "grad_norm": 0.3575780689716339, + "learning_rate": 4.9136192615851015e-05, + "loss": 0.3739, + "step": 1276500 + }, + { + "epoch": 8.64145734084019, + "grad_norm": 0.3672553598880768, + "learning_rate": 4.9135854265915984e-05, + "loss": 0.3746, + "step": 1277000 + }, + { + "epoch": 8.64484084019056, + "grad_norm": 0.34356722235679626, + "learning_rate": 4.9135515915980946e-05, + "loss": 0.3751, + "step": 1277500 + }, + { + "epoch": 8.648224339540928, + "grad_norm": 0.3406316041946411, + "learning_rate": 4.913517756604591e-05, + "loss": 0.3733, + "step": 1278000 + }, + { + "epoch": 8.651607838891294, + "grad_norm": 0.33928433060646057, + "learning_rate": 4.913483921611087e-05, + "loss": 0.3746, + "step": 1278500 + }, + { + "epoch": 8.654991338241663, + "grad_norm": 0.3306182026863098, + "learning_rate": 4.913450086617584e-05, + "loss": 0.3745, + "step": 1279000 + }, + { + "epoch": 8.658374837592032, + "grad_norm": 0.37345966696739197, + "learning_rate": 4.91341625162408e-05, + "loss": 0.3748, + "step": 1279500 + }, + { + "epoch": 8.6617583369424, + "grad_norm": 0.32681286334991455, + "learning_rate": 4.9133824166305764e-05, + "loss": 0.3737, + "step": 1280000 + }, + { + "epoch": 8.665141836292767, + "grad_norm": 0.3620678186416626, + "learning_rate": 4.9133485816370726e-05, + "loss": 0.3725, + "step": 1280500 + }, + { + "epoch": 8.668525335643135, + "grad_norm": 0.3080102503299713, + "learning_rate": 4.913314746643569e-05, + "loss": 0.374, + "step": 1281000 + }, + { + "epoch": 8.671908834993504, + "grad_norm": 0.3523677885532379, + "learning_rate": 4.913280911650065e-05, + "loss": 0.3755, + "step": 1281500 + }, + { + "epoch": 8.675292334343872, + "grad_norm": 0.34980931878089905, + "learning_rate": 4.913247076656561e-05, + "loss": 0.3745, + "step": 1282000 + }, + { + "epoch": 8.67867583369424, + "grad_norm": 0.3787962794303894, + "learning_rate": 4.9132132416630574e-05, + "loss": 0.3745, + "step": 1282500 + }, + { + "epoch": 8.682059333044608, + "grad_norm": 0.3643645942211151, + "learning_rate": 4.913179406669554e-05, + "loss": 0.376, + "step": 1283000 + }, + { + "epoch": 8.685442832394976, + "grad_norm": 0.32300788164138794, + "learning_rate": 4.9131455716760505e-05, + "loss": 0.3748, + "step": 1283500 + }, + { + "epoch": 8.688826331745345, + "grad_norm": 0.38785240054130554, + "learning_rate": 4.913111736682547e-05, + "loss": 0.3758, + "step": 1284000 + }, + { + "epoch": 8.692209831095713, + "grad_norm": 0.3650936186313629, + "learning_rate": 4.913077901689043e-05, + "loss": 0.3759, + "step": 1284500 + }, + { + "epoch": 8.69559333044608, + "grad_norm": 0.3315945863723755, + "learning_rate": 4.91304406669554e-05, + "loss": 0.3732, + "step": 1285000 + }, + { + "epoch": 8.698976829796448, + "grad_norm": 0.3755415380001068, + "learning_rate": 4.913010231702036e-05, + "loss": 0.3751, + "step": 1285500 + }, + { + "epoch": 8.702360329146817, + "grad_norm": 0.3727501332759857, + "learning_rate": 4.9129763967085316e-05, + "loss": 0.3759, + "step": 1286000 + }, + { + "epoch": 8.705743828497186, + "grad_norm": 0.3709351420402527, + "learning_rate": 4.9129425617150285e-05, + "loss": 0.3747, + "step": 1286500 + }, + { + "epoch": 8.709127327847552, + "grad_norm": 0.37241730093955994, + "learning_rate": 4.912908726721525e-05, + "loss": 0.3751, + "step": 1287000 + }, + { + "epoch": 8.71251082719792, + "grad_norm": 0.36001521348953247, + "learning_rate": 4.912874891728021e-05, + "loss": 0.3753, + "step": 1287500 + }, + { + "epoch": 8.71589432654829, + "grad_norm": 0.35647907853126526, + "learning_rate": 4.912841056734517e-05, + "loss": 0.3756, + "step": 1288000 + }, + { + "epoch": 8.719277825898658, + "grad_norm": 0.45838338136672974, + "learning_rate": 4.912807221741014e-05, + "loss": 0.3744, + "step": 1288500 + }, + { + "epoch": 8.722661325249026, + "grad_norm": 0.33726125955581665, + "learning_rate": 4.91277338674751e-05, + "loss": 0.3747, + "step": 1289000 + }, + { + "epoch": 8.726044824599393, + "grad_norm": 0.348104327917099, + "learning_rate": 4.9127395517540064e-05, + "loss": 0.3741, + "step": 1289500 + }, + { + "epoch": 8.729428323949762, + "grad_norm": 0.34803929924964905, + "learning_rate": 4.9127057167605027e-05, + "loss": 0.3734, + "step": 1290000 + }, + { + "epoch": 8.73281182330013, + "grad_norm": 0.37727972865104675, + "learning_rate": 4.912671881766999e-05, + "loss": 0.376, + "step": 1290500 + }, + { + "epoch": 8.736195322650499, + "grad_norm": 0.3559610843658447, + "learning_rate": 4.912638046773495e-05, + "loss": 0.3742, + "step": 1291000 + }, + { + "epoch": 8.739578822000865, + "grad_norm": 0.357925683259964, + "learning_rate": 4.912604211779991e-05, + "loss": 0.3748, + "step": 1291500 + }, + { + "epoch": 8.742962321351234, + "grad_norm": 0.3771865665912628, + "learning_rate": 4.9125703767864875e-05, + "loss": 0.3758, + "step": 1292000 + }, + { + "epoch": 8.746345820701602, + "grad_norm": 0.38133761286735535, + "learning_rate": 4.9125365417929844e-05, + "loss": 0.3769, + "step": 1292500 + }, + { + "epoch": 8.749729320051971, + "grad_norm": 0.3215438723564148, + "learning_rate": 4.9125027067994806e-05, + "loss": 0.3741, + "step": 1293000 + }, + { + "epoch": 8.75311281940234, + "grad_norm": 0.3579214811325073, + "learning_rate": 4.912468871805977e-05, + "loss": 0.3754, + "step": 1293500 + }, + { + "epoch": 8.756496318752706, + "grad_norm": 0.38451337814331055, + "learning_rate": 4.912435036812473e-05, + "loss": 0.3757, + "step": 1294000 + }, + { + "epoch": 8.759879818103075, + "grad_norm": 0.32298362255096436, + "learning_rate": 4.91240120181897e-05, + "loss": 0.3743, + "step": 1294500 + }, + { + "epoch": 8.763263317453443, + "grad_norm": 0.3099938631057739, + "learning_rate": 4.912367366825466e-05, + "loss": 0.3731, + "step": 1295000 + }, + { + "epoch": 8.766646816803812, + "grad_norm": 0.3347644507884979, + "learning_rate": 4.912333531831962e-05, + "loss": 0.3732, + "step": 1295500 + }, + { + "epoch": 8.770030316154179, + "grad_norm": 0.32155701518058777, + "learning_rate": 4.9122996968384586e-05, + "loss": 0.3743, + "step": 1296000 + }, + { + "epoch": 8.773413815504547, + "grad_norm": 0.3333846926689148, + "learning_rate": 4.912265861844955e-05, + "loss": 0.3753, + "step": 1296500 + }, + { + "epoch": 8.776797314854916, + "grad_norm": 0.36494186520576477, + "learning_rate": 4.912232026851451e-05, + "loss": 0.3733, + "step": 1297000 + }, + { + "epoch": 8.780180814205284, + "grad_norm": 0.33402982354164124, + "learning_rate": 4.912198191857947e-05, + "loss": 0.3759, + "step": 1297500 + }, + { + "epoch": 8.783564313555651, + "grad_norm": 0.34107717871665955, + "learning_rate": 4.912164356864444e-05, + "loss": 0.3743, + "step": 1298000 + }, + { + "epoch": 8.78694781290602, + "grad_norm": 0.3402135968208313, + "learning_rate": 4.91213052187094e-05, + "loss": 0.3771, + "step": 1298500 + }, + { + "epoch": 8.790331312256388, + "grad_norm": 0.3298054337501526, + "learning_rate": 4.9120966868774365e-05, + "loss": 0.3748, + "step": 1299000 + }, + { + "epoch": 8.793714811606756, + "grad_norm": 0.3509495258331299, + "learning_rate": 4.912062851883933e-05, + "loss": 0.3735, + "step": 1299500 + }, + { + "epoch": 8.797098310957125, + "grad_norm": 0.3410671055316925, + "learning_rate": 4.912029016890429e-05, + "loss": 0.3735, + "step": 1300000 + }, + { + "epoch": 8.800481810307492, + "grad_norm": 0.39786043763160706, + "learning_rate": 4.911995181896925e-05, + "loss": 0.3741, + "step": 1300500 + }, + { + "epoch": 8.80386530965786, + "grad_norm": 0.35167184472084045, + "learning_rate": 4.9119613469034214e-05, + "loss": 0.3747, + "step": 1301000 + }, + { + "epoch": 8.807248809008229, + "grad_norm": 0.30722448229789734, + "learning_rate": 4.9119275119099176e-05, + "loss": 0.3738, + "step": 1301500 + }, + { + "epoch": 8.810632308358597, + "grad_norm": 0.3152434825897217, + "learning_rate": 4.9118936769164145e-05, + "loss": 0.3757, + "step": 1302000 + }, + { + "epoch": 8.814015807708964, + "grad_norm": 0.3755607306957245, + "learning_rate": 4.911859841922911e-05, + "loss": 0.3734, + "step": 1302500 + }, + { + "epoch": 8.817399307059333, + "grad_norm": 0.34447774291038513, + "learning_rate": 4.911826006929407e-05, + "loss": 0.3742, + "step": 1303000 + }, + { + "epoch": 8.820782806409701, + "grad_norm": 0.3727530837059021, + "learning_rate": 4.911792171935903e-05, + "loss": 0.3749, + "step": 1303500 + }, + { + "epoch": 8.82416630576007, + "grad_norm": 0.37038761377334595, + "learning_rate": 4.9117583369424e-05, + "loss": 0.3745, + "step": 1304000 + }, + { + "epoch": 8.827549805110438, + "grad_norm": 0.3163911998271942, + "learning_rate": 4.911724501948896e-05, + "loss": 0.3756, + "step": 1304500 + }, + { + "epoch": 8.830933304460805, + "grad_norm": 0.3718319237232208, + "learning_rate": 4.911690666955392e-05, + "loss": 0.3749, + "step": 1305000 + }, + { + "epoch": 8.834316803811173, + "grad_norm": 0.36646541953086853, + "learning_rate": 4.9116568319618886e-05, + "loss": 0.3754, + "step": 1305500 + }, + { + "epoch": 8.837700303161542, + "grad_norm": 0.33847516775131226, + "learning_rate": 4.911622996968385e-05, + "loss": 0.3748, + "step": 1306000 + }, + { + "epoch": 8.84108380251191, + "grad_norm": 0.35955485701560974, + "learning_rate": 4.911589161974881e-05, + "loss": 0.3728, + "step": 1306500 + }, + { + "epoch": 8.844467301862277, + "grad_norm": 0.33273687958717346, + "learning_rate": 4.911555326981377e-05, + "loss": 0.3749, + "step": 1307000 + }, + { + "epoch": 8.847850801212646, + "grad_norm": 0.35360097885131836, + "learning_rate": 4.911521491987874e-05, + "loss": 0.3745, + "step": 1307500 + }, + { + "epoch": 8.851234300563014, + "grad_norm": 0.3549317419528961, + "learning_rate": 4.9114876569943704e-05, + "loss": 0.3745, + "step": 1308000 + }, + { + "epoch": 8.854617799913383, + "grad_norm": 0.36261728405952454, + "learning_rate": 4.9114538220008666e-05, + "loss": 0.3739, + "step": 1308500 + }, + { + "epoch": 8.858001299263751, + "grad_norm": 0.36524686217308044, + "learning_rate": 4.911419987007363e-05, + "loss": 0.3738, + "step": 1309000 + }, + { + "epoch": 8.861384798614118, + "grad_norm": 0.3448386788368225, + "learning_rate": 4.911386152013859e-05, + "loss": 0.3761, + "step": 1309500 + }, + { + "epoch": 8.864768297964487, + "grad_norm": 0.3430918753147125, + "learning_rate": 4.911352317020355e-05, + "loss": 0.3738, + "step": 1310000 + }, + { + "epoch": 8.868151797314855, + "grad_norm": 0.3501821756362915, + "learning_rate": 4.9113184820268514e-05, + "loss": 0.3745, + "step": 1310500 + }, + { + "epoch": 8.871535296665224, + "grad_norm": 0.3411780893802643, + "learning_rate": 4.9112846470333476e-05, + "loss": 0.375, + "step": 1311000 + }, + { + "epoch": 8.87491879601559, + "grad_norm": 0.3733043968677521, + "learning_rate": 4.9112508120398445e-05, + "loss": 0.375, + "step": 1311500 + }, + { + "epoch": 8.878302295365959, + "grad_norm": 0.3024614751338959, + "learning_rate": 4.911216977046341e-05, + "loss": 0.3739, + "step": 1312000 + }, + { + "epoch": 8.881685794716327, + "grad_norm": 0.32841992378234863, + "learning_rate": 4.911183142052837e-05, + "loss": 0.3744, + "step": 1312500 + }, + { + "epoch": 8.885069294066696, + "grad_norm": 0.35591748356819153, + "learning_rate": 4.911149307059333e-05, + "loss": 0.3743, + "step": 1313000 + }, + { + "epoch": 8.888452793417063, + "grad_norm": 0.31989797949790955, + "learning_rate": 4.91111547206583e-05, + "loss": 0.3748, + "step": 1313500 + }, + { + "epoch": 8.891836292767431, + "grad_norm": 0.31331291794776917, + "learning_rate": 4.911081637072326e-05, + "loss": 0.3735, + "step": 1314000 + }, + { + "epoch": 8.8952197921178, + "grad_norm": 0.36485719680786133, + "learning_rate": 4.911047802078822e-05, + "loss": 0.3738, + "step": 1314500 + }, + { + "epoch": 8.898603291468168, + "grad_norm": 0.3131905496120453, + "learning_rate": 4.911013967085319e-05, + "loss": 0.373, + "step": 1315000 + }, + { + "epoch": 8.901986790818537, + "grad_norm": 0.35740169882774353, + "learning_rate": 4.910980132091815e-05, + "loss": 0.3744, + "step": 1315500 + }, + { + "epoch": 8.905370290168904, + "grad_norm": 0.37337788939476013, + "learning_rate": 4.910946297098311e-05, + "loss": 0.3754, + "step": 1316000 + }, + { + "epoch": 8.908753789519272, + "grad_norm": 0.3863716721534729, + "learning_rate": 4.910912462104807e-05, + "loss": 0.3742, + "step": 1316500 + }, + { + "epoch": 8.91213728886964, + "grad_norm": 0.35913145542144775, + "learning_rate": 4.9108786271113035e-05, + "loss": 0.3746, + "step": 1317000 + }, + { + "epoch": 8.91552078822001, + "grad_norm": 0.3646251857280731, + "learning_rate": 4.9108447921178004e-05, + "loss": 0.3732, + "step": 1317500 + }, + { + "epoch": 8.918904287570378, + "grad_norm": 0.3502126634120941, + "learning_rate": 4.9108109571242966e-05, + "loss": 0.3738, + "step": 1318000 + }, + { + "epoch": 8.922287786920744, + "grad_norm": 0.3665471076965332, + "learning_rate": 4.910777122130793e-05, + "loss": 0.3743, + "step": 1318500 + }, + { + "epoch": 8.925671286271113, + "grad_norm": 0.3789934813976288, + "learning_rate": 4.910743287137289e-05, + "loss": 0.3757, + "step": 1319000 + }, + { + "epoch": 8.929054785621481, + "grad_norm": 0.34552836418151855, + "learning_rate": 4.910709452143785e-05, + "loss": 0.3752, + "step": 1319500 + }, + { + "epoch": 8.93243828497185, + "grad_norm": 0.3529386818408966, + "learning_rate": 4.9106756171502815e-05, + "loss": 0.3752, + "step": 1320000 + }, + { + "epoch": 8.935821784322217, + "grad_norm": 0.32900169491767883, + "learning_rate": 4.910641782156778e-05, + "loss": 0.3746, + "step": 1320500 + }, + { + "epoch": 8.939205283672585, + "grad_norm": 0.3948231041431427, + "learning_rate": 4.9106079471632746e-05, + "loss": 0.3748, + "step": 1321000 + }, + { + "epoch": 8.942588783022954, + "grad_norm": 0.3746308386325836, + "learning_rate": 4.910574112169771e-05, + "loss": 0.3732, + "step": 1321500 + }, + { + "epoch": 8.945972282373322, + "grad_norm": 0.3531734347343445, + "learning_rate": 4.910540277176267e-05, + "loss": 0.3742, + "step": 1322000 + }, + { + "epoch": 8.949355781723689, + "grad_norm": 0.3803749084472656, + "learning_rate": 4.910506442182763e-05, + "loss": 0.3735, + "step": 1322500 + }, + { + "epoch": 8.952739281074058, + "grad_norm": 0.3477669358253479, + "learning_rate": 4.91047260718926e-05, + "loss": 0.375, + "step": 1323000 + }, + { + "epoch": 8.956122780424426, + "grad_norm": 0.3473570942878723, + "learning_rate": 4.910438772195756e-05, + "loss": 0.3755, + "step": 1323500 + }, + { + "epoch": 8.959506279774795, + "grad_norm": 0.3485695719718933, + "learning_rate": 4.910404937202252e-05, + "loss": 0.3769, + "step": 1324000 + }, + { + "epoch": 8.962889779125163, + "grad_norm": 0.2922978401184082, + "learning_rate": 4.910371102208748e-05, + "loss": 0.3735, + "step": 1324500 + }, + { + "epoch": 8.96627327847553, + "grad_norm": 0.35643815994262695, + "learning_rate": 4.910337267215245e-05, + "loss": 0.3742, + "step": 1325000 + }, + { + "epoch": 8.969656777825898, + "grad_norm": 0.340582013130188, + "learning_rate": 4.910303432221741e-05, + "loss": 0.3754, + "step": 1325500 + }, + { + "epoch": 8.973040277176267, + "grad_norm": 0.32579490542411804, + "learning_rate": 4.9102695972282374e-05, + "loss": 0.3752, + "step": 1326000 + }, + { + "epoch": 8.976423776526635, + "grad_norm": 0.36068543791770935, + "learning_rate": 4.9102357622347336e-05, + "loss": 0.3737, + "step": 1326500 + }, + { + "epoch": 8.979807275877002, + "grad_norm": 0.3534857928752899, + "learning_rate": 4.9102019272412305e-05, + "loss": 0.373, + "step": 1327000 + }, + { + "epoch": 8.98319077522737, + "grad_norm": 0.34306958317756653, + "learning_rate": 4.910168092247727e-05, + "loss": 0.3747, + "step": 1327500 + }, + { + "epoch": 8.98657427457774, + "grad_norm": 0.33797481656074524, + "learning_rate": 4.910134257254223e-05, + "loss": 0.3735, + "step": 1328000 + }, + { + "epoch": 8.989957773928108, + "grad_norm": 0.4253041446208954, + "learning_rate": 4.910100422260719e-05, + "loss": 0.3741, + "step": 1328500 + }, + { + "epoch": 8.993341273278476, + "grad_norm": 0.36487436294555664, + "learning_rate": 4.910066587267215e-05, + "loss": 0.3747, + "step": 1329000 + }, + { + "epoch": 8.996724772628843, + "grad_norm": 0.35783451795578003, + "learning_rate": 4.9100327522737116e-05, + "loss": 0.3745, + "step": 1329500 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.8575272885373202, + "eval_loss": 0.5787888169288635, + "eval_runtime": 3378.0515, + "eval_samples_per_second": 86.069, + "eval_steps_per_second": 5.379, + "step": 1329984 + }, + { + "epoch": 9.000108271979212, + "grad_norm": 0.3485052287578583, + "learning_rate": 4.909998917280208e-05, + "loss": 0.3746, + "step": 1330000 + }, + { + "epoch": 9.00349177132958, + "grad_norm": 0.338402658700943, + "learning_rate": 4.9099650822867047e-05, + "loss": 0.3709, + "step": 1330500 + }, + { + "epoch": 9.006875270679949, + "grad_norm": 0.3309011161327362, + "learning_rate": 4.909931247293201e-05, + "loss": 0.3711, + "step": 1331000 + }, + { + "epoch": 9.010258770030315, + "grad_norm": 0.3409023880958557, + "learning_rate": 4.909897412299697e-05, + "loss": 0.3724, + "step": 1331500 + }, + { + "epoch": 9.013642269380684, + "grad_norm": 0.354640930891037, + "learning_rate": 4.909863577306193e-05, + "loss": 0.3721, + "step": 1332000 + }, + { + "epoch": 9.017025768731052, + "grad_norm": 0.3582461178302765, + "learning_rate": 4.90982974231269e-05, + "loss": 0.3718, + "step": 1332500 + }, + { + "epoch": 9.020409268081421, + "grad_norm": 0.3886023163795471, + "learning_rate": 4.9097959073191864e-05, + "loss": 0.372, + "step": 1333000 + }, + { + "epoch": 9.02379276743179, + "grad_norm": 0.344675213098526, + "learning_rate": 4.909762072325682e-05, + "loss": 0.371, + "step": 1333500 + }, + { + "epoch": 9.027176266782156, + "grad_norm": 0.34601104259490967, + "learning_rate": 4.909728237332178e-05, + "loss": 0.3737, + "step": 1334000 + }, + { + "epoch": 9.030559766132525, + "grad_norm": 0.3386719226837158, + "learning_rate": 4.909694402338675e-05, + "loss": 0.3728, + "step": 1334500 + }, + { + "epoch": 9.033943265482893, + "grad_norm": 0.33993321657180786, + "learning_rate": 4.909660567345171e-05, + "loss": 0.3732, + "step": 1335000 + }, + { + "epoch": 9.037326764833262, + "grad_norm": 0.3747544586658478, + "learning_rate": 4.9096267323516675e-05, + "loss": 0.3732, + "step": 1335500 + }, + { + "epoch": 9.040710264183629, + "grad_norm": 0.317266583442688, + "learning_rate": 4.909592897358164e-05, + "loss": 0.3726, + "step": 1336000 + }, + { + "epoch": 9.044093763533997, + "grad_norm": 0.44216030836105347, + "learning_rate": 4.9095590623646606e-05, + "loss": 0.3721, + "step": 1336500 + }, + { + "epoch": 9.047477262884366, + "grad_norm": 0.3385017216205597, + "learning_rate": 4.909525227371157e-05, + "loss": 0.3738, + "step": 1337000 + }, + { + "epoch": 9.050860762234734, + "grad_norm": 0.3921028673648834, + "learning_rate": 4.909491392377653e-05, + "loss": 0.3716, + "step": 1337500 + }, + { + "epoch": 9.054244261585103, + "grad_norm": 0.3511488139629364, + "learning_rate": 4.909457557384149e-05, + "loss": 0.3724, + "step": 1338000 + }, + { + "epoch": 9.05762776093547, + "grad_norm": 0.3934440612792969, + "learning_rate": 4.9094237223906454e-05, + "loss": 0.3741, + "step": 1338500 + }, + { + "epoch": 9.061011260285838, + "grad_norm": 0.32151028513908386, + "learning_rate": 4.9093898873971416e-05, + "loss": 0.3731, + "step": 1339000 + }, + { + "epoch": 9.064394759636206, + "grad_norm": 0.33140772581100464, + "learning_rate": 4.909356052403638e-05, + "loss": 0.3739, + "step": 1339500 + }, + { + "epoch": 9.067778258986575, + "grad_norm": 0.3499145805835724, + "learning_rate": 4.909322217410135e-05, + "loss": 0.3738, + "step": 1340000 + }, + { + "epoch": 9.071161758336942, + "grad_norm": 0.33269986510276794, + "learning_rate": 4.909288382416631e-05, + "loss": 0.3713, + "step": 1340500 + }, + { + "epoch": 9.07454525768731, + "grad_norm": 0.3687961995601654, + "learning_rate": 4.909254547423127e-05, + "loss": 0.373, + "step": 1341000 + }, + { + "epoch": 9.077928757037679, + "grad_norm": 0.3527079224586487, + "learning_rate": 4.9092207124296234e-05, + "loss": 0.3733, + "step": 1341500 + }, + { + "epoch": 9.081312256388047, + "grad_norm": 0.3850629925727844, + "learning_rate": 4.90918687743612e-05, + "loss": 0.3729, + "step": 1342000 + }, + { + "epoch": 9.084695755738414, + "grad_norm": 0.3427541255950928, + "learning_rate": 4.9091530424426165e-05, + "loss": 0.3722, + "step": 1342500 + }, + { + "epoch": 9.088079255088783, + "grad_norm": 0.3640572428703308, + "learning_rate": 4.909119207449112e-05, + "loss": 0.3725, + "step": 1343000 + }, + { + "epoch": 9.091462754439151, + "grad_norm": 0.3413487672805786, + "learning_rate": 4.909085372455608e-05, + "loss": 0.3728, + "step": 1343500 + }, + { + "epoch": 9.09484625378952, + "grad_norm": 0.36850154399871826, + "learning_rate": 4.909051537462105e-05, + "loss": 0.373, + "step": 1344000 + }, + { + "epoch": 9.098229753139888, + "grad_norm": 0.3488371968269348, + "learning_rate": 4.909017702468601e-05, + "loss": 0.3747, + "step": 1344500 + }, + { + "epoch": 9.101613252490255, + "grad_norm": 0.32600492238998413, + "learning_rate": 4.9089838674750975e-05, + "loss": 0.3725, + "step": 1345000 + }, + { + "epoch": 9.104996751840623, + "grad_norm": 0.3362772762775421, + "learning_rate": 4.908950032481594e-05, + "loss": 0.3725, + "step": 1345500 + }, + { + "epoch": 9.108380251190992, + "grad_norm": 0.3490723967552185, + "learning_rate": 4.9089161974880906e-05, + "loss": 0.3732, + "step": 1346000 + }, + { + "epoch": 9.11176375054136, + "grad_norm": 0.39872556924819946, + "learning_rate": 4.908882362494587e-05, + "loss": 0.3731, + "step": 1346500 + }, + { + "epoch": 9.115147249891727, + "grad_norm": 0.3392133414745331, + "learning_rate": 4.908848527501083e-05, + "loss": 0.3722, + "step": 1347000 + }, + { + "epoch": 9.118530749242096, + "grad_norm": 0.34616315364837646, + "learning_rate": 4.908814692507579e-05, + "loss": 0.3732, + "step": 1347500 + }, + { + "epoch": 9.121914248592464, + "grad_norm": 0.34856176376342773, + "learning_rate": 4.9087808575140755e-05, + "loss": 0.3736, + "step": 1348000 + }, + { + "epoch": 9.125297747942833, + "grad_norm": 0.3597109317779541, + "learning_rate": 4.908747022520572e-05, + "loss": 0.3736, + "step": 1348500 + }, + { + "epoch": 9.128681247293201, + "grad_norm": 0.3455621898174286, + "learning_rate": 4.908713187527068e-05, + "loss": 0.3724, + "step": 1349000 + }, + { + "epoch": 9.132064746643568, + "grad_norm": 0.3394932448863983, + "learning_rate": 4.908679352533565e-05, + "loss": 0.3737, + "step": 1349500 + }, + { + "epoch": 9.135448245993937, + "grad_norm": 0.3652005195617676, + "learning_rate": 4.908645517540061e-05, + "loss": 0.3735, + "step": 1350000 + }, + { + "epoch": 9.138831745344305, + "grad_norm": 0.3541617691516876, + "learning_rate": 4.908611682546557e-05, + "loss": 0.3733, + "step": 1350500 + }, + { + "epoch": 9.142215244694674, + "grad_norm": 0.32934844493865967, + "learning_rate": 4.9085778475530534e-05, + "loss": 0.3722, + "step": 1351000 + }, + { + "epoch": 9.14559874404504, + "grad_norm": 0.3870023488998413, + "learning_rate": 4.90854401255955e-05, + "loss": 0.3718, + "step": 1351500 + }, + { + "epoch": 9.148982243395409, + "grad_norm": 0.36997953057289124, + "learning_rate": 4.9085101775660465e-05, + "loss": 0.3723, + "step": 1352000 + }, + { + "epoch": 9.152365742745777, + "grad_norm": 0.34613293409347534, + "learning_rate": 4.908476342572542e-05, + "loss": 0.3735, + "step": 1352500 + }, + { + "epoch": 9.155749242096146, + "grad_norm": 0.3669275939464569, + "learning_rate": 4.908442507579038e-05, + "loss": 0.3735, + "step": 1353000 + }, + { + "epoch": 9.159132741446514, + "grad_norm": 0.333718478679657, + "learning_rate": 4.908408672585535e-05, + "loss": 0.3731, + "step": 1353500 + }, + { + "epoch": 9.162516240796881, + "grad_norm": 0.3390810191631317, + "learning_rate": 4.9083748375920314e-05, + "loss": 0.3736, + "step": 1354000 + }, + { + "epoch": 9.16589974014725, + "grad_norm": 0.3680596351623535, + "learning_rate": 4.9083410025985276e-05, + "loss": 0.3739, + "step": 1354500 + }, + { + "epoch": 9.169283239497618, + "grad_norm": 0.3484257459640503, + "learning_rate": 4.908307167605024e-05, + "loss": 0.3715, + "step": 1355000 + }, + { + "epoch": 9.172666738847987, + "grad_norm": 0.3617357313632965, + "learning_rate": 4.908273332611521e-05, + "loss": 0.3721, + "step": 1355500 + }, + { + "epoch": 9.176050238198354, + "grad_norm": 0.3549576997756958, + "learning_rate": 4.908239497618017e-05, + "loss": 0.3746, + "step": 1356000 + }, + { + "epoch": 9.179433737548722, + "grad_norm": 0.35443395376205444, + "learning_rate": 4.908205662624513e-05, + "loss": 0.3726, + "step": 1356500 + }, + { + "epoch": 9.18281723689909, + "grad_norm": 0.34137991070747375, + "learning_rate": 4.908171827631009e-05, + "loss": 0.3728, + "step": 1357000 + }, + { + "epoch": 9.186200736249459, + "grad_norm": 0.40165209770202637, + "learning_rate": 4.9081379926375055e-05, + "loss": 0.3734, + "step": 1357500 + }, + { + "epoch": 9.189584235599828, + "grad_norm": 0.35499700903892517, + "learning_rate": 4.908104157644002e-05, + "loss": 0.372, + "step": 1358000 + }, + { + "epoch": 9.192967734950194, + "grad_norm": 0.37622034549713135, + "learning_rate": 4.908070322650498e-05, + "loss": 0.3731, + "step": 1358500 + }, + { + "epoch": 9.196351234300563, + "grad_norm": 0.37281620502471924, + "learning_rate": 4.908036487656995e-05, + "loss": 0.3732, + "step": 1359000 + }, + { + "epoch": 9.199734733650931, + "grad_norm": 0.3736304044723511, + "learning_rate": 4.908002652663491e-05, + "loss": 0.3724, + "step": 1359500 + }, + { + "epoch": 9.2031182330013, + "grad_norm": 0.3365495204925537, + "learning_rate": 4.907968817669987e-05, + "loss": 0.3742, + "step": 1360000 + }, + { + "epoch": 9.206501732351667, + "grad_norm": 0.3519720435142517, + "learning_rate": 4.9079349826764835e-05, + "loss": 0.3725, + "step": 1360500 + }, + { + "epoch": 9.209885231702035, + "grad_norm": 0.37363943457603455, + "learning_rate": 4.9079011476829804e-05, + "loss": 0.3749, + "step": 1361000 + }, + { + "epoch": 9.213268731052404, + "grad_norm": 0.37272366881370544, + "learning_rate": 4.9078673126894766e-05, + "loss": 0.3743, + "step": 1361500 + }, + { + "epoch": 9.216652230402772, + "grad_norm": 0.39422059059143066, + "learning_rate": 4.907833477695973e-05, + "loss": 0.3725, + "step": 1362000 + }, + { + "epoch": 9.220035729753139, + "grad_norm": 0.3420394957065582, + "learning_rate": 4.907799642702468e-05, + "loss": 0.375, + "step": 1362500 + }, + { + "epoch": 9.223419229103508, + "grad_norm": 0.34846001863479614, + "learning_rate": 4.907765807708965e-05, + "loss": 0.3731, + "step": 1363000 + }, + { + "epoch": 9.226802728453876, + "grad_norm": 0.3821485638618469, + "learning_rate": 4.9077319727154614e-05, + "loss": 0.3735, + "step": 1363500 + }, + { + "epoch": 9.230186227804245, + "grad_norm": 0.3564333915710449, + "learning_rate": 4.9076981377219576e-05, + "loss": 0.3742, + "step": 1364000 + }, + { + "epoch": 9.233569727154613, + "grad_norm": 0.35378578305244446, + "learning_rate": 4.907664302728454e-05, + "loss": 0.3734, + "step": 1364500 + }, + { + "epoch": 9.23695322650498, + "grad_norm": 0.34390419721603394, + "learning_rate": 4.907630467734951e-05, + "loss": 0.3731, + "step": 1365000 + }, + { + "epoch": 9.240336725855348, + "grad_norm": 0.35762277245521545, + "learning_rate": 4.907596632741447e-05, + "loss": 0.3747, + "step": 1365500 + }, + { + "epoch": 9.243720225205717, + "grad_norm": 0.3782312273979187, + "learning_rate": 4.907562797747943e-05, + "loss": 0.3738, + "step": 1366000 + }, + { + "epoch": 9.247103724556085, + "grad_norm": 0.36128950119018555, + "learning_rate": 4.9075289627544394e-05, + "loss": 0.3739, + "step": 1366500 + }, + { + "epoch": 9.250487223906452, + "grad_norm": 0.3614073693752289, + "learning_rate": 4.9074951277609356e-05, + "loss": 0.3733, + "step": 1367000 + }, + { + "epoch": 9.25387072325682, + "grad_norm": 0.39572808146476746, + "learning_rate": 4.907461292767432e-05, + "loss": 0.3753, + "step": 1367500 + }, + { + "epoch": 9.25725422260719, + "grad_norm": 0.3680035173892975, + "learning_rate": 4.907427457773928e-05, + "loss": 0.3724, + "step": 1368000 + }, + { + "epoch": 9.260637721957558, + "grad_norm": 0.37925729155540466, + "learning_rate": 4.907393622780425e-05, + "loss": 0.3724, + "step": 1368500 + }, + { + "epoch": 9.264021221307926, + "grad_norm": 0.34867802262306213, + "learning_rate": 4.907359787786921e-05, + "loss": 0.3746, + "step": 1369000 + }, + { + "epoch": 9.267404720658293, + "grad_norm": 0.36166927218437195, + "learning_rate": 4.907325952793417e-05, + "loss": 0.3742, + "step": 1369500 + }, + { + "epoch": 9.270788220008662, + "grad_norm": 0.3401140868663788, + "learning_rate": 4.9072921177999135e-05, + "loss": 0.3724, + "step": 1370000 + }, + { + "epoch": 9.27417171935903, + "grad_norm": 0.34102433919906616, + "learning_rate": 4.90725828280641e-05, + "loss": 0.374, + "step": 1370500 + }, + { + "epoch": 9.277555218709399, + "grad_norm": 0.3460339605808258, + "learning_rate": 4.9072244478129066e-05, + "loss": 0.3737, + "step": 1371000 + }, + { + "epoch": 9.280938718059765, + "grad_norm": 0.40066081285476685, + "learning_rate": 4.907190612819403e-05, + "loss": 0.373, + "step": 1371500 + }, + { + "epoch": 9.284322217410134, + "grad_norm": 0.3612765967845917, + "learning_rate": 4.9071567778258984e-05, + "loss": 0.3735, + "step": 1372000 + }, + { + "epoch": 9.287705716760502, + "grad_norm": 0.35781562328338623, + "learning_rate": 4.907122942832395e-05, + "loss": 0.3731, + "step": 1372500 + }, + { + "epoch": 9.291089216110871, + "grad_norm": 0.3313261568546295, + "learning_rate": 4.9070891078388915e-05, + "loss": 0.3751, + "step": 1373000 + }, + { + "epoch": 9.29447271546124, + "grad_norm": 0.39906901121139526, + "learning_rate": 4.907055272845388e-05, + "loss": 0.3729, + "step": 1373500 + }, + { + "epoch": 9.297856214811606, + "grad_norm": 0.3768948018550873, + "learning_rate": 4.907021437851884e-05, + "loss": 0.3721, + "step": 1374000 + }, + { + "epoch": 9.301239714161975, + "grad_norm": 0.3924950659275055, + "learning_rate": 4.906987602858381e-05, + "loss": 0.3723, + "step": 1374500 + }, + { + "epoch": 9.304623213512343, + "grad_norm": 0.3677727282047272, + "learning_rate": 4.906953767864877e-05, + "loss": 0.3718, + "step": 1375000 + }, + { + "epoch": 9.308006712862712, + "grad_norm": 0.36460989713668823, + "learning_rate": 4.906919932871373e-05, + "loss": 0.373, + "step": 1375500 + }, + { + "epoch": 9.311390212213078, + "grad_norm": 0.31929346919059753, + "learning_rate": 4.9068860978778694e-05, + "loss": 0.3734, + "step": 1376000 + }, + { + "epoch": 9.314773711563447, + "grad_norm": 0.3578610122203827, + "learning_rate": 4.9068522628843657e-05, + "loss": 0.3717, + "step": 1376500 + }, + { + "epoch": 9.318157210913816, + "grad_norm": 0.32925495505332947, + "learning_rate": 4.906818427890862e-05, + "loss": 0.374, + "step": 1377000 + }, + { + "epoch": 9.321540710264184, + "grad_norm": 0.36400106549263, + "learning_rate": 4.906784592897358e-05, + "loss": 0.3747, + "step": 1377500 + }, + { + "epoch": 9.324924209614553, + "grad_norm": 0.34628424048423767, + "learning_rate": 4.906750757903855e-05, + "loss": 0.3734, + "step": 1378000 + }, + { + "epoch": 9.32830770896492, + "grad_norm": 0.34225183725357056, + "learning_rate": 4.906716922910351e-05, + "loss": 0.3731, + "step": 1378500 + }, + { + "epoch": 9.331691208315288, + "grad_norm": 0.3421752452850342, + "learning_rate": 4.9066830879168474e-05, + "loss": 0.3725, + "step": 1379000 + }, + { + "epoch": 9.335074707665656, + "grad_norm": 0.37214502692222595, + "learning_rate": 4.9066492529233436e-05, + "loss": 0.3738, + "step": 1379500 + }, + { + "epoch": 9.338458207016025, + "grad_norm": 0.3321809470653534, + "learning_rate": 4.90661541792984e-05, + "loss": 0.3737, + "step": 1380000 + }, + { + "epoch": 9.341841706366392, + "grad_norm": 0.3678796887397766, + "learning_rate": 4.906581582936337e-05, + "loss": 0.3724, + "step": 1380500 + }, + { + "epoch": 9.34522520571676, + "grad_norm": 0.31558719277381897, + "learning_rate": 4.906547747942833e-05, + "loss": 0.3759, + "step": 1381000 + }, + { + "epoch": 9.348608705067129, + "grad_norm": 0.3410813510417938, + "learning_rate": 4.9065139129493285e-05, + "loss": 0.3719, + "step": 1381500 + }, + { + "epoch": 9.351992204417497, + "grad_norm": 0.3446415662765503, + "learning_rate": 4.9064800779558253e-05, + "loss": 0.3738, + "step": 1382000 + }, + { + "epoch": 9.355375703767866, + "grad_norm": 0.37881118059158325, + "learning_rate": 4.9064462429623216e-05, + "loss": 0.372, + "step": 1382500 + }, + { + "epoch": 9.358759203118233, + "grad_norm": 0.378154993057251, + "learning_rate": 4.906412407968818e-05, + "loss": 0.375, + "step": 1383000 + }, + { + "epoch": 9.362142702468601, + "grad_norm": 0.3477650284767151, + "learning_rate": 4.906378572975314e-05, + "loss": 0.3746, + "step": 1383500 + }, + { + "epoch": 9.36552620181897, + "grad_norm": 0.35453009605407715, + "learning_rate": 4.906344737981811e-05, + "loss": 0.373, + "step": 1384000 + }, + { + "epoch": 9.368909701169338, + "grad_norm": 0.33059656620025635, + "learning_rate": 4.906310902988307e-05, + "loss": 0.3755, + "step": 1384500 + }, + { + "epoch": 9.372293200519705, + "grad_norm": 0.3068912625312805, + "learning_rate": 4.906277067994803e-05, + "loss": 0.3726, + "step": 1385000 + }, + { + "epoch": 9.375676699870073, + "grad_norm": 0.3638466000556946, + "learning_rate": 4.9062432330012995e-05, + "loss": 0.3746, + "step": 1385500 + }, + { + "epoch": 9.379060199220442, + "grad_norm": 0.35415658354759216, + "learning_rate": 4.906209398007796e-05, + "loss": 0.3719, + "step": 1386000 + }, + { + "epoch": 9.38244369857081, + "grad_norm": 0.3766058385372162, + "learning_rate": 4.906175563014292e-05, + "loss": 0.374, + "step": 1386500 + }, + { + "epoch": 9.385827197921177, + "grad_norm": 0.3652975857257843, + "learning_rate": 4.906141728020788e-05, + "loss": 0.3736, + "step": 1387000 + }, + { + "epoch": 9.389210697271546, + "grad_norm": 0.342379629611969, + "learning_rate": 4.9061078930272844e-05, + "loss": 0.374, + "step": 1387500 + }, + { + "epoch": 9.392594196621914, + "grad_norm": 0.3303496241569519, + "learning_rate": 4.906074058033781e-05, + "loss": 0.3736, + "step": 1388000 + }, + { + "epoch": 9.395977695972283, + "grad_norm": 0.3351159989833832, + "learning_rate": 4.9060402230402775e-05, + "loss": 0.3732, + "step": 1388500 + }, + { + "epoch": 9.399361195322651, + "grad_norm": 0.362427681684494, + "learning_rate": 4.906006388046774e-05, + "loss": 0.373, + "step": 1389000 + }, + { + "epoch": 9.402744694673018, + "grad_norm": 0.33641988039016724, + "learning_rate": 4.90597255305327e-05, + "loss": 0.373, + "step": 1389500 + }, + { + "epoch": 9.406128194023387, + "grad_norm": 0.3344801068305969, + "learning_rate": 4.905938718059767e-05, + "loss": 0.3726, + "step": 1390000 + }, + { + "epoch": 9.409511693373755, + "grad_norm": 0.4045099914073944, + "learning_rate": 4.905904883066263e-05, + "loss": 0.373, + "step": 1390500 + }, + { + "epoch": 9.412895192724124, + "grad_norm": 0.3387194573879242, + "learning_rate": 4.9058710480727585e-05, + "loss": 0.3724, + "step": 1391000 + }, + { + "epoch": 9.41627869207449, + "grad_norm": 0.33229130506515503, + "learning_rate": 4.9058372130792554e-05, + "loss": 0.3746, + "step": 1391500 + }, + { + "epoch": 9.419662191424859, + "grad_norm": 0.4265800714492798, + "learning_rate": 4.9058033780857516e-05, + "loss": 0.3731, + "step": 1392000 + }, + { + "epoch": 9.423045690775227, + "grad_norm": 0.38675758242607117, + "learning_rate": 4.905769543092248e-05, + "loss": 0.3743, + "step": 1392500 + }, + { + "epoch": 9.426429190125596, + "grad_norm": 0.34786656498908997, + "learning_rate": 4.905735708098744e-05, + "loss": 0.3748, + "step": 1393000 + }, + { + "epoch": 9.429812689475964, + "grad_norm": 0.32004114985466003, + "learning_rate": 4.905701873105241e-05, + "loss": 0.3746, + "step": 1393500 + }, + { + "epoch": 9.433196188826331, + "grad_norm": 0.35318368673324585, + "learning_rate": 4.905668038111737e-05, + "loss": 0.3736, + "step": 1394000 + }, + { + "epoch": 9.4365796881767, + "grad_norm": 0.38677674531936646, + "learning_rate": 4.9056342031182334e-05, + "loss": 0.3743, + "step": 1394500 + }, + { + "epoch": 9.439963187527068, + "grad_norm": 0.35570451617240906, + "learning_rate": 4.9056003681247296e-05, + "loss": 0.3741, + "step": 1395000 + }, + { + "epoch": 9.443346686877437, + "grad_norm": 0.3725792467594147, + "learning_rate": 4.905566533131226e-05, + "loss": 0.3741, + "step": 1395500 + }, + { + "epoch": 9.446730186227803, + "grad_norm": 0.3793668746948242, + "learning_rate": 4.905532698137722e-05, + "loss": 0.3728, + "step": 1396000 + }, + { + "epoch": 9.450113685578172, + "grad_norm": 0.3633134961128235, + "learning_rate": 4.905498863144218e-05, + "loss": 0.3733, + "step": 1396500 + }, + { + "epoch": 9.45349718492854, + "grad_norm": 0.3719507157802582, + "learning_rate": 4.9054650281507144e-05, + "loss": 0.3733, + "step": 1397000 + }, + { + "epoch": 9.456880684278909, + "grad_norm": 0.3579012453556061, + "learning_rate": 4.905431193157211e-05, + "loss": 0.375, + "step": 1397500 + }, + { + "epoch": 9.460264183629278, + "grad_norm": 0.40617984533309937, + "learning_rate": 4.9053973581637075e-05, + "loss": 0.3745, + "step": 1398000 + }, + { + "epoch": 9.463647682979644, + "grad_norm": 0.33194923400878906, + "learning_rate": 4.905363523170204e-05, + "loss": 0.3736, + "step": 1398500 + }, + { + "epoch": 9.467031182330013, + "grad_norm": 0.35460004210472107, + "learning_rate": 4.9053296881767e-05, + "loss": 0.3746, + "step": 1399000 + }, + { + "epoch": 9.470414681680381, + "grad_norm": 0.35819530487060547, + "learning_rate": 4.905295853183197e-05, + "loss": 0.3744, + "step": 1399500 + }, + { + "epoch": 9.47379818103075, + "grad_norm": 0.35111624002456665, + "learning_rate": 4.905262018189693e-05, + "loss": 0.374, + "step": 1400000 + }, + { + "epoch": 9.477181680381117, + "grad_norm": 0.3364504873752594, + "learning_rate": 4.9052281831961886e-05, + "loss": 0.3748, + "step": 1400500 + }, + { + "epoch": 9.480565179731485, + "grad_norm": 0.3461628258228302, + "learning_rate": 4.9051943482026855e-05, + "loss": 0.3731, + "step": 1401000 + }, + { + "epoch": 9.483948679081854, + "grad_norm": 0.35349273681640625, + "learning_rate": 4.905160513209182e-05, + "loss": 0.3742, + "step": 1401500 + }, + { + "epoch": 9.487332178432222, + "grad_norm": 0.35184773802757263, + "learning_rate": 4.905126678215678e-05, + "loss": 0.3726, + "step": 1402000 + }, + { + "epoch": 9.490715677782589, + "grad_norm": 0.34152382612228394, + "learning_rate": 4.905092843222174e-05, + "loss": 0.3724, + "step": 1402500 + }, + { + "epoch": 9.494099177132957, + "grad_norm": 0.3228163421154022, + "learning_rate": 4.905059008228671e-05, + "loss": 0.3737, + "step": 1403000 + }, + { + "epoch": 9.497482676483326, + "grad_norm": 0.3243357837200165, + "learning_rate": 4.905025173235167e-05, + "loss": 0.3726, + "step": 1403500 + }, + { + "epoch": 9.500866175833695, + "grad_norm": 0.3643701374530792, + "learning_rate": 4.9049913382416634e-05, + "loss": 0.3744, + "step": 1404000 + }, + { + "epoch": 9.504249675184063, + "grad_norm": 0.3463166058063507, + "learning_rate": 4.9049575032481596e-05, + "loss": 0.3729, + "step": 1404500 + }, + { + "epoch": 9.50763317453443, + "grad_norm": 0.36135637760162354, + "learning_rate": 4.904923668254656e-05, + "loss": 0.3739, + "step": 1405000 + }, + { + "epoch": 9.511016673884798, + "grad_norm": 0.3563440144062042, + "learning_rate": 4.904889833261152e-05, + "loss": 0.3731, + "step": 1405500 + }, + { + "epoch": 9.514400173235167, + "grad_norm": 0.3402830958366394, + "learning_rate": 4.904855998267648e-05, + "loss": 0.3746, + "step": 1406000 + }, + { + "epoch": 9.517783672585535, + "grad_norm": 0.3417011797428131, + "learning_rate": 4.9048221632741445e-05, + "loss": 0.3746, + "step": 1406500 + }, + { + "epoch": 9.521167171935904, + "grad_norm": 0.321426659822464, + "learning_rate": 4.9047883282806414e-05, + "loss": 0.3736, + "step": 1407000 + }, + { + "epoch": 9.52455067128627, + "grad_norm": 0.36277860403060913, + "learning_rate": 4.9047544932871376e-05, + "loss": 0.3743, + "step": 1407500 + }, + { + "epoch": 9.52793417063664, + "grad_norm": 0.3817081153392792, + "learning_rate": 4.904720658293634e-05, + "loss": 0.3754, + "step": 1408000 + }, + { + "epoch": 9.531317669987008, + "grad_norm": 0.3738994598388672, + "learning_rate": 4.90468682330013e-05, + "loss": 0.3738, + "step": 1408500 + }, + { + "epoch": 9.534701169337376, + "grad_norm": 0.3876747786998749, + "learning_rate": 4.904652988306627e-05, + "loss": 0.3736, + "step": 1409000 + }, + { + "epoch": 9.538084668687743, + "grad_norm": 0.36051487922668457, + "learning_rate": 4.904619153313123e-05, + "loss": 0.3726, + "step": 1409500 + }, + { + "epoch": 9.541468168038111, + "grad_norm": 0.3415667414665222, + "learning_rate": 4.9045853183196186e-05, + "loss": 0.3726, + "step": 1410000 + }, + { + "epoch": 9.54485166738848, + "grad_norm": 0.37187904119491577, + "learning_rate": 4.9045514833261155e-05, + "loss": 0.3741, + "step": 1410500 + }, + { + "epoch": 9.548235166738849, + "grad_norm": 0.34227636456489563, + "learning_rate": 4.904517648332612e-05, + "loss": 0.374, + "step": 1411000 + }, + { + "epoch": 9.551618666089215, + "grad_norm": 0.3580974042415619, + "learning_rate": 4.904483813339108e-05, + "loss": 0.3731, + "step": 1411500 + }, + { + "epoch": 9.555002165439584, + "grad_norm": 0.355074942111969, + "learning_rate": 4.904449978345604e-05, + "loss": 0.3733, + "step": 1412000 + }, + { + "epoch": 9.558385664789952, + "grad_norm": 0.314767986536026, + "learning_rate": 4.904416143352101e-05, + "loss": 0.3724, + "step": 1412500 + }, + { + "epoch": 9.56176916414032, + "grad_norm": 0.36553481221199036, + "learning_rate": 4.904382308358597e-05, + "loss": 0.3729, + "step": 1413000 + }, + { + "epoch": 9.56515266349069, + "grad_norm": 0.33571797609329224, + "learning_rate": 4.9043484733650935e-05, + "loss": 0.3744, + "step": 1413500 + }, + { + "epoch": 9.568536162841056, + "grad_norm": 0.33554765582084656, + "learning_rate": 4.90431463837159e-05, + "loss": 0.3731, + "step": 1414000 + }, + { + "epoch": 9.571919662191425, + "grad_norm": 0.37651869654655457, + "learning_rate": 4.9042808033780866e-05, + "loss": 0.3735, + "step": 1414500 + }, + { + "epoch": 9.575303161541793, + "grad_norm": 0.35934606194496155, + "learning_rate": 4.904246968384582e-05, + "loss": 0.3727, + "step": 1415000 + }, + { + "epoch": 9.578686660892162, + "grad_norm": 0.3884565830230713, + "learning_rate": 4.904213133391078e-05, + "loss": 0.3736, + "step": 1415500 + }, + { + "epoch": 9.582070160242528, + "grad_norm": 0.3344481289386749, + "learning_rate": 4.9041792983975745e-05, + "loss": 0.3743, + "step": 1416000 + }, + { + "epoch": 9.585453659592897, + "grad_norm": 0.37787380814552307, + "learning_rate": 4.9041454634040714e-05, + "loss": 0.3713, + "step": 1416500 + }, + { + "epoch": 9.588837158943265, + "grad_norm": 0.3896807134151459, + "learning_rate": 4.9041116284105676e-05, + "loss": 0.3737, + "step": 1417000 + }, + { + "epoch": 9.592220658293634, + "grad_norm": 0.37299489974975586, + "learning_rate": 4.904077793417064e-05, + "loss": 0.3714, + "step": 1417500 + }, + { + "epoch": 9.595604157644003, + "grad_norm": 0.3850718140602112, + "learning_rate": 4.90404395842356e-05, + "loss": 0.3727, + "step": 1418000 + }, + { + "epoch": 9.59898765699437, + "grad_norm": 0.3553083837032318, + "learning_rate": 4.904010123430057e-05, + "loss": 0.3734, + "step": 1418500 + }, + { + "epoch": 9.602371156344738, + "grad_norm": 0.3951491415500641, + "learning_rate": 4.903976288436553e-05, + "loss": 0.3723, + "step": 1419000 + }, + { + "epoch": 9.605754655695106, + "grad_norm": 0.4291026294231415, + "learning_rate": 4.903942453443049e-05, + "loss": 0.3738, + "step": 1419500 + }, + { + "epoch": 9.609138155045475, + "grad_norm": 0.36827054619789124, + "learning_rate": 4.9039086184495456e-05, + "loss": 0.3727, + "step": 1420000 + }, + { + "epoch": 9.612521654395842, + "grad_norm": 0.35370057821273804, + "learning_rate": 4.903874783456042e-05, + "loss": 0.3721, + "step": 1420500 + }, + { + "epoch": 9.61590515374621, + "grad_norm": 0.3819619417190552, + "learning_rate": 4.903840948462538e-05, + "loss": 0.3728, + "step": 1421000 + }, + { + "epoch": 9.619288653096579, + "grad_norm": 0.335520476102829, + "learning_rate": 4.903807113469034e-05, + "loss": 0.3746, + "step": 1421500 + }, + { + "epoch": 9.622672152446947, + "grad_norm": 0.3815540373325348, + "learning_rate": 4.903773278475531e-05, + "loss": 0.3714, + "step": 1422000 + }, + { + "epoch": 9.626055651797316, + "grad_norm": 0.3790264427661896, + "learning_rate": 4.903739443482027e-05, + "loss": 0.3744, + "step": 1422500 + }, + { + "epoch": 9.629439151147682, + "grad_norm": 0.3546484410762787, + "learning_rate": 4.9037056084885235e-05, + "loss": 0.3728, + "step": 1423000 + }, + { + "epoch": 9.632822650498051, + "grad_norm": 0.3164576292037964, + "learning_rate": 4.90367177349502e-05, + "loss": 0.3723, + "step": 1423500 + }, + { + "epoch": 9.63620614984842, + "grad_norm": 0.34413549304008484, + "learning_rate": 4.9036379385015167e-05, + "loss": 0.3717, + "step": 1424000 + }, + { + "epoch": 9.639589649198788, + "grad_norm": 0.3173050284385681, + "learning_rate": 4.903604103508012e-05, + "loss": 0.3737, + "step": 1424500 + }, + { + "epoch": 9.642973148549155, + "grad_norm": 0.3323732614517212, + "learning_rate": 4.9035702685145084e-05, + "loss": 0.3719, + "step": 1425000 + }, + { + "epoch": 9.646356647899523, + "grad_norm": 0.3265518546104431, + "learning_rate": 4.9035364335210046e-05, + "loss": 0.374, + "step": 1425500 + }, + { + "epoch": 9.649740147249892, + "grad_norm": 0.34177255630493164, + "learning_rate": 4.9035025985275015e-05, + "loss": 0.3746, + "step": 1426000 + }, + { + "epoch": 9.65312364660026, + "grad_norm": 0.34831613302230835, + "learning_rate": 4.903468763533998e-05, + "loss": 0.3724, + "step": 1426500 + }, + { + "epoch": 9.656507145950627, + "grad_norm": 0.3265458047389984, + "learning_rate": 4.903434928540494e-05, + "loss": 0.3746, + "step": 1427000 + }, + { + "epoch": 9.659890645300996, + "grad_norm": 0.3510293662548065, + "learning_rate": 4.90340109354699e-05, + "loss": 0.3721, + "step": 1427500 + }, + { + "epoch": 9.663274144651364, + "grad_norm": 0.328610897064209, + "learning_rate": 4.903367258553487e-05, + "loss": 0.374, + "step": 1428000 + }, + { + "epoch": 9.666657644001733, + "grad_norm": 0.34514859318733215, + "learning_rate": 4.903333423559983e-05, + "loss": 0.3733, + "step": 1428500 + }, + { + "epoch": 9.670041143352101, + "grad_norm": 0.3269490897655487, + "learning_rate": 4.903299588566479e-05, + "loss": 0.374, + "step": 1429000 + }, + { + "epoch": 9.673424642702468, + "grad_norm": 0.3199786841869354, + "learning_rate": 4.903265753572976e-05, + "loss": 0.373, + "step": 1429500 + }, + { + "epoch": 9.676808142052836, + "grad_norm": 0.31224554777145386, + "learning_rate": 4.903231918579472e-05, + "loss": 0.3753, + "step": 1430000 + }, + { + "epoch": 9.680191641403205, + "grad_norm": 0.34504151344299316, + "learning_rate": 4.903198083585968e-05, + "loss": 0.3738, + "step": 1430500 + }, + { + "epoch": 9.683575140753574, + "grad_norm": 0.3295120894908905, + "learning_rate": 4.903164248592464e-05, + "loss": 0.3736, + "step": 1431000 + }, + { + "epoch": 9.686958640103942, + "grad_norm": 0.34418854117393494, + "learning_rate": 4.903130413598961e-05, + "loss": 0.3738, + "step": 1431500 + }, + { + "epoch": 9.690342139454309, + "grad_norm": 0.34600433707237244, + "learning_rate": 4.9030965786054574e-05, + "loss": 0.3754, + "step": 1432000 + }, + { + "epoch": 9.693725638804677, + "grad_norm": 0.3856705129146576, + "learning_rate": 4.9030627436119536e-05, + "loss": 0.3743, + "step": 1432500 + }, + { + "epoch": 9.697109138155046, + "grad_norm": 0.34834805130958557, + "learning_rate": 4.90302890861845e-05, + "loss": 0.3739, + "step": 1433000 + }, + { + "epoch": 9.700492637505414, + "grad_norm": 0.33331626653671265, + "learning_rate": 4.902995073624946e-05, + "loss": 0.3736, + "step": 1433500 + }, + { + "epoch": 9.703876136855781, + "grad_norm": 0.3352530300617218, + "learning_rate": 4.902961238631442e-05, + "loss": 0.3735, + "step": 1434000 + }, + { + "epoch": 9.70725963620615, + "grad_norm": 0.34129124879837036, + "learning_rate": 4.9029274036379385e-05, + "loss": 0.3742, + "step": 1434500 + }, + { + "epoch": 9.710643135556518, + "grad_norm": 0.31555068492889404, + "learning_rate": 4.902893568644435e-05, + "loss": 0.3719, + "step": 1435000 + }, + { + "epoch": 9.714026634906887, + "grad_norm": 0.37110719084739685, + "learning_rate": 4.9028597336509316e-05, + "loss": 0.3743, + "step": 1435500 + }, + { + "epoch": 9.717410134257253, + "grad_norm": 0.33461055159568787, + "learning_rate": 4.902825898657428e-05, + "loss": 0.375, + "step": 1436000 + }, + { + "epoch": 9.720793633607622, + "grad_norm": 0.32992124557495117, + "learning_rate": 4.902792063663924e-05, + "loss": 0.3737, + "step": 1436500 + }, + { + "epoch": 9.72417713295799, + "grad_norm": 0.3794945180416107, + "learning_rate": 4.90275822867042e-05, + "loss": 0.3728, + "step": 1437000 + }, + { + "epoch": 9.727560632308359, + "grad_norm": 0.341802716255188, + "learning_rate": 4.902724393676917e-05, + "loss": 0.3728, + "step": 1437500 + }, + { + "epoch": 9.730944131658728, + "grad_norm": 0.327571302652359, + "learning_rate": 4.902690558683413e-05, + "loss": 0.3743, + "step": 1438000 + }, + { + "epoch": 9.734327631009094, + "grad_norm": 0.39465704560279846, + "learning_rate": 4.902656723689909e-05, + "loss": 0.3733, + "step": 1438500 + }, + { + "epoch": 9.737711130359463, + "grad_norm": 0.36543476581573486, + "learning_rate": 4.902622888696406e-05, + "loss": 0.3741, + "step": 1439000 + }, + { + "epoch": 9.741094629709831, + "grad_norm": 0.35676220059394836, + "learning_rate": 4.902589053702902e-05, + "loss": 0.3714, + "step": 1439500 + }, + { + "epoch": 9.7444781290602, + "grad_norm": 0.3575495779514313, + "learning_rate": 4.902555218709398e-05, + "loss": 0.3737, + "step": 1440000 + }, + { + "epoch": 9.747861628410567, + "grad_norm": 0.36440545320510864, + "learning_rate": 4.9025213837158944e-05, + "loss": 0.3738, + "step": 1440500 + }, + { + "epoch": 9.751245127760935, + "grad_norm": 0.37288621068000793, + "learning_rate": 4.9024875487223906e-05, + "loss": 0.3727, + "step": 1441000 + }, + { + "epoch": 9.754628627111304, + "grad_norm": 0.36296847462654114, + "learning_rate": 4.9024537137288875e-05, + "loss": 0.3728, + "step": 1441500 + }, + { + "epoch": 9.758012126461672, + "grad_norm": 0.39852529764175415, + "learning_rate": 4.902419878735384e-05, + "loss": 0.3745, + "step": 1442000 + }, + { + "epoch": 9.761395625812039, + "grad_norm": 0.3430839776992798, + "learning_rate": 4.90238604374188e-05, + "loss": 0.3749, + "step": 1442500 + }, + { + "epoch": 9.764779125162407, + "grad_norm": 0.39759382605552673, + "learning_rate": 4.902352208748376e-05, + "loss": 0.3743, + "step": 1443000 + }, + { + "epoch": 9.768162624512776, + "grad_norm": 0.3515682816505432, + "learning_rate": 4.902318373754872e-05, + "loss": 0.3726, + "step": 1443500 + }, + { + "epoch": 9.771546123863144, + "grad_norm": 0.3821214437484741, + "learning_rate": 4.9022845387613685e-05, + "loss": 0.3732, + "step": 1444000 + }, + { + "epoch": 9.774929623213513, + "grad_norm": 0.36167728900909424, + "learning_rate": 4.902250703767865e-05, + "loss": 0.3744, + "step": 1444500 + }, + { + "epoch": 9.77831312256388, + "grad_norm": 0.35349175333976746, + "learning_rate": 4.9022168687743616e-05, + "loss": 0.3725, + "step": 1445000 + }, + { + "epoch": 9.781696621914248, + "grad_norm": 0.3603718876838684, + "learning_rate": 4.902183033780858e-05, + "loss": 0.3728, + "step": 1445500 + }, + { + "epoch": 9.785080121264617, + "grad_norm": 0.3149004876613617, + "learning_rate": 4.902149198787354e-05, + "loss": 0.3732, + "step": 1446000 + }, + { + "epoch": 9.788463620614985, + "grad_norm": 0.35508009791374207, + "learning_rate": 4.90211536379385e-05, + "loss": 0.3731, + "step": 1446500 + }, + { + "epoch": 9.791847119965354, + "grad_norm": 0.36445415019989014, + "learning_rate": 4.902081528800347e-05, + "loss": 0.3738, + "step": 1447000 + }, + { + "epoch": 9.79523061931572, + "grad_norm": 0.33561834692955017, + "learning_rate": 4.9020476938068434e-05, + "loss": 0.3733, + "step": 1447500 + }, + { + "epoch": 9.79861411866609, + "grad_norm": 0.35104507207870483, + "learning_rate": 4.902013858813339e-05, + "loss": 0.3735, + "step": 1448000 + }, + { + "epoch": 9.801997618016458, + "grad_norm": 0.3738578259944916, + "learning_rate": 4.901980023819836e-05, + "loss": 0.3738, + "step": 1448500 + }, + { + "epoch": 9.805381117366826, + "grad_norm": 0.36810868978500366, + "learning_rate": 4.901946188826332e-05, + "loss": 0.3731, + "step": 1449000 + }, + { + "epoch": 9.808764616717193, + "grad_norm": 0.32860609889030457, + "learning_rate": 4.901912353832828e-05, + "loss": 0.373, + "step": 1449500 + }, + { + "epoch": 9.812148116067561, + "grad_norm": 0.3360423743724823, + "learning_rate": 4.9018785188393244e-05, + "loss": 0.3736, + "step": 1450000 + }, + { + "epoch": 9.81553161541793, + "grad_norm": 0.3495067358016968, + "learning_rate": 4.9018446838458206e-05, + "loss": 0.3741, + "step": 1450500 + }, + { + "epoch": 9.818915114768298, + "grad_norm": 0.36783358454704285, + "learning_rate": 4.9018108488523175e-05, + "loss": 0.3748, + "step": 1451000 + }, + { + "epoch": 9.822298614118665, + "grad_norm": 0.36060866713523865, + "learning_rate": 4.901777013858814e-05, + "loss": 0.3733, + "step": 1451500 + }, + { + "epoch": 9.825682113469034, + "grad_norm": 0.32261085510253906, + "learning_rate": 4.90174317886531e-05, + "loss": 0.3729, + "step": 1452000 + }, + { + "epoch": 9.829065612819402, + "grad_norm": 0.3506331145763397, + "learning_rate": 4.901709343871806e-05, + "loss": 0.3744, + "step": 1452500 + }, + { + "epoch": 9.83244911216977, + "grad_norm": 0.3443905711174011, + "learning_rate": 4.9016755088783024e-05, + "loss": 0.3726, + "step": 1453000 + }, + { + "epoch": 9.83583261152014, + "grad_norm": 0.3584153950214386, + "learning_rate": 4.9016416738847986e-05, + "loss": 0.3725, + "step": 1453500 + }, + { + "epoch": 9.839216110870506, + "grad_norm": 0.36786600947380066, + "learning_rate": 4.901607838891295e-05, + "loss": 0.3724, + "step": 1454000 + }, + { + "epoch": 9.842599610220875, + "grad_norm": 0.3549017906188965, + "learning_rate": 4.901574003897792e-05, + "loss": 0.3735, + "step": 1454500 + }, + { + "epoch": 9.845983109571243, + "grad_norm": 0.39618372917175293, + "learning_rate": 4.901540168904288e-05, + "loss": 0.3738, + "step": 1455000 + }, + { + "epoch": 9.849366608921612, + "grad_norm": 0.3646342158317566, + "learning_rate": 4.901506333910784e-05, + "loss": 0.374, + "step": 1455500 + }, + { + "epoch": 9.852750108271978, + "grad_norm": 0.3795194923877716, + "learning_rate": 4.90147249891728e-05, + "loss": 0.3737, + "step": 1456000 + }, + { + "epoch": 9.856133607622347, + "grad_norm": 0.35680365562438965, + "learning_rate": 4.901438663923777e-05, + "loss": 0.3729, + "step": 1456500 + }, + { + "epoch": 9.859517106972715, + "grad_norm": 0.3975049555301666, + "learning_rate": 4.9014048289302734e-05, + "loss": 0.3737, + "step": 1457000 + }, + { + "epoch": 9.862900606323084, + "grad_norm": 0.3477362394332886, + "learning_rate": 4.901370993936769e-05, + "loss": 0.3731, + "step": 1457500 + }, + { + "epoch": 9.866284105673452, + "grad_norm": 0.3453601896762848, + "learning_rate": 4.901337158943265e-05, + "loss": 0.3743, + "step": 1458000 + }, + { + "epoch": 9.86966760502382, + "grad_norm": 0.3613618016242981, + "learning_rate": 4.901303323949762e-05, + "loss": 0.372, + "step": 1458500 + }, + { + "epoch": 9.873051104374188, + "grad_norm": 0.34904763102531433, + "learning_rate": 4.901269488956258e-05, + "loss": 0.3726, + "step": 1459000 + }, + { + "epoch": 9.876434603724556, + "grad_norm": 0.342383474111557, + "learning_rate": 4.9012356539627545e-05, + "loss": 0.3739, + "step": 1459500 + }, + { + "epoch": 9.879818103074925, + "grad_norm": 0.3554176390171051, + "learning_rate": 4.901201818969251e-05, + "loss": 0.374, + "step": 1460000 + }, + { + "epoch": 9.883201602425292, + "grad_norm": 0.3256734609603882, + "learning_rate": 4.9011679839757476e-05, + "loss": 0.3753, + "step": 1460500 + }, + { + "epoch": 9.88658510177566, + "grad_norm": 0.3433820605278015, + "learning_rate": 4.901134148982244e-05, + "loss": 0.375, + "step": 1461000 + }, + { + "epoch": 9.889968601126029, + "grad_norm": 0.36203470826148987, + "learning_rate": 4.90110031398874e-05, + "loss": 0.3727, + "step": 1461500 + }, + { + "epoch": 9.893352100476397, + "grad_norm": 0.36855342984199524, + "learning_rate": 4.901066478995236e-05, + "loss": 0.3738, + "step": 1462000 + }, + { + "epoch": 9.896735599826766, + "grad_norm": 0.3460084795951843, + "learning_rate": 4.9010326440017324e-05, + "loss": 0.3734, + "step": 1462500 + }, + { + "epoch": 9.900119099177132, + "grad_norm": 0.337659627199173, + "learning_rate": 4.9009988090082287e-05, + "loss": 0.3741, + "step": 1463000 + }, + { + "epoch": 9.903502598527501, + "grad_norm": 0.3099478781223297, + "learning_rate": 4.900964974014725e-05, + "loss": 0.3741, + "step": 1463500 + }, + { + "epoch": 9.90688609787787, + "grad_norm": 0.38407188653945923, + "learning_rate": 4.900931139021222e-05, + "loss": 0.3738, + "step": 1464000 + }, + { + "epoch": 9.910269597228238, + "grad_norm": 0.3641258180141449, + "learning_rate": 4.900897304027718e-05, + "loss": 0.3738, + "step": 1464500 + }, + { + "epoch": 9.913653096578605, + "grad_norm": 0.33939868211746216, + "learning_rate": 4.900863469034214e-05, + "loss": 0.3742, + "step": 1465000 + }, + { + "epoch": 9.917036595928973, + "grad_norm": 0.3442780077457428, + "learning_rate": 4.9008296340407104e-05, + "loss": 0.3725, + "step": 1465500 + }, + { + "epoch": 9.920420095279342, + "grad_norm": 0.3463834524154663, + "learning_rate": 4.900795799047207e-05, + "loss": 0.3733, + "step": 1466000 + }, + { + "epoch": 9.92380359462971, + "grad_norm": 0.31947022676467896, + "learning_rate": 4.9007619640537035e-05, + "loss": 0.3724, + "step": 1466500 + }, + { + "epoch": 9.927187093980077, + "grad_norm": 0.3326902389526367, + "learning_rate": 4.900728129060199e-05, + "loss": 0.3716, + "step": 1467000 + }, + { + "epoch": 9.930570593330446, + "grad_norm": 0.3712017238140106, + "learning_rate": 4.900694294066695e-05, + "loss": 0.3733, + "step": 1467500 + }, + { + "epoch": 9.933954092680814, + "grad_norm": 0.34310412406921387, + "learning_rate": 4.900660459073192e-05, + "loss": 0.373, + "step": 1468000 + }, + { + "epoch": 9.937337592031183, + "grad_norm": 0.36225417256355286, + "learning_rate": 4.9006266240796883e-05, + "loss": 0.3743, + "step": 1468500 + }, + { + "epoch": 9.940721091381551, + "grad_norm": 0.34073570370674133, + "learning_rate": 4.9005927890861846e-05, + "loss": 0.3749, + "step": 1469000 + }, + { + "epoch": 9.944104590731918, + "grad_norm": 0.38553059101104736, + "learning_rate": 4.900558954092681e-05, + "loss": 0.3736, + "step": 1469500 + }, + { + "epoch": 9.947488090082286, + "grad_norm": 0.379245400428772, + "learning_rate": 4.9005251190991777e-05, + "loss": 0.372, + "step": 1470000 + }, + { + "epoch": 9.950871589432655, + "grad_norm": 0.3675907850265503, + "learning_rate": 4.900491284105674e-05, + "loss": 0.3734, + "step": 1470500 + }, + { + "epoch": 9.954255088783023, + "grad_norm": 0.3889124095439911, + "learning_rate": 4.90045744911217e-05, + "loss": 0.3748, + "step": 1471000 + }, + { + "epoch": 9.957638588133392, + "grad_norm": 0.34726428985595703, + "learning_rate": 4.900423614118666e-05, + "loss": 0.375, + "step": 1471500 + }, + { + "epoch": 9.961022087483759, + "grad_norm": 0.41922616958618164, + "learning_rate": 4.9003897791251625e-05, + "loss": 0.3734, + "step": 1472000 + }, + { + "epoch": 9.964405586834127, + "grad_norm": 0.34421753883361816, + "learning_rate": 4.900355944131659e-05, + "loss": 0.3732, + "step": 1472500 + }, + { + "epoch": 9.967789086184496, + "grad_norm": 0.3449467420578003, + "learning_rate": 4.900322109138155e-05, + "loss": 0.3737, + "step": 1473000 + }, + { + "epoch": 9.971172585534864, + "grad_norm": 0.37184974551200867, + "learning_rate": 4.900288274144652e-05, + "loss": 0.373, + "step": 1473500 + }, + { + "epoch": 9.974556084885231, + "grad_norm": 0.34117862582206726, + "learning_rate": 4.900254439151148e-05, + "loss": 0.3717, + "step": 1474000 + }, + { + "epoch": 9.9779395842356, + "grad_norm": 0.35933008790016174, + "learning_rate": 4.900220604157644e-05, + "loss": 0.373, + "step": 1474500 + }, + { + "epoch": 9.981323083585968, + "grad_norm": 0.34943458437919617, + "learning_rate": 4.9001867691641405e-05, + "loss": 0.373, + "step": 1475000 + }, + { + "epoch": 9.984706582936337, + "grad_norm": 0.3573608100414276, + "learning_rate": 4.9001529341706373e-05, + "loss": 0.3728, + "step": 1475500 + }, + { + "epoch": 9.988090082286703, + "grad_norm": 0.3658691346645355, + "learning_rate": 4.9001190991771336e-05, + "loss": 0.3723, + "step": 1476000 + }, + { + "epoch": 9.991473581637072, + "grad_norm": 0.35315924882888794, + "learning_rate": 4.90008526418363e-05, + "loss": 0.3732, + "step": 1476500 + }, + { + "epoch": 9.99485708098744, + "grad_norm": 0.32430362701416016, + "learning_rate": 4.900051429190125e-05, + "loss": 0.3725, + "step": 1477000 + }, + { + "epoch": 9.998240580337809, + "grad_norm": 0.434662789106369, + "learning_rate": 4.900017594196622e-05, + "loss": 0.374, + "step": 1477500 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.8576633603817478, + "eval_loss": 0.5780515074729919, + "eval_runtime": 3398.1128, + "eval_samples_per_second": 85.56, + "eval_steps_per_second": 5.348, + "step": 1477760 + }, + { + "epoch": 10.001624079688177, + "grad_norm": 0.35455480217933655, + "learning_rate": 4.8999837592031184e-05, + "loss": 0.3714, + "step": 1478000 + }, + { + "epoch": 10.005007579038544, + "grad_norm": 0.3450685739517212, + "learning_rate": 4.8999499242096146e-05, + "loss": 0.3697, + "step": 1478500 + }, + { + "epoch": 10.008391078388913, + "grad_norm": 0.34946346282958984, + "learning_rate": 4.899916089216111e-05, + "loss": 0.3701, + "step": 1479000 + }, + { + "epoch": 10.011774577739281, + "grad_norm": 0.35094282031059265, + "learning_rate": 4.899882254222608e-05, + "loss": 0.3721, + "step": 1479500 + }, + { + "epoch": 10.01515807708965, + "grad_norm": 0.37638944387435913, + "learning_rate": 4.899848419229104e-05, + "loss": 0.3706, + "step": 1480000 + }, + { + "epoch": 10.018541576440017, + "grad_norm": 0.3403615951538086, + "learning_rate": 4.8998145842356e-05, + "loss": 0.3706, + "step": 1480500 + }, + { + "epoch": 10.021925075790385, + "grad_norm": 0.3257076144218445, + "learning_rate": 4.8997807492420964e-05, + "loss": 0.3714, + "step": 1481000 + }, + { + "epoch": 10.025308575140754, + "grad_norm": 0.3220478296279907, + "learning_rate": 4.8997469142485926e-05, + "loss": 0.372, + "step": 1481500 + }, + { + "epoch": 10.028692074491122, + "grad_norm": 0.36535710096359253, + "learning_rate": 4.899713079255089e-05, + "loss": 0.37, + "step": 1482000 + }, + { + "epoch": 10.03207557384149, + "grad_norm": 0.341636061668396, + "learning_rate": 4.899679244261585e-05, + "loss": 0.3711, + "step": 1482500 + }, + { + "epoch": 10.035459073191857, + "grad_norm": 0.3620785176753998, + "learning_rate": 4.899645409268082e-05, + "loss": 0.3733, + "step": 1483000 + }, + { + "epoch": 10.038842572542226, + "grad_norm": 0.3525392711162567, + "learning_rate": 4.899611574274578e-05, + "loss": 0.3713, + "step": 1483500 + }, + { + "epoch": 10.042226071892594, + "grad_norm": 0.3929446339607239, + "learning_rate": 4.899577739281074e-05, + "loss": 0.3717, + "step": 1484000 + }, + { + "epoch": 10.045609571242963, + "grad_norm": 0.3982110023498535, + "learning_rate": 4.8995439042875705e-05, + "loss": 0.3735, + "step": 1484500 + }, + { + "epoch": 10.04899307059333, + "grad_norm": 0.3973587453365326, + "learning_rate": 4.8995100692940674e-05, + "loss": 0.3714, + "step": 1485000 + }, + { + "epoch": 10.052376569943698, + "grad_norm": 0.33388495445251465, + "learning_rate": 4.8994762343005636e-05, + "loss": 0.3711, + "step": 1485500 + }, + { + "epoch": 10.055760069294067, + "grad_norm": 0.3741908073425293, + "learning_rate": 4.89944239930706e-05, + "loss": 0.373, + "step": 1486000 + }, + { + "epoch": 10.059143568644435, + "grad_norm": 0.37797847390174866, + "learning_rate": 4.8994085643135554e-05, + "loss": 0.3721, + "step": 1486500 + }, + { + "epoch": 10.062527067994804, + "grad_norm": 0.36359724402427673, + "learning_rate": 4.899374729320052e-05, + "loss": 0.3714, + "step": 1487000 + }, + { + "epoch": 10.06591056734517, + "grad_norm": 0.36529961228370667, + "learning_rate": 4.8993408943265485e-05, + "loss": 0.3724, + "step": 1487500 + }, + { + "epoch": 10.069294066695539, + "grad_norm": 0.439048171043396, + "learning_rate": 4.899307059333045e-05, + "loss": 0.3737, + "step": 1488000 + }, + { + "epoch": 10.072677566045908, + "grad_norm": 0.33382806181907654, + "learning_rate": 4.899273224339541e-05, + "loss": 0.3718, + "step": 1488500 + }, + { + "epoch": 10.076061065396276, + "grad_norm": 0.3356092870235443, + "learning_rate": 4.899239389346038e-05, + "loss": 0.3724, + "step": 1489000 + }, + { + "epoch": 10.079444564746643, + "grad_norm": 0.3422519266605377, + "learning_rate": 4.899205554352534e-05, + "loss": 0.3728, + "step": 1489500 + }, + { + "epoch": 10.082828064097011, + "grad_norm": 0.4010469317436218, + "learning_rate": 4.89917171935903e-05, + "loss": 0.3717, + "step": 1490000 + }, + { + "epoch": 10.08621156344738, + "grad_norm": 0.37595677375793457, + "learning_rate": 4.8991378843655264e-05, + "loss": 0.3719, + "step": 1490500 + }, + { + "epoch": 10.089595062797748, + "grad_norm": 0.3703933656215668, + "learning_rate": 4.8991040493720226e-05, + "loss": 0.3711, + "step": 1491000 + }, + { + "epoch": 10.092978562148115, + "grad_norm": 0.3610004782676697, + "learning_rate": 4.899070214378519e-05, + "loss": 0.3724, + "step": 1491500 + }, + { + "epoch": 10.096362061498484, + "grad_norm": 0.36778146028518677, + "learning_rate": 4.899036379385015e-05, + "loss": 0.3726, + "step": 1492000 + }, + { + "epoch": 10.099745560848852, + "grad_norm": 0.36750367283821106, + "learning_rate": 4.899002544391512e-05, + "loss": 0.3713, + "step": 1492500 + }, + { + "epoch": 10.10312906019922, + "grad_norm": 0.3422515094280243, + "learning_rate": 4.898968709398008e-05, + "loss": 0.3715, + "step": 1493000 + }, + { + "epoch": 10.10651255954959, + "grad_norm": 0.3872862756252289, + "learning_rate": 4.8989348744045044e-05, + "loss": 0.3714, + "step": 1493500 + }, + { + "epoch": 10.109896058899956, + "grad_norm": 0.3537133038043976, + "learning_rate": 4.8989010394110006e-05, + "loss": 0.3717, + "step": 1494000 + }, + { + "epoch": 10.113279558250325, + "grad_norm": 0.3469699025154114, + "learning_rate": 4.8988672044174975e-05, + "loss": 0.3742, + "step": 1494500 + }, + { + "epoch": 10.116663057600693, + "grad_norm": 0.3984968960285187, + "learning_rate": 4.898833369423994e-05, + "loss": 0.3712, + "step": 1495000 + }, + { + "epoch": 10.120046556951062, + "grad_norm": 0.36833515763282776, + "learning_rate": 4.89879953443049e-05, + "loss": 0.3714, + "step": 1495500 + }, + { + "epoch": 10.123430056301428, + "grad_norm": 0.44496744871139526, + "learning_rate": 4.8987656994369854e-05, + "loss": 0.3715, + "step": 1496000 + }, + { + "epoch": 10.126813555651797, + "grad_norm": 0.366485059261322, + "learning_rate": 4.898731864443482e-05, + "loss": 0.3706, + "step": 1496500 + }, + { + "epoch": 10.130197055002165, + "grad_norm": 0.3111550807952881, + "learning_rate": 4.8986980294499785e-05, + "loss": 0.3718, + "step": 1497000 + }, + { + "epoch": 10.133580554352534, + "grad_norm": 0.35600733757019043, + "learning_rate": 4.898664194456475e-05, + "loss": 0.3741, + "step": 1497500 + }, + { + "epoch": 10.136964053702902, + "grad_norm": 0.33908215165138245, + "learning_rate": 4.898630359462971e-05, + "loss": 0.3713, + "step": 1498000 + }, + { + "epoch": 10.14034755305327, + "grad_norm": 0.3269176185131073, + "learning_rate": 4.898596524469468e-05, + "loss": 0.3716, + "step": 1498500 + }, + { + "epoch": 10.143731052403638, + "grad_norm": 0.31056517362594604, + "learning_rate": 4.898562689475964e-05, + "loss": 0.3717, + "step": 1499000 + }, + { + "epoch": 10.147114551754006, + "grad_norm": 0.3457142114639282, + "learning_rate": 4.89852885448246e-05, + "loss": 0.3724, + "step": 1499500 + }, + { + "epoch": 10.150498051104375, + "grad_norm": 0.36891162395477295, + "learning_rate": 4.8984950194889565e-05, + "loss": 0.3725, + "step": 1500000 + }, + { + "epoch": 10.153881550454742, + "grad_norm": 0.33141809701919556, + "learning_rate": 4.898461184495453e-05, + "loss": 0.3742, + "step": 1500500 + }, + { + "epoch": 10.15726504980511, + "grad_norm": 0.35837483406066895, + "learning_rate": 4.898427349501949e-05, + "loss": 0.3704, + "step": 1501000 + }, + { + "epoch": 10.160648549155479, + "grad_norm": 0.37250304222106934, + "learning_rate": 4.898393514508445e-05, + "loss": 0.3726, + "step": 1501500 + }, + { + "epoch": 10.164032048505847, + "grad_norm": 0.32248085737228394, + "learning_rate": 4.898359679514942e-05, + "loss": 0.3704, + "step": 1502000 + }, + { + "epoch": 10.167415547856216, + "grad_norm": 0.44182881712913513, + "learning_rate": 4.898325844521438e-05, + "loss": 0.3708, + "step": 1502500 + }, + { + "epoch": 10.170799047206582, + "grad_norm": 0.3475426137447357, + "learning_rate": 4.8982920095279344e-05, + "loss": 0.3714, + "step": 1503000 + }, + { + "epoch": 10.17418254655695, + "grad_norm": 0.3327220678329468, + "learning_rate": 4.8982581745344306e-05, + "loss": 0.373, + "step": 1503500 + }, + { + "epoch": 10.17756604590732, + "grad_norm": 0.3623294532299042, + "learning_rate": 4.898224339540927e-05, + "loss": 0.3721, + "step": 1504000 + }, + { + "epoch": 10.180949545257688, + "grad_norm": 0.33989912271499634, + "learning_rate": 4.898190504547424e-05, + "loss": 0.3713, + "step": 1504500 + }, + { + "epoch": 10.184333044608055, + "grad_norm": 0.32133978605270386, + "learning_rate": 4.89815666955392e-05, + "loss": 0.3719, + "step": 1505000 + }, + { + "epoch": 10.187716543958423, + "grad_norm": 0.38162854313850403, + "learning_rate": 4.8981228345604155e-05, + "loss": 0.3727, + "step": 1505500 + }, + { + "epoch": 10.191100043308792, + "grad_norm": 0.37406229972839355, + "learning_rate": 4.8980889995669124e-05, + "loss": 0.3718, + "step": 1506000 + }, + { + "epoch": 10.19448354265916, + "grad_norm": 0.3452966511249542, + "learning_rate": 4.8980551645734086e-05, + "loss": 0.3714, + "step": 1506500 + }, + { + "epoch": 10.197867042009529, + "grad_norm": 0.3505021631717682, + "learning_rate": 4.898021329579905e-05, + "loss": 0.3707, + "step": 1507000 + }, + { + "epoch": 10.201250541359896, + "grad_norm": 0.3570825159549713, + "learning_rate": 4.897987494586401e-05, + "loss": 0.3703, + "step": 1507500 + }, + { + "epoch": 10.204634040710264, + "grad_norm": 0.3546437919139862, + "learning_rate": 4.897953659592898e-05, + "loss": 0.371, + "step": 1508000 + }, + { + "epoch": 10.208017540060633, + "grad_norm": 0.41467759013175964, + "learning_rate": 4.897919824599394e-05, + "loss": 0.3722, + "step": 1508500 + }, + { + "epoch": 10.211401039411001, + "grad_norm": 0.39821138978004456, + "learning_rate": 4.89788598960589e-05, + "loss": 0.371, + "step": 1509000 + }, + { + "epoch": 10.214784538761368, + "grad_norm": 0.3705928325653076, + "learning_rate": 4.8978521546123865e-05, + "loss": 0.3718, + "step": 1509500 + }, + { + "epoch": 10.218168038111736, + "grad_norm": 0.3757786750793457, + "learning_rate": 4.897818319618883e-05, + "loss": 0.3716, + "step": 1510000 + }, + { + "epoch": 10.221551537462105, + "grad_norm": 0.3502088785171509, + "learning_rate": 4.897784484625379e-05, + "loss": 0.3726, + "step": 1510500 + }, + { + "epoch": 10.224935036812473, + "grad_norm": 0.3413272202014923, + "learning_rate": 4.897750649631875e-05, + "loss": 0.3707, + "step": 1511000 + }, + { + "epoch": 10.228318536162842, + "grad_norm": 0.35884177684783936, + "learning_rate": 4.8977168146383714e-05, + "loss": 0.3721, + "step": 1511500 + }, + { + "epoch": 10.231702035513209, + "grad_norm": 0.33523499965667725, + "learning_rate": 4.897682979644868e-05, + "loss": 0.3705, + "step": 1512000 + }, + { + "epoch": 10.235085534863577, + "grad_norm": 0.40591904520988464, + "learning_rate": 4.8976491446513645e-05, + "loss": 0.3725, + "step": 1512500 + }, + { + "epoch": 10.238469034213946, + "grad_norm": 0.3495722711086273, + "learning_rate": 4.897615309657861e-05, + "loss": 0.3723, + "step": 1513000 + }, + { + "epoch": 10.241852533564314, + "grad_norm": 0.3645005226135254, + "learning_rate": 4.897581474664357e-05, + "loss": 0.3725, + "step": 1513500 + }, + { + "epoch": 10.245236032914681, + "grad_norm": 0.3237664997577667, + "learning_rate": 4.897547639670854e-05, + "loss": 0.3728, + "step": 1514000 + }, + { + "epoch": 10.24861953226505, + "grad_norm": 0.3544117510318756, + "learning_rate": 4.89751380467735e-05, + "loss": 0.3725, + "step": 1514500 + }, + { + "epoch": 10.252003031615418, + "grad_norm": 0.3753994107246399, + "learning_rate": 4.8974799696838456e-05, + "loss": 0.374, + "step": 1515000 + }, + { + "epoch": 10.255386530965787, + "grad_norm": 0.34924355149269104, + "learning_rate": 4.8974461346903424e-05, + "loss": 0.3733, + "step": 1515500 + }, + { + "epoch": 10.258770030316153, + "grad_norm": 0.41519322991371155, + "learning_rate": 4.8974122996968387e-05, + "loss": 0.3718, + "step": 1516000 + }, + { + "epoch": 10.262153529666522, + "grad_norm": 0.33920472860336304, + "learning_rate": 4.897378464703335e-05, + "loss": 0.373, + "step": 1516500 + }, + { + "epoch": 10.26553702901689, + "grad_norm": 0.3762575387954712, + "learning_rate": 4.897344629709831e-05, + "loss": 0.3723, + "step": 1517000 + }, + { + "epoch": 10.268920528367259, + "grad_norm": 0.33512699604034424, + "learning_rate": 4.897310794716328e-05, + "loss": 0.371, + "step": 1517500 + }, + { + "epoch": 10.272304027717627, + "grad_norm": 0.36007460951805115, + "learning_rate": 4.897276959722824e-05, + "loss": 0.3722, + "step": 1518000 + }, + { + "epoch": 10.275687527067994, + "grad_norm": 0.34781354665756226, + "learning_rate": 4.8972431247293204e-05, + "loss": 0.3732, + "step": 1518500 + }, + { + "epoch": 10.279071026418363, + "grad_norm": 0.3836442828178406, + "learning_rate": 4.8972092897358166e-05, + "loss": 0.3719, + "step": 1519000 + }, + { + "epoch": 10.282454525768731, + "grad_norm": 0.3336435556411743, + "learning_rate": 4.897175454742313e-05, + "loss": 0.3729, + "step": 1519500 + }, + { + "epoch": 10.2858380251191, + "grad_norm": 0.3318532705307007, + "learning_rate": 4.897141619748809e-05, + "loss": 0.3735, + "step": 1520000 + }, + { + "epoch": 10.289221524469466, + "grad_norm": 0.3705673813819885, + "learning_rate": 4.897107784755305e-05, + "loss": 0.3723, + "step": 1520500 + }, + { + "epoch": 10.292605023819835, + "grad_norm": 0.3291831910610199, + "learning_rate": 4.8970739497618015e-05, + "loss": 0.3729, + "step": 1521000 + }, + { + "epoch": 10.295988523170204, + "grad_norm": 0.3454623818397522, + "learning_rate": 4.8970401147682983e-05, + "loss": 0.3723, + "step": 1521500 + }, + { + "epoch": 10.299372022520572, + "grad_norm": 0.34960824251174927, + "learning_rate": 4.8970062797747946e-05, + "loss": 0.3724, + "step": 1522000 + }, + { + "epoch": 10.30275552187094, + "grad_norm": 0.3701937198638916, + "learning_rate": 4.896972444781291e-05, + "loss": 0.373, + "step": 1522500 + }, + { + "epoch": 10.306139021221307, + "grad_norm": 0.3673439621925354, + "learning_rate": 4.896938609787787e-05, + "loss": 0.3737, + "step": 1523000 + }, + { + "epoch": 10.309522520571676, + "grad_norm": 0.3876641094684601, + "learning_rate": 4.896904774794284e-05, + "loss": 0.3723, + "step": 1523500 + }, + { + "epoch": 10.312906019922044, + "grad_norm": 0.3613537847995758, + "learning_rate": 4.89687093980078e-05, + "loss": 0.371, + "step": 1524000 + }, + { + "epoch": 10.316289519272413, + "grad_norm": 0.3192806839942932, + "learning_rate": 4.8968371048072756e-05, + "loss": 0.371, + "step": 1524500 + }, + { + "epoch": 10.31967301862278, + "grad_norm": 0.35730502009391785, + "learning_rate": 4.8968032698137725e-05, + "loss": 0.3736, + "step": 1525000 + }, + { + "epoch": 10.323056517973148, + "grad_norm": 0.35998380184173584, + "learning_rate": 4.896769434820269e-05, + "loss": 0.3726, + "step": 1525500 + }, + { + "epoch": 10.326440017323517, + "grad_norm": 0.3771452307701111, + "learning_rate": 4.896735599826765e-05, + "loss": 0.3705, + "step": 1526000 + }, + { + "epoch": 10.329823516673885, + "grad_norm": 0.39200085401535034, + "learning_rate": 4.896701764833261e-05, + "loss": 0.3711, + "step": 1526500 + }, + { + "epoch": 10.333207016024254, + "grad_norm": 0.3914739191532135, + "learning_rate": 4.896667929839758e-05, + "loss": 0.3725, + "step": 1527000 + }, + { + "epoch": 10.33659051537462, + "grad_norm": 0.39595144987106323, + "learning_rate": 4.896634094846254e-05, + "loss": 0.3732, + "step": 1527500 + }, + { + "epoch": 10.339974014724989, + "grad_norm": 0.35875624418258667, + "learning_rate": 4.8966002598527505e-05, + "loss": 0.3728, + "step": 1528000 + }, + { + "epoch": 10.343357514075358, + "grad_norm": 0.3740348219871521, + "learning_rate": 4.896566424859247e-05, + "loss": 0.3738, + "step": 1528500 + }, + { + "epoch": 10.346741013425726, + "grad_norm": 0.3435947597026825, + "learning_rate": 4.896532589865743e-05, + "loss": 0.3715, + "step": 1529000 + }, + { + "epoch": 10.350124512776093, + "grad_norm": 0.34385281801223755, + "learning_rate": 4.896498754872239e-05, + "loss": 0.3719, + "step": 1529500 + }, + { + "epoch": 10.353508012126461, + "grad_norm": 0.3487713932991028, + "learning_rate": 4.896464919878735e-05, + "loss": 0.3726, + "step": 1530000 + }, + { + "epoch": 10.35689151147683, + "grad_norm": 0.36372271180152893, + "learning_rate": 4.8964310848852315e-05, + "loss": 0.3715, + "step": 1530500 + }, + { + "epoch": 10.360275010827198, + "grad_norm": 0.377616286277771, + "learning_rate": 4.8963972498917284e-05, + "loss": 0.3735, + "step": 1531000 + }, + { + "epoch": 10.363658510177567, + "grad_norm": 0.33630087971687317, + "learning_rate": 4.8963634148982246e-05, + "loss": 0.3726, + "step": 1531500 + }, + { + "epoch": 10.367042009527934, + "grad_norm": 0.35733091831207275, + "learning_rate": 4.896329579904721e-05, + "loss": 0.3725, + "step": 1532000 + }, + { + "epoch": 10.370425508878302, + "grad_norm": 0.3580639064311981, + "learning_rate": 4.896295744911217e-05, + "loss": 0.372, + "step": 1532500 + }, + { + "epoch": 10.37380900822867, + "grad_norm": 0.3302856683731079, + "learning_rate": 4.896261909917714e-05, + "loss": 0.3712, + "step": 1533000 + }, + { + "epoch": 10.37719250757904, + "grad_norm": 0.33400455117225647, + "learning_rate": 4.89622807492421e-05, + "loss": 0.3739, + "step": 1533500 + }, + { + "epoch": 10.380576006929406, + "grad_norm": 0.38881915807724, + "learning_rate": 4.896194239930706e-05, + "loss": 0.3717, + "step": 1534000 + }, + { + "epoch": 10.383959506279774, + "grad_norm": 0.32272690534591675, + "learning_rate": 4.8961604049372026e-05, + "loss": 0.3717, + "step": 1534500 + }, + { + "epoch": 10.387343005630143, + "grad_norm": 0.371811181306839, + "learning_rate": 4.896126569943699e-05, + "loss": 0.3717, + "step": 1535000 + }, + { + "epoch": 10.390726504980512, + "grad_norm": 0.357614129781723, + "learning_rate": 4.896092734950195e-05, + "loss": 0.3723, + "step": 1535500 + }, + { + "epoch": 10.39411000433088, + "grad_norm": 0.3578258454799652, + "learning_rate": 4.896058899956691e-05, + "loss": 0.3721, + "step": 1536000 + }, + { + "epoch": 10.397493503681247, + "grad_norm": 0.33020392060279846, + "learning_rate": 4.896025064963188e-05, + "loss": 0.3725, + "step": 1536500 + }, + { + "epoch": 10.400877003031615, + "grad_norm": 0.3697195053100586, + "learning_rate": 4.895991229969684e-05, + "loss": 0.3733, + "step": 1537000 + }, + { + "epoch": 10.404260502381984, + "grad_norm": 0.3831406533718109, + "learning_rate": 4.8959573949761805e-05, + "loss": 0.373, + "step": 1537500 + }, + { + "epoch": 10.407644001732352, + "grad_norm": 0.3610920011997223, + "learning_rate": 4.895923559982677e-05, + "loss": 0.3715, + "step": 1538000 + }, + { + "epoch": 10.41102750108272, + "grad_norm": 0.33148255944252014, + "learning_rate": 4.8958897249891736e-05, + "loss": 0.372, + "step": 1538500 + }, + { + "epoch": 10.414411000433088, + "grad_norm": 0.3550785183906555, + "learning_rate": 4.895855889995669e-05, + "loss": 0.3744, + "step": 1539000 + }, + { + "epoch": 10.417794499783456, + "grad_norm": 0.3774226903915405, + "learning_rate": 4.8958220550021654e-05, + "loss": 0.372, + "step": 1539500 + }, + { + "epoch": 10.421177999133825, + "grad_norm": 0.3333495259284973, + "learning_rate": 4.8957882200086616e-05, + "loss": 0.374, + "step": 1540000 + }, + { + "epoch": 10.424561498484191, + "grad_norm": 0.3145923316478729, + "learning_rate": 4.8957543850151585e-05, + "loss": 0.3744, + "step": 1540500 + }, + { + "epoch": 10.42794499783456, + "grad_norm": 0.37305423617362976, + "learning_rate": 4.895720550021655e-05, + "loss": 0.3723, + "step": 1541000 + }, + { + "epoch": 10.431328497184928, + "grad_norm": 0.358360230922699, + "learning_rate": 4.895686715028151e-05, + "loss": 0.3726, + "step": 1541500 + }, + { + "epoch": 10.434711996535297, + "grad_norm": 0.3629431426525116, + "learning_rate": 4.895652880034647e-05, + "loss": 0.3726, + "step": 1542000 + }, + { + "epoch": 10.438095495885666, + "grad_norm": 0.4073221981525421, + "learning_rate": 4.895619045041144e-05, + "loss": 0.373, + "step": 1542500 + }, + { + "epoch": 10.441478995236032, + "grad_norm": 0.39308783411979675, + "learning_rate": 4.89558521004764e-05, + "loss": 0.3718, + "step": 1543000 + }, + { + "epoch": 10.4448624945864, + "grad_norm": 0.3807017505168915, + "learning_rate": 4.895551375054136e-05, + "loss": 0.3739, + "step": 1543500 + }, + { + "epoch": 10.44824599393677, + "grad_norm": 0.3773409426212311, + "learning_rate": 4.8955175400606326e-05, + "loss": 0.3721, + "step": 1544000 + }, + { + "epoch": 10.451629493287138, + "grad_norm": 0.36095383763313293, + "learning_rate": 4.895483705067129e-05, + "loss": 0.3727, + "step": 1544500 + }, + { + "epoch": 10.455012992637505, + "grad_norm": 0.3516400456428528, + "learning_rate": 4.895449870073625e-05, + "loss": 0.371, + "step": 1545000 + }, + { + "epoch": 10.458396491987873, + "grad_norm": 0.36414170265197754, + "learning_rate": 4.895416035080121e-05, + "loss": 0.3734, + "step": 1545500 + }, + { + "epoch": 10.461779991338242, + "grad_norm": 0.3702701926231384, + "learning_rate": 4.895382200086618e-05, + "loss": 0.3733, + "step": 1546000 + }, + { + "epoch": 10.46516349068861, + "grad_norm": 0.36798205971717834, + "learning_rate": 4.8953483650931144e-05, + "loss": 0.3717, + "step": 1546500 + }, + { + "epoch": 10.468546990038979, + "grad_norm": 0.3716042935848236, + "learning_rate": 4.8953145300996106e-05, + "loss": 0.3733, + "step": 1547000 + }, + { + "epoch": 10.471930489389345, + "grad_norm": 0.331015020608902, + "learning_rate": 4.895280695106107e-05, + "loss": 0.3733, + "step": 1547500 + }, + { + "epoch": 10.475313988739714, + "grad_norm": 0.3597949147224426, + "learning_rate": 4.895246860112604e-05, + "loss": 0.3697, + "step": 1548000 + }, + { + "epoch": 10.478697488090083, + "grad_norm": 0.3553822636604309, + "learning_rate": 4.895213025119099e-05, + "loss": 0.3741, + "step": 1548500 + }, + { + "epoch": 10.482080987440451, + "grad_norm": 0.3873803913593292, + "learning_rate": 4.8951791901255954e-05, + "loss": 0.3729, + "step": 1549000 + }, + { + "epoch": 10.485464486790818, + "grad_norm": 0.3339466452598572, + "learning_rate": 4.8951453551320916e-05, + "loss": 0.3728, + "step": 1549500 + }, + { + "epoch": 10.488847986141186, + "grad_norm": 0.335771769285202, + "learning_rate": 4.8951115201385885e-05, + "loss": 0.3721, + "step": 1550000 + }, + { + "epoch": 10.492231485491555, + "grad_norm": 0.37400147318840027, + "learning_rate": 4.895077685145085e-05, + "loss": 0.3733, + "step": 1550500 + }, + { + "epoch": 10.495614984841923, + "grad_norm": 0.37101054191589355, + "learning_rate": 4.895043850151581e-05, + "loss": 0.3729, + "step": 1551000 + }, + { + "epoch": 10.498998484192292, + "grad_norm": 0.36181971430778503, + "learning_rate": 4.895010015158077e-05, + "loss": 0.371, + "step": 1551500 + }, + { + "epoch": 10.502381983542659, + "grad_norm": 0.3649595379829407, + "learning_rate": 4.894976180164574e-05, + "loss": 0.371, + "step": 1552000 + }, + { + "epoch": 10.505765482893027, + "grad_norm": 0.3265508711338043, + "learning_rate": 4.89494234517107e-05, + "loss": 0.3719, + "step": 1552500 + }, + { + "epoch": 10.509148982243396, + "grad_norm": 0.3649456799030304, + "learning_rate": 4.894908510177566e-05, + "loss": 0.3709, + "step": 1553000 + }, + { + "epoch": 10.512532481593764, + "grad_norm": 0.33293768763542175, + "learning_rate": 4.894874675184063e-05, + "loss": 0.3732, + "step": 1553500 + }, + { + "epoch": 10.515915980944131, + "grad_norm": 0.34916120767593384, + "learning_rate": 4.894840840190559e-05, + "loss": 0.3714, + "step": 1554000 + }, + { + "epoch": 10.5192994802945, + "grad_norm": 0.3865835666656494, + "learning_rate": 4.894807005197055e-05, + "loss": 0.3726, + "step": 1554500 + }, + { + "epoch": 10.522682979644868, + "grad_norm": 0.3607036769390106, + "learning_rate": 4.894773170203551e-05, + "loss": 0.3711, + "step": 1555000 + }, + { + "epoch": 10.526066478995237, + "grad_norm": 0.36521416902542114, + "learning_rate": 4.894739335210048e-05, + "loss": 0.3734, + "step": 1555500 + }, + { + "epoch": 10.529449978345603, + "grad_norm": 0.3339827060699463, + "learning_rate": 4.8947055002165444e-05, + "loss": 0.3731, + "step": 1556000 + }, + { + "epoch": 10.532833477695972, + "grad_norm": 0.3377821743488312, + "learning_rate": 4.8946716652230406e-05, + "loss": 0.3729, + "step": 1556500 + }, + { + "epoch": 10.53621697704634, + "grad_norm": 0.4511741101741791, + "learning_rate": 4.894637830229537e-05, + "loss": 0.3713, + "step": 1557000 + }, + { + "epoch": 10.539600476396709, + "grad_norm": 0.34616196155548096, + "learning_rate": 4.894603995236034e-05, + "loss": 0.3747, + "step": 1557500 + }, + { + "epoch": 10.542983975747077, + "grad_norm": 0.3160056471824646, + "learning_rate": 4.894570160242529e-05, + "loss": 0.3725, + "step": 1558000 + }, + { + "epoch": 10.546367475097444, + "grad_norm": 0.3685974180698395, + "learning_rate": 4.8945363252490255e-05, + "loss": 0.372, + "step": 1558500 + }, + { + "epoch": 10.549750974447813, + "grad_norm": 0.3372802734375, + "learning_rate": 4.894502490255522e-05, + "loss": 0.3718, + "step": 1559000 + }, + { + "epoch": 10.553134473798181, + "grad_norm": 0.3723401129245758, + "learning_rate": 4.8944686552620186e-05, + "loss": 0.3736, + "step": 1559500 + }, + { + "epoch": 10.55651797314855, + "grad_norm": 0.36342746019363403, + "learning_rate": 4.894434820268515e-05, + "loss": 0.374, + "step": 1560000 + }, + { + "epoch": 10.559901472498918, + "grad_norm": 0.34721601009368896, + "learning_rate": 4.894400985275011e-05, + "loss": 0.374, + "step": 1560500 + }, + { + "epoch": 10.563284971849285, + "grad_norm": 0.38332539796829224, + "learning_rate": 4.894367150281507e-05, + "loss": 0.3719, + "step": 1561000 + }, + { + "epoch": 10.566668471199653, + "grad_norm": 0.35107436776161194, + "learning_rate": 4.894333315288004e-05, + "loss": 0.3731, + "step": 1561500 + }, + { + "epoch": 10.570051970550022, + "grad_norm": 0.3253043591976166, + "learning_rate": 4.8942994802945e-05, + "loss": 0.3727, + "step": 1562000 + }, + { + "epoch": 10.57343546990039, + "grad_norm": 0.36623555421829224, + "learning_rate": 4.894265645300996e-05, + "loss": 0.3733, + "step": 1562500 + }, + { + "epoch": 10.576818969250757, + "grad_norm": 0.35085707902908325, + "learning_rate": 4.894231810307493e-05, + "loss": 0.3714, + "step": 1563000 + }, + { + "epoch": 10.580202468601126, + "grad_norm": 0.3474346101284027, + "learning_rate": 4.894197975313989e-05, + "loss": 0.3736, + "step": 1563500 + }, + { + "epoch": 10.583585967951494, + "grad_norm": 0.3433658182621002, + "learning_rate": 4.894164140320485e-05, + "loss": 0.3708, + "step": 1564000 + }, + { + "epoch": 10.586969467301863, + "grad_norm": 0.38147303462028503, + "learning_rate": 4.8941303053269814e-05, + "loss": 0.3716, + "step": 1564500 + }, + { + "epoch": 10.59035296665223, + "grad_norm": 0.33993029594421387, + "learning_rate": 4.894096470333478e-05, + "loss": 0.3715, + "step": 1565000 + }, + { + "epoch": 10.593736466002598, + "grad_norm": 0.3724004924297333, + "learning_rate": 4.8940626353399745e-05, + "loss": 0.3728, + "step": 1565500 + }, + { + "epoch": 10.597119965352967, + "grad_norm": 0.34733206033706665, + "learning_rate": 4.894028800346471e-05, + "loss": 0.3709, + "step": 1566000 + }, + { + "epoch": 10.600503464703335, + "grad_norm": 0.3865257203578949, + "learning_rate": 4.893994965352967e-05, + "loss": 0.3732, + "step": 1566500 + }, + { + "epoch": 10.603886964053704, + "grad_norm": 0.33898982405662537, + "learning_rate": 4.893961130359463e-05, + "loss": 0.3724, + "step": 1567000 + }, + { + "epoch": 10.60727046340407, + "grad_norm": 0.32820501923561096, + "learning_rate": 4.8939272953659593e-05, + "loss": 0.3739, + "step": 1567500 + }, + { + "epoch": 10.610653962754439, + "grad_norm": 0.34335583448410034, + "learning_rate": 4.8938934603724556e-05, + "loss": 0.3727, + "step": 1568000 + }, + { + "epoch": 10.614037462104807, + "grad_norm": 0.3512874245643616, + "learning_rate": 4.893859625378952e-05, + "loss": 0.372, + "step": 1568500 + }, + { + "epoch": 10.617420961455176, + "grad_norm": 0.3625296652317047, + "learning_rate": 4.893825790385449e-05, + "loss": 0.3731, + "step": 1569000 + }, + { + "epoch": 10.620804460805543, + "grad_norm": 0.344817578792572, + "learning_rate": 4.893791955391945e-05, + "loss": 0.3733, + "step": 1569500 + }, + { + "epoch": 10.624187960155911, + "grad_norm": 0.398043155670166, + "learning_rate": 4.893758120398441e-05, + "loss": 0.3726, + "step": 1570000 + }, + { + "epoch": 10.62757145950628, + "grad_norm": 0.3540467619895935, + "learning_rate": 4.893724285404937e-05, + "loss": 0.372, + "step": 1570500 + }, + { + "epoch": 10.630954958856648, + "grad_norm": 0.32507434487342834, + "learning_rate": 4.893690450411434e-05, + "loss": 0.3727, + "step": 1571000 + }, + { + "epoch": 10.634338458207017, + "grad_norm": 0.36480745673179626, + "learning_rate": 4.8936566154179304e-05, + "loss": 0.3732, + "step": 1571500 + }, + { + "epoch": 10.637721957557384, + "grad_norm": 0.38430601358413696, + "learning_rate": 4.893622780424426e-05, + "loss": 0.3753, + "step": 1572000 + }, + { + "epoch": 10.641105456907752, + "grad_norm": 0.327197790145874, + "learning_rate": 4.893588945430923e-05, + "loss": 0.3727, + "step": 1572500 + }, + { + "epoch": 10.64448895625812, + "grad_norm": 0.3469175696372986, + "learning_rate": 4.893555110437419e-05, + "loss": 0.3719, + "step": 1573000 + }, + { + "epoch": 10.64787245560849, + "grad_norm": 0.30438894033432007, + "learning_rate": 4.893521275443915e-05, + "loss": 0.3723, + "step": 1573500 + }, + { + "epoch": 10.651255954958856, + "grad_norm": 0.39085620641708374, + "learning_rate": 4.8934874404504115e-05, + "loss": 0.3733, + "step": 1574000 + }, + { + "epoch": 10.654639454309224, + "grad_norm": 0.35249242186546326, + "learning_rate": 4.893453605456908e-05, + "loss": 0.3744, + "step": 1574500 + }, + { + "epoch": 10.658022953659593, + "grad_norm": 0.35302841663360596, + "learning_rate": 4.8934197704634046e-05, + "loss": 0.373, + "step": 1575000 + }, + { + "epoch": 10.661406453009961, + "grad_norm": 0.3354608118534088, + "learning_rate": 4.893385935469901e-05, + "loss": 0.3734, + "step": 1575500 + }, + { + "epoch": 10.66478995236033, + "grad_norm": 0.37679925560951233, + "learning_rate": 4.893352100476397e-05, + "loss": 0.3724, + "step": 1576000 + }, + { + "epoch": 10.668173451710697, + "grad_norm": 0.35046711564064026, + "learning_rate": 4.893318265482893e-05, + "loss": 0.3725, + "step": 1576500 + }, + { + "epoch": 10.671556951061065, + "grad_norm": 0.34201011061668396, + "learning_rate": 4.8932844304893894e-05, + "loss": 0.3734, + "step": 1577000 + }, + { + "epoch": 10.674940450411434, + "grad_norm": 0.3327251970767975, + "learning_rate": 4.8932505954958856e-05, + "loss": 0.3714, + "step": 1577500 + }, + { + "epoch": 10.678323949761802, + "grad_norm": 0.31196796894073486, + "learning_rate": 4.893216760502382e-05, + "loss": 0.3732, + "step": 1578000 + }, + { + "epoch": 10.681707449112169, + "grad_norm": 0.32061767578125, + "learning_rate": 4.893182925508879e-05, + "loss": 0.3713, + "step": 1578500 + }, + { + "epoch": 10.685090948462538, + "grad_norm": 0.33071762323379517, + "learning_rate": 4.893149090515375e-05, + "loss": 0.3728, + "step": 1579000 + }, + { + "epoch": 10.688474447812906, + "grad_norm": 0.4266686737537384, + "learning_rate": 4.893115255521871e-05, + "loss": 0.3721, + "step": 1579500 + }, + { + "epoch": 10.691857947163275, + "grad_norm": 0.3743648827075958, + "learning_rate": 4.8930814205283674e-05, + "loss": 0.3735, + "step": 1580000 + }, + { + "epoch": 10.695241446513641, + "grad_norm": 0.35196825861930847, + "learning_rate": 4.893047585534864e-05, + "loss": 0.371, + "step": 1580500 + }, + { + "epoch": 10.69862494586401, + "grad_norm": 0.34265002608299255, + "learning_rate": 4.8930137505413605e-05, + "loss": 0.3722, + "step": 1581000 + }, + { + "epoch": 10.702008445214378, + "grad_norm": 0.3608235716819763, + "learning_rate": 4.892979915547856e-05, + "loss": 0.3725, + "step": 1581500 + }, + { + "epoch": 10.705391944564747, + "grad_norm": 0.3871472477912903, + "learning_rate": 4.892946080554353e-05, + "loss": 0.3724, + "step": 1582000 + }, + { + "epoch": 10.708775443915115, + "grad_norm": 0.5456562638282776, + "learning_rate": 4.892912245560849e-05, + "loss": 0.3731, + "step": 1582500 + }, + { + "epoch": 10.712158943265482, + "grad_norm": 0.304875910282135, + "learning_rate": 4.892878410567345e-05, + "loss": 0.3735, + "step": 1583000 + }, + { + "epoch": 10.71554244261585, + "grad_norm": 0.3602370023727417, + "learning_rate": 4.8928445755738415e-05, + "loss": 0.3726, + "step": 1583500 + }, + { + "epoch": 10.71892594196622, + "grad_norm": 0.36348941922187805, + "learning_rate": 4.892810740580338e-05, + "loss": 0.3724, + "step": 1584000 + }, + { + "epoch": 10.722309441316588, + "grad_norm": 0.3065342903137207, + "learning_rate": 4.8927769055868346e-05, + "loss": 0.3732, + "step": 1584500 + }, + { + "epoch": 10.725692940666956, + "grad_norm": 0.3403933644294739, + "learning_rate": 4.892743070593331e-05, + "loss": 0.3733, + "step": 1585000 + }, + { + "epoch": 10.729076440017323, + "grad_norm": 0.36604762077331543, + "learning_rate": 4.892709235599827e-05, + "loss": 0.3722, + "step": 1585500 + }, + { + "epoch": 10.732459939367692, + "grad_norm": 0.36708346009254456, + "learning_rate": 4.892675400606323e-05, + "loss": 0.3731, + "step": 1586000 + }, + { + "epoch": 10.73584343871806, + "grad_norm": 0.3771677613258362, + "learning_rate": 4.8926415656128195e-05, + "loss": 0.3736, + "step": 1586500 + }, + { + "epoch": 10.739226938068429, + "grad_norm": 0.3557665944099426, + "learning_rate": 4.892607730619316e-05, + "loss": 0.3724, + "step": 1587000 + }, + { + "epoch": 10.742610437418795, + "grad_norm": 0.4154599905014038, + "learning_rate": 4.892573895625812e-05, + "loss": 0.3738, + "step": 1587500 + }, + { + "epoch": 10.745993936769164, + "grad_norm": 0.34683358669281006, + "learning_rate": 4.892540060632309e-05, + "loss": 0.3727, + "step": 1588000 + }, + { + "epoch": 10.749377436119532, + "grad_norm": 0.3479159474372864, + "learning_rate": 4.892506225638805e-05, + "loss": 0.3741, + "step": 1588500 + }, + { + "epoch": 10.752760935469901, + "grad_norm": 0.3663256764411926, + "learning_rate": 4.892472390645301e-05, + "loss": 0.3725, + "step": 1589000 + }, + { + "epoch": 10.756144434820268, + "grad_norm": 0.3802635669708252, + "learning_rate": 4.8924385556517974e-05, + "loss": 0.3717, + "step": 1589500 + }, + { + "epoch": 10.759527934170636, + "grad_norm": 0.35679909586906433, + "learning_rate": 4.892404720658294e-05, + "loss": 0.3742, + "step": 1590000 + }, + { + "epoch": 10.762911433521005, + "grad_norm": 0.33733245730400085, + "learning_rate": 4.8923708856647905e-05, + "loss": 0.3725, + "step": 1590500 + }, + { + "epoch": 10.766294932871373, + "grad_norm": 0.5832557082176208, + "learning_rate": 4.892337050671287e-05, + "loss": 0.3715, + "step": 1591000 + }, + { + "epoch": 10.769678432221742, + "grad_norm": 0.37262704968452454, + "learning_rate": 4.892303215677782e-05, + "loss": 0.3733, + "step": 1591500 + }, + { + "epoch": 10.773061931572109, + "grad_norm": 0.3342667818069458, + "learning_rate": 4.892269380684279e-05, + "loss": 0.3711, + "step": 1592000 + }, + { + "epoch": 10.776445430922477, + "grad_norm": 0.3682439625263214, + "learning_rate": 4.8922355456907754e-05, + "loss": 0.3719, + "step": 1592500 + }, + { + "epoch": 10.779828930272846, + "grad_norm": 0.33057907223701477, + "learning_rate": 4.8922017106972716e-05, + "loss": 0.3719, + "step": 1593000 + }, + { + "epoch": 10.783212429623214, + "grad_norm": 0.39824581146240234, + "learning_rate": 4.892167875703768e-05, + "loss": 0.3719, + "step": 1593500 + }, + { + "epoch": 10.78659592897358, + "grad_norm": 0.35091519355773926, + "learning_rate": 4.892134040710265e-05, + "loss": 0.3733, + "step": 1594000 + }, + { + "epoch": 10.78997942832395, + "grad_norm": 0.37601837515830994, + "learning_rate": 4.892100205716761e-05, + "loss": 0.3728, + "step": 1594500 + }, + { + "epoch": 10.793362927674318, + "grad_norm": 0.3370593190193176, + "learning_rate": 4.892066370723257e-05, + "loss": 0.3723, + "step": 1595000 + }, + { + "epoch": 10.796746427024686, + "grad_norm": 0.37330934405326843, + "learning_rate": 4.892032535729753e-05, + "loss": 0.3728, + "step": 1595500 + }, + { + "epoch": 10.800129926375053, + "grad_norm": 0.3830586075782776, + "learning_rate": 4.8919987007362495e-05, + "loss": 0.3739, + "step": 1596000 + }, + { + "epoch": 10.803513425725422, + "grad_norm": 0.3950082063674927, + "learning_rate": 4.891964865742746e-05, + "loss": 0.3732, + "step": 1596500 + }, + { + "epoch": 10.80689692507579, + "grad_norm": 0.3372601568698883, + "learning_rate": 4.891931030749242e-05, + "loss": 0.3723, + "step": 1597000 + }, + { + "epoch": 10.810280424426159, + "grad_norm": 0.35912057757377625, + "learning_rate": 4.891897195755739e-05, + "loss": 0.373, + "step": 1597500 + }, + { + "epoch": 10.813663923776527, + "grad_norm": 0.35594770312309265, + "learning_rate": 4.891863360762235e-05, + "loss": 0.3732, + "step": 1598000 + }, + { + "epoch": 10.817047423126894, + "grad_norm": 0.3459462523460388, + "learning_rate": 4.891829525768731e-05, + "loss": 0.3742, + "step": 1598500 + }, + { + "epoch": 10.820430922477263, + "grad_norm": 0.33997267484664917, + "learning_rate": 4.8917956907752275e-05, + "loss": 0.3723, + "step": 1599000 + }, + { + "epoch": 10.823814421827631, + "grad_norm": 0.34426257014274597, + "learning_rate": 4.8917618557817244e-05, + "loss": 0.3748, + "step": 1599500 + }, + { + "epoch": 10.827197921178, + "grad_norm": 0.39141958951950073, + "learning_rate": 4.8917280207882206e-05, + "loss": 0.3717, + "step": 1600000 + }, + { + "epoch": 10.830581420528368, + "grad_norm": 0.3831254243850708, + "learning_rate": 4.891694185794717e-05, + "loss": 0.3728, + "step": 1600500 + }, + { + "epoch": 10.833964919878735, + "grad_norm": 0.33389633893966675, + "learning_rate": 4.891660350801212e-05, + "loss": 0.3723, + "step": 1601000 + }, + { + "epoch": 10.837348419229103, + "grad_norm": 0.3628421425819397, + "learning_rate": 4.891626515807709e-05, + "loss": 0.3736, + "step": 1601500 + }, + { + "epoch": 10.840731918579472, + "grad_norm": 0.3639175295829773, + "learning_rate": 4.8915926808142054e-05, + "loss": 0.3715, + "step": 1602000 + }, + { + "epoch": 10.84411541792984, + "grad_norm": 0.35927465558052063, + "learning_rate": 4.8915588458207017e-05, + "loss": 0.3733, + "step": 1602500 + }, + { + "epoch": 10.847498917280207, + "grad_norm": 0.3583664000034332, + "learning_rate": 4.891525010827198e-05, + "loss": 0.3731, + "step": 1603000 + }, + { + "epoch": 10.850882416630576, + "grad_norm": 0.3696195185184479, + "learning_rate": 4.891491175833695e-05, + "loss": 0.3749, + "step": 1603500 + }, + { + "epoch": 10.854265915980944, + "grad_norm": 0.367540180683136, + "learning_rate": 4.891457340840191e-05, + "loss": 0.3734, + "step": 1604000 + }, + { + "epoch": 10.857649415331313, + "grad_norm": 0.38197025656700134, + "learning_rate": 4.891423505846687e-05, + "loss": 0.3717, + "step": 1604500 + }, + { + "epoch": 10.86103291468168, + "grad_norm": 0.3927936255931854, + "learning_rate": 4.8913896708531834e-05, + "loss": 0.372, + "step": 1605000 + }, + { + "epoch": 10.864416414032048, + "grad_norm": 0.3550572693347931, + "learning_rate": 4.8913558358596796e-05, + "loss": 0.372, + "step": 1605500 + }, + { + "epoch": 10.867799913382417, + "grad_norm": 0.366964727640152, + "learning_rate": 4.891322000866176e-05, + "loss": 0.3726, + "step": 1606000 + }, + { + "epoch": 10.871183412732785, + "grad_norm": 0.3368151783943176, + "learning_rate": 4.891288165872672e-05, + "loss": 0.3723, + "step": 1606500 + }, + { + "epoch": 10.874566912083154, + "grad_norm": 0.34027695655822754, + "learning_rate": 4.891254330879169e-05, + "loss": 0.3715, + "step": 1607000 + }, + { + "epoch": 10.87795041143352, + "grad_norm": 0.3800116181373596, + "learning_rate": 4.891220495885665e-05, + "loss": 0.3718, + "step": 1607500 + }, + { + "epoch": 10.881333910783889, + "grad_norm": 0.35723376274108887, + "learning_rate": 4.8911866608921613e-05, + "loss": 0.3728, + "step": 1608000 + }, + { + "epoch": 10.884717410134257, + "grad_norm": 0.3366886377334595, + "learning_rate": 4.8911528258986576e-05, + "loss": 0.3733, + "step": 1608500 + }, + { + "epoch": 10.888100909484626, + "grad_norm": 0.38351333141326904, + "learning_rate": 4.8911189909051544e-05, + "loss": 0.3725, + "step": 1609000 + }, + { + "epoch": 10.891484408834993, + "grad_norm": 0.36331382393836975, + "learning_rate": 4.8910851559116507e-05, + "loss": 0.373, + "step": 1609500 + }, + { + "epoch": 10.894867908185361, + "grad_norm": 0.3579769432544708, + "learning_rate": 4.891051320918147e-05, + "loss": 0.3724, + "step": 1610000 + }, + { + "epoch": 10.89825140753573, + "grad_norm": 0.3592638671398163, + "learning_rate": 4.8910174859246424e-05, + "loss": 0.3718, + "step": 1610500 + }, + { + "epoch": 10.901634906886098, + "grad_norm": 0.37395668029785156, + "learning_rate": 4.890983650931139e-05, + "loss": 0.3714, + "step": 1611000 + }, + { + "epoch": 10.905018406236467, + "grad_norm": 0.31781119108200073, + "learning_rate": 4.8909498159376355e-05, + "loss": 0.3735, + "step": 1611500 + }, + { + "epoch": 10.908401905586834, + "grad_norm": 0.3627088665962219, + "learning_rate": 4.890915980944132e-05, + "loss": 0.3733, + "step": 1612000 + }, + { + "epoch": 10.911785404937202, + "grad_norm": 0.3828943073749542, + "learning_rate": 4.890882145950628e-05, + "loss": 0.3733, + "step": 1612500 + }, + { + "epoch": 10.91516890428757, + "grad_norm": 0.3835192322731018, + "learning_rate": 4.890848310957125e-05, + "loss": 0.3721, + "step": 1613000 + }, + { + "epoch": 10.918552403637939, + "grad_norm": 0.400499165058136, + "learning_rate": 4.890814475963621e-05, + "loss": 0.3715, + "step": 1613500 + }, + { + "epoch": 10.921935902988306, + "grad_norm": 0.348901629447937, + "learning_rate": 4.890780640970117e-05, + "loss": 0.3721, + "step": 1614000 + }, + { + "epoch": 10.925319402338674, + "grad_norm": 0.39376094937324524, + "learning_rate": 4.8907468059766135e-05, + "loss": 0.3718, + "step": 1614500 + }, + { + "epoch": 10.928702901689043, + "grad_norm": 0.36082133650779724, + "learning_rate": 4.89071297098311e-05, + "loss": 0.3737, + "step": 1615000 + }, + { + "epoch": 10.932086401039411, + "grad_norm": 0.35252830386161804, + "learning_rate": 4.890679135989606e-05, + "loss": 0.3723, + "step": 1615500 + }, + { + "epoch": 10.93546990038978, + "grad_norm": 0.3130725026130676, + "learning_rate": 4.890645300996102e-05, + "loss": 0.3712, + "step": 1616000 + }, + { + "epoch": 10.938853399740147, + "grad_norm": 0.3593806326389313, + "learning_rate": 4.890611466002599e-05, + "loss": 0.3724, + "step": 1616500 + }, + { + "epoch": 10.942236899090515, + "grad_norm": 0.3564632833003998, + "learning_rate": 4.890577631009095e-05, + "loss": 0.3736, + "step": 1617000 + }, + { + "epoch": 10.945620398440884, + "grad_norm": 0.3367440402507782, + "learning_rate": 4.8905437960155914e-05, + "loss": 0.3722, + "step": 1617500 + }, + { + "epoch": 10.949003897791252, + "grad_norm": 0.35689491033554077, + "learning_rate": 4.8905099610220876e-05, + "loss": 0.3726, + "step": 1618000 + }, + { + "epoch": 10.952387397141619, + "grad_norm": 0.36531731486320496, + "learning_rate": 4.8904761260285845e-05, + "loss": 0.3716, + "step": 1618500 + }, + { + "epoch": 10.955770896491988, + "grad_norm": 0.3991343677043915, + "learning_rate": 4.890442291035081e-05, + "loss": 0.3728, + "step": 1619000 + }, + { + "epoch": 10.959154395842356, + "grad_norm": 0.3427871763706207, + "learning_rate": 4.890408456041577e-05, + "loss": 0.3722, + "step": 1619500 + }, + { + "epoch": 10.962537895192725, + "grad_norm": 0.3547114133834839, + "learning_rate": 4.8903746210480725e-05, + "loss": 0.3725, + "step": 1620000 + }, + { + "epoch": 10.965921394543091, + "grad_norm": 0.4285212457180023, + "learning_rate": 4.8903407860545694e-05, + "loss": 0.3727, + "step": 1620500 + }, + { + "epoch": 10.96930489389346, + "grad_norm": 0.37522661685943604, + "learning_rate": 4.8903069510610656e-05, + "loss": 0.3704, + "step": 1621000 + }, + { + "epoch": 10.972688393243828, + "grad_norm": 0.36373889446258545, + "learning_rate": 4.890273116067562e-05, + "loss": 0.3722, + "step": 1621500 + }, + { + "epoch": 10.976071892594197, + "grad_norm": 0.3494463562965393, + "learning_rate": 4.890239281074058e-05, + "loss": 0.3731, + "step": 1622000 + }, + { + "epoch": 10.979455391944565, + "grad_norm": 0.3906720280647278, + "learning_rate": 4.890205446080555e-05, + "loss": 0.3727, + "step": 1622500 + }, + { + "epoch": 10.982838891294932, + "grad_norm": 0.3903609812259674, + "learning_rate": 4.890171611087051e-05, + "loss": 0.3704, + "step": 1623000 + }, + { + "epoch": 10.9862223906453, + "grad_norm": 0.3798389434814453, + "learning_rate": 4.890137776093547e-05, + "loss": 0.3722, + "step": 1623500 + }, + { + "epoch": 10.98960588999567, + "grad_norm": 0.37869197130203247, + "learning_rate": 4.8901039411000435e-05, + "loss": 0.3725, + "step": 1624000 + }, + { + "epoch": 10.992989389346038, + "grad_norm": 0.3519867956638336, + "learning_rate": 4.89007010610654e-05, + "loss": 0.3725, + "step": 1624500 + }, + { + "epoch": 10.996372888696406, + "grad_norm": 0.34879013895988464, + "learning_rate": 4.890036271113036e-05, + "loss": 0.3722, + "step": 1625000 + }, + { + "epoch": 10.999756388046773, + "grad_norm": 0.3999994695186615, + "learning_rate": 4.890002436119532e-05, + "loss": 0.3723, + "step": 1625500 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.858302066261313, + "eval_loss": 0.5760162472724915, + "eval_runtime": 3383.38, + "eval_samples_per_second": 85.933, + "eval_steps_per_second": 5.371, + "step": 1625536 + }, + { + "epoch": 11.003139887397142, + "grad_norm": 0.36553969979286194, + "learning_rate": 4.889968601126029e-05, + "loss": 0.3697, + "step": 1626000 + }, + { + "epoch": 11.00652338674751, + "grad_norm": 0.34825971722602844, + "learning_rate": 4.889934766132525e-05, + "loss": 0.3716, + "step": 1626500 + }, + { + "epoch": 11.009906886097879, + "grad_norm": 0.37449222803115845, + "learning_rate": 4.8899009311390215e-05, + "loss": 0.3699, + "step": 1627000 + }, + { + "epoch": 11.013290385448245, + "grad_norm": 0.311702698469162, + "learning_rate": 4.889867096145518e-05, + "loss": 0.3691, + "step": 1627500 + }, + { + "epoch": 11.016673884798614, + "grad_norm": 0.38622474670410156, + "learning_rate": 4.8898332611520146e-05, + "loss": 0.3706, + "step": 1628000 + }, + { + "epoch": 11.020057384148982, + "grad_norm": 0.35131123661994934, + "learning_rate": 4.889799426158511e-05, + "loss": 0.37, + "step": 1628500 + }, + { + "epoch": 11.023440883499351, + "grad_norm": 0.3682219684123993, + "learning_rate": 4.889765591165007e-05, + "loss": 0.3696, + "step": 1629000 + }, + { + "epoch": 11.026824382849718, + "grad_norm": 0.3607659339904785, + "learning_rate": 4.8897317561715025e-05, + "loss": 0.3695, + "step": 1629500 + }, + { + "epoch": 11.030207882200086, + "grad_norm": 0.34448304772377014, + "learning_rate": 4.8896979211779994e-05, + "loss": 0.3715, + "step": 1630000 + }, + { + "epoch": 11.033591381550455, + "grad_norm": 0.37748998403549194, + "learning_rate": 4.8896640861844956e-05, + "loss": 0.3695, + "step": 1630500 + }, + { + "epoch": 11.036974880900823, + "grad_norm": 0.3491244316101074, + "learning_rate": 4.889630251190992e-05, + "loss": 0.3704, + "step": 1631000 + }, + { + "epoch": 11.040358380251192, + "grad_norm": 0.3739766478538513, + "learning_rate": 4.889596416197488e-05, + "loss": 0.3706, + "step": 1631500 + }, + { + "epoch": 11.043741879601559, + "grad_norm": 0.3629169762134552, + "learning_rate": 4.889562581203985e-05, + "loss": 0.3715, + "step": 1632000 + }, + { + "epoch": 11.047125378951927, + "grad_norm": 0.32953304052352905, + "learning_rate": 4.889528746210481e-05, + "loss": 0.3695, + "step": 1632500 + }, + { + "epoch": 11.050508878302296, + "grad_norm": 0.37410807609558105, + "learning_rate": 4.8894949112169774e-05, + "loss": 0.3715, + "step": 1633000 + }, + { + "epoch": 11.053892377652664, + "grad_norm": 0.33794400095939636, + "learning_rate": 4.8894610762234736e-05, + "loss": 0.3699, + "step": 1633500 + }, + { + "epoch": 11.05727587700303, + "grad_norm": 0.3270277678966522, + "learning_rate": 4.88942724122997e-05, + "loss": 0.3697, + "step": 1634000 + }, + { + "epoch": 11.0606593763534, + "grad_norm": 0.34896576404571533, + "learning_rate": 4.889393406236466e-05, + "loss": 0.3708, + "step": 1634500 + }, + { + "epoch": 11.064042875703768, + "grad_norm": 0.38799527287483215, + "learning_rate": 4.889359571242962e-05, + "loss": 0.3724, + "step": 1635000 + }, + { + "epoch": 11.067426375054136, + "grad_norm": 0.39045727252960205, + "learning_rate": 4.889325736249459e-05, + "loss": 0.3715, + "step": 1635500 + }, + { + "epoch": 11.070809874404505, + "grad_norm": 0.40085986256599426, + "learning_rate": 4.889291901255955e-05, + "loss": 0.3709, + "step": 1636000 + }, + { + "epoch": 11.074193373754872, + "grad_norm": 0.4043148159980774, + "learning_rate": 4.8892580662624515e-05, + "loss": 0.3709, + "step": 1636500 + }, + { + "epoch": 11.07757687310524, + "grad_norm": 0.3465675413608551, + "learning_rate": 4.889224231268948e-05, + "loss": 0.3705, + "step": 1637000 + }, + { + "epoch": 11.080960372455609, + "grad_norm": 0.3407261371612549, + "learning_rate": 4.889190396275444e-05, + "loss": 0.3705, + "step": 1637500 + }, + { + "epoch": 11.084343871805977, + "grad_norm": 0.3841465711593628, + "learning_rate": 4.889156561281941e-05, + "loss": 0.371, + "step": 1638000 + }, + { + "epoch": 11.087727371156344, + "grad_norm": 0.3267723619937897, + "learning_rate": 4.889122726288437e-05, + "loss": 0.3722, + "step": 1638500 + }, + { + "epoch": 11.091110870506713, + "grad_norm": 0.32410913705825806, + "learning_rate": 4.8890888912949326e-05, + "loss": 0.3707, + "step": 1639000 + }, + { + "epoch": 11.094494369857081, + "grad_norm": 0.3783022463321686, + "learning_rate": 4.8890550563014295e-05, + "loss": 0.3705, + "step": 1639500 + }, + { + "epoch": 11.09787786920745, + "grad_norm": 0.39223384857177734, + "learning_rate": 4.889021221307926e-05, + "loss": 0.3699, + "step": 1640000 + }, + { + "epoch": 11.101261368557818, + "grad_norm": 0.3522324562072754, + "learning_rate": 4.888987386314422e-05, + "loss": 0.3707, + "step": 1640500 + }, + { + "epoch": 11.104644867908185, + "grad_norm": 0.33530494570732117, + "learning_rate": 4.888953551320918e-05, + "loss": 0.3698, + "step": 1641000 + }, + { + "epoch": 11.108028367258553, + "grad_norm": 0.34049347043037415, + "learning_rate": 4.888919716327415e-05, + "loss": 0.3709, + "step": 1641500 + }, + { + "epoch": 11.111411866608922, + "grad_norm": 0.34313058853149414, + "learning_rate": 4.888885881333911e-05, + "loss": 0.3716, + "step": 1642000 + }, + { + "epoch": 11.11479536595929, + "grad_norm": 0.35914409160614014, + "learning_rate": 4.8888520463404074e-05, + "loss": 0.3692, + "step": 1642500 + }, + { + "epoch": 11.118178865309657, + "grad_norm": 0.3764127194881439, + "learning_rate": 4.8888182113469036e-05, + "loss": 0.37, + "step": 1643000 + }, + { + "epoch": 11.121562364660026, + "grad_norm": 0.3786813020706177, + "learning_rate": 4.8887843763534e-05, + "loss": 0.3704, + "step": 1643500 + }, + { + "epoch": 11.124945864010394, + "grad_norm": 0.3405832052230835, + "learning_rate": 4.888750541359896e-05, + "loss": 0.3702, + "step": 1644000 + }, + { + "epoch": 11.128329363360763, + "grad_norm": 0.36560699343681335, + "learning_rate": 4.888716706366392e-05, + "loss": 0.3723, + "step": 1644500 + }, + { + "epoch": 11.13171286271113, + "grad_norm": 0.33541402220726013, + "learning_rate": 4.8886828713728885e-05, + "loss": 0.371, + "step": 1645000 + }, + { + "epoch": 11.135096362061498, + "grad_norm": 0.3588182330131531, + "learning_rate": 4.8886490363793854e-05, + "loss": 0.3705, + "step": 1645500 + }, + { + "epoch": 11.138479861411867, + "grad_norm": 0.36514315009117126, + "learning_rate": 4.8886152013858816e-05, + "loss": 0.3698, + "step": 1646000 + }, + { + "epoch": 11.141863360762235, + "grad_norm": 0.35488948225975037, + "learning_rate": 4.888581366392378e-05, + "loss": 0.371, + "step": 1646500 + }, + { + "epoch": 11.145246860112604, + "grad_norm": 0.3414875566959381, + "learning_rate": 4.888547531398874e-05, + "loss": 0.3706, + "step": 1647000 + }, + { + "epoch": 11.14863035946297, + "grad_norm": 0.3390885889530182, + "learning_rate": 4.888513696405371e-05, + "loss": 0.3717, + "step": 1647500 + }, + { + "epoch": 11.152013858813339, + "grad_norm": 0.38793033361434937, + "learning_rate": 4.888479861411867e-05, + "loss": 0.3706, + "step": 1648000 + }, + { + "epoch": 11.155397358163707, + "grad_norm": 0.3770737946033478, + "learning_rate": 4.8884460264183627e-05, + "loss": 0.3713, + "step": 1648500 + }, + { + "epoch": 11.158780857514076, + "grad_norm": 0.37044626474380493, + "learning_rate": 4.8884121914248595e-05, + "loss": 0.3705, + "step": 1649000 + }, + { + "epoch": 11.162164356864443, + "grad_norm": 0.40374863147735596, + "learning_rate": 4.888378356431356e-05, + "loss": 0.3726, + "step": 1649500 + }, + { + "epoch": 11.165547856214811, + "grad_norm": 0.348796546459198, + "learning_rate": 4.888344521437852e-05, + "loss": 0.3715, + "step": 1650000 + }, + { + "epoch": 11.16893135556518, + "grad_norm": 0.4167865812778473, + "learning_rate": 4.888310686444348e-05, + "loss": 0.3737, + "step": 1650500 + }, + { + "epoch": 11.172314854915548, + "grad_norm": 0.3556322753429413, + "learning_rate": 4.888276851450845e-05, + "loss": 0.3728, + "step": 1651000 + }, + { + "epoch": 11.175698354265917, + "grad_norm": 0.3516223728656769, + "learning_rate": 4.888243016457341e-05, + "loss": 0.3697, + "step": 1651500 + }, + { + "epoch": 11.179081853616283, + "grad_norm": 0.3844006657600403, + "learning_rate": 4.8882091814638375e-05, + "loss": 0.3704, + "step": 1652000 + }, + { + "epoch": 11.182465352966652, + "grad_norm": 0.36370837688446045, + "learning_rate": 4.888175346470334e-05, + "loss": 0.3704, + "step": 1652500 + }, + { + "epoch": 11.18584885231702, + "grad_norm": 0.37239259481430054, + "learning_rate": 4.8881415114768306e-05, + "loss": 0.3712, + "step": 1653000 + }, + { + "epoch": 11.189232351667389, + "grad_norm": 0.365557998418808, + "learning_rate": 4.888107676483326e-05, + "loss": 0.3735, + "step": 1653500 + }, + { + "epoch": 11.192615851017756, + "grad_norm": 0.38343822956085205, + "learning_rate": 4.8880738414898223e-05, + "loss": 0.3718, + "step": 1654000 + }, + { + "epoch": 11.195999350368124, + "grad_norm": 0.33387985825538635, + "learning_rate": 4.8880400064963186e-05, + "loss": 0.37, + "step": 1654500 + }, + { + "epoch": 11.199382849718493, + "grad_norm": 0.3623245656490326, + "learning_rate": 4.8880061715028154e-05, + "loss": 0.3706, + "step": 1655000 + }, + { + "epoch": 11.202766349068861, + "grad_norm": 0.38692569732666016, + "learning_rate": 4.8879723365093117e-05, + "loss": 0.3708, + "step": 1655500 + }, + { + "epoch": 11.20614984841923, + "grad_norm": 0.33788901567459106, + "learning_rate": 4.887938501515808e-05, + "loss": 0.3717, + "step": 1656000 + }, + { + "epoch": 11.209533347769597, + "grad_norm": 0.3440932035446167, + "learning_rate": 4.887904666522304e-05, + "loss": 0.371, + "step": 1656500 + }, + { + "epoch": 11.212916847119965, + "grad_norm": 0.3497539758682251, + "learning_rate": 4.887870831528801e-05, + "loss": 0.3715, + "step": 1657000 + }, + { + "epoch": 11.216300346470334, + "grad_norm": 0.34109485149383545, + "learning_rate": 4.887836996535297e-05, + "loss": 0.3731, + "step": 1657500 + }, + { + "epoch": 11.219683845820702, + "grad_norm": 0.36244407296180725, + "learning_rate": 4.887803161541793e-05, + "loss": 0.3711, + "step": 1658000 + }, + { + "epoch": 11.223067345171069, + "grad_norm": 0.3530657887458801, + "learning_rate": 4.8877693265482896e-05, + "loss": 0.372, + "step": 1658500 + }, + { + "epoch": 11.226450844521437, + "grad_norm": 0.3541402816772461, + "learning_rate": 4.887735491554786e-05, + "loss": 0.3708, + "step": 1659000 + }, + { + "epoch": 11.229834343871806, + "grad_norm": 0.3628634512424469, + "learning_rate": 4.887701656561282e-05, + "loss": 0.3715, + "step": 1659500 + }, + { + "epoch": 11.233217843222175, + "grad_norm": 0.3461693823337555, + "learning_rate": 4.887667821567778e-05, + "loss": 0.3711, + "step": 1660000 + }, + { + "epoch": 11.236601342572543, + "grad_norm": 0.3520946204662323, + "learning_rate": 4.887633986574275e-05, + "loss": 0.3725, + "step": 1660500 + }, + { + "epoch": 11.23998484192291, + "grad_norm": 0.3735743761062622, + "learning_rate": 4.8876001515807713e-05, + "loss": 0.3723, + "step": 1661000 + }, + { + "epoch": 11.243368341273278, + "grad_norm": 0.37564146518707275, + "learning_rate": 4.8875663165872676e-05, + "loss": 0.3722, + "step": 1661500 + }, + { + "epoch": 11.246751840623647, + "grad_norm": 0.38179755210876465, + "learning_rate": 4.887532481593764e-05, + "loss": 0.3722, + "step": 1662000 + }, + { + "epoch": 11.250135339974015, + "grad_norm": 0.36789751052856445, + "learning_rate": 4.8874986466002607e-05, + "loss": 0.3715, + "step": 1662500 + }, + { + "epoch": 11.253518839324382, + "grad_norm": 0.3178900182247162, + "learning_rate": 4.887464811606756e-05, + "loss": 0.3706, + "step": 1663000 + }, + { + "epoch": 11.25690233867475, + "grad_norm": 0.37493330240249634, + "learning_rate": 4.8874309766132524e-05, + "loss": 0.3714, + "step": 1663500 + }, + { + "epoch": 11.26028583802512, + "grad_norm": 0.33544522523880005, + "learning_rate": 4.8873971416197486e-05, + "loss": 0.3708, + "step": 1664000 + }, + { + "epoch": 11.263669337375488, + "grad_norm": 0.34790122509002686, + "learning_rate": 4.8873633066262455e-05, + "loss": 0.3708, + "step": 1664500 + }, + { + "epoch": 11.267052836725856, + "grad_norm": 0.3806302845478058, + "learning_rate": 4.887329471632742e-05, + "loss": 0.3706, + "step": 1665000 + }, + { + "epoch": 11.270436336076223, + "grad_norm": 0.3643375039100647, + "learning_rate": 4.887295636639238e-05, + "loss": 0.3704, + "step": 1665500 + }, + { + "epoch": 11.273819835426591, + "grad_norm": 0.41378292441368103, + "learning_rate": 4.887261801645734e-05, + "loss": 0.3714, + "step": 1666000 + }, + { + "epoch": 11.27720333477696, + "grad_norm": 0.3954533338546753, + "learning_rate": 4.887227966652231e-05, + "loss": 0.3712, + "step": 1666500 + }, + { + "epoch": 11.280586834127329, + "grad_norm": 0.41279372572898865, + "learning_rate": 4.887194131658727e-05, + "loss": 0.3711, + "step": 1667000 + }, + { + "epoch": 11.283970333477695, + "grad_norm": 0.32453882694244385, + "learning_rate": 4.887160296665223e-05, + "loss": 0.3712, + "step": 1667500 + }, + { + "epoch": 11.287353832828064, + "grad_norm": 0.35816478729248047, + "learning_rate": 4.88712646167172e-05, + "loss": 0.371, + "step": 1668000 + }, + { + "epoch": 11.290737332178432, + "grad_norm": 0.3419073820114136, + "learning_rate": 4.887092626678216e-05, + "loss": 0.3713, + "step": 1668500 + }, + { + "epoch": 11.2941208315288, + "grad_norm": 0.41529932618141174, + "learning_rate": 4.887058791684712e-05, + "loss": 0.3701, + "step": 1669000 + }, + { + "epoch": 11.297504330879168, + "grad_norm": 0.34597066044807434, + "learning_rate": 4.887024956691208e-05, + "loss": 0.3734, + "step": 1669500 + }, + { + "epoch": 11.300887830229536, + "grad_norm": 0.3705573081970215, + "learning_rate": 4.886991121697705e-05, + "loss": 0.3718, + "step": 1670000 + }, + { + "epoch": 11.304271329579905, + "grad_norm": 0.38146474957466125, + "learning_rate": 4.8869572867042014e-05, + "loss": 0.3713, + "step": 1670500 + }, + { + "epoch": 11.307654828930273, + "grad_norm": 0.3561438024044037, + "learning_rate": 4.8869234517106976e-05, + "loss": 0.3715, + "step": 1671000 + }, + { + "epoch": 11.311038328280642, + "grad_norm": 0.39705178141593933, + "learning_rate": 4.886889616717194e-05, + "loss": 0.3707, + "step": 1671500 + }, + { + "epoch": 11.314421827631008, + "grad_norm": 0.35090193152427673, + "learning_rate": 4.886855781723691e-05, + "loss": 0.3716, + "step": 1672000 + }, + { + "epoch": 11.317805326981377, + "grad_norm": 0.3633898198604584, + "learning_rate": 4.886821946730186e-05, + "loss": 0.3712, + "step": 1672500 + }, + { + "epoch": 11.321188826331746, + "grad_norm": 0.3485035300254822, + "learning_rate": 4.8867881117366825e-05, + "loss": 0.3711, + "step": 1673000 + }, + { + "epoch": 11.324572325682114, + "grad_norm": 0.38166773319244385, + "learning_rate": 4.886754276743179e-05, + "loss": 0.3707, + "step": 1673500 + }, + { + "epoch": 11.32795582503248, + "grad_norm": 0.37450724840164185, + "learning_rate": 4.8867204417496756e-05, + "loss": 0.3699, + "step": 1674000 + }, + { + "epoch": 11.33133932438285, + "grad_norm": 0.3536973297595978, + "learning_rate": 4.886686606756172e-05, + "loss": 0.3733, + "step": 1674500 + }, + { + "epoch": 11.334722823733218, + "grad_norm": 0.3306363821029663, + "learning_rate": 4.886652771762668e-05, + "loss": 0.3714, + "step": 1675000 + }, + { + "epoch": 11.338106323083586, + "grad_norm": 0.3833313286304474, + "learning_rate": 4.886618936769164e-05, + "loss": 0.3705, + "step": 1675500 + }, + { + "epoch": 11.341489822433955, + "grad_norm": 0.3853326439857483, + "learning_rate": 4.886585101775661e-05, + "loss": 0.371, + "step": 1676000 + }, + { + "epoch": 11.344873321784322, + "grad_norm": 0.34179025888442993, + "learning_rate": 4.886551266782157e-05, + "loss": 0.3721, + "step": 1676500 + }, + { + "epoch": 11.34825682113469, + "grad_norm": 0.36327168345451355, + "learning_rate": 4.886517431788653e-05, + "loss": 0.372, + "step": 1677000 + }, + { + "epoch": 11.351640320485059, + "grad_norm": 0.364208847284317, + "learning_rate": 4.88648359679515e-05, + "loss": 0.3731, + "step": 1677500 + }, + { + "epoch": 11.355023819835427, + "grad_norm": 0.3499656319618225, + "learning_rate": 4.886449761801646e-05, + "loss": 0.3723, + "step": 1678000 + }, + { + "epoch": 11.358407319185794, + "grad_norm": 0.3366363048553467, + "learning_rate": 4.886415926808142e-05, + "loss": 0.372, + "step": 1678500 + }, + { + "epoch": 11.361790818536162, + "grad_norm": 0.4245286285877228, + "learning_rate": 4.8863820918146384e-05, + "loss": 0.3716, + "step": 1679000 + }, + { + "epoch": 11.365174317886531, + "grad_norm": 0.36462846398353577, + "learning_rate": 4.886348256821135e-05, + "loss": 0.3728, + "step": 1679500 + }, + { + "epoch": 11.3685578172369, + "grad_norm": 0.36596938967704773, + "learning_rate": 4.8863144218276315e-05, + "loss": 0.3712, + "step": 1680000 + }, + { + "epoch": 11.371941316587268, + "grad_norm": 0.37903404235839844, + "learning_rate": 4.886280586834128e-05, + "loss": 0.3726, + "step": 1680500 + }, + { + "epoch": 11.375324815937635, + "grad_norm": 0.3716667890548706, + "learning_rate": 4.886246751840624e-05, + "loss": 0.3714, + "step": 1681000 + }, + { + "epoch": 11.378708315288003, + "grad_norm": 0.3672736585140228, + "learning_rate": 4.886212916847121e-05, + "loss": 0.3708, + "step": 1681500 + }, + { + "epoch": 11.382091814638372, + "grad_norm": 0.35480719804763794, + "learning_rate": 4.886179081853616e-05, + "loss": 0.3703, + "step": 1682000 + }, + { + "epoch": 11.38547531398874, + "grad_norm": 0.33838438987731934, + "learning_rate": 4.8861452468601125e-05, + "loss": 0.372, + "step": 1682500 + }, + { + "epoch": 11.388858813339107, + "grad_norm": 0.38470542430877686, + "learning_rate": 4.886111411866609e-05, + "loss": 0.3714, + "step": 1683000 + }, + { + "epoch": 11.392242312689476, + "grad_norm": 0.3572859466075897, + "learning_rate": 4.8860775768731056e-05, + "loss": 0.3713, + "step": 1683500 + }, + { + "epoch": 11.395625812039844, + "grad_norm": 0.32588905096054077, + "learning_rate": 4.886043741879602e-05, + "loss": 0.3712, + "step": 1684000 + }, + { + "epoch": 11.399009311390213, + "grad_norm": 0.3401735723018646, + "learning_rate": 4.886009906886098e-05, + "loss": 0.3704, + "step": 1684500 + }, + { + "epoch": 11.402392810740581, + "grad_norm": 0.3463999330997467, + "learning_rate": 4.885976071892594e-05, + "loss": 0.3734, + "step": 1685000 + }, + { + "epoch": 11.405776310090948, + "grad_norm": 0.36956986784935, + "learning_rate": 4.885942236899091e-05, + "loss": 0.3727, + "step": 1685500 + }, + { + "epoch": 11.409159809441316, + "grad_norm": 0.3834473788738251, + "learning_rate": 4.8859084019055874e-05, + "loss": 0.3705, + "step": 1686000 + }, + { + "epoch": 11.412543308791685, + "grad_norm": 0.3418712615966797, + "learning_rate": 4.885874566912083e-05, + "loss": 0.3707, + "step": 1686500 + }, + { + "epoch": 11.415926808142054, + "grad_norm": 0.3438604474067688, + "learning_rate": 4.88584073191858e-05, + "loss": 0.3725, + "step": 1687000 + }, + { + "epoch": 11.41931030749242, + "grad_norm": 0.3532719314098358, + "learning_rate": 4.885806896925076e-05, + "loss": 0.3735, + "step": 1687500 + }, + { + "epoch": 11.422693806842789, + "grad_norm": 0.357460618019104, + "learning_rate": 4.885773061931572e-05, + "loss": 0.371, + "step": 1688000 + }, + { + "epoch": 11.426077306193157, + "grad_norm": 0.33926403522491455, + "learning_rate": 4.8857392269380684e-05, + "loss": 0.3711, + "step": 1688500 + }, + { + "epoch": 11.429460805543526, + "grad_norm": 0.3649449944496155, + "learning_rate": 4.885705391944565e-05, + "loss": 0.372, + "step": 1689000 + }, + { + "epoch": 11.432844304893894, + "grad_norm": 0.38523754477500916, + "learning_rate": 4.8856715569510615e-05, + "loss": 0.3724, + "step": 1689500 + }, + { + "epoch": 11.436227804244261, + "grad_norm": 0.37194523215293884, + "learning_rate": 4.885637721957558e-05, + "loss": 0.3707, + "step": 1690000 + }, + { + "epoch": 11.43961130359463, + "grad_norm": 0.32036304473876953, + "learning_rate": 4.885603886964054e-05, + "loss": 0.3721, + "step": 1690500 + }, + { + "epoch": 11.442994802944998, + "grad_norm": 0.3410488963127136, + "learning_rate": 4.88557005197055e-05, + "loss": 0.3706, + "step": 1691000 + }, + { + "epoch": 11.446378302295367, + "grad_norm": 0.3249223530292511, + "learning_rate": 4.8855362169770464e-05, + "loss": 0.3708, + "step": 1691500 + }, + { + "epoch": 11.449761801645733, + "grad_norm": 0.3791833817958832, + "learning_rate": 4.8855023819835426e-05, + "loss": 0.3709, + "step": 1692000 + }, + { + "epoch": 11.453145300996102, + "grad_norm": 0.3641836643218994, + "learning_rate": 4.885468546990039e-05, + "loss": 0.3703, + "step": 1692500 + }, + { + "epoch": 11.45652880034647, + "grad_norm": 0.40123745799064636, + "learning_rate": 4.885434711996536e-05, + "loss": 0.3725, + "step": 1693000 + }, + { + "epoch": 11.459912299696839, + "grad_norm": 0.3453229069709778, + "learning_rate": 4.885400877003032e-05, + "loss": 0.3713, + "step": 1693500 + }, + { + "epoch": 11.463295799047206, + "grad_norm": 0.3665013313293457, + "learning_rate": 4.885367042009528e-05, + "loss": 0.3718, + "step": 1694000 + }, + { + "epoch": 11.466679298397574, + "grad_norm": 0.3658900856971741, + "learning_rate": 4.885333207016024e-05, + "loss": 0.373, + "step": 1694500 + }, + { + "epoch": 11.470062797747943, + "grad_norm": 0.3546628952026367, + "learning_rate": 4.885299372022521e-05, + "loss": 0.3725, + "step": 1695000 + }, + { + "epoch": 11.473446297098311, + "grad_norm": 0.3913738429546356, + "learning_rate": 4.8852655370290174e-05, + "loss": 0.3723, + "step": 1695500 + }, + { + "epoch": 11.47682979644868, + "grad_norm": 0.36838194727897644, + "learning_rate": 4.885231702035513e-05, + "loss": 0.3711, + "step": 1696000 + }, + { + "epoch": 11.480213295799047, + "grad_norm": 0.36029133200645447, + "learning_rate": 4.88519786704201e-05, + "loss": 0.3716, + "step": 1696500 + }, + { + "epoch": 11.483596795149415, + "grad_norm": 0.39833101630210876, + "learning_rate": 4.885164032048506e-05, + "loss": 0.3706, + "step": 1697000 + }, + { + "epoch": 11.486980294499784, + "grad_norm": 0.3916868567466736, + "learning_rate": 4.885130197055002e-05, + "loss": 0.3719, + "step": 1697500 + }, + { + "epoch": 11.490363793850152, + "grad_norm": 0.36675065755844116, + "learning_rate": 4.8850963620614985e-05, + "loss": 0.3718, + "step": 1698000 + }, + { + "epoch": 11.493747293200519, + "grad_norm": 0.3780879080295563, + "learning_rate": 4.8850625270679954e-05, + "loss": 0.3713, + "step": 1698500 + }, + { + "epoch": 11.497130792550887, + "grad_norm": 0.3343255817890167, + "learning_rate": 4.8850286920744916e-05, + "loss": 0.3729, + "step": 1699000 + }, + { + "epoch": 11.500514291901256, + "grad_norm": 0.35851815342903137, + "learning_rate": 4.884994857080988e-05, + "loss": 0.3722, + "step": 1699500 + }, + { + "epoch": 11.503897791251624, + "grad_norm": 0.3488004505634308, + "learning_rate": 4.884961022087484e-05, + "loss": 0.3721, + "step": 1700000 + }, + { + "epoch": 11.507281290601993, + "grad_norm": 0.3857555091381073, + "learning_rate": 4.88492718709398e-05, + "loss": 0.3707, + "step": 1700500 + }, + { + "epoch": 11.51066478995236, + "grad_norm": 0.36618277430534363, + "learning_rate": 4.8848933521004764e-05, + "loss": 0.371, + "step": 1701000 + }, + { + "epoch": 11.514048289302728, + "grad_norm": 0.36315736174583435, + "learning_rate": 4.8848595171069727e-05, + "loss": 0.3732, + "step": 1701500 + }, + { + "epoch": 11.517431788653097, + "grad_norm": 0.3206866681575775, + "learning_rate": 4.884825682113469e-05, + "loss": 0.3705, + "step": 1702000 + }, + { + "epoch": 11.520815288003465, + "grad_norm": 0.3441249430179596, + "learning_rate": 4.884791847119966e-05, + "loss": 0.3719, + "step": 1702500 + }, + { + "epoch": 11.524198787353832, + "grad_norm": 0.32257530093193054, + "learning_rate": 4.884758012126462e-05, + "loss": 0.3704, + "step": 1703000 + }, + { + "epoch": 11.5275822867042, + "grad_norm": 0.33246949315071106, + "learning_rate": 4.884724177132958e-05, + "loss": 0.3702, + "step": 1703500 + }, + { + "epoch": 11.53096578605457, + "grad_norm": 0.3414853811264038, + "learning_rate": 4.8846903421394544e-05, + "loss": 0.372, + "step": 1704000 + }, + { + "epoch": 11.534349285404938, + "grad_norm": 0.3186386525630951, + "learning_rate": 4.884656507145951e-05, + "loss": 0.3718, + "step": 1704500 + }, + { + "epoch": 11.537732784755306, + "grad_norm": 0.3396872580051422, + "learning_rate": 4.8846226721524475e-05, + "loss": 0.3716, + "step": 1705000 + }, + { + "epoch": 11.541116284105673, + "grad_norm": 0.3399006128311157, + "learning_rate": 4.884588837158944e-05, + "loss": 0.3719, + "step": 1705500 + }, + { + "epoch": 11.544499783456041, + "grad_norm": 0.3655692934989929, + "learning_rate": 4.88455500216544e-05, + "loss": 0.372, + "step": 1706000 + }, + { + "epoch": 11.54788328280641, + "grad_norm": 0.3565117418766022, + "learning_rate": 4.884521167171936e-05, + "loss": 0.3713, + "step": 1706500 + }, + { + "epoch": 11.551266782156778, + "grad_norm": 0.34369155764579773, + "learning_rate": 4.8844873321784323e-05, + "loss": 0.3721, + "step": 1707000 + }, + { + "epoch": 11.554650281507145, + "grad_norm": 0.41790443658828735, + "learning_rate": 4.8844534971849286e-05, + "loss": 0.3726, + "step": 1707500 + }, + { + "epoch": 11.558033780857514, + "grad_norm": 0.4061517119407654, + "learning_rate": 4.884419662191425e-05, + "loss": 0.3713, + "step": 1708000 + }, + { + "epoch": 11.561417280207882, + "grad_norm": 0.3356378376483917, + "learning_rate": 4.884385827197922e-05, + "loss": 0.3708, + "step": 1708500 + }, + { + "epoch": 11.56480077955825, + "grad_norm": 0.3446858525276184, + "learning_rate": 4.884351992204418e-05, + "loss": 0.3715, + "step": 1709000 + }, + { + "epoch": 11.568184278908618, + "grad_norm": 0.3577991724014282, + "learning_rate": 4.884318157210914e-05, + "loss": 0.3716, + "step": 1709500 + }, + { + "epoch": 11.571567778258986, + "grad_norm": 0.3579697608947754, + "learning_rate": 4.88428432221741e-05, + "loss": 0.3712, + "step": 1710000 + }, + { + "epoch": 11.574951277609355, + "grad_norm": 0.3684149980545044, + "learning_rate": 4.8842504872239065e-05, + "loss": 0.3732, + "step": 1710500 + }, + { + "epoch": 11.578334776959723, + "grad_norm": 0.4148857593536377, + "learning_rate": 4.884216652230403e-05, + "loss": 0.3725, + "step": 1711000 + }, + { + "epoch": 11.581718276310092, + "grad_norm": 0.4042187035083771, + "learning_rate": 4.884182817236899e-05, + "loss": 0.3724, + "step": 1711500 + }, + { + "epoch": 11.585101775660458, + "grad_norm": 0.3650857210159302, + "learning_rate": 4.884148982243396e-05, + "loss": 0.373, + "step": 1712000 + }, + { + "epoch": 11.588485275010827, + "grad_norm": 0.35698583722114563, + "learning_rate": 4.884115147249892e-05, + "loss": 0.3733, + "step": 1712500 + }, + { + "epoch": 11.591868774361195, + "grad_norm": 0.3720245957374573, + "learning_rate": 4.884081312256388e-05, + "loss": 0.37, + "step": 1713000 + }, + { + "epoch": 11.595252273711564, + "grad_norm": 0.37715089321136475, + "learning_rate": 4.8840474772628845e-05, + "loss": 0.3719, + "step": 1713500 + }, + { + "epoch": 11.598635773061932, + "grad_norm": 0.3547378182411194, + "learning_rate": 4.8840136422693814e-05, + "loss": 0.3718, + "step": 1714000 + }, + { + "epoch": 11.6020192724123, + "grad_norm": 0.3440019488334656, + "learning_rate": 4.8839798072758776e-05, + "loss": 0.3722, + "step": 1714500 + }, + { + "epoch": 11.605402771762668, + "grad_norm": 0.3711560368537903, + "learning_rate": 4.883945972282374e-05, + "loss": 0.3722, + "step": 1715000 + }, + { + "epoch": 11.608786271113036, + "grad_norm": 0.35923251509666443, + "learning_rate": 4.883912137288869e-05, + "loss": 0.3708, + "step": 1715500 + }, + { + "epoch": 11.612169770463405, + "grad_norm": 0.35365545749664307, + "learning_rate": 4.883878302295366e-05, + "loss": 0.3718, + "step": 1716000 + }, + { + "epoch": 11.615553269813772, + "grad_norm": 0.3428800404071808, + "learning_rate": 4.8838444673018624e-05, + "loss": 0.3735, + "step": 1716500 + }, + { + "epoch": 11.61893676916414, + "grad_norm": 0.315690279006958, + "learning_rate": 4.8838106323083586e-05, + "loss": 0.3708, + "step": 1717000 + }, + { + "epoch": 11.622320268514509, + "grad_norm": 0.38900619745254517, + "learning_rate": 4.883776797314855e-05, + "loss": 0.3715, + "step": 1717500 + }, + { + "epoch": 11.625703767864877, + "grad_norm": 0.3546205759048462, + "learning_rate": 4.883742962321352e-05, + "loss": 0.3719, + "step": 1718000 + }, + { + "epoch": 11.629087267215244, + "grad_norm": 0.30673447251319885, + "learning_rate": 4.883709127327848e-05, + "loss": 0.3718, + "step": 1718500 + }, + { + "epoch": 11.632470766565612, + "grad_norm": 0.35959509015083313, + "learning_rate": 4.883675292334344e-05, + "loss": 0.3722, + "step": 1719000 + }, + { + "epoch": 11.635854265915981, + "grad_norm": 0.4214160144329071, + "learning_rate": 4.8836414573408404e-05, + "loss": 0.3709, + "step": 1719500 + }, + { + "epoch": 11.63923776526635, + "grad_norm": 0.35455048084259033, + "learning_rate": 4.8836076223473366e-05, + "loss": 0.3714, + "step": 1720000 + }, + { + "epoch": 11.642621264616718, + "grad_norm": 0.3674977719783783, + "learning_rate": 4.883573787353833e-05, + "loss": 0.3718, + "step": 1720500 + }, + { + "epoch": 11.646004763967085, + "grad_norm": 0.37586429715156555, + "learning_rate": 4.883539952360329e-05, + "loss": 0.3712, + "step": 1721000 + }, + { + "epoch": 11.649388263317453, + "grad_norm": 0.3308994174003601, + "learning_rate": 4.883506117366826e-05, + "loss": 0.3734, + "step": 1721500 + }, + { + "epoch": 11.652771762667822, + "grad_norm": 0.3521401584148407, + "learning_rate": 4.883472282373322e-05, + "loss": 0.3717, + "step": 1722000 + }, + { + "epoch": 11.65615526201819, + "grad_norm": 0.3826388418674469, + "learning_rate": 4.883438447379818e-05, + "loss": 0.3708, + "step": 1722500 + }, + { + "epoch": 11.659538761368557, + "grad_norm": 0.39649340510368347, + "learning_rate": 4.8834046123863145e-05, + "loss": 0.3728, + "step": 1723000 + }, + { + "epoch": 11.662922260718926, + "grad_norm": 0.37863731384277344, + "learning_rate": 4.8833707773928114e-05, + "loss": 0.371, + "step": 1723500 + }, + { + "epoch": 11.666305760069294, + "grad_norm": 0.3028692603111267, + "learning_rate": 4.8833369423993076e-05, + "loss": 0.3713, + "step": 1724000 + }, + { + "epoch": 11.669689259419663, + "grad_norm": 0.34876948595046997, + "learning_rate": 4.883303107405804e-05, + "loss": 0.3729, + "step": 1724500 + }, + { + "epoch": 11.673072758770031, + "grad_norm": 0.3666656017303467, + "learning_rate": 4.8832692724122994e-05, + "loss": 0.3699, + "step": 1725000 + }, + { + "epoch": 11.676456258120398, + "grad_norm": 0.344465047121048, + "learning_rate": 4.883235437418796e-05, + "loss": 0.372, + "step": 1725500 + }, + { + "epoch": 11.679839757470766, + "grad_norm": 0.3424505591392517, + "learning_rate": 4.8832016024252925e-05, + "loss": 0.3698, + "step": 1726000 + }, + { + "epoch": 11.683223256821135, + "grad_norm": 0.4021340608596802, + "learning_rate": 4.883167767431789e-05, + "loss": 0.371, + "step": 1726500 + }, + { + "epoch": 11.686606756171503, + "grad_norm": 0.3761267364025116, + "learning_rate": 4.883133932438285e-05, + "loss": 0.3728, + "step": 1727000 + }, + { + "epoch": 11.68999025552187, + "grad_norm": 0.3646292984485626, + "learning_rate": 4.883100097444782e-05, + "loss": 0.3714, + "step": 1727500 + }, + { + "epoch": 11.693373754872239, + "grad_norm": 0.352817565202713, + "learning_rate": 4.883066262451278e-05, + "loss": 0.372, + "step": 1728000 + }, + { + "epoch": 11.696757254222607, + "grad_norm": 0.35847803950309753, + "learning_rate": 4.883032427457774e-05, + "loss": 0.3699, + "step": 1728500 + }, + { + "epoch": 11.700140753572976, + "grad_norm": 0.3726541996002197, + "learning_rate": 4.8829985924642704e-05, + "loss": 0.3708, + "step": 1729000 + }, + { + "epoch": 11.703524252923344, + "grad_norm": 0.346699595451355, + "learning_rate": 4.8829647574707666e-05, + "loss": 0.3725, + "step": 1729500 + }, + { + "epoch": 11.706907752273711, + "grad_norm": 0.359237939119339, + "learning_rate": 4.882930922477263e-05, + "loss": 0.3721, + "step": 1730000 + }, + { + "epoch": 11.71029125162408, + "grad_norm": 0.393034964799881, + "learning_rate": 4.882897087483759e-05, + "loss": 0.3709, + "step": 1730500 + }, + { + "epoch": 11.713674750974448, + "grad_norm": 0.3590317964553833, + "learning_rate": 4.882863252490256e-05, + "loss": 0.3721, + "step": 1731000 + }, + { + "epoch": 11.717058250324817, + "grad_norm": 0.3669711947441101, + "learning_rate": 4.882829417496752e-05, + "loss": 0.3733, + "step": 1731500 + }, + { + "epoch": 11.720441749675183, + "grad_norm": 0.36638450622558594, + "learning_rate": 4.8827955825032484e-05, + "loss": 0.3724, + "step": 1732000 + }, + { + "epoch": 11.723825249025552, + "grad_norm": 0.3468673527240753, + "learning_rate": 4.8827617475097446e-05, + "loss": 0.3707, + "step": 1732500 + }, + { + "epoch": 11.72720874837592, + "grad_norm": 0.3468047082424164, + "learning_rate": 4.8827279125162415e-05, + "loss": 0.3715, + "step": 1733000 + }, + { + "epoch": 11.730592247726289, + "grad_norm": 0.3381323516368866, + "learning_rate": 4.882694077522738e-05, + "loss": 0.373, + "step": 1733500 + }, + { + "epoch": 11.733975747076656, + "grad_norm": 0.33758747577667236, + "learning_rate": 4.882660242529234e-05, + "loss": 0.3721, + "step": 1734000 + }, + { + "epoch": 11.737359246427024, + "grad_norm": 0.33760327100753784, + "learning_rate": 4.8826264075357294e-05, + "loss": 0.3705, + "step": 1734500 + }, + { + "epoch": 11.740742745777393, + "grad_norm": 0.36903616786003113, + "learning_rate": 4.882592572542226e-05, + "loss": 0.3721, + "step": 1735000 + }, + { + "epoch": 11.744126245127761, + "grad_norm": 0.37238264083862305, + "learning_rate": 4.8825587375487225e-05, + "loss": 0.3708, + "step": 1735500 + }, + { + "epoch": 11.74750974447813, + "grad_norm": 0.31929102540016174, + "learning_rate": 4.882524902555219e-05, + "loss": 0.3725, + "step": 1736000 + }, + { + "epoch": 11.750893243828497, + "grad_norm": 0.38301795721054077, + "learning_rate": 4.882491067561715e-05, + "loss": 0.3721, + "step": 1736500 + }, + { + "epoch": 11.754276743178865, + "grad_norm": 0.3501420319080353, + "learning_rate": 4.882457232568212e-05, + "loss": 0.3718, + "step": 1737000 + }, + { + "epoch": 11.757660242529234, + "grad_norm": 0.35756149888038635, + "learning_rate": 4.882423397574708e-05, + "loss": 0.372, + "step": 1737500 + }, + { + "epoch": 11.761043741879602, + "grad_norm": 0.3879341185092926, + "learning_rate": 4.882389562581204e-05, + "loss": 0.3718, + "step": 1738000 + }, + { + "epoch": 11.764427241229969, + "grad_norm": 0.3338926434516907, + "learning_rate": 4.8823557275877005e-05, + "loss": 0.3717, + "step": 1738500 + }, + { + "epoch": 11.767810740580337, + "grad_norm": 0.35573339462280273, + "learning_rate": 4.882321892594197e-05, + "loss": 0.372, + "step": 1739000 + }, + { + "epoch": 11.771194239930706, + "grad_norm": 0.3486931025981903, + "learning_rate": 4.882288057600693e-05, + "loss": 0.3709, + "step": 1739500 + }, + { + "epoch": 11.774577739281074, + "grad_norm": 0.37727949023246765, + "learning_rate": 4.882254222607189e-05, + "loss": 0.3716, + "step": 1740000 + }, + { + "epoch": 11.777961238631443, + "grad_norm": 0.35868674516677856, + "learning_rate": 4.882220387613686e-05, + "loss": 0.3731, + "step": 1740500 + }, + { + "epoch": 11.78134473798181, + "grad_norm": 0.3462320566177368, + "learning_rate": 4.882186552620182e-05, + "loss": 0.3716, + "step": 1741000 + }, + { + "epoch": 11.784728237332178, + "grad_norm": 0.39170634746551514, + "learning_rate": 4.8821527176266784e-05, + "loss": 0.3712, + "step": 1741500 + }, + { + "epoch": 11.788111736682547, + "grad_norm": 0.3877205550670624, + "learning_rate": 4.8821188826331747e-05, + "loss": 0.3694, + "step": 1742000 + }, + { + "epoch": 11.791495236032915, + "grad_norm": 0.3896919786930084, + "learning_rate": 4.8820850476396715e-05, + "loss": 0.3738, + "step": 1742500 + }, + { + "epoch": 11.794878735383282, + "grad_norm": 0.34968894720077515, + "learning_rate": 4.882051212646168e-05, + "loss": 0.3705, + "step": 1743000 + }, + { + "epoch": 11.79826223473365, + "grad_norm": 0.3168765604496002, + "learning_rate": 4.882017377652664e-05, + "loss": 0.3731, + "step": 1743500 + }, + { + "epoch": 11.801645734084019, + "grad_norm": 0.39077892899513245, + "learning_rate": 4.8819835426591595e-05, + "loss": 0.3723, + "step": 1744000 + }, + { + "epoch": 11.805029233434388, + "grad_norm": 0.3569541871547699, + "learning_rate": 4.8819497076656564e-05, + "loss": 0.3717, + "step": 1744500 + }, + { + "epoch": 11.808412732784756, + "grad_norm": 0.3633823096752167, + "learning_rate": 4.8819158726721526e-05, + "loss": 0.3718, + "step": 1745000 + }, + { + "epoch": 11.811796232135123, + "grad_norm": 0.365784615278244, + "learning_rate": 4.881882037678649e-05, + "loss": 0.3716, + "step": 1745500 + }, + { + "epoch": 11.815179731485491, + "grad_norm": 0.35726240277290344, + "learning_rate": 4.881848202685145e-05, + "loss": 0.373, + "step": 1746000 + }, + { + "epoch": 11.81856323083586, + "grad_norm": 0.3863852322101593, + "learning_rate": 4.881814367691642e-05, + "loss": 0.3708, + "step": 1746500 + }, + { + "epoch": 11.821946730186228, + "grad_norm": 0.3824382722377777, + "learning_rate": 4.881780532698138e-05, + "loss": 0.3714, + "step": 1747000 + }, + { + "epoch": 11.825330229536595, + "grad_norm": 0.37908515334129333, + "learning_rate": 4.8817466977046343e-05, + "loss": 0.3708, + "step": 1747500 + }, + { + "epoch": 11.828713728886964, + "grad_norm": 0.3562980890274048, + "learning_rate": 4.8817128627111306e-05, + "loss": 0.3717, + "step": 1748000 + }, + { + "epoch": 11.832097228237332, + "grad_norm": 0.3647066652774811, + "learning_rate": 4.881679027717627e-05, + "loss": 0.3721, + "step": 1748500 + }, + { + "epoch": 11.8354807275877, + "grad_norm": 0.36721131205558777, + "learning_rate": 4.881645192724123e-05, + "loss": 0.3717, + "step": 1749000 + }, + { + "epoch": 11.838864226938068, + "grad_norm": 0.36937791109085083, + "learning_rate": 4.881611357730619e-05, + "loss": 0.3704, + "step": 1749500 + }, + { + "epoch": 11.842247726288436, + "grad_norm": 0.3358730673789978, + "learning_rate": 4.881577522737116e-05, + "loss": 0.3716, + "step": 1750000 + }, + { + "epoch": 11.845631225638805, + "grad_norm": 0.38302385807037354, + "learning_rate": 4.881543687743612e-05, + "loss": 0.3718, + "step": 1750500 + }, + { + "epoch": 11.849014724989173, + "grad_norm": 0.3572796881198883, + "learning_rate": 4.8815098527501085e-05, + "loss": 0.3709, + "step": 1751000 + }, + { + "epoch": 11.852398224339542, + "grad_norm": 0.34055548906326294, + "learning_rate": 4.881476017756605e-05, + "loss": 0.3715, + "step": 1751500 + }, + { + "epoch": 11.855781723689908, + "grad_norm": 0.3258049190044403, + "learning_rate": 4.8814421827631016e-05, + "loss": 0.3711, + "step": 1752000 + }, + { + "epoch": 11.859165223040277, + "grad_norm": 0.35258248448371887, + "learning_rate": 4.881408347769598e-05, + "loss": 0.3707, + "step": 1752500 + }, + { + "epoch": 11.862548722390645, + "grad_norm": 0.35839393734931946, + "learning_rate": 4.881374512776094e-05, + "loss": 0.3718, + "step": 1753000 + }, + { + "epoch": 11.865932221741014, + "grad_norm": 0.3616470992565155, + "learning_rate": 4.8813406777825896e-05, + "loss": 0.3721, + "step": 1753500 + }, + { + "epoch": 11.869315721091382, + "grad_norm": 0.36866047978401184, + "learning_rate": 4.8813068427890865e-05, + "loss": 0.373, + "step": 1754000 + }, + { + "epoch": 11.87269922044175, + "grad_norm": 0.3095003068447113, + "learning_rate": 4.881273007795583e-05, + "loss": 0.3714, + "step": 1754500 + }, + { + "epoch": 11.876082719792118, + "grad_norm": 0.3345845341682434, + "learning_rate": 4.881239172802079e-05, + "loss": 0.3733, + "step": 1755000 + }, + { + "epoch": 11.879466219142486, + "grad_norm": 0.33528879284858704, + "learning_rate": 4.881205337808575e-05, + "loss": 0.3718, + "step": 1755500 + }, + { + "epoch": 11.882849718492855, + "grad_norm": 0.3436717092990875, + "learning_rate": 4.881171502815072e-05, + "loss": 0.3724, + "step": 1756000 + }, + { + "epoch": 11.886233217843222, + "grad_norm": 0.36002013087272644, + "learning_rate": 4.881137667821568e-05, + "loss": 0.3713, + "step": 1756500 + }, + { + "epoch": 11.88961671719359, + "grad_norm": 0.37779897451400757, + "learning_rate": 4.8811038328280644e-05, + "loss": 0.3703, + "step": 1757000 + }, + { + "epoch": 11.893000216543959, + "grad_norm": 0.34598466753959656, + "learning_rate": 4.8810699978345606e-05, + "loss": 0.3713, + "step": 1757500 + }, + { + "epoch": 11.896383715894327, + "grad_norm": 0.34416958689689636, + "learning_rate": 4.881036162841057e-05, + "loss": 0.3693, + "step": 1758000 + }, + { + "epoch": 11.899767215244694, + "grad_norm": 0.3440912067890167, + "learning_rate": 4.881002327847553e-05, + "loss": 0.3707, + "step": 1758500 + }, + { + "epoch": 11.903150714595062, + "grad_norm": 0.34557539224624634, + "learning_rate": 4.880968492854049e-05, + "loss": 0.3724, + "step": 1759000 + }, + { + "epoch": 11.90653421394543, + "grad_norm": 0.3842761516571045, + "learning_rate": 4.880934657860546e-05, + "loss": 0.3726, + "step": 1759500 + }, + { + "epoch": 11.9099177132958, + "grad_norm": 0.37416863441467285, + "learning_rate": 4.8809008228670424e-05, + "loss": 0.3715, + "step": 1760000 + }, + { + "epoch": 11.913301212646168, + "grad_norm": 0.3516014516353607, + "learning_rate": 4.8808669878735386e-05, + "loss": 0.3728, + "step": 1760500 + }, + { + "epoch": 11.916684711996535, + "grad_norm": 0.36235132813453674, + "learning_rate": 4.880833152880035e-05, + "loss": 0.3712, + "step": 1761000 + }, + { + "epoch": 11.920068211346903, + "grad_norm": 0.34364044666290283, + "learning_rate": 4.880799317886532e-05, + "loss": 0.3719, + "step": 1761500 + }, + { + "epoch": 11.923451710697272, + "grad_norm": 0.380653977394104, + "learning_rate": 4.880765482893028e-05, + "loss": 0.3713, + "step": 1762000 + }, + { + "epoch": 11.92683521004764, + "grad_norm": 0.40505051612854004, + "learning_rate": 4.880731647899524e-05, + "loss": 0.3727, + "step": 1762500 + }, + { + "epoch": 11.930218709398007, + "grad_norm": 0.3548690676689148, + "learning_rate": 4.8806978129060196e-05, + "loss": 0.3725, + "step": 1763000 + }, + { + "epoch": 11.933602208748376, + "grad_norm": 0.34525832533836365, + "learning_rate": 4.8806639779125165e-05, + "loss": 0.3727, + "step": 1763500 + }, + { + "epoch": 11.936985708098744, + "grad_norm": 0.3858516216278076, + "learning_rate": 4.880630142919013e-05, + "loss": 0.3726, + "step": 1764000 + }, + { + "epoch": 11.940369207449113, + "grad_norm": 0.3259793817996979, + "learning_rate": 4.880596307925509e-05, + "loss": 0.3715, + "step": 1764500 + }, + { + "epoch": 11.943752706799481, + "grad_norm": 0.3813071846961975, + "learning_rate": 4.880562472932005e-05, + "loss": 0.374, + "step": 1765000 + }, + { + "epoch": 11.947136206149848, + "grad_norm": 0.379879355430603, + "learning_rate": 4.880528637938502e-05, + "loss": 0.3727, + "step": 1765500 + }, + { + "epoch": 11.950519705500216, + "grad_norm": 0.35107338428497314, + "learning_rate": 4.880494802944998e-05, + "loss": 0.3722, + "step": 1766000 + }, + { + "epoch": 11.953903204850585, + "grad_norm": 0.33819907903671265, + "learning_rate": 4.8804609679514945e-05, + "loss": 0.3712, + "step": 1766500 + }, + { + "epoch": 11.957286704200953, + "grad_norm": 0.37626633048057556, + "learning_rate": 4.880427132957991e-05, + "loss": 0.3712, + "step": 1767000 + }, + { + "epoch": 11.96067020355132, + "grad_norm": 0.3538345694541931, + "learning_rate": 4.8803932979644876e-05, + "loss": 0.3704, + "step": 1767500 + }, + { + "epoch": 11.964053702901689, + "grad_norm": 0.32092374563217163, + "learning_rate": 4.880359462970983e-05, + "loss": 0.3711, + "step": 1768000 + }, + { + "epoch": 11.967437202252057, + "grad_norm": 0.3457582890987396, + "learning_rate": 4.880325627977479e-05, + "loss": 0.3706, + "step": 1768500 + }, + { + "epoch": 11.970820701602426, + "grad_norm": 0.350088894367218, + "learning_rate": 4.880291792983976e-05, + "loss": 0.3712, + "step": 1769000 + }, + { + "epoch": 11.974204200952794, + "grad_norm": 0.3614156246185303, + "learning_rate": 4.8802579579904724e-05, + "loss": 0.373, + "step": 1769500 + }, + { + "epoch": 11.977587700303161, + "grad_norm": 0.36072155833244324, + "learning_rate": 4.8802241229969686e-05, + "loss": 0.3721, + "step": 1770000 + }, + { + "epoch": 11.98097119965353, + "grad_norm": 0.38477975130081177, + "learning_rate": 4.880190288003465e-05, + "loss": 0.3713, + "step": 1770500 + }, + { + "epoch": 11.984354699003898, + "grad_norm": 0.3696984052658081, + "learning_rate": 4.880156453009961e-05, + "loss": 0.3723, + "step": 1771000 + }, + { + "epoch": 11.987738198354267, + "grad_norm": 0.3442613184452057, + "learning_rate": 4.880122618016458e-05, + "loss": 0.3723, + "step": 1771500 + }, + { + "epoch": 11.991121697704633, + "grad_norm": 0.34321239590644836, + "learning_rate": 4.880088783022954e-05, + "loss": 0.3742, + "step": 1772000 + }, + { + "epoch": 11.994505197055002, + "grad_norm": 0.3402617871761322, + "learning_rate": 4.88005494802945e-05, + "loss": 0.3712, + "step": 1772500 + }, + { + "epoch": 11.99788869640537, + "grad_norm": 0.34689807891845703, + "learning_rate": 4.8800211130359466e-05, + "loss": 0.3707, + "step": 1773000 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.8585814519632364, + "eval_loss": 0.5731452703475952, + "eval_runtime": 3384.9935, + "eval_samples_per_second": 85.892, + "eval_steps_per_second": 5.368, + "step": 1773312 + }, + { + "epoch": 12.001272195755739, + "grad_norm": 0.37968823313713074, + "learning_rate": 4.879987278042443e-05, + "loss": 0.3703, + "step": 1773500 + }, + { + "epoch": 12.004655695106107, + "grad_norm": 0.3329985737800598, + "learning_rate": 4.879953443048939e-05, + "loss": 0.3683, + "step": 1774000 + }, + { + "epoch": 12.008039194456474, + "grad_norm": 0.3562377393245697, + "learning_rate": 4.879919608055435e-05, + "loss": 0.3696, + "step": 1774500 + }, + { + "epoch": 12.011422693806843, + "grad_norm": 0.37858709692955017, + "learning_rate": 4.879885773061932e-05, + "loss": 0.37, + "step": 1775000 + }, + { + "epoch": 12.014806193157211, + "grad_norm": 0.33354243636131287, + "learning_rate": 4.879851938068428e-05, + "loss": 0.3695, + "step": 1775500 + }, + { + "epoch": 12.01818969250758, + "grad_norm": 0.36619076132774353, + "learning_rate": 4.8798181030749245e-05, + "loss": 0.3693, + "step": 1776000 + }, + { + "epoch": 12.021573191857946, + "grad_norm": 0.332576185464859, + "learning_rate": 4.879784268081421e-05, + "loss": 0.3706, + "step": 1776500 + }, + { + "epoch": 12.024956691208315, + "grad_norm": 0.36819761991500854, + "learning_rate": 4.8797504330879176e-05, + "loss": 0.3701, + "step": 1777000 + }, + { + "epoch": 12.028340190558684, + "grad_norm": 0.3572423458099365, + "learning_rate": 4.879716598094413e-05, + "loss": 0.3709, + "step": 1777500 + }, + { + "epoch": 12.031723689909052, + "grad_norm": 0.3288554847240448, + "learning_rate": 4.8796827631009094e-05, + "loss": 0.369, + "step": 1778000 + }, + { + "epoch": 12.035107189259419, + "grad_norm": 0.37519699335098267, + "learning_rate": 4.8796489281074056e-05, + "loss": 0.3687, + "step": 1778500 + }, + { + "epoch": 12.038490688609787, + "grad_norm": 0.3955352008342743, + "learning_rate": 4.8796150931139025e-05, + "loss": 0.3702, + "step": 1779000 + }, + { + "epoch": 12.041874187960156, + "grad_norm": 0.39312025904655457, + "learning_rate": 4.879581258120399e-05, + "loss": 0.3703, + "step": 1779500 + }, + { + "epoch": 12.045257687310524, + "grad_norm": 0.36839836835861206, + "learning_rate": 4.879547423126895e-05, + "loss": 0.3703, + "step": 1780000 + }, + { + "epoch": 12.048641186660893, + "grad_norm": 0.36080053448677063, + "learning_rate": 4.879513588133391e-05, + "loss": 0.3708, + "step": 1780500 + }, + { + "epoch": 12.05202468601126, + "grad_norm": 0.3614180088043213, + "learning_rate": 4.879479753139888e-05, + "loss": 0.3695, + "step": 1781000 + }, + { + "epoch": 12.055408185361628, + "grad_norm": 0.36564359068870544, + "learning_rate": 4.879445918146384e-05, + "loss": 0.3713, + "step": 1781500 + }, + { + "epoch": 12.058791684711997, + "grad_norm": 0.3622540831565857, + "learning_rate": 4.87941208315288e-05, + "loss": 0.37, + "step": 1782000 + }, + { + "epoch": 12.062175184062365, + "grad_norm": 0.36027461290359497, + "learning_rate": 4.8793782481593766e-05, + "loss": 0.37, + "step": 1782500 + }, + { + "epoch": 12.065558683412732, + "grad_norm": 0.34156128764152527, + "learning_rate": 4.879344413165873e-05, + "loss": 0.3679, + "step": 1783000 + }, + { + "epoch": 12.0689421827631, + "grad_norm": 0.3399154543876648, + "learning_rate": 4.879310578172369e-05, + "loss": 0.371, + "step": 1783500 + }, + { + "epoch": 12.072325682113469, + "grad_norm": 0.34678414463996887, + "learning_rate": 4.879276743178865e-05, + "loss": 0.3698, + "step": 1784000 + }, + { + "epoch": 12.075709181463838, + "grad_norm": 0.3499497175216675, + "learning_rate": 4.879242908185362e-05, + "loss": 0.3715, + "step": 1784500 + }, + { + "epoch": 12.079092680814206, + "grad_norm": 0.35000520944595337, + "learning_rate": 4.8792090731918584e-05, + "loss": 0.3699, + "step": 1785000 + }, + { + "epoch": 12.082476180164573, + "grad_norm": 0.4421665668487549, + "learning_rate": 4.8791752381983546e-05, + "loss": 0.3698, + "step": 1785500 + }, + { + "epoch": 12.085859679514941, + "grad_norm": 0.368918776512146, + "learning_rate": 4.879141403204851e-05, + "loss": 0.3693, + "step": 1786000 + }, + { + "epoch": 12.08924317886531, + "grad_norm": 0.3288615643978119, + "learning_rate": 4.879107568211348e-05, + "loss": 0.37, + "step": 1786500 + }, + { + "epoch": 12.092626678215678, + "grad_norm": 0.35252460837364197, + "learning_rate": 4.879073733217843e-05, + "loss": 0.3693, + "step": 1787000 + }, + { + "epoch": 12.096010177566045, + "grad_norm": 0.39786890149116516, + "learning_rate": 4.8790398982243394e-05, + "loss": 0.3714, + "step": 1787500 + }, + { + "epoch": 12.099393676916414, + "grad_norm": 0.35984885692596436, + "learning_rate": 4.8790060632308357e-05, + "loss": 0.3693, + "step": 1788000 + }, + { + "epoch": 12.102777176266782, + "grad_norm": 0.367716908454895, + "learning_rate": 4.8789722282373325e-05, + "loss": 0.3693, + "step": 1788500 + }, + { + "epoch": 12.10616067561715, + "grad_norm": 0.3643549680709839, + "learning_rate": 4.878938393243829e-05, + "loss": 0.3705, + "step": 1789000 + }, + { + "epoch": 12.10954417496752, + "grad_norm": 0.3920140862464905, + "learning_rate": 4.878904558250325e-05, + "loss": 0.3675, + "step": 1789500 + }, + { + "epoch": 12.112927674317886, + "grad_norm": 0.36466044187545776, + "learning_rate": 4.878870723256821e-05, + "loss": 0.3684, + "step": 1790000 + }, + { + "epoch": 12.116311173668254, + "grad_norm": 0.38179928064346313, + "learning_rate": 4.878836888263318e-05, + "loss": 0.3713, + "step": 1790500 + }, + { + "epoch": 12.119694673018623, + "grad_norm": 0.35445892810821533, + "learning_rate": 4.878803053269814e-05, + "loss": 0.3703, + "step": 1791000 + }, + { + "epoch": 12.123078172368992, + "grad_norm": 0.388711541891098, + "learning_rate": 4.87876921827631e-05, + "loss": 0.3699, + "step": 1791500 + }, + { + "epoch": 12.126461671719358, + "grad_norm": 0.3741181790828705, + "learning_rate": 4.878735383282807e-05, + "loss": 0.3698, + "step": 1792000 + }, + { + "epoch": 12.129845171069727, + "grad_norm": 0.3510143756866455, + "learning_rate": 4.878701548289303e-05, + "loss": 0.3714, + "step": 1792500 + }, + { + "epoch": 12.133228670420095, + "grad_norm": 0.49974462389945984, + "learning_rate": 4.878667713295799e-05, + "loss": 0.3694, + "step": 1793000 + }, + { + "epoch": 12.136612169770464, + "grad_norm": 0.3473338782787323, + "learning_rate": 4.8786338783022953e-05, + "loss": 0.37, + "step": 1793500 + }, + { + "epoch": 12.139995669120832, + "grad_norm": 0.36775150895118713, + "learning_rate": 4.878600043308792e-05, + "loss": 0.3713, + "step": 1794000 + }, + { + "epoch": 12.1433791684712, + "grad_norm": 0.38027772307395935, + "learning_rate": 4.8785662083152884e-05, + "loss": 0.3685, + "step": 1794500 + }, + { + "epoch": 12.146762667821568, + "grad_norm": 0.3760400414466858, + "learning_rate": 4.8785323733217847e-05, + "loss": 0.3713, + "step": 1795000 + }, + { + "epoch": 12.150146167171936, + "grad_norm": 0.3490118384361267, + "learning_rate": 4.878498538328281e-05, + "loss": 0.3692, + "step": 1795500 + }, + { + "epoch": 12.153529666522305, + "grad_norm": 0.3494408130645752, + "learning_rate": 4.878464703334778e-05, + "loss": 0.371, + "step": 1796000 + }, + { + "epoch": 12.156913165872671, + "grad_norm": 0.4272332787513733, + "learning_rate": 4.878430868341273e-05, + "loss": 0.3702, + "step": 1796500 + }, + { + "epoch": 12.16029666522304, + "grad_norm": 0.35877782106399536, + "learning_rate": 4.8783970333477695e-05, + "loss": 0.37, + "step": 1797000 + }, + { + "epoch": 12.163680164573409, + "grad_norm": 0.3769785761833191, + "learning_rate": 4.878363198354266e-05, + "loss": 0.3702, + "step": 1797500 + }, + { + "epoch": 12.167063663923777, + "grad_norm": 0.35167497396469116, + "learning_rate": 4.8783293633607626e-05, + "loss": 0.3726, + "step": 1798000 + }, + { + "epoch": 12.170447163274144, + "grad_norm": 0.35780346393585205, + "learning_rate": 4.878295528367259e-05, + "loss": 0.3706, + "step": 1798500 + }, + { + "epoch": 12.173830662624512, + "grad_norm": 0.33106979727745056, + "learning_rate": 4.878261693373755e-05, + "loss": 0.3696, + "step": 1799000 + }, + { + "epoch": 12.17721416197488, + "grad_norm": 0.3905043303966522, + "learning_rate": 4.878227858380251e-05, + "loss": 0.37, + "step": 1799500 + }, + { + "epoch": 12.18059766132525, + "grad_norm": 0.367580771446228, + "learning_rate": 4.878194023386748e-05, + "loss": 0.3702, + "step": 1800000 + }, + { + "epoch": 12.183981160675618, + "grad_norm": 0.3430144786834717, + "learning_rate": 4.8781601883932443e-05, + "loss": 0.3712, + "step": 1800500 + }, + { + "epoch": 12.187364660025985, + "grad_norm": 0.37266072630882263, + "learning_rate": 4.87812635339974e-05, + "loss": 0.3711, + "step": 1801000 + }, + { + "epoch": 12.190748159376353, + "grad_norm": 0.36405715346336365, + "learning_rate": 4.878092518406237e-05, + "loss": 0.3711, + "step": 1801500 + }, + { + "epoch": 12.194131658726722, + "grad_norm": 0.3532100319862366, + "learning_rate": 4.878058683412733e-05, + "loss": 0.3703, + "step": 1802000 + }, + { + "epoch": 12.19751515807709, + "grad_norm": 0.36643290519714355, + "learning_rate": 4.878024848419229e-05, + "loss": 0.371, + "step": 1802500 + }, + { + "epoch": 12.200898657427457, + "grad_norm": 0.3668473958969116, + "learning_rate": 4.8779910134257254e-05, + "loss": 0.3703, + "step": 1803000 + }, + { + "epoch": 12.204282156777825, + "grad_norm": 0.35221970081329346, + "learning_rate": 4.877957178432222e-05, + "loss": 0.3706, + "step": 1803500 + }, + { + "epoch": 12.207665656128194, + "grad_norm": 0.30639296770095825, + "learning_rate": 4.8779233434387185e-05, + "loss": 0.3721, + "step": 1804000 + }, + { + "epoch": 12.211049155478563, + "grad_norm": 0.3235289454460144, + "learning_rate": 4.877889508445215e-05, + "loss": 0.3707, + "step": 1804500 + }, + { + "epoch": 12.214432654828931, + "grad_norm": 0.3486664593219757, + "learning_rate": 4.877855673451711e-05, + "loss": 0.3713, + "step": 1805000 + }, + { + "epoch": 12.217816154179298, + "grad_norm": 0.33962762355804443, + "learning_rate": 4.877821838458208e-05, + "loss": 0.3695, + "step": 1805500 + }, + { + "epoch": 12.221199653529666, + "grad_norm": 0.3924519419670105, + "learning_rate": 4.8777880034647034e-05, + "loss": 0.3721, + "step": 1806000 + }, + { + "epoch": 12.224583152880035, + "grad_norm": 0.3697432577610016, + "learning_rate": 4.8777541684711996e-05, + "loss": 0.3702, + "step": 1806500 + }, + { + "epoch": 12.227966652230403, + "grad_norm": 0.4088262617588043, + "learning_rate": 4.877720333477696e-05, + "loss": 0.369, + "step": 1807000 + }, + { + "epoch": 12.23135015158077, + "grad_norm": 0.3665180504322052, + "learning_rate": 4.877686498484193e-05, + "loss": 0.3706, + "step": 1807500 + }, + { + "epoch": 12.234733650931139, + "grad_norm": 0.35706061124801636, + "learning_rate": 4.877652663490689e-05, + "loss": 0.3693, + "step": 1808000 + }, + { + "epoch": 12.238117150281507, + "grad_norm": 0.3554570972919464, + "learning_rate": 4.877618828497185e-05, + "loss": 0.3704, + "step": 1808500 + }, + { + "epoch": 12.241500649631876, + "grad_norm": 0.35334378480911255, + "learning_rate": 4.877584993503681e-05, + "loss": 0.371, + "step": 1809000 + }, + { + "epoch": 12.244884148982244, + "grad_norm": 0.41907766461372375, + "learning_rate": 4.877551158510178e-05, + "loss": 0.3705, + "step": 1809500 + }, + { + "epoch": 12.248267648332611, + "grad_norm": 0.3961111009120941, + "learning_rate": 4.8775173235166744e-05, + "loss": 0.3706, + "step": 1810000 + }, + { + "epoch": 12.25165114768298, + "grad_norm": 0.3752531409263611, + "learning_rate": 4.87748348852317e-05, + "loss": 0.3714, + "step": 1810500 + }, + { + "epoch": 12.255034647033348, + "grad_norm": 0.37807732820510864, + "learning_rate": 4.877449653529667e-05, + "loss": 0.3712, + "step": 1811000 + }, + { + "epoch": 12.258418146383717, + "grad_norm": 0.370071679353714, + "learning_rate": 4.877415818536163e-05, + "loss": 0.3699, + "step": 1811500 + }, + { + "epoch": 12.261801645734083, + "grad_norm": 0.3707692325115204, + "learning_rate": 4.877381983542659e-05, + "loss": 0.3711, + "step": 1812000 + }, + { + "epoch": 12.265185145084452, + "grad_norm": 0.35802507400512695, + "learning_rate": 4.8773481485491555e-05, + "loss": 0.3695, + "step": 1812500 + }, + { + "epoch": 12.26856864443482, + "grad_norm": 0.39189577102661133, + "learning_rate": 4.8773143135556524e-05, + "loss": 0.371, + "step": 1813000 + }, + { + "epoch": 12.271952143785189, + "grad_norm": 0.34621888399124146, + "learning_rate": 4.8772804785621486e-05, + "loss": 0.3706, + "step": 1813500 + }, + { + "epoch": 12.275335643135557, + "grad_norm": 0.40879249572753906, + "learning_rate": 4.877246643568645e-05, + "loss": 0.3698, + "step": 1814000 + }, + { + "epoch": 12.278719142485924, + "grad_norm": 0.3463546633720398, + "learning_rate": 4.877212808575141e-05, + "loss": 0.3713, + "step": 1814500 + }, + { + "epoch": 12.282102641836293, + "grad_norm": 0.35812705755233765, + "learning_rate": 4.877178973581638e-05, + "loss": 0.3712, + "step": 1815000 + }, + { + "epoch": 12.285486141186661, + "grad_norm": 0.3232555687427521, + "learning_rate": 4.8771451385881334e-05, + "loss": 0.3711, + "step": 1815500 + }, + { + "epoch": 12.28886964053703, + "grad_norm": 0.3548388183116913, + "learning_rate": 4.8771113035946296e-05, + "loss": 0.3695, + "step": 1816000 + }, + { + "epoch": 12.292253139887396, + "grad_norm": 0.3918019235134125, + "learning_rate": 4.877077468601126e-05, + "loss": 0.3708, + "step": 1816500 + }, + { + "epoch": 12.295636639237765, + "grad_norm": 0.36075350642204285, + "learning_rate": 4.877043633607623e-05, + "loss": 0.3696, + "step": 1817000 + }, + { + "epoch": 12.299020138588133, + "grad_norm": 0.364998459815979, + "learning_rate": 4.877009798614119e-05, + "loss": 0.3703, + "step": 1817500 + }, + { + "epoch": 12.302403637938502, + "grad_norm": 0.34095925092697144, + "learning_rate": 4.876975963620615e-05, + "loss": 0.3699, + "step": 1818000 + }, + { + "epoch": 12.30578713728887, + "grad_norm": 0.33013880252838135, + "learning_rate": 4.8769421286271114e-05, + "loss": 0.3708, + "step": 1818500 + }, + { + "epoch": 12.309170636639237, + "grad_norm": 0.3411044180393219, + "learning_rate": 4.876908293633608e-05, + "loss": 0.3706, + "step": 1819000 + }, + { + "epoch": 12.312554135989606, + "grad_norm": 0.3632298707962036, + "learning_rate": 4.8768744586401045e-05, + "loss": 0.3705, + "step": 1819500 + }, + { + "epoch": 12.315937635339974, + "grad_norm": 0.33505117893218994, + "learning_rate": 4.876840623646601e-05, + "loss": 0.3713, + "step": 1820000 + }, + { + "epoch": 12.319321134690343, + "grad_norm": 0.35253992676734924, + "learning_rate": 4.876806788653097e-05, + "loss": 0.3726, + "step": 1820500 + }, + { + "epoch": 12.32270463404071, + "grad_norm": 0.32772931456565857, + "learning_rate": 4.876772953659593e-05, + "loss": 0.3702, + "step": 1821000 + }, + { + "epoch": 12.326088133391078, + "grad_norm": 0.36878344416618347, + "learning_rate": 4.876739118666089e-05, + "loss": 0.37, + "step": 1821500 + }, + { + "epoch": 12.329471632741447, + "grad_norm": 0.4037615656852722, + "learning_rate": 4.8767052836725855e-05, + "loss": 0.3716, + "step": 1822000 + }, + { + "epoch": 12.332855132091815, + "grad_norm": 0.42346951365470886, + "learning_rate": 4.8766714486790824e-05, + "loss": 0.3706, + "step": 1822500 + }, + { + "epoch": 12.336238631442182, + "grad_norm": 0.36725038290023804, + "learning_rate": 4.8766376136855786e-05, + "loss": 0.3711, + "step": 1823000 + }, + { + "epoch": 12.33962213079255, + "grad_norm": 0.3480580449104309, + "learning_rate": 4.876603778692075e-05, + "loss": 0.3702, + "step": 1823500 + }, + { + "epoch": 12.343005630142919, + "grad_norm": 0.3734580874443054, + "learning_rate": 4.876569943698571e-05, + "loss": 0.3706, + "step": 1824000 + }, + { + "epoch": 12.346389129493287, + "grad_norm": 0.3935319781303406, + "learning_rate": 4.876536108705067e-05, + "loss": 0.3711, + "step": 1824500 + }, + { + "epoch": 12.349772628843656, + "grad_norm": 0.3282531797885895, + "learning_rate": 4.8765022737115635e-05, + "loss": 0.3707, + "step": 1825000 + }, + { + "epoch": 12.353156128194023, + "grad_norm": 0.4004055857658386, + "learning_rate": 4.87646843871806e-05, + "loss": 0.3713, + "step": 1825500 + }, + { + "epoch": 12.356539627544391, + "grad_norm": 0.3112265169620514, + "learning_rate": 4.876434603724556e-05, + "loss": 0.372, + "step": 1826000 + }, + { + "epoch": 12.35992312689476, + "grad_norm": 0.3350541889667511, + "learning_rate": 4.876400768731053e-05, + "loss": 0.3702, + "step": 1826500 + }, + { + "epoch": 12.363306626245128, + "grad_norm": 0.3925771713256836, + "learning_rate": 4.876366933737549e-05, + "loss": 0.3708, + "step": 1827000 + }, + { + "epoch": 12.366690125595495, + "grad_norm": 0.40490618348121643, + "learning_rate": 4.876333098744045e-05, + "loss": 0.37, + "step": 1827500 + }, + { + "epoch": 12.370073624945864, + "grad_norm": 0.3550584316253662, + "learning_rate": 4.8762992637505414e-05, + "loss": 0.3725, + "step": 1828000 + }, + { + "epoch": 12.373457124296232, + "grad_norm": 0.37563541531562805, + "learning_rate": 4.876265428757038e-05, + "loss": 0.3702, + "step": 1828500 + }, + { + "epoch": 12.3768406236466, + "grad_norm": 0.33646509051322937, + "learning_rate": 4.8762315937635345e-05, + "loss": 0.3712, + "step": 1829000 + }, + { + "epoch": 12.38022412299697, + "grad_norm": 0.32552415132522583, + "learning_rate": 4.876197758770031e-05, + "loss": 0.3722, + "step": 1829500 + }, + { + "epoch": 12.383607622347336, + "grad_norm": 0.36574462056159973, + "learning_rate": 4.876163923776527e-05, + "loss": 0.3693, + "step": 1830000 + }, + { + "epoch": 12.386991121697704, + "grad_norm": 0.37173759937286377, + "learning_rate": 4.876130088783023e-05, + "loss": 0.3693, + "step": 1830500 + }, + { + "epoch": 12.390374621048073, + "grad_norm": 0.38284623622894287, + "learning_rate": 4.8760962537895194e-05, + "loss": 0.37, + "step": 1831000 + }, + { + "epoch": 12.393758120398441, + "grad_norm": 0.3950818181037903, + "learning_rate": 4.8760624187960156e-05, + "loss": 0.3706, + "step": 1831500 + }, + { + "epoch": 12.397141619748808, + "grad_norm": 0.42820975184440613, + "learning_rate": 4.8760285838025125e-05, + "loss": 0.3709, + "step": 1832000 + }, + { + "epoch": 12.400525119099177, + "grad_norm": 0.4042028486728668, + "learning_rate": 4.875994748809009e-05, + "loss": 0.371, + "step": 1832500 + }, + { + "epoch": 12.403908618449545, + "grad_norm": 0.3339955508708954, + "learning_rate": 4.875960913815505e-05, + "loss": 0.3708, + "step": 1833000 + }, + { + "epoch": 12.407292117799914, + "grad_norm": 0.34510526061058044, + "learning_rate": 4.875927078822001e-05, + "loss": 0.3696, + "step": 1833500 + }, + { + "epoch": 12.410675617150282, + "grad_norm": 0.36796268820762634, + "learning_rate": 4.875893243828497e-05, + "loss": 0.372, + "step": 1834000 + }, + { + "epoch": 12.414059116500649, + "grad_norm": 0.38430359959602356, + "learning_rate": 4.8758594088349935e-05, + "loss": 0.3706, + "step": 1834500 + }, + { + "epoch": 12.417442615851018, + "grad_norm": 0.39185529947280884, + "learning_rate": 4.87582557384149e-05, + "loss": 0.3708, + "step": 1835000 + }, + { + "epoch": 12.420826115201386, + "grad_norm": 0.3595646023750305, + "learning_rate": 4.875791738847986e-05, + "loss": 0.3729, + "step": 1835500 + }, + { + "epoch": 12.424209614551755, + "grad_norm": 0.3638467490673065, + "learning_rate": 4.875757903854483e-05, + "loss": 0.3716, + "step": 1836000 + }, + { + "epoch": 12.427593113902121, + "grad_norm": 0.37463465332984924, + "learning_rate": 4.875724068860979e-05, + "loss": 0.371, + "step": 1836500 + }, + { + "epoch": 12.43097661325249, + "grad_norm": 0.32411718368530273, + "learning_rate": 4.875690233867475e-05, + "loss": 0.371, + "step": 1837000 + }, + { + "epoch": 12.434360112602858, + "grad_norm": 0.33750879764556885, + "learning_rate": 4.8756563988739715e-05, + "loss": 0.3708, + "step": 1837500 + }, + { + "epoch": 12.437743611953227, + "grad_norm": 0.33264005184173584, + "learning_rate": 4.8756225638804684e-05, + "loss": 0.3696, + "step": 1838000 + }, + { + "epoch": 12.441127111303594, + "grad_norm": 0.3644348382949829, + "learning_rate": 4.8755887288869646e-05, + "loss": 0.3701, + "step": 1838500 + }, + { + "epoch": 12.444510610653962, + "grad_norm": 0.35615313053131104, + "learning_rate": 4.875554893893461e-05, + "loss": 0.3707, + "step": 1839000 + }, + { + "epoch": 12.44789411000433, + "grad_norm": 0.37479841709136963, + "learning_rate": 4.875521058899957e-05, + "loss": 0.3715, + "step": 1839500 + }, + { + "epoch": 12.4512776093547, + "grad_norm": 0.3680063784122467, + "learning_rate": 4.875487223906453e-05, + "loss": 0.372, + "step": 1840000 + }, + { + "epoch": 12.454661108705068, + "grad_norm": 0.3814094662666321, + "learning_rate": 4.8754533889129494e-05, + "loss": 0.3724, + "step": 1840500 + }, + { + "epoch": 12.458044608055435, + "grad_norm": 0.3767112195491791, + "learning_rate": 4.875419553919446e-05, + "loss": 0.3696, + "step": 1841000 + }, + { + "epoch": 12.461428107405803, + "grad_norm": 0.34589385986328125, + "learning_rate": 4.875385718925942e-05, + "loss": 0.3715, + "step": 1841500 + }, + { + "epoch": 12.464811606756172, + "grad_norm": 0.3726743757724762, + "learning_rate": 4.875351883932439e-05, + "loss": 0.3696, + "step": 1842000 + }, + { + "epoch": 12.46819510610654, + "grad_norm": 0.34632807970046997, + "learning_rate": 4.875318048938935e-05, + "loss": 0.3723, + "step": 1842500 + }, + { + "epoch": 12.471578605456909, + "grad_norm": 0.3409026861190796, + "learning_rate": 4.875284213945431e-05, + "loss": 0.3708, + "step": 1843000 + }, + { + "epoch": 12.474962104807275, + "grad_norm": 0.3455606698989868, + "learning_rate": 4.8752503789519274e-05, + "loss": 0.3713, + "step": 1843500 + }, + { + "epoch": 12.478345604157644, + "grad_norm": 0.3865852653980255, + "learning_rate": 4.8752165439584236e-05, + "loss": 0.3692, + "step": 1844000 + }, + { + "epoch": 12.481729103508012, + "grad_norm": 0.375196635723114, + "learning_rate": 4.87518270896492e-05, + "loss": 0.3717, + "step": 1844500 + }, + { + "epoch": 12.485112602858381, + "grad_norm": 0.3795843720436096, + "learning_rate": 4.875148873971416e-05, + "loss": 0.3705, + "step": 1845000 + }, + { + "epoch": 12.488496102208748, + "grad_norm": 0.4108113944530487, + "learning_rate": 4.875115038977913e-05, + "loss": 0.3706, + "step": 1845500 + }, + { + "epoch": 12.491879601559116, + "grad_norm": 0.3474077880382538, + "learning_rate": 4.875081203984409e-05, + "loss": 0.3708, + "step": 1846000 + }, + { + "epoch": 12.495263100909485, + "grad_norm": 0.3858991861343384, + "learning_rate": 4.8750473689909054e-05, + "loss": 0.3698, + "step": 1846500 + }, + { + "epoch": 12.498646600259853, + "grad_norm": 0.3526313006877899, + "learning_rate": 4.8750135339974016e-05, + "loss": 0.3713, + "step": 1847000 + }, + { + "epoch": 12.50203009961022, + "grad_norm": 0.38953834772109985, + "learning_rate": 4.8749796990038985e-05, + "loss": 0.3718, + "step": 1847500 + }, + { + "epoch": 12.505413598960589, + "grad_norm": 0.4159233868122101, + "learning_rate": 4.874945864010395e-05, + "loss": 0.371, + "step": 1848000 + }, + { + "epoch": 12.508797098310957, + "grad_norm": 0.3940291404724121, + "learning_rate": 4.874912029016891e-05, + "loss": 0.3686, + "step": 1848500 + }, + { + "epoch": 12.512180597661326, + "grad_norm": 0.35747018456459045, + "learning_rate": 4.8748781940233864e-05, + "loss": 0.3699, + "step": 1849000 + }, + { + "epoch": 12.515564097011694, + "grad_norm": 0.35041186213493347, + "learning_rate": 4.874844359029883e-05, + "loss": 0.3717, + "step": 1849500 + }, + { + "epoch": 12.518947596362061, + "grad_norm": 0.37993401288986206, + "learning_rate": 4.8748105240363795e-05, + "loss": 0.3705, + "step": 1850000 + }, + { + "epoch": 12.52233109571243, + "grad_norm": 0.35090896487236023, + "learning_rate": 4.874776689042876e-05, + "loss": 0.37, + "step": 1850500 + }, + { + "epoch": 12.525714595062798, + "grad_norm": 0.3232038915157318, + "learning_rate": 4.874742854049372e-05, + "loss": 0.3695, + "step": 1851000 + }, + { + "epoch": 12.529098094413166, + "grad_norm": 0.3876701891422272, + "learning_rate": 4.874709019055869e-05, + "loss": 0.37, + "step": 1851500 + }, + { + "epoch": 12.532481593763533, + "grad_norm": 0.3513367474079132, + "learning_rate": 4.874675184062365e-05, + "loss": 0.3707, + "step": 1852000 + }, + { + "epoch": 12.535865093113902, + "grad_norm": 0.3564150333404541, + "learning_rate": 4.874641349068861e-05, + "loss": 0.3717, + "step": 1852500 + }, + { + "epoch": 12.53924859246427, + "grad_norm": 0.3708456754684448, + "learning_rate": 4.8746075140753575e-05, + "loss": 0.3713, + "step": 1853000 + }, + { + "epoch": 12.542632091814639, + "grad_norm": 0.3454448878765106, + "learning_rate": 4.874573679081854e-05, + "loss": 0.3691, + "step": 1853500 + }, + { + "epoch": 12.546015591165007, + "grad_norm": 0.34445688128471375, + "learning_rate": 4.87453984408835e-05, + "loss": 0.3701, + "step": 1854000 + }, + { + "epoch": 12.549399090515374, + "grad_norm": 0.35106971859931946, + "learning_rate": 4.874506009094846e-05, + "loss": 0.3706, + "step": 1854500 + }, + { + "epoch": 12.552782589865743, + "grad_norm": 0.3719109892845154, + "learning_rate": 4.874472174101343e-05, + "loss": 0.3718, + "step": 1855000 + }, + { + "epoch": 12.556166089216111, + "grad_norm": 0.3739451766014099, + "learning_rate": 4.874438339107839e-05, + "loss": 0.3707, + "step": 1855500 + }, + { + "epoch": 12.55954958856648, + "grad_norm": 0.3855455815792084, + "learning_rate": 4.8744045041143354e-05, + "loss": 0.3712, + "step": 1856000 + }, + { + "epoch": 12.562933087916846, + "grad_norm": 0.3706997334957123, + "learning_rate": 4.8743706691208316e-05, + "loss": 0.372, + "step": 1856500 + }, + { + "epoch": 12.566316587267215, + "grad_norm": 0.34375709295272827, + "learning_rate": 4.8743368341273285e-05, + "loss": 0.3708, + "step": 1857000 + }, + { + "epoch": 12.569700086617583, + "grad_norm": 0.3563523292541504, + "learning_rate": 4.874302999133825e-05, + "loss": 0.3722, + "step": 1857500 + }, + { + "epoch": 12.573083585967952, + "grad_norm": 0.35885095596313477, + "learning_rate": 4.874269164140321e-05, + "loss": 0.3705, + "step": 1858000 + }, + { + "epoch": 12.57646708531832, + "grad_norm": 0.38173002004623413, + "learning_rate": 4.8742353291468165e-05, + "loss": 0.3709, + "step": 1858500 + }, + { + "epoch": 12.579850584668687, + "grad_norm": 0.35054492950439453, + "learning_rate": 4.8742014941533134e-05, + "loss": 0.3701, + "step": 1859000 + }, + { + "epoch": 12.583234084019056, + "grad_norm": 0.34447476267814636, + "learning_rate": 4.8741676591598096e-05, + "loss": 0.3708, + "step": 1859500 + }, + { + "epoch": 12.586617583369424, + "grad_norm": 0.3595876395702362, + "learning_rate": 4.874133824166306e-05, + "loss": 0.371, + "step": 1860000 + }, + { + "epoch": 12.590001082719793, + "grad_norm": 0.37683749198913574, + "learning_rate": 4.874099989172802e-05, + "loss": 0.3721, + "step": 1860500 + }, + { + "epoch": 12.59338458207016, + "grad_norm": 0.3663787841796875, + "learning_rate": 4.874066154179299e-05, + "loss": 0.3711, + "step": 1861000 + }, + { + "epoch": 12.596768081420528, + "grad_norm": 0.37803715467453003, + "learning_rate": 4.874032319185795e-05, + "loss": 0.3712, + "step": 1861500 + }, + { + "epoch": 12.600151580770897, + "grad_norm": 0.4103628695011139, + "learning_rate": 4.873998484192291e-05, + "loss": 0.3715, + "step": 1862000 + }, + { + "epoch": 12.603535080121265, + "grad_norm": 0.3537883162498474, + "learning_rate": 4.8739646491987875e-05, + "loss": 0.3711, + "step": 1862500 + }, + { + "epoch": 12.606918579471632, + "grad_norm": 0.3754059374332428, + "learning_rate": 4.873930814205284e-05, + "loss": 0.3702, + "step": 1863000 + }, + { + "epoch": 12.610302078822, + "grad_norm": 0.35886409878730774, + "learning_rate": 4.87389697921178e-05, + "loss": 0.3713, + "step": 1863500 + }, + { + "epoch": 12.613685578172369, + "grad_norm": 0.35821646451950073, + "learning_rate": 4.873863144218276e-05, + "loss": 0.3712, + "step": 1864000 + }, + { + "epoch": 12.617069077522737, + "grad_norm": 0.36803847551345825, + "learning_rate": 4.873829309224773e-05, + "loss": 0.3706, + "step": 1864500 + }, + { + "epoch": 12.620452576873106, + "grad_norm": 0.3754260838031769, + "learning_rate": 4.873795474231269e-05, + "loss": 0.3711, + "step": 1865000 + }, + { + "epoch": 12.623836076223473, + "grad_norm": 0.35917600989341736, + "learning_rate": 4.8737616392377655e-05, + "loss": 0.372, + "step": 1865500 + }, + { + "epoch": 12.627219575573841, + "grad_norm": 0.32796338200569153, + "learning_rate": 4.873727804244262e-05, + "loss": 0.3708, + "step": 1866000 + }, + { + "epoch": 12.63060307492421, + "grad_norm": 0.45512181520462036, + "learning_rate": 4.8736939692507586e-05, + "loss": 0.3707, + "step": 1866500 + }, + { + "epoch": 12.633986574274578, + "grad_norm": 0.36449873447418213, + "learning_rate": 4.873660134257255e-05, + "loss": 0.3713, + "step": 1867000 + }, + { + "epoch": 12.637370073624947, + "grad_norm": 0.3368343412876129, + "learning_rate": 4.873626299263751e-05, + "loss": 0.3704, + "step": 1867500 + }, + { + "epoch": 12.640753572975314, + "grad_norm": 0.37133774161338806, + "learning_rate": 4.8735924642702465e-05, + "loss": 0.3706, + "step": 1868000 + }, + { + "epoch": 12.644137072325682, + "grad_norm": 0.33808013796806335, + "learning_rate": 4.8735586292767434e-05, + "loss": 0.3692, + "step": 1868500 + }, + { + "epoch": 12.64752057167605, + "grad_norm": 0.3964240252971649, + "learning_rate": 4.8735247942832396e-05, + "loss": 0.3728, + "step": 1869000 + }, + { + "epoch": 12.65090407102642, + "grad_norm": 0.32918646931648254, + "learning_rate": 4.873490959289736e-05, + "loss": 0.3713, + "step": 1869500 + }, + { + "epoch": 12.654287570376786, + "grad_norm": 0.38005614280700684, + "learning_rate": 4.873457124296232e-05, + "loss": 0.3707, + "step": 1870000 + }, + { + "epoch": 12.657671069727154, + "grad_norm": 0.3748287856578827, + "learning_rate": 4.873423289302729e-05, + "loss": 0.3711, + "step": 1870500 + }, + { + "epoch": 12.661054569077523, + "grad_norm": 0.3982534408569336, + "learning_rate": 4.873389454309225e-05, + "loss": 0.3724, + "step": 1871000 + }, + { + "epoch": 12.664438068427891, + "grad_norm": 0.3427683115005493, + "learning_rate": 4.8733556193157214e-05, + "loss": 0.3695, + "step": 1871500 + }, + { + "epoch": 12.667821567778258, + "grad_norm": 0.3884345293045044, + "learning_rate": 4.8733217843222176e-05, + "loss": 0.3711, + "step": 1872000 + }, + { + "epoch": 12.671205067128627, + "grad_norm": 0.3692188858985901, + "learning_rate": 4.873287949328714e-05, + "loss": 0.3703, + "step": 1872500 + }, + { + "epoch": 12.674588566478995, + "grad_norm": 0.47177180647850037, + "learning_rate": 4.87325411433521e-05, + "loss": 0.3716, + "step": 1873000 + }, + { + "epoch": 12.677972065829364, + "grad_norm": 0.34280335903167725, + "learning_rate": 4.873220279341706e-05, + "loss": 0.37, + "step": 1873500 + }, + { + "epoch": 12.681355565179732, + "grad_norm": 0.39825934171676636, + "learning_rate": 4.873186444348203e-05, + "loss": 0.3704, + "step": 1874000 + }, + { + "epoch": 12.684739064530099, + "grad_norm": 0.336199551820755, + "learning_rate": 4.873152609354699e-05, + "loss": 0.3709, + "step": 1874500 + }, + { + "epoch": 12.688122563880468, + "grad_norm": 0.3393048644065857, + "learning_rate": 4.8731187743611955e-05, + "loss": 0.3704, + "step": 1875000 + }, + { + "epoch": 12.691506063230836, + "grad_norm": 0.36187732219696045, + "learning_rate": 4.873084939367692e-05, + "loss": 0.37, + "step": 1875500 + }, + { + "epoch": 12.694889562581205, + "grad_norm": 0.5153160095214844, + "learning_rate": 4.8730511043741886e-05, + "loss": 0.3698, + "step": 1876000 + }, + { + "epoch": 12.698273061931571, + "grad_norm": 0.3579491376876831, + "learning_rate": 4.873017269380685e-05, + "loss": 0.3706, + "step": 1876500 + }, + { + "epoch": 12.70165656128194, + "grad_norm": 0.3689671456813812, + "learning_rate": 4.872983434387181e-05, + "loss": 0.3711, + "step": 1877000 + }, + { + "epoch": 12.705040060632308, + "grad_norm": 0.3544732630252838, + "learning_rate": 4.8729495993936766e-05, + "loss": 0.3707, + "step": 1877500 + }, + { + "epoch": 12.708423559982677, + "grad_norm": 0.3601408898830414, + "learning_rate": 4.8729157644001735e-05, + "loss": 0.3708, + "step": 1878000 + }, + { + "epoch": 12.711807059333044, + "grad_norm": 0.37086576223373413, + "learning_rate": 4.87288192940667e-05, + "loss": 0.3712, + "step": 1878500 + }, + { + "epoch": 12.715190558683412, + "grad_norm": 0.378149151802063, + "learning_rate": 4.872848094413166e-05, + "loss": 0.3716, + "step": 1879000 + }, + { + "epoch": 12.71857405803378, + "grad_norm": 0.35186734795570374, + "learning_rate": 4.872814259419662e-05, + "loss": 0.3715, + "step": 1879500 + }, + { + "epoch": 12.72195755738415, + "grad_norm": 0.3598947525024414, + "learning_rate": 4.872780424426159e-05, + "loss": 0.3717, + "step": 1880000 + }, + { + "epoch": 12.725341056734518, + "grad_norm": 0.34927839040756226, + "learning_rate": 4.872746589432655e-05, + "loss": 0.3696, + "step": 1880500 + }, + { + "epoch": 12.728724556084885, + "grad_norm": 0.3616214096546173, + "learning_rate": 4.8727127544391514e-05, + "loss": 0.37, + "step": 1881000 + }, + { + "epoch": 12.732108055435253, + "grad_norm": 0.3940015435218811, + "learning_rate": 4.8726789194456477e-05, + "loss": 0.3675, + "step": 1881500 + }, + { + "epoch": 12.735491554785622, + "grad_norm": 0.35310205817222595, + "learning_rate": 4.8726450844521445e-05, + "loss": 0.3711, + "step": 1882000 + }, + { + "epoch": 12.73887505413599, + "grad_norm": 0.3785247802734375, + "learning_rate": 4.87261124945864e-05, + "loss": 0.3707, + "step": 1882500 + }, + { + "epoch": 12.742258553486359, + "grad_norm": 0.39114999771118164, + "learning_rate": 4.872577414465136e-05, + "loss": 0.3717, + "step": 1883000 + }, + { + "epoch": 12.745642052836725, + "grad_norm": 0.36994487047195435, + "learning_rate": 4.872543579471633e-05, + "loss": 0.3701, + "step": 1883500 + }, + { + "epoch": 12.749025552187094, + "grad_norm": 0.3476729989051819, + "learning_rate": 4.8725097444781294e-05, + "loss": 0.3711, + "step": 1884000 + }, + { + "epoch": 12.752409051537462, + "grad_norm": 0.38086920976638794, + "learning_rate": 4.8724759094846256e-05, + "loss": 0.3695, + "step": 1884500 + }, + { + "epoch": 12.755792550887831, + "grad_norm": 0.4031669497489929, + "learning_rate": 4.872442074491122e-05, + "loss": 0.3704, + "step": 1885000 + }, + { + "epoch": 12.759176050238198, + "grad_norm": 0.36528006196022034, + "learning_rate": 4.872408239497619e-05, + "loss": 0.3696, + "step": 1885500 + }, + { + "epoch": 12.762559549588566, + "grad_norm": 0.38576963543891907, + "learning_rate": 4.872374404504115e-05, + "loss": 0.3713, + "step": 1886000 + }, + { + "epoch": 12.765943048938935, + "grad_norm": 0.3542245626449585, + "learning_rate": 4.872340569510611e-05, + "loss": 0.3706, + "step": 1886500 + }, + { + "epoch": 12.769326548289303, + "grad_norm": 0.3419119119644165, + "learning_rate": 4.872306734517107e-05, + "loss": 0.3704, + "step": 1887000 + }, + { + "epoch": 12.77271004763967, + "grad_norm": 0.32595545053482056, + "learning_rate": 4.8722728995236036e-05, + "loss": 0.371, + "step": 1887500 + }, + { + "epoch": 12.776093546990039, + "grad_norm": 0.3801577687263489, + "learning_rate": 4.8722390645301e-05, + "loss": 0.3713, + "step": 1888000 + }, + { + "epoch": 12.779477046340407, + "grad_norm": 0.3445073068141937, + "learning_rate": 4.872205229536596e-05, + "loss": 0.3713, + "step": 1888500 + }, + { + "epoch": 12.782860545690776, + "grad_norm": 0.35645556449890137, + "learning_rate": 4.872171394543092e-05, + "loss": 0.3717, + "step": 1889000 + }, + { + "epoch": 12.786244045041144, + "grad_norm": 0.35171839594841003, + "learning_rate": 4.872137559549589e-05, + "loss": 0.3717, + "step": 1889500 + }, + { + "epoch": 12.78962754439151, + "grad_norm": 0.3259086310863495, + "learning_rate": 4.872103724556085e-05, + "loss": 0.3713, + "step": 1890000 + }, + { + "epoch": 12.79301104374188, + "grad_norm": 0.38064926862716675, + "learning_rate": 4.8720698895625815e-05, + "loss": 0.3721, + "step": 1890500 + }, + { + "epoch": 12.796394543092248, + "grad_norm": 0.34762662649154663, + "learning_rate": 4.872036054569078e-05, + "loss": 0.3707, + "step": 1891000 + }, + { + "epoch": 12.799778042442616, + "grad_norm": 0.3977963328361511, + "learning_rate": 4.8720022195755746e-05, + "loss": 0.3705, + "step": 1891500 + }, + { + "epoch": 12.803161541792983, + "grad_norm": 0.3365938067436218, + "learning_rate": 4.87196838458207e-05, + "loss": 0.3694, + "step": 1892000 + }, + { + "epoch": 12.806545041143352, + "grad_norm": 0.36340808868408203, + "learning_rate": 4.8719345495885664e-05, + "loss": 0.3724, + "step": 1892500 + }, + { + "epoch": 12.80992854049372, + "grad_norm": 0.35641616582870483, + "learning_rate": 4.871900714595063e-05, + "loss": 0.3727, + "step": 1893000 + }, + { + "epoch": 12.813312039844089, + "grad_norm": 0.4009360074996948, + "learning_rate": 4.8718668796015595e-05, + "loss": 0.3692, + "step": 1893500 + }, + { + "epoch": 12.816695539194457, + "grad_norm": 0.3709418475627899, + "learning_rate": 4.871833044608056e-05, + "loss": 0.3702, + "step": 1894000 + }, + { + "epoch": 12.820079038544824, + "grad_norm": 0.36211636662483215, + "learning_rate": 4.871799209614552e-05, + "loss": 0.3692, + "step": 1894500 + }, + { + "epoch": 12.823462537895193, + "grad_norm": 0.35264286398887634, + "learning_rate": 4.871765374621048e-05, + "loss": 0.3713, + "step": 1895000 + }, + { + "epoch": 12.826846037245561, + "grad_norm": 0.36805665493011475, + "learning_rate": 4.871731539627545e-05, + "loss": 0.3717, + "step": 1895500 + }, + { + "epoch": 12.83022953659593, + "grad_norm": 0.35103264451026917, + "learning_rate": 4.871697704634041e-05, + "loss": 0.3714, + "step": 1896000 + }, + { + "epoch": 12.833613035946296, + "grad_norm": 0.3711964190006256, + "learning_rate": 4.871663869640537e-05, + "loss": 0.3718, + "step": 1896500 + }, + { + "epoch": 12.836996535296665, + "grad_norm": 0.38640648126602173, + "learning_rate": 4.8716300346470336e-05, + "loss": 0.3704, + "step": 1897000 + }, + { + "epoch": 12.840380034647033, + "grad_norm": 0.3128819465637207, + "learning_rate": 4.87159619965353e-05, + "loss": 0.3715, + "step": 1897500 + }, + { + "epoch": 12.843763533997402, + "grad_norm": 0.3670842945575714, + "learning_rate": 4.871562364660026e-05, + "loss": 0.3722, + "step": 1898000 + }, + { + "epoch": 12.84714703334777, + "grad_norm": 0.3582267761230469, + "learning_rate": 4.871528529666522e-05, + "loss": 0.3711, + "step": 1898500 + }, + { + "epoch": 12.850530532698137, + "grad_norm": 0.32701876759529114, + "learning_rate": 4.871494694673019e-05, + "loss": 0.3709, + "step": 1899000 + }, + { + "epoch": 12.853914032048506, + "grad_norm": 0.35891956090927124, + "learning_rate": 4.8714608596795154e-05, + "loss": 0.3701, + "step": 1899500 + }, + { + "epoch": 12.857297531398874, + "grad_norm": 0.3812888562679291, + "learning_rate": 4.8714270246860116e-05, + "loss": 0.3696, + "step": 1900000 + }, + { + "epoch": 12.860681030749243, + "grad_norm": 0.3577359914779663, + "learning_rate": 4.871393189692508e-05, + "loss": 0.3724, + "step": 1900500 + }, + { + "epoch": 12.86406453009961, + "grad_norm": 0.36038023233413696, + "learning_rate": 4.871359354699005e-05, + "loss": 0.3695, + "step": 1901000 + }, + { + "epoch": 12.867448029449978, + "grad_norm": 0.3661651909351349, + "learning_rate": 4.8713255197055e-05, + "loss": 0.3707, + "step": 1901500 + }, + { + "epoch": 12.870831528800347, + "grad_norm": 0.3396393954753876, + "learning_rate": 4.8712916847119964e-05, + "loss": 0.3715, + "step": 1902000 + }, + { + "epoch": 12.874215028150715, + "grad_norm": 0.38005658984184265, + "learning_rate": 4.871257849718493e-05, + "loss": 0.3705, + "step": 1902500 + }, + { + "epoch": 12.877598527501082, + "grad_norm": 0.35202455520629883, + "learning_rate": 4.8712240147249895e-05, + "loss": 0.371, + "step": 1903000 + }, + { + "epoch": 12.88098202685145, + "grad_norm": 0.3754706084728241, + "learning_rate": 4.871190179731486e-05, + "loss": 0.371, + "step": 1903500 + }, + { + "epoch": 12.884365526201819, + "grad_norm": 0.3649231195449829, + "learning_rate": 4.871156344737982e-05, + "loss": 0.3707, + "step": 1904000 + }, + { + "epoch": 12.887749025552187, + "grad_norm": 0.37613940238952637, + "learning_rate": 4.871122509744478e-05, + "loss": 0.3711, + "step": 1904500 + }, + { + "epoch": 12.891132524902556, + "grad_norm": 0.39613452553749084, + "learning_rate": 4.871088674750975e-05, + "loss": 0.3699, + "step": 1905000 + }, + { + "epoch": 12.894516024252923, + "grad_norm": 0.3531261384487152, + "learning_rate": 4.871054839757471e-05, + "loss": 0.3718, + "step": 1905500 + }, + { + "epoch": 12.897899523603291, + "grad_norm": 0.35395076870918274, + "learning_rate": 4.871021004763967e-05, + "loss": 0.3711, + "step": 1906000 + }, + { + "epoch": 12.90128302295366, + "grad_norm": 0.3228279948234558, + "learning_rate": 4.870987169770464e-05, + "loss": 0.3697, + "step": 1906500 + }, + { + "epoch": 12.904666522304028, + "grad_norm": 0.3967723548412323, + "learning_rate": 4.87095333477696e-05, + "loss": 0.372, + "step": 1907000 + }, + { + "epoch": 12.908050021654397, + "grad_norm": 0.3011283576488495, + "learning_rate": 4.870919499783456e-05, + "loss": 0.3712, + "step": 1907500 + }, + { + "epoch": 12.911433521004763, + "grad_norm": 0.330438494682312, + "learning_rate": 4.870885664789952e-05, + "loss": 0.3709, + "step": 1908000 + }, + { + "epoch": 12.914817020355132, + "grad_norm": 0.34110695123672485, + "learning_rate": 4.870851829796449e-05, + "loss": 0.3709, + "step": 1908500 + }, + { + "epoch": 12.9182005197055, + "grad_norm": 0.35048842430114746, + "learning_rate": 4.8708179948029454e-05, + "loss": 0.3697, + "step": 1909000 + }, + { + "epoch": 12.921584019055869, + "grad_norm": 0.3407607972621918, + "learning_rate": 4.8707841598094416e-05, + "loss": 0.3706, + "step": 1909500 + }, + { + "epoch": 12.924967518406236, + "grad_norm": 0.3866593539714813, + "learning_rate": 4.870750324815938e-05, + "loss": 0.372, + "step": 1910000 + }, + { + "epoch": 12.928351017756604, + "grad_norm": 0.3867066502571106, + "learning_rate": 4.870716489822435e-05, + "loss": 0.3726, + "step": 1910500 + }, + { + "epoch": 12.931734517106973, + "grad_norm": 0.3892397880554199, + "learning_rate": 4.87068265482893e-05, + "loss": 0.3711, + "step": 1911000 + }, + { + "epoch": 12.935118016457341, + "grad_norm": 0.3106846213340759, + "learning_rate": 4.8706488198354265e-05, + "loss": 0.3718, + "step": 1911500 + }, + { + "epoch": 12.938501515807708, + "grad_norm": 0.3602654039859772, + "learning_rate": 4.870614984841923e-05, + "loss": 0.3714, + "step": 1912000 + }, + { + "epoch": 12.941885015158077, + "grad_norm": 0.3535303771495819, + "learning_rate": 4.8705811498484196e-05, + "loss": 0.37, + "step": 1912500 + }, + { + "epoch": 12.945268514508445, + "grad_norm": 0.377194881439209, + "learning_rate": 4.870547314854916e-05, + "loss": 0.3693, + "step": 1913000 + }, + { + "epoch": 12.948652013858814, + "grad_norm": 0.35924094915390015, + "learning_rate": 4.870513479861412e-05, + "loss": 0.3705, + "step": 1913500 + }, + { + "epoch": 12.952035513209182, + "grad_norm": 0.3477979004383087, + "learning_rate": 4.870479644867908e-05, + "loss": 0.3705, + "step": 1914000 + }, + { + "epoch": 12.955419012559549, + "grad_norm": 0.31688621640205383, + "learning_rate": 4.870445809874405e-05, + "loss": 0.3705, + "step": 1914500 + }, + { + "epoch": 12.958802511909917, + "grad_norm": 0.35452088713645935, + "learning_rate": 4.870411974880901e-05, + "loss": 0.3714, + "step": 1915000 + }, + { + "epoch": 12.962186011260286, + "grad_norm": 0.3695243299007416, + "learning_rate": 4.870378139887397e-05, + "loss": 0.3708, + "step": 1915500 + }, + { + "epoch": 12.965569510610655, + "grad_norm": 0.34055230021476746, + "learning_rate": 4.870344304893894e-05, + "loss": 0.3721, + "step": 1916000 + }, + { + "epoch": 12.968953009961021, + "grad_norm": 0.3908543586730957, + "learning_rate": 4.87031046990039e-05, + "loss": 0.3714, + "step": 1916500 + }, + { + "epoch": 12.97233650931139, + "grad_norm": 0.3592712879180908, + "learning_rate": 4.870276634906886e-05, + "loss": 0.3723, + "step": 1917000 + }, + { + "epoch": 12.975720008661758, + "grad_norm": 0.39363932609558105, + "learning_rate": 4.8702427999133824e-05, + "loss": 0.3709, + "step": 1917500 + }, + { + "epoch": 12.979103508012127, + "grad_norm": 0.36408188939094543, + "learning_rate": 4.870208964919879e-05, + "loss": 0.3697, + "step": 1918000 + }, + { + "epoch": 12.982487007362495, + "grad_norm": 0.312065452337265, + "learning_rate": 4.8701751299263755e-05, + "loss": 0.371, + "step": 1918500 + }, + { + "epoch": 12.985870506712862, + "grad_norm": 0.3716977536678314, + "learning_rate": 4.870141294932872e-05, + "loss": 0.3704, + "step": 1919000 + }, + { + "epoch": 12.98925400606323, + "grad_norm": 0.3691862225532532, + "learning_rate": 4.870107459939368e-05, + "loss": 0.3712, + "step": 1919500 + }, + { + "epoch": 12.9926375054136, + "grad_norm": 0.37253594398498535, + "learning_rate": 4.870073624945865e-05, + "loss": 0.3723, + "step": 1920000 + }, + { + "epoch": 12.996021004763968, + "grad_norm": 0.36383548378944397, + "learning_rate": 4.87003978995236e-05, + "loss": 0.3716, + "step": 1920500 + }, + { + "epoch": 12.999404504114334, + "grad_norm": 0.34657612442970276, + "learning_rate": 4.8700059549588565e-05, + "loss": 0.3697, + "step": 1921000 + }, + { + "epoch": 13.0, + "eval_accuracy": 0.8588460308752098, + "eval_loss": 0.5731586217880249, + "eval_runtime": 3378.9041, + "eval_samples_per_second": 86.047, + "eval_steps_per_second": 5.378, + "step": 1921088 + }, + { + "epoch": 13.002788003464703, + "grad_norm": 0.3782336413860321, + "learning_rate": 4.869972119965353e-05, + "loss": 0.3712, + "step": 1921500 + }, + { + "epoch": 13.006171502815072, + "grad_norm": 0.33048200607299805, + "learning_rate": 4.8699382849718496e-05, + "loss": 0.3693, + "step": 1922000 + }, + { + "epoch": 13.00955500216544, + "grad_norm": 0.36409974098205566, + "learning_rate": 4.869904449978346e-05, + "loss": 0.3674, + "step": 1922500 + }, + { + "epoch": 13.012938501515809, + "grad_norm": 0.40011271834373474, + "learning_rate": 4.869870614984842e-05, + "loss": 0.3693, + "step": 1923000 + }, + { + "epoch": 13.016322000866175, + "grad_norm": 0.38334009051322937, + "learning_rate": 4.869836779991338e-05, + "loss": 0.3686, + "step": 1923500 + }, + { + "epoch": 13.019705500216544, + "grad_norm": 0.37485963106155396, + "learning_rate": 4.869802944997835e-05, + "loss": 0.3702, + "step": 1924000 + }, + { + "epoch": 13.023088999566912, + "grad_norm": 0.3391663730144501, + "learning_rate": 4.8697691100043314e-05, + "loss": 0.3671, + "step": 1924500 + }, + { + "epoch": 13.02647249891728, + "grad_norm": 0.38333529233932495, + "learning_rate": 4.869735275010827e-05, + "loss": 0.3676, + "step": 1925000 + }, + { + "epoch": 13.029855998267648, + "grad_norm": 0.34279459714889526, + "learning_rate": 4.869701440017324e-05, + "loss": 0.3694, + "step": 1925500 + }, + { + "epoch": 13.033239497618016, + "grad_norm": 0.37504786252975464, + "learning_rate": 4.86966760502382e-05, + "loss": 0.3685, + "step": 1926000 + }, + { + "epoch": 13.036622996968385, + "grad_norm": 0.35027268528938293, + "learning_rate": 4.869633770030316e-05, + "loss": 0.368, + "step": 1926500 + }, + { + "epoch": 13.040006496318753, + "grad_norm": 0.36550816893577576, + "learning_rate": 4.8695999350368124e-05, + "loss": 0.3692, + "step": 1927000 + }, + { + "epoch": 13.04338999566912, + "grad_norm": 0.3510855734348297, + "learning_rate": 4.869566100043309e-05, + "loss": 0.3687, + "step": 1927500 + }, + { + "epoch": 13.046773495019488, + "grad_norm": 0.3407793939113617, + "learning_rate": 4.8695322650498055e-05, + "loss": 0.3707, + "step": 1928000 + }, + { + "epoch": 13.050156994369857, + "grad_norm": 0.38290151953697205, + "learning_rate": 4.869498430056302e-05, + "loss": 0.3698, + "step": 1928500 + }, + { + "epoch": 13.053540493720226, + "grad_norm": 0.3659246265888214, + "learning_rate": 4.869464595062798e-05, + "loss": 0.368, + "step": 1929000 + }, + { + "epoch": 13.056923993070594, + "grad_norm": 0.36423566937446594, + "learning_rate": 4.869430760069295e-05, + "loss": 0.37, + "step": 1929500 + }, + { + "epoch": 13.06030749242096, + "grad_norm": 0.3652285039424896, + "learning_rate": 4.8693969250757904e-05, + "loss": 0.3699, + "step": 1930000 + }, + { + "epoch": 13.06369099177133, + "grad_norm": 0.3280698359012604, + "learning_rate": 4.8693630900822866e-05, + "loss": 0.3685, + "step": 1930500 + }, + { + "epoch": 13.067074491121698, + "grad_norm": 0.41137126088142395, + "learning_rate": 4.869329255088783e-05, + "loss": 0.3701, + "step": 1931000 + }, + { + "epoch": 13.070457990472066, + "grad_norm": 0.3417060375213623, + "learning_rate": 4.86929542009528e-05, + "loss": 0.3677, + "step": 1931500 + }, + { + "epoch": 13.073841489822433, + "grad_norm": 0.3309522271156311, + "learning_rate": 4.869261585101776e-05, + "loss": 0.3697, + "step": 1932000 + }, + { + "epoch": 13.077224989172802, + "grad_norm": 0.3627679646015167, + "learning_rate": 4.869227750108272e-05, + "loss": 0.3699, + "step": 1932500 + }, + { + "epoch": 13.08060848852317, + "grad_norm": 0.34668660163879395, + "learning_rate": 4.8691939151147683e-05, + "loss": 0.3683, + "step": 1933000 + }, + { + "epoch": 13.083991987873539, + "grad_norm": 0.36464157700538635, + "learning_rate": 4.869160080121265e-05, + "loss": 0.3695, + "step": 1933500 + }, + { + "epoch": 13.087375487223907, + "grad_norm": 0.36991873383522034, + "learning_rate": 4.8691262451277614e-05, + "loss": 0.3695, + "step": 1934000 + }, + { + "epoch": 13.090758986574274, + "grad_norm": 0.37532421946525574, + "learning_rate": 4.8690924101342577e-05, + "loss": 0.3699, + "step": 1934500 + }, + { + "epoch": 13.094142485924642, + "grad_norm": 0.34363842010498047, + "learning_rate": 4.869058575140754e-05, + "loss": 0.3706, + "step": 1935000 + }, + { + "epoch": 13.097525985275011, + "grad_norm": 0.37082090973854065, + "learning_rate": 4.86902474014725e-05, + "loss": 0.3691, + "step": 1935500 + }, + { + "epoch": 13.10090948462538, + "grad_norm": 0.36466777324676514, + "learning_rate": 4.868990905153746e-05, + "loss": 0.37, + "step": 1936000 + }, + { + "epoch": 13.104292983975746, + "grad_norm": 0.40605488419532776, + "learning_rate": 4.8689570701602425e-05, + "loss": 0.3708, + "step": 1936500 + }, + { + "epoch": 13.107676483326115, + "grad_norm": 0.326335608959198, + "learning_rate": 4.8689232351667394e-05, + "loss": 0.3693, + "step": 1937000 + }, + { + "epoch": 13.111059982676483, + "grad_norm": 0.39977341890335083, + "learning_rate": 4.8688894001732356e-05, + "loss": 0.3699, + "step": 1937500 + }, + { + "epoch": 13.114443482026852, + "grad_norm": 0.3610125780105591, + "learning_rate": 4.868855565179732e-05, + "loss": 0.3698, + "step": 1938000 + }, + { + "epoch": 13.11782698137722, + "grad_norm": 0.352734237909317, + "learning_rate": 4.868821730186228e-05, + "loss": 0.3686, + "step": 1938500 + }, + { + "epoch": 13.121210480727587, + "grad_norm": 0.42013734579086304, + "learning_rate": 4.868787895192725e-05, + "loss": 0.3717, + "step": 1939000 + }, + { + "epoch": 13.124593980077956, + "grad_norm": 0.38568973541259766, + "learning_rate": 4.8687540601992205e-05, + "loss": 0.3703, + "step": 1939500 + }, + { + "epoch": 13.127977479428324, + "grad_norm": 0.3538978397846222, + "learning_rate": 4.868720225205717e-05, + "loss": 0.369, + "step": 1940000 + }, + { + "epoch": 13.131360978778693, + "grad_norm": 0.34834977984428406, + "learning_rate": 4.868686390212213e-05, + "loss": 0.3706, + "step": 1940500 + }, + { + "epoch": 13.13474447812906, + "grad_norm": 0.33336830139160156, + "learning_rate": 4.86865255521871e-05, + "loss": 0.3697, + "step": 1941000 + }, + { + "epoch": 13.138127977479428, + "grad_norm": 0.39493104815483093, + "learning_rate": 4.868618720225206e-05, + "loss": 0.3686, + "step": 1941500 + }, + { + "epoch": 13.141511476829796, + "grad_norm": 0.3420484662055969, + "learning_rate": 4.868584885231702e-05, + "loss": 0.3691, + "step": 1942000 + }, + { + "epoch": 13.144894976180165, + "grad_norm": 0.3717644214630127, + "learning_rate": 4.8685510502381984e-05, + "loss": 0.3705, + "step": 1942500 + }, + { + "epoch": 13.148278475530534, + "grad_norm": 0.3905476927757263, + "learning_rate": 4.868517215244695e-05, + "loss": 0.3688, + "step": 1943000 + }, + { + "epoch": 13.1516619748809, + "grad_norm": 0.36413437128067017, + "learning_rate": 4.8684833802511915e-05, + "loss": 0.3692, + "step": 1943500 + }, + { + "epoch": 13.155045474231269, + "grad_norm": 0.37018147110939026, + "learning_rate": 4.868449545257688e-05, + "loss": 0.3703, + "step": 1944000 + }, + { + "epoch": 13.158428973581637, + "grad_norm": 0.3624891936779022, + "learning_rate": 4.868415710264184e-05, + "loss": 0.37, + "step": 1944500 + }, + { + "epoch": 13.161812472932006, + "grad_norm": 0.3513813018798828, + "learning_rate": 4.86838187527068e-05, + "loss": 0.3692, + "step": 1945000 + }, + { + "epoch": 13.165195972282373, + "grad_norm": 0.35136500000953674, + "learning_rate": 4.8683480402771764e-05, + "loss": 0.3678, + "step": 1945500 + }, + { + "epoch": 13.168579471632741, + "grad_norm": 0.34195035696029663, + "learning_rate": 4.8683142052836726e-05, + "loss": 0.3689, + "step": 1946000 + }, + { + "epoch": 13.17196297098311, + "grad_norm": 0.353342205286026, + "learning_rate": 4.8682803702901695e-05, + "loss": 0.3708, + "step": 1946500 + }, + { + "epoch": 13.175346470333478, + "grad_norm": 0.33236193656921387, + "learning_rate": 4.868246535296666e-05, + "loss": 0.3697, + "step": 1947000 + }, + { + "epoch": 13.178729969683847, + "grad_norm": 0.40654829144477844, + "learning_rate": 4.868212700303162e-05, + "loss": 0.3708, + "step": 1947500 + }, + { + "epoch": 13.182113469034213, + "grad_norm": 0.38720816373825073, + "learning_rate": 4.868178865309658e-05, + "loss": 0.3699, + "step": 1948000 + }, + { + "epoch": 13.185496968384582, + "grad_norm": 0.37693819403648376, + "learning_rate": 4.868145030316155e-05, + "loss": 0.3685, + "step": 1948500 + }, + { + "epoch": 13.18888046773495, + "grad_norm": 0.35839956998825073, + "learning_rate": 4.8681111953226505e-05, + "loss": 0.3704, + "step": 1949000 + }, + { + "epoch": 13.192263967085319, + "grad_norm": 0.36030447483062744, + "learning_rate": 4.868077360329147e-05, + "loss": 0.3693, + "step": 1949500 + }, + { + "epoch": 13.195647466435686, + "grad_norm": 0.37780576944351196, + "learning_rate": 4.868043525335643e-05, + "loss": 0.3707, + "step": 1950000 + }, + { + "epoch": 13.199030965786054, + "grad_norm": 0.3539212942123413, + "learning_rate": 4.86800969034214e-05, + "loss": 0.3703, + "step": 1950500 + }, + { + "epoch": 13.202414465136423, + "grad_norm": 0.3429529666900635, + "learning_rate": 4.867975855348636e-05, + "loss": 0.3684, + "step": 1951000 + }, + { + "epoch": 13.205797964486791, + "grad_norm": 0.3764457106590271, + "learning_rate": 4.867942020355132e-05, + "loss": 0.3702, + "step": 1951500 + }, + { + "epoch": 13.209181463837158, + "grad_norm": 0.3694688379764557, + "learning_rate": 4.8679081853616285e-05, + "loss": 0.3694, + "step": 1952000 + }, + { + "epoch": 13.212564963187527, + "grad_norm": 0.3935447037220001, + "learning_rate": 4.8678743503681254e-05, + "loss": 0.3688, + "step": 1952500 + }, + { + "epoch": 13.215948462537895, + "grad_norm": 0.3366578221321106, + "learning_rate": 4.8678405153746216e-05, + "loss": 0.3678, + "step": 1953000 + }, + { + "epoch": 13.219331961888264, + "grad_norm": 0.3674514591693878, + "learning_rate": 4.867806680381118e-05, + "loss": 0.3702, + "step": 1953500 + }, + { + "epoch": 13.222715461238632, + "grad_norm": 0.3328007757663727, + "learning_rate": 4.867772845387614e-05, + "loss": 0.3699, + "step": 1954000 + }, + { + "epoch": 13.226098960588999, + "grad_norm": 0.3304157853126526, + "learning_rate": 4.86773901039411e-05, + "loss": 0.3688, + "step": 1954500 + }, + { + "epoch": 13.229482459939367, + "grad_norm": 0.3459080755710602, + "learning_rate": 4.8677051754006064e-05, + "loss": 0.3684, + "step": 1955000 + }, + { + "epoch": 13.232865959289736, + "grad_norm": 0.39367958903312683, + "learning_rate": 4.8676713404071026e-05, + "loss": 0.3703, + "step": 1955500 + }, + { + "epoch": 13.236249458640104, + "grad_norm": 0.37636101245880127, + "learning_rate": 4.8676375054135995e-05, + "loss": 0.3699, + "step": 1956000 + }, + { + "epoch": 13.239632957990471, + "grad_norm": 0.3529524505138397, + "learning_rate": 4.867603670420096e-05, + "loss": 0.3693, + "step": 1956500 + }, + { + "epoch": 13.24301645734084, + "grad_norm": 0.33474189043045044, + "learning_rate": 4.867569835426592e-05, + "loss": 0.3691, + "step": 1957000 + }, + { + "epoch": 13.246399956691208, + "grad_norm": 0.3529336750507355, + "learning_rate": 4.867536000433088e-05, + "loss": 0.371, + "step": 1957500 + }, + { + "epoch": 13.249783456041577, + "grad_norm": 0.38825368881225586, + "learning_rate": 4.8675021654395844e-05, + "loss": 0.3695, + "step": 1958000 + }, + { + "epoch": 13.253166955391945, + "grad_norm": 0.3401729166507721, + "learning_rate": 4.8674683304460806e-05, + "loss": 0.3701, + "step": 1958500 + }, + { + "epoch": 13.256550454742312, + "grad_norm": 0.3608418405056, + "learning_rate": 4.867434495452577e-05, + "loss": 0.369, + "step": 1959000 + }, + { + "epoch": 13.25993395409268, + "grad_norm": 0.5334931015968323, + "learning_rate": 4.867400660459073e-05, + "loss": 0.3689, + "step": 1959500 + }, + { + "epoch": 13.26331745344305, + "grad_norm": 0.35742655396461487, + "learning_rate": 4.86736682546557e-05, + "loss": 0.3714, + "step": 1960000 + }, + { + "epoch": 13.266700952793418, + "grad_norm": 0.369232714176178, + "learning_rate": 4.867332990472066e-05, + "loss": 0.3696, + "step": 1960500 + }, + { + "epoch": 13.270084452143784, + "grad_norm": 0.34640198945999146, + "learning_rate": 4.867299155478562e-05, + "loss": 0.37, + "step": 1961000 + }, + { + "epoch": 13.273467951494153, + "grad_norm": 0.3134201467037201, + "learning_rate": 4.8672653204850585e-05, + "loss": 0.3702, + "step": 1961500 + }, + { + "epoch": 13.276851450844521, + "grad_norm": 0.4063735604286194, + "learning_rate": 4.8672314854915554e-05, + "loss": 0.3702, + "step": 1962000 + }, + { + "epoch": 13.28023495019489, + "grad_norm": 0.40066906809806824, + "learning_rate": 4.8671976504980516e-05, + "loss": 0.3707, + "step": 1962500 + }, + { + "epoch": 13.283618449545258, + "grad_norm": 0.38528531789779663, + "learning_rate": 4.867163815504548e-05, + "loss": 0.3695, + "step": 1963000 + }, + { + "epoch": 13.287001948895625, + "grad_norm": 0.38084566593170166, + "learning_rate": 4.867129980511044e-05, + "loss": 0.3697, + "step": 1963500 + }, + { + "epoch": 13.290385448245994, + "grad_norm": 0.36008328199386597, + "learning_rate": 4.86709614551754e-05, + "loss": 0.3694, + "step": 1964000 + }, + { + "epoch": 13.293768947596362, + "grad_norm": 0.3995845317840576, + "learning_rate": 4.8670623105240365e-05, + "loss": 0.3701, + "step": 1964500 + }, + { + "epoch": 13.29715244694673, + "grad_norm": 0.38086098432540894, + "learning_rate": 4.867028475530533e-05, + "loss": 0.3692, + "step": 1965000 + }, + { + "epoch": 13.300535946297098, + "grad_norm": 0.3582462668418884, + "learning_rate": 4.866994640537029e-05, + "loss": 0.3695, + "step": 1965500 + }, + { + "epoch": 13.303919445647466, + "grad_norm": 0.38480857014656067, + "learning_rate": 4.866960805543526e-05, + "loss": 0.3686, + "step": 1966000 + }, + { + "epoch": 13.307302944997835, + "grad_norm": 0.36101385951042175, + "learning_rate": 4.866926970550022e-05, + "loss": 0.3694, + "step": 1966500 + }, + { + "epoch": 13.310686444348203, + "grad_norm": 0.3723582625389099, + "learning_rate": 4.866893135556518e-05, + "loss": 0.3689, + "step": 1967000 + }, + { + "epoch": 13.314069943698572, + "grad_norm": 0.38096871972084045, + "learning_rate": 4.8668593005630144e-05, + "loss": 0.37, + "step": 1967500 + }, + { + "epoch": 13.317453443048938, + "grad_norm": 0.42835044860839844, + "learning_rate": 4.8668254655695106e-05, + "loss": 0.3703, + "step": 1968000 + }, + { + "epoch": 13.320836942399307, + "grad_norm": 0.3604111671447754, + "learning_rate": 4.866791630576007e-05, + "loss": 0.3707, + "step": 1968500 + }, + { + "epoch": 13.324220441749675, + "grad_norm": 0.33632153272628784, + "learning_rate": 4.866757795582503e-05, + "loss": 0.3699, + "step": 1969000 + }, + { + "epoch": 13.327603941100044, + "grad_norm": 0.36421000957489014, + "learning_rate": 4.866723960589e-05, + "loss": 0.3713, + "step": 1969500 + }, + { + "epoch": 13.33098744045041, + "grad_norm": 0.37115344405174255, + "learning_rate": 4.866690125595496e-05, + "loss": 0.3711, + "step": 1970000 + }, + { + "epoch": 13.33437093980078, + "grad_norm": 0.33895623683929443, + "learning_rate": 4.8666562906019924e-05, + "loss": 0.3703, + "step": 1970500 + }, + { + "epoch": 13.337754439151148, + "grad_norm": 0.36736783385276794, + "learning_rate": 4.8666224556084886e-05, + "loss": 0.3697, + "step": 1971000 + }, + { + "epoch": 13.341137938501516, + "grad_norm": 0.3676416575908661, + "learning_rate": 4.8665886206149855e-05, + "loss": 0.3705, + "step": 1971500 + }, + { + "epoch": 13.344521437851885, + "grad_norm": 0.3712790012359619, + "learning_rate": 4.866554785621482e-05, + "loss": 0.3692, + "step": 1972000 + }, + { + "epoch": 13.347904937202252, + "grad_norm": 0.3409649431705475, + "learning_rate": 4.866520950627978e-05, + "loss": 0.3696, + "step": 1972500 + }, + { + "epoch": 13.35128843655262, + "grad_norm": 0.4139016568660736, + "learning_rate": 4.866487115634474e-05, + "loss": 0.3706, + "step": 1973000 + }, + { + "epoch": 13.354671935902989, + "grad_norm": 0.35674795508384705, + "learning_rate": 4.86645328064097e-05, + "loss": 0.37, + "step": 1973500 + }, + { + "epoch": 13.358055435253357, + "grad_norm": 0.3726387023925781, + "learning_rate": 4.8664194456474665e-05, + "loss": 0.3698, + "step": 1974000 + }, + { + "epoch": 13.361438934603724, + "grad_norm": 0.3655881881713867, + "learning_rate": 4.866385610653963e-05, + "loss": 0.3685, + "step": 1974500 + }, + { + "epoch": 13.364822433954092, + "grad_norm": 0.37517014145851135, + "learning_rate": 4.866351775660459e-05, + "loss": 0.3716, + "step": 1975000 + }, + { + "epoch": 13.368205933304461, + "grad_norm": 0.3558104932308197, + "learning_rate": 4.866317940666956e-05, + "loss": 0.3679, + "step": 1975500 + }, + { + "epoch": 13.37158943265483, + "grad_norm": 0.35728612542152405, + "learning_rate": 4.866284105673452e-05, + "loss": 0.3697, + "step": 1976000 + }, + { + "epoch": 13.374972932005196, + "grad_norm": 0.3204164505004883, + "learning_rate": 4.866250270679948e-05, + "loss": 0.37, + "step": 1976500 + }, + { + "epoch": 13.378356431355565, + "grad_norm": 0.37354809045791626, + "learning_rate": 4.8662164356864445e-05, + "loss": 0.3704, + "step": 1977000 + }, + { + "epoch": 13.381739930705933, + "grad_norm": 0.32038918137550354, + "learning_rate": 4.866182600692941e-05, + "loss": 0.3684, + "step": 1977500 + }, + { + "epoch": 13.385123430056302, + "grad_norm": 0.3417024314403534, + "learning_rate": 4.866148765699437e-05, + "loss": 0.3701, + "step": 1978000 + }, + { + "epoch": 13.38850692940667, + "grad_norm": 0.36151084303855896, + "learning_rate": 4.866114930705933e-05, + "loss": 0.3697, + "step": 1978500 + }, + { + "epoch": 13.391890428757037, + "grad_norm": 0.3163359761238098, + "learning_rate": 4.86608109571243e-05, + "loss": 0.3698, + "step": 1979000 + }, + { + "epoch": 13.395273928107406, + "grad_norm": 0.3717551529407501, + "learning_rate": 4.866047260718926e-05, + "loss": 0.3707, + "step": 1979500 + }, + { + "epoch": 13.398657427457774, + "grad_norm": 0.38001298904418945, + "learning_rate": 4.8660134257254225e-05, + "loss": 0.3697, + "step": 1980000 + }, + { + "epoch": 13.402040926808143, + "grad_norm": 0.3517325818538666, + "learning_rate": 4.865979590731919e-05, + "loss": 0.3699, + "step": 1980500 + }, + { + "epoch": 13.40542442615851, + "grad_norm": 0.33883053064346313, + "learning_rate": 4.8659457557384156e-05, + "loss": 0.3714, + "step": 1981000 + }, + { + "epoch": 13.408807925508878, + "grad_norm": 0.3610602617263794, + "learning_rate": 4.865911920744912e-05, + "loss": 0.3723, + "step": 1981500 + }, + { + "epoch": 13.412191424859246, + "grad_norm": 0.35467588901519775, + "learning_rate": 4.865878085751408e-05, + "loss": 0.3699, + "step": 1982000 + }, + { + "epoch": 13.415574924209615, + "grad_norm": 0.3825504183769226, + "learning_rate": 4.8658442507579035e-05, + "loss": 0.3719, + "step": 1982500 + }, + { + "epoch": 13.418958423559983, + "grad_norm": 0.38568857312202454, + "learning_rate": 4.8658104157644004e-05, + "loss": 0.3688, + "step": 1983000 + }, + { + "epoch": 13.42234192291035, + "grad_norm": 0.3631773591041565, + "learning_rate": 4.8657765807708966e-05, + "loss": 0.3697, + "step": 1983500 + }, + { + "epoch": 13.425725422260719, + "grad_norm": 0.35028496384620667, + "learning_rate": 4.865742745777393e-05, + "loss": 0.3695, + "step": 1984000 + }, + { + "epoch": 13.429108921611087, + "grad_norm": 0.3581047058105469, + "learning_rate": 4.865708910783889e-05, + "loss": 0.3707, + "step": 1984500 + }, + { + "epoch": 13.432492420961456, + "grad_norm": 0.35143864154815674, + "learning_rate": 4.865675075790386e-05, + "loss": 0.3699, + "step": 1985000 + }, + { + "epoch": 13.435875920311823, + "grad_norm": 0.3333075940608978, + "learning_rate": 4.865641240796882e-05, + "loss": 0.3703, + "step": 1985500 + }, + { + "epoch": 13.439259419662191, + "grad_norm": 0.35157662630081177, + "learning_rate": 4.8656074058033784e-05, + "loss": 0.3701, + "step": 1986000 + }, + { + "epoch": 13.44264291901256, + "grad_norm": 0.36174359917640686, + "learning_rate": 4.8655735708098746e-05, + "loss": 0.3699, + "step": 1986500 + }, + { + "epoch": 13.446026418362928, + "grad_norm": 0.36083289980888367, + "learning_rate": 4.865539735816371e-05, + "loss": 0.3692, + "step": 1987000 + }, + { + "epoch": 13.449409917713297, + "grad_norm": 0.3417918384075165, + "learning_rate": 4.865505900822867e-05, + "loss": 0.3702, + "step": 1987500 + }, + { + "epoch": 13.452793417063663, + "grad_norm": 0.3874722421169281, + "learning_rate": 4.865472065829363e-05, + "loss": 0.3703, + "step": 1988000 + }, + { + "epoch": 13.456176916414032, + "grad_norm": 0.36781999468803406, + "learning_rate": 4.86543823083586e-05, + "loss": 0.3702, + "step": 1988500 + }, + { + "epoch": 13.4595604157644, + "grad_norm": 0.3900182247161865, + "learning_rate": 4.865404395842356e-05, + "loss": 0.3708, + "step": 1989000 + }, + { + "epoch": 13.462943915114769, + "grad_norm": 0.36022305488586426, + "learning_rate": 4.8653705608488525e-05, + "loss": 0.3699, + "step": 1989500 + }, + { + "epoch": 13.466327414465136, + "grad_norm": 0.3545489013195038, + "learning_rate": 4.865336725855349e-05, + "loss": 0.3713, + "step": 1990000 + }, + { + "epoch": 13.469710913815504, + "grad_norm": 0.3593008518218994, + "learning_rate": 4.8653028908618456e-05, + "loss": 0.3697, + "step": 1990500 + }, + { + "epoch": 13.473094413165873, + "grad_norm": 0.3376884460449219, + "learning_rate": 4.865269055868342e-05, + "loss": 0.3716, + "step": 1991000 + }, + { + "epoch": 13.476477912516241, + "grad_norm": 0.3358532190322876, + "learning_rate": 4.865235220874838e-05, + "loss": 0.3691, + "step": 1991500 + }, + { + "epoch": 13.479861411866608, + "grad_norm": 0.3432691693305969, + "learning_rate": 4.8652013858813336e-05, + "loss": 0.3709, + "step": 1992000 + }, + { + "epoch": 13.483244911216977, + "grad_norm": 0.41430553793907166, + "learning_rate": 4.8651675508878305e-05, + "loss": 0.3699, + "step": 1992500 + }, + { + "epoch": 13.486628410567345, + "grad_norm": 0.34607023000717163, + "learning_rate": 4.865133715894327e-05, + "loss": 0.3695, + "step": 1993000 + }, + { + "epoch": 13.490011909917714, + "grad_norm": 0.3873082101345062, + "learning_rate": 4.865099880900823e-05, + "loss": 0.3688, + "step": 1993500 + }, + { + "epoch": 13.493395409268082, + "grad_norm": 0.3736827075481415, + "learning_rate": 4.865066045907319e-05, + "loss": 0.3709, + "step": 1994000 + }, + { + "epoch": 13.496778908618449, + "grad_norm": 0.33309677243232727, + "learning_rate": 4.865032210913816e-05, + "loss": 0.3695, + "step": 1994500 + }, + { + "epoch": 13.500162407968817, + "grad_norm": 0.3392011523246765, + "learning_rate": 4.864998375920312e-05, + "loss": 0.3705, + "step": 1995000 + }, + { + "epoch": 13.503545907319186, + "grad_norm": 0.36364486813545227, + "learning_rate": 4.8649645409268084e-05, + "loss": 0.37, + "step": 1995500 + }, + { + "epoch": 13.506929406669554, + "grad_norm": 0.36173754930496216, + "learning_rate": 4.8649307059333046e-05, + "loss": 0.3712, + "step": 1996000 + }, + { + "epoch": 13.510312906019923, + "grad_norm": 0.40453898906707764, + "learning_rate": 4.8648968709398015e-05, + "loss": 0.3699, + "step": 1996500 + }, + { + "epoch": 13.51369640537029, + "grad_norm": 0.369381308555603, + "learning_rate": 4.864863035946297e-05, + "loss": 0.3679, + "step": 1997000 + }, + { + "epoch": 13.517079904720658, + "grad_norm": 0.37291908264160156, + "learning_rate": 4.864829200952793e-05, + "loss": 0.3714, + "step": 1997500 + }, + { + "epoch": 13.520463404071027, + "grad_norm": 0.3864448368549347, + "learning_rate": 4.86479536595929e-05, + "loss": 0.3707, + "step": 1998000 + }, + { + "epoch": 13.523846903421395, + "grad_norm": 0.39676347374916077, + "learning_rate": 4.8647615309657864e-05, + "loss": 0.37, + "step": 1998500 + }, + { + "epoch": 13.527230402771762, + "grad_norm": 0.34468865394592285, + "learning_rate": 4.8647276959722826e-05, + "loss": 0.3713, + "step": 1999000 + }, + { + "epoch": 13.53061390212213, + "grad_norm": 0.34684211015701294, + "learning_rate": 4.864693860978779e-05, + "loss": 0.3702, + "step": 1999500 + }, + { + "epoch": 13.533997401472499, + "grad_norm": 0.3568776249885559, + "learning_rate": 4.864660025985276e-05, + "loss": 0.3699, + "step": 2000000 + }, + { + "epoch": 13.537380900822868, + "grad_norm": 0.36592113971710205, + "learning_rate": 4.864626190991772e-05, + "loss": 0.3716, + "step": 2000500 + }, + { + "epoch": 13.540764400173234, + "grad_norm": 0.348983496427536, + "learning_rate": 4.864592355998268e-05, + "loss": 0.3689, + "step": 2001000 + }, + { + "epoch": 13.544147899523603, + "grad_norm": 0.3736741542816162, + "learning_rate": 4.8645585210047636e-05, + "loss": 0.3713, + "step": 2001500 + }, + { + "epoch": 13.547531398873971, + "grad_norm": 0.33904919028282166, + "learning_rate": 4.8645246860112605e-05, + "loss": 0.3689, + "step": 2002000 + }, + { + "epoch": 13.55091489822434, + "grad_norm": 0.3362513482570648, + "learning_rate": 4.864490851017757e-05, + "loss": 0.3701, + "step": 2002500 + }, + { + "epoch": 13.554298397574708, + "grad_norm": 0.34666186571121216, + "learning_rate": 4.864457016024253e-05, + "loss": 0.3702, + "step": 2003000 + }, + { + "epoch": 13.557681896925075, + "grad_norm": 0.3963722586631775, + "learning_rate": 4.864423181030749e-05, + "loss": 0.3703, + "step": 2003500 + }, + { + "epoch": 13.561065396275444, + "grad_norm": 0.3478109538555145, + "learning_rate": 4.864389346037246e-05, + "loss": 0.3692, + "step": 2004000 + }, + { + "epoch": 13.564448895625812, + "grad_norm": 0.3706079423427582, + "learning_rate": 4.864355511043742e-05, + "loss": 0.3685, + "step": 2004500 + }, + { + "epoch": 13.56783239497618, + "grad_norm": 0.35787704586982727, + "learning_rate": 4.8643216760502385e-05, + "loss": 0.3693, + "step": 2005000 + }, + { + "epoch": 13.571215894326548, + "grad_norm": 0.4233979284763336, + "learning_rate": 4.864287841056735e-05, + "loss": 0.3718, + "step": 2005500 + }, + { + "epoch": 13.574599393676916, + "grad_norm": 0.3667444586753845, + "learning_rate": 4.8642540060632316e-05, + "loss": 0.3696, + "step": 2006000 + }, + { + "epoch": 13.577982893027285, + "grad_norm": 0.34215712547302246, + "learning_rate": 4.864220171069727e-05, + "loss": 0.3703, + "step": 2006500 + }, + { + "epoch": 13.581366392377653, + "grad_norm": 0.3960571587085724, + "learning_rate": 4.864186336076223e-05, + "loss": 0.3697, + "step": 2007000 + }, + { + "epoch": 13.584749891728022, + "grad_norm": 0.34247100353240967, + "learning_rate": 4.86415250108272e-05, + "loss": 0.3675, + "step": 2007500 + }, + { + "epoch": 13.588133391078388, + "grad_norm": 0.3906427323818207, + "learning_rate": 4.8641186660892164e-05, + "loss": 0.3712, + "step": 2008000 + }, + { + "epoch": 13.591516890428757, + "grad_norm": 0.36824533343315125, + "learning_rate": 4.8640848310957126e-05, + "loss": 0.3707, + "step": 2008500 + }, + { + "epoch": 13.594900389779125, + "grad_norm": 0.37552452087402344, + "learning_rate": 4.864050996102209e-05, + "loss": 0.3704, + "step": 2009000 + }, + { + "epoch": 13.598283889129494, + "grad_norm": 0.35757166147232056, + "learning_rate": 4.864017161108706e-05, + "loss": 0.3696, + "step": 2009500 + }, + { + "epoch": 13.60166738847986, + "grad_norm": 0.38713139295578003, + "learning_rate": 4.863983326115202e-05, + "loss": 0.3708, + "step": 2010000 + }, + { + "epoch": 13.60505088783023, + "grad_norm": 0.3677159547805786, + "learning_rate": 4.863949491121698e-05, + "loss": 0.3692, + "step": 2010500 + }, + { + "epoch": 13.608434387180598, + "grad_norm": 0.4078529477119446, + "learning_rate": 4.863915656128194e-05, + "loss": 0.3709, + "step": 2011000 + }, + { + "epoch": 13.611817886530966, + "grad_norm": 0.38677313923835754, + "learning_rate": 4.8638818211346906e-05, + "loss": 0.3703, + "step": 2011500 + }, + { + "epoch": 13.615201385881335, + "grad_norm": 0.35637909173965454, + "learning_rate": 4.863847986141187e-05, + "loss": 0.3705, + "step": 2012000 + }, + { + "epoch": 13.618584885231702, + "grad_norm": 0.3691534101963043, + "learning_rate": 4.863814151147683e-05, + "loss": 0.3681, + "step": 2012500 + }, + { + "epoch": 13.62196838458207, + "grad_norm": 0.40598565340042114, + "learning_rate": 4.863780316154179e-05, + "loss": 0.3702, + "step": 2013000 + }, + { + "epoch": 13.625351883932439, + "grad_norm": 0.34304094314575195, + "learning_rate": 4.863746481160676e-05, + "loss": 0.3701, + "step": 2013500 + }, + { + "epoch": 13.628735383282807, + "grad_norm": 0.3206089735031128, + "learning_rate": 4.863712646167172e-05, + "loss": 0.3704, + "step": 2014000 + }, + { + "epoch": 13.632118882633174, + "grad_norm": 0.32081344723701477, + "learning_rate": 4.8636788111736685e-05, + "loss": 0.3696, + "step": 2014500 + }, + { + "epoch": 13.635502381983542, + "grad_norm": 0.3509119749069214, + "learning_rate": 4.863644976180165e-05, + "loss": 0.3717, + "step": 2015000 + }, + { + "epoch": 13.63888588133391, + "grad_norm": 0.36864274740219116, + "learning_rate": 4.8636111411866616e-05, + "loss": 0.3698, + "step": 2015500 + }, + { + "epoch": 13.64226938068428, + "grad_norm": 0.3501897156238556, + "learning_rate": 4.863577306193157e-05, + "loss": 0.3711, + "step": 2016000 + }, + { + "epoch": 13.645652880034646, + "grad_norm": 0.4033876657485962, + "learning_rate": 4.8635434711996534e-05, + "loss": 0.3701, + "step": 2016500 + }, + { + "epoch": 13.649036379385015, + "grad_norm": 0.335446834564209, + "learning_rate": 4.86350963620615e-05, + "loss": 0.3679, + "step": 2017000 + }, + { + "epoch": 13.652419878735383, + "grad_norm": 0.3642156720161438, + "learning_rate": 4.8634758012126465e-05, + "loss": 0.3701, + "step": 2017500 + }, + { + "epoch": 13.655803378085752, + "grad_norm": 0.39847907423973083, + "learning_rate": 4.863441966219143e-05, + "loss": 0.3706, + "step": 2018000 + }, + { + "epoch": 13.65918687743612, + "grad_norm": 0.3453882038593292, + "learning_rate": 4.863408131225639e-05, + "loss": 0.3724, + "step": 2018500 + }, + { + "epoch": 13.662570376786487, + "grad_norm": 0.3945293128490448, + "learning_rate": 4.863374296232136e-05, + "loss": 0.3697, + "step": 2019000 + }, + { + "epoch": 13.665953876136856, + "grad_norm": 0.31794705986976624, + "learning_rate": 4.863340461238632e-05, + "loss": 0.3693, + "step": 2019500 + }, + { + "epoch": 13.669337375487224, + "grad_norm": 0.33316007256507874, + "learning_rate": 4.863306626245128e-05, + "loss": 0.3682, + "step": 2020000 + }, + { + "epoch": 13.672720874837593, + "grad_norm": 0.3561200499534607, + "learning_rate": 4.863272791251624e-05, + "loss": 0.3703, + "step": 2020500 + }, + { + "epoch": 13.676104374187961, + "grad_norm": 0.38469579815864563, + "learning_rate": 4.8632389562581207e-05, + "loss": 0.3722, + "step": 2021000 + }, + { + "epoch": 13.679487873538328, + "grad_norm": 0.3694473206996918, + "learning_rate": 4.863205121264617e-05, + "loss": 0.3709, + "step": 2021500 + }, + { + "epoch": 13.682871372888696, + "grad_norm": 0.39557352662086487, + "learning_rate": 4.863171286271113e-05, + "loss": 0.3718, + "step": 2022000 + }, + { + "epoch": 13.686254872239065, + "grad_norm": 0.31979459524154663, + "learning_rate": 4.863137451277609e-05, + "loss": 0.3715, + "step": 2022500 + }, + { + "epoch": 13.689638371589433, + "grad_norm": 0.3282807171344757, + "learning_rate": 4.863103616284106e-05, + "loss": 0.3696, + "step": 2023000 + }, + { + "epoch": 13.6930218709398, + "grad_norm": 0.3137219250202179, + "learning_rate": 4.8630697812906024e-05, + "loss": 0.3695, + "step": 2023500 + }, + { + "epoch": 13.696405370290169, + "grad_norm": 0.390055388212204, + "learning_rate": 4.8630359462970986e-05, + "loss": 0.3725, + "step": 2024000 + }, + { + "epoch": 13.699788869640537, + "grad_norm": 0.3728681206703186, + "learning_rate": 4.863002111303595e-05, + "loss": 0.3699, + "step": 2024500 + }, + { + "epoch": 13.703172368990906, + "grad_norm": 0.3733259439468384, + "learning_rate": 4.862968276310092e-05, + "loss": 0.3701, + "step": 2025000 + }, + { + "epoch": 13.706555868341272, + "grad_norm": 0.4084514379501343, + "learning_rate": 4.862934441316587e-05, + "loss": 0.3699, + "step": 2025500 + }, + { + "epoch": 13.709939367691641, + "grad_norm": 0.3510924279689789, + "learning_rate": 4.8629006063230835e-05, + "loss": 0.3704, + "step": 2026000 + }, + { + "epoch": 13.71332286704201, + "grad_norm": 0.3545098304748535, + "learning_rate": 4.8628667713295803e-05, + "loss": 0.3707, + "step": 2026500 + }, + { + "epoch": 13.716706366392378, + "grad_norm": 0.40483832359313965, + "learning_rate": 4.8628329363360766e-05, + "loss": 0.3691, + "step": 2027000 + }, + { + "epoch": 13.720089865742747, + "grad_norm": 0.4002268314361572, + "learning_rate": 4.862799101342573e-05, + "loss": 0.3711, + "step": 2027500 + }, + { + "epoch": 13.723473365093113, + "grad_norm": 0.3556060194969177, + "learning_rate": 4.862765266349069e-05, + "loss": 0.3708, + "step": 2028000 + }, + { + "epoch": 13.726856864443482, + "grad_norm": 0.3630557358264923, + "learning_rate": 4.862731431355565e-05, + "loss": 0.3715, + "step": 2028500 + }, + { + "epoch": 13.73024036379385, + "grad_norm": 0.3628033995628357, + "learning_rate": 4.862697596362062e-05, + "loss": 0.3716, + "step": 2029000 + }, + { + "epoch": 13.733623863144219, + "grad_norm": 0.3553501069545746, + "learning_rate": 4.862663761368558e-05, + "loss": 0.3696, + "step": 2029500 + }, + { + "epoch": 13.737007362494586, + "grad_norm": 0.3359854817390442, + "learning_rate": 4.862629926375054e-05, + "loss": 0.3685, + "step": 2030000 + }, + { + "epoch": 13.740390861844954, + "grad_norm": 0.3862282335758209, + "learning_rate": 4.862596091381551e-05, + "loss": 0.3711, + "step": 2030500 + }, + { + "epoch": 13.743774361195323, + "grad_norm": 0.37626171112060547, + "learning_rate": 4.862562256388047e-05, + "loss": 0.3689, + "step": 2031000 + }, + { + "epoch": 13.747157860545691, + "grad_norm": 0.3787476718425751, + "learning_rate": 4.862528421394543e-05, + "loss": 0.3714, + "step": 2031500 + }, + { + "epoch": 13.750541359896058, + "grad_norm": 0.34771743416786194, + "learning_rate": 4.8624945864010394e-05, + "loss": 0.3708, + "step": 2032000 + }, + { + "epoch": 13.753924859246426, + "grad_norm": 0.32347938418388367, + "learning_rate": 4.862460751407536e-05, + "loss": 0.371, + "step": 2032500 + }, + { + "epoch": 13.757308358596795, + "grad_norm": 0.35285308957099915, + "learning_rate": 4.8624269164140325e-05, + "loss": 0.3699, + "step": 2033000 + }, + { + "epoch": 13.760691857947164, + "grad_norm": 0.38633468747138977, + "learning_rate": 4.862393081420529e-05, + "loss": 0.3715, + "step": 2033500 + }, + { + "epoch": 13.764075357297532, + "grad_norm": 0.38737979531288147, + "learning_rate": 4.862359246427025e-05, + "loss": 0.3687, + "step": 2034000 + }, + { + "epoch": 13.767458856647899, + "grad_norm": 0.33574140071868896, + "learning_rate": 4.862325411433522e-05, + "loss": 0.3696, + "step": 2034500 + }, + { + "epoch": 13.770842355998267, + "grad_norm": 0.3829055726528168, + "learning_rate": 4.862291576440017e-05, + "loss": 0.3702, + "step": 2035000 + }, + { + "epoch": 13.774225855348636, + "grad_norm": 0.36047014594078064, + "learning_rate": 4.8622577414465135e-05, + "loss": 0.3702, + "step": 2035500 + }, + { + "epoch": 13.777609354699004, + "grad_norm": 0.34504133462905884, + "learning_rate": 4.8622239064530104e-05, + "loss": 0.37, + "step": 2036000 + }, + { + "epoch": 13.780992854049373, + "grad_norm": 0.3734630346298218, + "learning_rate": 4.8621900714595066e-05, + "loss": 0.3705, + "step": 2036500 + }, + { + "epoch": 13.78437635339974, + "grad_norm": 0.3799772262573242, + "learning_rate": 4.862156236466003e-05, + "loss": 0.3701, + "step": 2037000 + }, + { + "epoch": 13.787759852750108, + "grad_norm": 0.35689836740493774, + "learning_rate": 4.862122401472499e-05, + "loss": 0.3705, + "step": 2037500 + }, + { + "epoch": 13.791143352100477, + "grad_norm": 0.38452208042144775, + "learning_rate": 4.862088566478995e-05, + "loss": 0.3695, + "step": 2038000 + }, + { + "epoch": 13.794526851450845, + "grad_norm": 0.37008896470069885, + "learning_rate": 4.862054731485492e-05, + "loss": 0.3708, + "step": 2038500 + }, + { + "epoch": 13.797910350801212, + "grad_norm": 0.3450815677642822, + "learning_rate": 4.8620208964919884e-05, + "loss": 0.37, + "step": 2039000 + }, + { + "epoch": 13.80129385015158, + "grad_norm": 0.330727219581604, + "learning_rate": 4.861987061498484e-05, + "loss": 0.3714, + "step": 2039500 + }, + { + "epoch": 13.804677349501949, + "grad_norm": 0.37175223231315613, + "learning_rate": 4.861953226504981e-05, + "loss": 0.37, + "step": 2040000 + }, + { + "epoch": 13.808060848852318, + "grad_norm": 0.3831532299518585, + "learning_rate": 4.861919391511477e-05, + "loss": 0.3693, + "step": 2040500 + }, + { + "epoch": 13.811444348202684, + "grad_norm": 0.33013632893562317, + "learning_rate": 4.861885556517973e-05, + "loss": 0.3702, + "step": 2041000 + }, + { + "epoch": 13.814827847553053, + "grad_norm": 0.35006463527679443, + "learning_rate": 4.8618517215244694e-05, + "loss": 0.3711, + "step": 2041500 + }, + { + "epoch": 13.818211346903421, + "grad_norm": 0.37648430466651917, + "learning_rate": 4.861817886530966e-05, + "loss": 0.3702, + "step": 2042000 + }, + { + "epoch": 13.82159484625379, + "grad_norm": 0.35369744896888733, + "learning_rate": 4.8617840515374625e-05, + "loss": 0.3702, + "step": 2042500 + }, + { + "epoch": 13.824978345604158, + "grad_norm": 0.3736339211463928, + "learning_rate": 4.861750216543959e-05, + "loss": 0.3711, + "step": 2043000 + }, + { + "epoch": 13.828361844954525, + "grad_norm": 0.3667067885398865, + "learning_rate": 4.861716381550455e-05, + "loss": 0.3702, + "step": 2043500 + }, + { + "epoch": 13.831745344304894, + "grad_norm": 0.34860265254974365, + "learning_rate": 4.861682546556952e-05, + "loss": 0.3707, + "step": 2044000 + }, + { + "epoch": 13.835128843655262, + "grad_norm": 0.3995159864425659, + "learning_rate": 4.8616487115634474e-05, + "loss": 0.3717, + "step": 2044500 + }, + { + "epoch": 13.83851234300563, + "grad_norm": 0.34385403990745544, + "learning_rate": 4.8616148765699436e-05, + "loss": 0.3699, + "step": 2045000 + }, + { + "epoch": 13.841895842355997, + "grad_norm": 0.37954169511795044, + "learning_rate": 4.86158104157644e-05, + "loss": 0.3714, + "step": 2045500 + }, + { + "epoch": 13.845279341706366, + "grad_norm": 0.3389084041118622, + "learning_rate": 4.861547206582937e-05, + "loss": 0.3699, + "step": 2046000 + }, + { + "epoch": 13.848662841056735, + "grad_norm": 0.3550108075141907, + "learning_rate": 4.861513371589433e-05, + "loss": 0.3708, + "step": 2046500 + }, + { + "epoch": 13.852046340407103, + "grad_norm": 0.37702715396881104, + "learning_rate": 4.861479536595929e-05, + "loss": 0.369, + "step": 2047000 + }, + { + "epoch": 13.855429839757472, + "grad_norm": 0.37218254804611206, + "learning_rate": 4.861445701602425e-05, + "loss": 0.3706, + "step": 2047500 + }, + { + "epoch": 13.858813339107838, + "grad_norm": 0.37504270672798157, + "learning_rate": 4.861411866608922e-05, + "loss": 0.3699, + "step": 2048000 + }, + { + "epoch": 13.862196838458207, + "grad_norm": 0.3862540125846863, + "learning_rate": 4.8613780316154184e-05, + "loss": 0.3698, + "step": 2048500 + }, + { + "epoch": 13.865580337808575, + "grad_norm": 0.36661839485168457, + "learning_rate": 4.861344196621914e-05, + "loss": 0.3702, + "step": 2049000 + }, + { + "epoch": 13.868963837158944, + "grad_norm": 0.33589258790016174, + "learning_rate": 4.861310361628411e-05, + "loss": 0.3699, + "step": 2049500 + }, + { + "epoch": 13.87234733650931, + "grad_norm": 0.41161099076271057, + "learning_rate": 4.861276526634907e-05, + "loss": 0.3714, + "step": 2050000 + }, + { + "epoch": 13.87573083585968, + "grad_norm": 0.35029563307762146, + "learning_rate": 4.861242691641403e-05, + "loss": 0.3713, + "step": 2050500 + }, + { + "epoch": 13.879114335210048, + "grad_norm": 0.35823801159858704, + "learning_rate": 4.8612088566478995e-05, + "loss": 0.3708, + "step": 2051000 + }, + { + "epoch": 13.882497834560416, + "grad_norm": 0.3975470960140228, + "learning_rate": 4.8611750216543964e-05, + "loss": 0.3697, + "step": 2051500 + }, + { + "epoch": 13.885881333910785, + "grad_norm": 0.34708172082901, + "learning_rate": 4.8611411866608926e-05, + "loss": 0.37, + "step": 2052000 + }, + { + "epoch": 13.889264833261151, + "grad_norm": 0.390287846326828, + "learning_rate": 4.861107351667389e-05, + "loss": 0.3705, + "step": 2052500 + }, + { + "epoch": 13.89264833261152, + "grad_norm": 0.3469938039779663, + "learning_rate": 4.861073516673885e-05, + "loss": 0.3701, + "step": 2053000 + }, + { + "epoch": 13.896031831961889, + "grad_norm": 0.38159653544425964, + "learning_rate": 4.861039681680382e-05, + "loss": 0.3716, + "step": 2053500 + }, + { + "epoch": 13.899415331312257, + "grad_norm": 0.43335747718811035, + "learning_rate": 4.8610058466868774e-05, + "loss": 0.3691, + "step": 2054000 + }, + { + "epoch": 13.902798830662624, + "grad_norm": 0.3556252419948578, + "learning_rate": 4.8609720116933736e-05, + "loss": 0.3716, + "step": 2054500 + }, + { + "epoch": 13.906182330012992, + "grad_norm": 0.3873630166053772, + "learning_rate": 4.86093817669987e-05, + "loss": 0.3703, + "step": 2055000 + }, + { + "epoch": 13.90956582936336, + "grad_norm": 0.3356965482234955, + "learning_rate": 4.860904341706367e-05, + "loss": 0.3694, + "step": 2055500 + }, + { + "epoch": 13.91294932871373, + "grad_norm": 0.37060633301734924, + "learning_rate": 4.860870506712863e-05, + "loss": 0.3695, + "step": 2056000 + }, + { + "epoch": 13.916332828064096, + "grad_norm": 0.372291624546051, + "learning_rate": 4.860836671719359e-05, + "loss": 0.3683, + "step": 2056500 + }, + { + "epoch": 13.919716327414465, + "grad_norm": 0.34234097599983215, + "learning_rate": 4.8608028367258554e-05, + "loss": 0.3708, + "step": 2057000 + }, + { + "epoch": 13.923099826764833, + "grad_norm": 0.37905994057655334, + "learning_rate": 4.860769001732352e-05, + "loss": 0.3689, + "step": 2057500 + }, + { + "epoch": 13.926483326115202, + "grad_norm": 0.3877720534801483, + "learning_rate": 4.8607351667388485e-05, + "loss": 0.3704, + "step": 2058000 + }, + { + "epoch": 13.92986682546557, + "grad_norm": 0.32991376519203186, + "learning_rate": 4.860701331745345e-05, + "loss": 0.3694, + "step": 2058500 + }, + { + "epoch": 13.933250324815937, + "grad_norm": 0.36483174562454224, + "learning_rate": 4.860667496751841e-05, + "loss": 0.3689, + "step": 2059000 + }, + { + "epoch": 13.936633824166305, + "grad_norm": 0.4076962172985077, + "learning_rate": 4.860633661758337e-05, + "loss": 0.3707, + "step": 2059500 + }, + { + "epoch": 13.940017323516674, + "grad_norm": 0.38313883543014526, + "learning_rate": 4.860599826764833e-05, + "loss": 0.3684, + "step": 2060000 + }, + { + "epoch": 13.943400822867043, + "grad_norm": 0.34798476099967957, + "learning_rate": 4.8605659917713295e-05, + "loss": 0.3713, + "step": 2060500 + }, + { + "epoch": 13.946784322217411, + "grad_norm": 0.3725389540195465, + "learning_rate": 4.8605321567778264e-05, + "loss": 0.3698, + "step": 2061000 + }, + { + "epoch": 13.950167821567778, + "grad_norm": 0.40532901883125305, + "learning_rate": 4.8604983217843226e-05, + "loss": 0.3695, + "step": 2061500 + }, + { + "epoch": 13.953551320918146, + "grad_norm": 0.38492247462272644, + "learning_rate": 4.860464486790819e-05, + "loss": 0.3696, + "step": 2062000 + }, + { + "epoch": 13.956934820268515, + "grad_norm": 0.34857288002967834, + "learning_rate": 4.860430651797315e-05, + "loss": 0.369, + "step": 2062500 + }, + { + "epoch": 13.960318319618883, + "grad_norm": 0.3815152645111084, + "learning_rate": 4.860396816803812e-05, + "loss": 0.3716, + "step": 2063000 + }, + { + "epoch": 13.96370181896925, + "grad_norm": 0.35839033126831055, + "learning_rate": 4.8603629818103075e-05, + "loss": 0.3691, + "step": 2063500 + }, + { + "epoch": 13.967085318319619, + "grad_norm": 0.36373916268348694, + "learning_rate": 4.860329146816804e-05, + "loss": 0.3685, + "step": 2064000 + }, + { + "epoch": 13.970468817669987, + "grad_norm": 0.3275459408760071, + "learning_rate": 4.8602953118233e-05, + "loss": 0.37, + "step": 2064500 + }, + { + "epoch": 13.973852317020356, + "grad_norm": 0.3416304886341095, + "learning_rate": 4.860261476829797e-05, + "loss": 0.3697, + "step": 2065000 + }, + { + "epoch": 13.977235816370722, + "grad_norm": 0.34164056181907654, + "learning_rate": 4.860227641836293e-05, + "loss": 0.3706, + "step": 2065500 + }, + { + "epoch": 13.980619315721091, + "grad_norm": 0.37814369797706604, + "learning_rate": 4.860193806842789e-05, + "loss": 0.3698, + "step": 2066000 + }, + { + "epoch": 13.98400281507146, + "grad_norm": 0.395485520362854, + "learning_rate": 4.8601599718492854e-05, + "loss": 0.3692, + "step": 2066500 + }, + { + "epoch": 13.987386314421828, + "grad_norm": 0.38372567296028137, + "learning_rate": 4.860126136855782e-05, + "loss": 0.3712, + "step": 2067000 + }, + { + "epoch": 13.990769813772197, + "grad_norm": 0.3979295492172241, + "learning_rate": 4.8600923018622785e-05, + "loss": 0.3688, + "step": 2067500 + }, + { + "epoch": 13.994153313122563, + "grad_norm": 0.37074190378189087, + "learning_rate": 4.860058466868775e-05, + "loss": 0.371, + "step": 2068000 + }, + { + "epoch": 13.997536812472932, + "grad_norm": 0.37351343035697937, + "learning_rate": 4.860024631875271e-05, + "loss": 0.3691, + "step": 2068500 + }, + { + "epoch": 14.0, + "eval_accuracy": 0.8588137544938473, + "eval_loss": 0.5739654302597046, + "eval_runtime": 3398.8028, + "eval_samples_per_second": 85.543, + "eval_steps_per_second": 5.347, + "step": 2068864 + }, + { + "epoch": 14.0009203118233, + "grad_norm": 0.3616331219673157, + "learning_rate": 4.859990796881767e-05, + "loss": 0.369, + "step": 2069000 + }, + { + "epoch": 14.004303811173669, + "grad_norm": 0.3238430619239807, + "learning_rate": 4.8599569618882634e-05, + "loss": 0.3689, + "step": 2069500 + }, + { + "epoch": 14.007687310524036, + "grad_norm": 0.39138802886009216, + "learning_rate": 4.8599231268947596e-05, + "loss": 0.368, + "step": 2070000 + }, + { + "epoch": 14.011070809874404, + "grad_norm": 0.3550621569156647, + "learning_rate": 4.8598892919012565e-05, + "loss": 0.3691, + "step": 2070500 + }, + { + "epoch": 14.014454309224773, + "grad_norm": 0.3459855914115906, + "learning_rate": 4.859855456907753e-05, + "loss": 0.3684, + "step": 2071000 + }, + { + "epoch": 14.017837808575141, + "grad_norm": 0.35226932168006897, + "learning_rate": 4.859821621914249e-05, + "loss": 0.368, + "step": 2071500 + }, + { + "epoch": 14.02122130792551, + "grad_norm": 0.3718564808368683, + "learning_rate": 4.859787786920745e-05, + "loss": 0.369, + "step": 2072000 + }, + { + "epoch": 14.024604807275876, + "grad_norm": 0.3818054497241974, + "learning_rate": 4.859753951927242e-05, + "loss": 0.3677, + "step": 2072500 + }, + { + "epoch": 14.027988306626245, + "grad_norm": 0.35006603598594666, + "learning_rate": 4.8597201169337376e-05, + "loss": 0.3679, + "step": 2073000 + }, + { + "epoch": 14.031371805976613, + "grad_norm": 0.387045681476593, + "learning_rate": 4.859686281940234e-05, + "loss": 0.3694, + "step": 2073500 + }, + { + "epoch": 14.034755305326982, + "grad_norm": 0.34844982624053955, + "learning_rate": 4.85965244694673e-05, + "loss": 0.3688, + "step": 2074000 + }, + { + "epoch": 14.038138804677349, + "grad_norm": 0.3560154139995575, + "learning_rate": 4.859618611953227e-05, + "loss": 0.3675, + "step": 2074500 + }, + { + "epoch": 14.041522304027717, + "grad_norm": 0.3833746910095215, + "learning_rate": 4.859584776959723e-05, + "loss": 0.369, + "step": 2075000 + }, + { + "epoch": 14.044905803378086, + "grad_norm": 0.33990758657455444, + "learning_rate": 4.859550941966219e-05, + "loss": 0.3686, + "step": 2075500 + }, + { + "epoch": 14.048289302728454, + "grad_norm": 0.3707942068576813, + "learning_rate": 4.8595171069727155e-05, + "loss": 0.3701, + "step": 2076000 + }, + { + "epoch": 14.051672802078823, + "grad_norm": 0.3842766582965851, + "learning_rate": 4.8594832719792124e-05, + "loss": 0.3693, + "step": 2076500 + }, + { + "epoch": 14.05505630142919, + "grad_norm": 0.3341391682624817, + "learning_rate": 4.8594494369857086e-05, + "loss": 0.3707, + "step": 2077000 + }, + { + "epoch": 14.058439800779558, + "grad_norm": 0.32680997252464294, + "learning_rate": 4.859415601992205e-05, + "loss": 0.3681, + "step": 2077500 + }, + { + "epoch": 14.061823300129927, + "grad_norm": 0.38530468940734863, + "learning_rate": 4.859381766998701e-05, + "loss": 0.3691, + "step": 2078000 + }, + { + "epoch": 14.065206799480295, + "grad_norm": 0.3662298321723938, + "learning_rate": 4.859347932005197e-05, + "loss": 0.3688, + "step": 2078500 + }, + { + "epoch": 14.068590298830662, + "grad_norm": 0.3635557293891907, + "learning_rate": 4.8593140970116935e-05, + "loss": 0.3687, + "step": 2079000 + }, + { + "epoch": 14.07197379818103, + "grad_norm": 0.34785518050193787, + "learning_rate": 4.85928026201819e-05, + "loss": 0.3696, + "step": 2079500 + }, + { + "epoch": 14.075357297531399, + "grad_norm": 0.35233691334724426, + "learning_rate": 4.8592464270246866e-05, + "loss": 0.3688, + "step": 2080000 + }, + { + "epoch": 14.078740796881767, + "grad_norm": 0.39892759919166565, + "learning_rate": 4.859212592031183e-05, + "loss": 0.3683, + "step": 2080500 + }, + { + "epoch": 14.082124296232134, + "grad_norm": 0.37727105617523193, + "learning_rate": 4.859178757037679e-05, + "loss": 0.3684, + "step": 2081000 + }, + { + "epoch": 14.085507795582503, + "grad_norm": 0.3854265511035919, + "learning_rate": 4.859144922044175e-05, + "loss": 0.3686, + "step": 2081500 + }, + { + "epoch": 14.088891294932871, + "grad_norm": 0.3632075786590576, + "learning_rate": 4.859111087050672e-05, + "loss": 0.3696, + "step": 2082000 + }, + { + "epoch": 14.09227479428324, + "grad_norm": 0.3490007221698761, + "learning_rate": 4.8590772520571676e-05, + "loss": 0.3694, + "step": 2082500 + }, + { + "epoch": 14.095658293633608, + "grad_norm": 0.385567843914032, + "learning_rate": 4.859043417063664e-05, + "loss": 0.3691, + "step": 2083000 + }, + { + "epoch": 14.099041792983975, + "grad_norm": 0.36982715129852295, + "learning_rate": 4.85900958207016e-05, + "loss": 0.3683, + "step": 2083500 + }, + { + "epoch": 14.102425292334344, + "grad_norm": 0.35927850008010864, + "learning_rate": 4.858975747076657e-05, + "loss": 0.3689, + "step": 2084000 + }, + { + "epoch": 14.105808791684712, + "grad_norm": 0.3497246205806732, + "learning_rate": 4.858941912083153e-05, + "loss": 0.3676, + "step": 2084500 + }, + { + "epoch": 14.10919229103508, + "grad_norm": 0.3736872971057892, + "learning_rate": 4.8589080770896494e-05, + "loss": 0.3699, + "step": 2085000 + }, + { + "epoch": 14.112575790385447, + "grad_norm": 0.3965182602405548, + "learning_rate": 4.8588742420961456e-05, + "loss": 0.3681, + "step": 2085500 + }, + { + "epoch": 14.115959289735816, + "grad_norm": 0.34779226779937744, + "learning_rate": 4.8588404071026425e-05, + "loss": 0.3699, + "step": 2086000 + }, + { + "epoch": 14.119342789086184, + "grad_norm": 0.3425229489803314, + "learning_rate": 4.858806572109139e-05, + "loss": 0.3683, + "step": 2086500 + }, + { + "epoch": 14.122726288436553, + "grad_norm": 0.39361563324928284, + "learning_rate": 4.858772737115635e-05, + "loss": 0.3694, + "step": 2087000 + }, + { + "epoch": 14.126109787786922, + "grad_norm": 0.39007148146629333, + "learning_rate": 4.858738902122131e-05, + "loss": 0.369, + "step": 2087500 + }, + { + "epoch": 14.129493287137288, + "grad_norm": 0.3457988202571869, + "learning_rate": 4.858705067128627e-05, + "loss": 0.3685, + "step": 2088000 + }, + { + "epoch": 14.132876786487657, + "grad_norm": 0.3679068982601166, + "learning_rate": 4.8586712321351235e-05, + "loss": 0.3685, + "step": 2088500 + }, + { + "epoch": 14.136260285838025, + "grad_norm": 0.35563474893569946, + "learning_rate": 4.85863739714162e-05, + "loss": 0.3691, + "step": 2089000 + }, + { + "epoch": 14.139643785188394, + "grad_norm": 0.3731387257575989, + "learning_rate": 4.8586035621481166e-05, + "loss": 0.3689, + "step": 2089500 + }, + { + "epoch": 14.14302728453876, + "grad_norm": 0.39115670323371887, + "learning_rate": 4.858569727154613e-05, + "loss": 0.3672, + "step": 2090000 + }, + { + "epoch": 14.146410783889129, + "grad_norm": 0.4126528203487396, + "learning_rate": 4.858535892161109e-05, + "loss": 0.3685, + "step": 2090500 + }, + { + "epoch": 14.149794283239498, + "grad_norm": 0.3236331045627594, + "learning_rate": 4.858502057167605e-05, + "loss": 0.3679, + "step": 2091000 + }, + { + "epoch": 14.153177782589866, + "grad_norm": 0.35798412561416626, + "learning_rate": 4.8584682221741015e-05, + "loss": 0.3687, + "step": 2091500 + }, + { + "epoch": 14.156561281940235, + "grad_norm": 0.3941161334514618, + "learning_rate": 4.858434387180598e-05, + "loss": 0.369, + "step": 2092000 + }, + { + "epoch": 14.159944781290601, + "grad_norm": 0.3767811059951782, + "learning_rate": 4.858400552187094e-05, + "loss": 0.369, + "step": 2092500 + }, + { + "epoch": 14.16332828064097, + "grad_norm": 0.3738722801208496, + "learning_rate": 4.85836671719359e-05, + "loss": 0.3686, + "step": 2093000 + }, + { + "epoch": 14.166711779991338, + "grad_norm": 0.4057631194591522, + "learning_rate": 4.858332882200087e-05, + "loss": 0.3702, + "step": 2093500 + }, + { + "epoch": 14.170095279341707, + "grad_norm": 0.36650463938713074, + "learning_rate": 4.858299047206583e-05, + "loss": 0.3685, + "step": 2094000 + }, + { + "epoch": 14.173478778692074, + "grad_norm": 0.3387785851955414, + "learning_rate": 4.8582652122130794e-05, + "loss": 0.3677, + "step": 2094500 + }, + { + "epoch": 14.176862278042442, + "grad_norm": 0.3528936803340912, + "learning_rate": 4.8582313772195756e-05, + "loss": 0.3683, + "step": 2095000 + }, + { + "epoch": 14.18024577739281, + "grad_norm": 0.3900843560695648, + "learning_rate": 4.8581975422260725e-05, + "loss": 0.3678, + "step": 2095500 + }, + { + "epoch": 14.18362927674318, + "grad_norm": 0.3487549424171448, + "learning_rate": 4.858163707232569e-05, + "loss": 0.3692, + "step": 2096000 + }, + { + "epoch": 14.187012776093548, + "grad_norm": 0.3960934281349182, + "learning_rate": 4.858129872239065e-05, + "loss": 0.3697, + "step": 2096500 + }, + { + "epoch": 14.190396275443915, + "grad_norm": 0.352027028799057, + "learning_rate": 4.858096037245561e-05, + "loss": 0.3702, + "step": 2097000 + }, + { + "epoch": 14.193779774794283, + "grad_norm": 0.3669641613960266, + "learning_rate": 4.8580622022520574e-05, + "loss": 0.3685, + "step": 2097500 + }, + { + "epoch": 14.197163274144652, + "grad_norm": 0.3863503634929657, + "learning_rate": 4.8580283672585536e-05, + "loss": 0.3684, + "step": 2098000 + }, + { + "epoch": 14.20054677349502, + "grad_norm": 0.3422660529613495, + "learning_rate": 4.85799453226505e-05, + "loss": 0.3679, + "step": 2098500 + }, + { + "epoch": 14.203930272845387, + "grad_norm": 0.3772645890712738, + "learning_rate": 4.857960697271546e-05, + "loss": 0.3693, + "step": 2099000 + }, + { + "epoch": 14.207313772195755, + "grad_norm": 0.3528118431568146, + "learning_rate": 4.857926862278043e-05, + "loss": 0.3682, + "step": 2099500 + }, + { + "epoch": 14.210697271546124, + "grad_norm": 0.40165719389915466, + "learning_rate": 4.857893027284539e-05, + "loss": 0.3695, + "step": 2100000 + }, + { + "epoch": 14.214080770896492, + "grad_norm": 0.38907134532928467, + "learning_rate": 4.857859192291035e-05, + "loss": 0.3677, + "step": 2100500 + }, + { + "epoch": 14.217464270246861, + "grad_norm": 0.38055655360221863, + "learning_rate": 4.8578253572975315e-05, + "loss": 0.3682, + "step": 2101000 + }, + { + "epoch": 14.220847769597228, + "grad_norm": 0.40473777055740356, + "learning_rate": 4.857791522304028e-05, + "loss": 0.369, + "step": 2101500 + }, + { + "epoch": 14.224231268947596, + "grad_norm": 0.37386074662208557, + "learning_rate": 4.857757687310524e-05, + "loss": 0.3682, + "step": 2102000 + }, + { + "epoch": 14.227614768297965, + "grad_norm": 0.3558364808559418, + "learning_rate": 4.85772385231702e-05, + "loss": 0.3685, + "step": 2102500 + }, + { + "epoch": 14.230998267648333, + "grad_norm": 0.38514620065689087, + "learning_rate": 4.857690017323517e-05, + "loss": 0.3713, + "step": 2103000 + }, + { + "epoch": 14.2343817669987, + "grad_norm": 0.3842502236366272, + "learning_rate": 4.857656182330013e-05, + "loss": 0.3699, + "step": 2103500 + }, + { + "epoch": 14.237765266349069, + "grad_norm": 0.398985892534256, + "learning_rate": 4.8576223473365095e-05, + "loss": 0.3701, + "step": 2104000 + }, + { + "epoch": 14.241148765699437, + "grad_norm": 0.3588099181652069, + "learning_rate": 4.857588512343006e-05, + "loss": 0.366, + "step": 2104500 + }, + { + "epoch": 14.244532265049806, + "grad_norm": 0.3762696087360382, + "learning_rate": 4.8575546773495026e-05, + "loss": 0.3698, + "step": 2105000 + }, + { + "epoch": 14.247915764400172, + "grad_norm": 0.34633252024650574, + "learning_rate": 4.857520842355999e-05, + "loss": 0.3694, + "step": 2105500 + }, + { + "epoch": 14.251299263750541, + "grad_norm": 0.33246076107025146, + "learning_rate": 4.857487007362495e-05, + "loss": 0.3684, + "step": 2106000 + }, + { + "epoch": 14.25468276310091, + "grad_norm": 0.37106186151504517, + "learning_rate": 4.857453172368991e-05, + "loss": 0.3678, + "step": 2106500 + }, + { + "epoch": 14.258066262451278, + "grad_norm": 0.342178076505661, + "learning_rate": 4.8574193373754874e-05, + "loss": 0.3701, + "step": 2107000 + }, + { + "epoch": 14.261449761801646, + "grad_norm": 0.3419799506664276, + "learning_rate": 4.8573855023819836e-05, + "loss": 0.3696, + "step": 2107500 + }, + { + "epoch": 14.264833261152013, + "grad_norm": 0.3432062566280365, + "learning_rate": 4.85735166738848e-05, + "loss": 0.3706, + "step": 2108000 + }, + { + "epoch": 14.268216760502382, + "grad_norm": 0.3578495383262634, + "learning_rate": 4.857317832394976e-05, + "loss": 0.3689, + "step": 2108500 + }, + { + "epoch": 14.27160025985275, + "grad_norm": 0.34382176399230957, + "learning_rate": 4.857283997401473e-05, + "loss": 0.3688, + "step": 2109000 + }, + { + "epoch": 14.274983759203119, + "grad_norm": 0.33303534984588623, + "learning_rate": 4.857250162407969e-05, + "loss": 0.3691, + "step": 2109500 + }, + { + "epoch": 14.278367258553486, + "grad_norm": 0.3536013066768646, + "learning_rate": 4.8572163274144654e-05, + "loss": 0.3709, + "step": 2110000 + }, + { + "epoch": 14.281750757903854, + "grad_norm": 0.3310689926147461, + "learning_rate": 4.8571824924209616e-05, + "loss": 0.3704, + "step": 2110500 + }, + { + "epoch": 14.285134257254223, + "grad_norm": 0.4020843505859375, + "learning_rate": 4.8571486574274585e-05, + "loss": 0.3699, + "step": 2111000 + }, + { + "epoch": 14.288517756604591, + "grad_norm": 0.3867824375629425, + "learning_rate": 4.857114822433954e-05, + "loss": 0.3683, + "step": 2111500 + }, + { + "epoch": 14.29190125595496, + "grad_norm": 0.3328765332698822, + "learning_rate": 4.85708098744045e-05, + "loss": 0.3685, + "step": 2112000 + }, + { + "epoch": 14.295284755305326, + "grad_norm": 0.33964329957962036, + "learning_rate": 4.857047152446947e-05, + "loss": 0.3702, + "step": 2112500 + }, + { + "epoch": 14.298668254655695, + "grad_norm": 0.4224208891391754, + "learning_rate": 4.857013317453443e-05, + "loss": 0.3694, + "step": 2113000 + }, + { + "epoch": 14.302051754006063, + "grad_norm": 0.4299384653568268, + "learning_rate": 4.8569794824599396e-05, + "loss": 0.368, + "step": 2113500 + }, + { + "epoch": 14.305435253356432, + "grad_norm": 0.36000317335128784, + "learning_rate": 4.856945647466436e-05, + "loss": 0.3698, + "step": 2114000 + }, + { + "epoch": 14.308818752706799, + "grad_norm": 0.3650577664375305, + "learning_rate": 4.8569118124729327e-05, + "loss": 0.3683, + "step": 2114500 + }, + { + "epoch": 14.312202252057167, + "grad_norm": 0.38082313537597656, + "learning_rate": 4.856877977479429e-05, + "loss": 0.3687, + "step": 2115000 + }, + { + "epoch": 14.315585751407536, + "grad_norm": 0.3912234604358673, + "learning_rate": 4.856844142485925e-05, + "loss": 0.3706, + "step": 2115500 + }, + { + "epoch": 14.318969250757904, + "grad_norm": 0.36391451954841614, + "learning_rate": 4.8568103074924206e-05, + "loss": 0.3691, + "step": 2116000 + }, + { + "epoch": 14.322352750108273, + "grad_norm": 0.36006873846054077, + "learning_rate": 4.8567764724989175e-05, + "loss": 0.3686, + "step": 2116500 + }, + { + "epoch": 14.32573624945864, + "grad_norm": 0.3425831198692322, + "learning_rate": 4.856742637505414e-05, + "loss": 0.3687, + "step": 2117000 + }, + { + "epoch": 14.329119748809008, + "grad_norm": 0.37592950463294983, + "learning_rate": 4.85670880251191e-05, + "loss": 0.3695, + "step": 2117500 + }, + { + "epoch": 14.332503248159377, + "grad_norm": 0.32281264662742615, + "learning_rate": 4.856674967518406e-05, + "loss": 0.3694, + "step": 2118000 + }, + { + "epoch": 14.335886747509745, + "grad_norm": 0.34113267064094543, + "learning_rate": 4.856641132524903e-05, + "loss": 0.3694, + "step": 2118500 + }, + { + "epoch": 14.339270246860112, + "grad_norm": 0.33674532175064087, + "learning_rate": 4.856607297531399e-05, + "loss": 0.3701, + "step": 2119000 + }, + { + "epoch": 14.34265374621048, + "grad_norm": 0.35087233781814575, + "learning_rate": 4.8565734625378955e-05, + "loss": 0.3689, + "step": 2119500 + }, + { + "epoch": 14.346037245560849, + "grad_norm": 0.3552713096141815, + "learning_rate": 4.856539627544392e-05, + "loss": 0.3694, + "step": 2120000 + }, + { + "epoch": 14.349420744911217, + "grad_norm": 0.3678264617919922, + "learning_rate": 4.8565057925508886e-05, + "loss": 0.3702, + "step": 2120500 + }, + { + "epoch": 14.352804244261584, + "grad_norm": 0.35346418619155884, + "learning_rate": 4.856471957557384e-05, + "loss": 0.3673, + "step": 2121000 + }, + { + "epoch": 14.356187743611953, + "grad_norm": 0.38516688346862793, + "learning_rate": 4.85643812256388e-05, + "loss": 0.369, + "step": 2121500 + }, + { + "epoch": 14.359571242962321, + "grad_norm": 0.3606177568435669, + "learning_rate": 4.856404287570377e-05, + "loss": 0.3725, + "step": 2122000 + }, + { + "epoch": 14.36295474231269, + "grad_norm": 0.3833155333995819, + "learning_rate": 4.8563704525768734e-05, + "loss": 0.3685, + "step": 2122500 + }, + { + "epoch": 14.366338241663058, + "grad_norm": 0.3655698001384735, + "learning_rate": 4.8563366175833696e-05, + "loss": 0.3677, + "step": 2123000 + }, + { + "epoch": 14.369721741013425, + "grad_norm": 0.4088969826698303, + "learning_rate": 4.856302782589866e-05, + "loss": 0.3699, + "step": 2123500 + }, + { + "epoch": 14.373105240363794, + "grad_norm": 0.34506741166114807, + "learning_rate": 4.856268947596363e-05, + "loss": 0.3679, + "step": 2124000 + }, + { + "epoch": 14.376488739714162, + "grad_norm": 0.32661932706832886, + "learning_rate": 4.856235112602859e-05, + "loss": 0.3687, + "step": 2124500 + }, + { + "epoch": 14.37987223906453, + "grad_norm": 0.367184042930603, + "learning_rate": 4.856201277609355e-05, + "loss": 0.3697, + "step": 2125000 + }, + { + "epoch": 14.3832557384149, + "grad_norm": 0.39475658535957336, + "learning_rate": 4.856167442615851e-05, + "loss": 0.369, + "step": 2125500 + }, + { + "epoch": 14.386639237765266, + "grad_norm": 0.38947799801826477, + "learning_rate": 4.8561336076223476e-05, + "loss": 0.3698, + "step": 2126000 + }, + { + "epoch": 14.390022737115634, + "grad_norm": 0.403063029050827, + "learning_rate": 4.856099772628844e-05, + "loss": 0.37, + "step": 2126500 + }, + { + "epoch": 14.393406236466003, + "grad_norm": 0.39779502153396606, + "learning_rate": 4.85606593763534e-05, + "loss": 0.3689, + "step": 2127000 + }, + { + "epoch": 14.396789735816371, + "grad_norm": 0.37461674213409424, + "learning_rate": 4.856032102641836e-05, + "loss": 0.3718, + "step": 2127500 + }, + { + "epoch": 14.400173235166738, + "grad_norm": 0.38416680693626404, + "learning_rate": 4.855998267648333e-05, + "loss": 0.3691, + "step": 2128000 + }, + { + "epoch": 14.403556734517107, + "grad_norm": 0.3756099343299866, + "learning_rate": 4.855964432654829e-05, + "loss": 0.3695, + "step": 2128500 + }, + { + "epoch": 14.406940233867475, + "grad_norm": 0.3770340085029602, + "learning_rate": 4.8559305976613255e-05, + "loss": 0.3689, + "step": 2129000 + }, + { + "epoch": 14.410323733217844, + "grad_norm": 0.3530448079109192, + "learning_rate": 4.855896762667822e-05, + "loss": 0.3697, + "step": 2129500 + }, + { + "epoch": 14.41370723256821, + "grad_norm": 0.37978580594062805, + "learning_rate": 4.8558629276743186e-05, + "loss": 0.3691, + "step": 2130000 + }, + { + "epoch": 14.417090731918579, + "grad_norm": 0.3957188129425049, + "learning_rate": 4.855829092680814e-05, + "loss": 0.3682, + "step": 2130500 + }, + { + "epoch": 14.420474231268948, + "grad_norm": 0.3946071267127991, + "learning_rate": 4.8557952576873104e-05, + "loss": 0.3695, + "step": 2131000 + }, + { + "epoch": 14.423857730619316, + "grad_norm": 0.3873770236968994, + "learning_rate": 4.855761422693807e-05, + "loss": 0.3687, + "step": 2131500 + }, + { + "epoch": 14.427241229969685, + "grad_norm": 0.3992931544780731, + "learning_rate": 4.8557275877003035e-05, + "loss": 0.3689, + "step": 2132000 + }, + { + "epoch": 14.430624729320051, + "grad_norm": 0.37176117300987244, + "learning_rate": 4.8556937527068e-05, + "loss": 0.3694, + "step": 2132500 + }, + { + "epoch": 14.43400822867042, + "grad_norm": 0.3461103141307831, + "learning_rate": 4.855659917713296e-05, + "loss": 0.3699, + "step": 2133000 + }, + { + "epoch": 14.437391728020788, + "grad_norm": 0.34357118606567383, + "learning_rate": 4.855626082719793e-05, + "loss": 0.3695, + "step": 2133500 + }, + { + "epoch": 14.440775227371157, + "grad_norm": 0.3594299256801605, + "learning_rate": 4.855592247726289e-05, + "loss": 0.3684, + "step": 2134000 + }, + { + "epoch": 14.444158726721524, + "grad_norm": 0.3565590977668762, + "learning_rate": 4.855558412732785e-05, + "loss": 0.37, + "step": 2134500 + }, + { + "epoch": 14.447542226071892, + "grad_norm": 0.3601624667644501, + "learning_rate": 4.855524577739281e-05, + "loss": 0.3694, + "step": 2135000 + }, + { + "epoch": 14.45092572542226, + "grad_norm": 0.36201897263526917, + "learning_rate": 4.8554907427457776e-05, + "loss": 0.3694, + "step": 2135500 + }, + { + "epoch": 14.45430922477263, + "grad_norm": 0.3696894645690918, + "learning_rate": 4.855456907752274e-05, + "loss": 0.3701, + "step": 2136000 + }, + { + "epoch": 14.457692724122998, + "grad_norm": 0.4050441086292267, + "learning_rate": 4.85542307275877e-05, + "loss": 0.3707, + "step": 2136500 + }, + { + "epoch": 14.461076223473365, + "grad_norm": 0.3355098068714142, + "learning_rate": 4.855389237765266e-05, + "loss": 0.3688, + "step": 2137000 + }, + { + "epoch": 14.464459722823733, + "grad_norm": 0.40232527256011963, + "learning_rate": 4.855355402771763e-05, + "loss": 0.3694, + "step": 2137500 + }, + { + "epoch": 14.467843222174102, + "grad_norm": 0.368574321269989, + "learning_rate": 4.8553215677782594e-05, + "loss": 0.3697, + "step": 2138000 + }, + { + "epoch": 14.47122672152447, + "grad_norm": 0.3811182677745819, + "learning_rate": 4.8552877327847556e-05, + "loss": 0.3685, + "step": 2138500 + }, + { + "epoch": 14.474610220874837, + "grad_norm": 0.39627721905708313, + "learning_rate": 4.855253897791252e-05, + "loss": 0.3691, + "step": 2139000 + }, + { + "epoch": 14.477993720225205, + "grad_norm": 0.34768345952033997, + "learning_rate": 4.855220062797749e-05, + "loss": 0.3691, + "step": 2139500 + }, + { + "epoch": 14.481377219575574, + "grad_norm": 0.38759613037109375, + "learning_rate": 4.855186227804244e-05, + "loss": 0.37, + "step": 2140000 + }, + { + "epoch": 14.484760718925942, + "grad_norm": 0.38205355405807495, + "learning_rate": 4.8551523928107404e-05, + "loss": 0.3697, + "step": 2140500 + }, + { + "epoch": 14.488144218276311, + "grad_norm": 0.3666626513004303, + "learning_rate": 4.855118557817237e-05, + "loss": 0.3668, + "step": 2141000 + }, + { + "epoch": 14.491527717626678, + "grad_norm": 0.3698650598526001, + "learning_rate": 4.8550847228237335e-05, + "loss": 0.3702, + "step": 2141500 + }, + { + "epoch": 14.494911216977046, + "grad_norm": 0.3574991524219513, + "learning_rate": 4.85505088783023e-05, + "loss": 0.3703, + "step": 2142000 + }, + { + "epoch": 14.498294716327415, + "grad_norm": 0.39458853006362915, + "learning_rate": 4.855017052836726e-05, + "loss": 0.3687, + "step": 2142500 + }, + { + "epoch": 14.501678215677783, + "grad_norm": 0.3711014986038208, + "learning_rate": 4.854983217843223e-05, + "loss": 0.3695, + "step": 2143000 + }, + { + "epoch": 14.50506171502815, + "grad_norm": 0.3292671740055084, + "learning_rate": 4.854949382849719e-05, + "loss": 0.3676, + "step": 2143500 + }, + { + "epoch": 14.508445214378519, + "grad_norm": 0.35296839475631714, + "learning_rate": 4.854915547856215e-05, + "loss": 0.368, + "step": 2144000 + }, + { + "epoch": 14.511828713728887, + "grad_norm": 0.37003955245018005, + "learning_rate": 4.854881712862711e-05, + "loss": 0.3692, + "step": 2144500 + }, + { + "epoch": 14.515212213079256, + "grad_norm": 0.4041885733604431, + "learning_rate": 4.854847877869208e-05, + "loss": 0.3697, + "step": 2145000 + }, + { + "epoch": 14.518595712429622, + "grad_norm": 0.3833051919937134, + "learning_rate": 4.854814042875704e-05, + "loss": 0.3674, + "step": 2145500 + }, + { + "epoch": 14.52197921177999, + "grad_norm": 0.3829428553581238, + "learning_rate": 4.8547802078822e-05, + "loss": 0.3701, + "step": 2146000 + }, + { + "epoch": 14.52536271113036, + "grad_norm": 0.3993237018585205, + "learning_rate": 4.854746372888696e-05, + "loss": 0.3693, + "step": 2146500 + }, + { + "epoch": 14.528746210480728, + "grad_norm": 0.36786895990371704, + "learning_rate": 4.854712537895193e-05, + "loss": 0.3695, + "step": 2147000 + }, + { + "epoch": 14.532129709831096, + "grad_norm": 0.34661757946014404, + "learning_rate": 4.8546787029016894e-05, + "loss": 0.3703, + "step": 2147500 + }, + { + "epoch": 14.535513209181463, + "grad_norm": 0.40853285789489746, + "learning_rate": 4.8546448679081856e-05, + "loss": 0.3699, + "step": 2148000 + }, + { + "epoch": 14.538896708531832, + "grad_norm": 0.3633783757686615, + "learning_rate": 4.854611032914682e-05, + "loss": 0.3705, + "step": 2148500 + }, + { + "epoch": 14.5422802078822, + "grad_norm": 0.3499980866909027, + "learning_rate": 4.854577197921179e-05, + "loss": 0.3685, + "step": 2149000 + }, + { + "epoch": 14.545663707232569, + "grad_norm": 0.3772374987602234, + "learning_rate": 4.854543362927674e-05, + "loss": 0.3696, + "step": 2149500 + }, + { + "epoch": 14.549047206582937, + "grad_norm": 0.3617144227027893, + "learning_rate": 4.8545095279341705e-05, + "loss": 0.37, + "step": 2150000 + }, + { + "epoch": 14.552430705933304, + "grad_norm": 0.3383565843105316, + "learning_rate": 4.8544756929406674e-05, + "loss": 0.3699, + "step": 2150500 + }, + { + "epoch": 14.555814205283673, + "grad_norm": 0.3805508017539978, + "learning_rate": 4.8544418579471636e-05, + "loss": 0.3696, + "step": 2151000 + }, + { + "epoch": 14.559197704634041, + "grad_norm": 0.39522701501846313, + "learning_rate": 4.85440802295366e-05, + "loss": 0.3694, + "step": 2151500 + }, + { + "epoch": 14.56258120398441, + "grad_norm": 0.39664092659950256, + "learning_rate": 4.854374187960156e-05, + "loss": 0.3666, + "step": 2152000 + }, + { + "epoch": 14.565964703334776, + "grad_norm": 0.3451291620731354, + "learning_rate": 4.854340352966653e-05, + "loss": 0.37, + "step": 2152500 + }, + { + "epoch": 14.569348202685145, + "grad_norm": 0.34200799465179443, + "learning_rate": 4.854306517973149e-05, + "loss": 0.3686, + "step": 2153000 + }, + { + "epoch": 14.572731702035513, + "grad_norm": 0.3482162356376648, + "learning_rate": 4.854272682979645e-05, + "loss": 0.3715, + "step": 2153500 + }, + { + "epoch": 14.576115201385882, + "grad_norm": 0.33394691348075867, + "learning_rate": 4.854238847986141e-05, + "loss": 0.3691, + "step": 2154000 + }, + { + "epoch": 14.579498700736249, + "grad_norm": 0.37948983907699585, + "learning_rate": 4.854205012992638e-05, + "loss": 0.3678, + "step": 2154500 + }, + { + "epoch": 14.582882200086617, + "grad_norm": 0.3694993257522583, + "learning_rate": 4.854171177999134e-05, + "loss": 0.3688, + "step": 2155000 + }, + { + "epoch": 14.586265699436986, + "grad_norm": 0.3592107594013214, + "learning_rate": 4.85413734300563e-05, + "loss": 0.3685, + "step": 2155500 + }, + { + "epoch": 14.589649198787354, + "grad_norm": 0.3389520049095154, + "learning_rate": 4.8541035080121264e-05, + "loss": 0.3698, + "step": 2156000 + }, + { + "epoch": 14.593032698137723, + "grad_norm": 0.3517674505710602, + "learning_rate": 4.854069673018623e-05, + "loss": 0.3683, + "step": 2156500 + }, + { + "epoch": 14.59641619748809, + "grad_norm": 0.32301872968673706, + "learning_rate": 4.8540358380251195e-05, + "loss": 0.3665, + "step": 2157000 + }, + { + "epoch": 14.599799696838458, + "grad_norm": 0.3524543046951294, + "learning_rate": 4.854002003031616e-05, + "loss": 0.3715, + "step": 2157500 + }, + { + "epoch": 14.603183196188827, + "grad_norm": 0.34795093536376953, + "learning_rate": 4.853968168038112e-05, + "loss": 0.3682, + "step": 2158000 + }, + { + "epoch": 14.606566695539195, + "grad_norm": 0.3424944281578064, + "learning_rate": 4.853934333044609e-05, + "loss": 0.3695, + "step": 2158500 + }, + { + "epoch": 14.609950194889562, + "grad_norm": 0.36514824628829956, + "learning_rate": 4.8539004980511043e-05, + "loss": 0.3689, + "step": 2159000 + }, + { + "epoch": 14.61333369423993, + "grad_norm": 0.3638770878314972, + "learning_rate": 4.8538666630576006e-05, + "loss": 0.3693, + "step": 2159500 + }, + { + "epoch": 14.616717193590299, + "grad_norm": 0.3456403911113739, + "learning_rate": 4.8538328280640974e-05, + "loss": 0.3702, + "step": 2160000 + }, + { + "epoch": 14.620100692940667, + "grad_norm": 0.3254307806491852, + "learning_rate": 4.8537989930705937e-05, + "loss": 0.3693, + "step": 2160500 + }, + { + "epoch": 14.623484192291036, + "grad_norm": 0.3652152121067047, + "learning_rate": 4.85376515807709e-05, + "loss": 0.3688, + "step": 2161000 + }, + { + "epoch": 14.626867691641403, + "grad_norm": 0.36455264687538147, + "learning_rate": 4.853731323083586e-05, + "loss": 0.369, + "step": 2161500 + }, + { + "epoch": 14.630251190991771, + "grad_norm": 0.37666502594947815, + "learning_rate": 4.853697488090082e-05, + "loss": 0.3691, + "step": 2162000 + }, + { + "epoch": 14.63363469034214, + "grad_norm": 0.355785071849823, + "learning_rate": 4.853663653096579e-05, + "loss": 0.3701, + "step": 2162500 + }, + { + "epoch": 14.637018189692508, + "grad_norm": 0.3691225051879883, + "learning_rate": 4.8536298181030754e-05, + "loss": 0.3695, + "step": 2163000 + }, + { + "epoch": 14.640401689042875, + "grad_norm": 0.3322974145412445, + "learning_rate": 4.853595983109571e-05, + "loss": 0.3691, + "step": 2163500 + }, + { + "epoch": 14.643785188393244, + "grad_norm": 0.3645572066307068, + "learning_rate": 4.853562148116068e-05, + "loss": 0.3697, + "step": 2164000 + }, + { + "epoch": 14.647168687743612, + "grad_norm": 0.3978794515132904, + "learning_rate": 4.853528313122564e-05, + "loss": 0.3689, + "step": 2164500 + }, + { + "epoch": 14.65055218709398, + "grad_norm": 0.33656492829322815, + "learning_rate": 4.85349447812906e-05, + "loss": 0.3694, + "step": 2165000 + }, + { + "epoch": 14.653935686444349, + "grad_norm": 0.326246052980423, + "learning_rate": 4.8534606431355565e-05, + "loss": 0.3688, + "step": 2165500 + }, + { + "epoch": 14.657319185794716, + "grad_norm": 0.3709580898284912, + "learning_rate": 4.8534268081420533e-05, + "loss": 0.369, + "step": 2166000 + }, + { + "epoch": 14.660702685145084, + "grad_norm": 0.39139363169670105, + "learning_rate": 4.8533929731485496e-05, + "loss": 0.3675, + "step": 2166500 + }, + { + "epoch": 14.664086184495453, + "grad_norm": 0.3772076964378357, + "learning_rate": 4.853359138155046e-05, + "loss": 0.3682, + "step": 2167000 + }, + { + "epoch": 14.667469683845821, + "grad_norm": 0.33439138531684875, + "learning_rate": 4.853325303161542e-05, + "loss": 0.3686, + "step": 2167500 + }, + { + "epoch": 14.670853183196188, + "grad_norm": 0.403716504573822, + "learning_rate": 4.853291468168039e-05, + "loss": 0.3683, + "step": 2168000 + }, + { + "epoch": 14.674236682546557, + "grad_norm": 0.4087928533554077, + "learning_rate": 4.8532576331745344e-05, + "loss": 0.3701, + "step": 2168500 + }, + { + "epoch": 14.677620181896925, + "grad_norm": 0.3306918442249298, + "learning_rate": 4.8532237981810306e-05, + "loss": 0.3677, + "step": 2169000 + }, + { + "epoch": 14.681003681247294, + "grad_norm": 0.35591837763786316, + "learning_rate": 4.853189963187527e-05, + "loss": 0.3686, + "step": 2169500 + }, + { + "epoch": 14.68438718059766, + "grad_norm": 0.35075968503952026, + "learning_rate": 4.853156128194024e-05, + "loss": 0.3688, + "step": 2170000 + }, + { + "epoch": 14.687770679948029, + "grad_norm": 0.35520946979522705, + "learning_rate": 4.85312229320052e-05, + "loss": 0.3693, + "step": 2170500 + }, + { + "epoch": 14.691154179298398, + "grad_norm": 0.3742755353450775, + "learning_rate": 4.853088458207016e-05, + "loss": 0.3685, + "step": 2171000 + }, + { + "epoch": 14.694537678648766, + "grad_norm": 0.3788643181324005, + "learning_rate": 4.8530546232135124e-05, + "loss": 0.3689, + "step": 2171500 + }, + { + "epoch": 14.697921177999135, + "grad_norm": 0.3427134156227112, + "learning_rate": 4.853020788220009e-05, + "loss": 0.3694, + "step": 2172000 + }, + { + "epoch": 14.701304677349501, + "grad_norm": 0.38204559683799744, + "learning_rate": 4.8529869532265055e-05, + "loss": 0.3698, + "step": 2172500 + }, + { + "epoch": 14.70468817669987, + "grad_norm": 0.38196712732315063, + "learning_rate": 4.852953118233002e-05, + "loss": 0.3705, + "step": 2173000 + }, + { + "epoch": 14.708071676050238, + "grad_norm": 0.3562172055244446, + "learning_rate": 4.852919283239498e-05, + "loss": 0.3688, + "step": 2173500 + }, + { + "epoch": 14.711455175400607, + "grad_norm": 0.37580806016921997, + "learning_rate": 4.852885448245994e-05, + "loss": 0.3687, + "step": 2174000 + }, + { + "epoch": 14.714838674750974, + "grad_norm": 0.30255070328712463, + "learning_rate": 4.85285161325249e-05, + "loss": 0.3678, + "step": 2174500 + }, + { + "epoch": 14.718222174101342, + "grad_norm": 0.37820175290107727, + "learning_rate": 4.8528177782589865e-05, + "loss": 0.3677, + "step": 2175000 + }, + { + "epoch": 14.72160567345171, + "grad_norm": 0.3466613292694092, + "learning_rate": 4.8527839432654834e-05, + "loss": 0.3693, + "step": 2175500 + }, + { + "epoch": 14.72498917280208, + "grad_norm": 0.377681702375412, + "learning_rate": 4.8527501082719796e-05, + "loss": 0.3687, + "step": 2176000 + }, + { + "epoch": 14.728372672152448, + "grad_norm": 0.3577273488044739, + "learning_rate": 4.852716273278476e-05, + "loss": 0.3701, + "step": 2176500 + }, + { + "epoch": 14.731756171502814, + "grad_norm": 0.3537616729736328, + "learning_rate": 4.852682438284972e-05, + "loss": 0.3699, + "step": 2177000 + }, + { + "epoch": 14.735139670853183, + "grad_norm": 0.3725182116031647, + "learning_rate": 4.852648603291469e-05, + "loss": 0.3699, + "step": 2177500 + }, + { + "epoch": 14.738523170203552, + "grad_norm": 0.3579446077346802, + "learning_rate": 4.8526147682979645e-05, + "loss": 0.368, + "step": 2178000 + }, + { + "epoch": 14.74190666955392, + "grad_norm": 0.35082072019577026, + "learning_rate": 4.852580933304461e-05, + "loss": 0.3694, + "step": 2178500 + }, + { + "epoch": 14.745290168904287, + "grad_norm": 0.38058724999427795, + "learning_rate": 4.852547098310957e-05, + "loss": 0.3704, + "step": 2179000 + }, + { + "epoch": 14.748673668254655, + "grad_norm": 0.35093605518341064, + "learning_rate": 4.852513263317454e-05, + "loss": 0.3703, + "step": 2179500 + }, + { + "epoch": 14.752057167605024, + "grad_norm": 0.37360599637031555, + "learning_rate": 4.85247942832395e-05, + "loss": 0.3681, + "step": 2180000 + }, + { + "epoch": 14.755440666955392, + "grad_norm": 0.382112592458725, + "learning_rate": 4.852445593330446e-05, + "loss": 0.3702, + "step": 2180500 + }, + { + "epoch": 14.75882416630576, + "grad_norm": 0.3928546905517578, + "learning_rate": 4.8524117583369424e-05, + "loss": 0.3705, + "step": 2181000 + }, + { + "epoch": 14.762207665656128, + "grad_norm": 0.3380775451660156, + "learning_rate": 4.852377923343439e-05, + "loss": 0.3692, + "step": 2181500 + }, + { + "epoch": 14.765591165006496, + "grad_norm": 0.3672342002391815, + "learning_rate": 4.8523440883499355e-05, + "loss": 0.3718, + "step": 2182000 + }, + { + "epoch": 14.768974664356865, + "grad_norm": 0.3561403751373291, + "learning_rate": 4.852310253356432e-05, + "loss": 0.3681, + "step": 2182500 + }, + { + "epoch": 14.772358163707233, + "grad_norm": 0.36243200302124023, + "learning_rate": 4.852276418362928e-05, + "loss": 0.369, + "step": 2183000 + }, + { + "epoch": 14.7757416630576, + "grad_norm": 0.39097335934638977, + "learning_rate": 4.852242583369424e-05, + "loss": 0.3683, + "step": 2183500 + }, + { + "epoch": 14.779125162407968, + "grad_norm": 0.36693957448005676, + "learning_rate": 4.8522087483759204e-05, + "loss": 0.3694, + "step": 2184000 + }, + { + "epoch": 14.782508661758337, + "grad_norm": 0.317962646484375, + "learning_rate": 4.8521749133824166e-05, + "loss": 0.3699, + "step": 2184500 + }, + { + "epoch": 14.785892161108706, + "grad_norm": 0.3652923107147217, + "learning_rate": 4.8521410783889135e-05, + "loss": 0.3696, + "step": 2185000 + }, + { + "epoch": 14.789275660459072, + "grad_norm": 0.3356926739215851, + "learning_rate": 4.85210724339541e-05, + "loss": 0.3707, + "step": 2185500 + }, + { + "epoch": 14.79265915980944, + "grad_norm": 0.3495100438594818, + "learning_rate": 4.852073408401906e-05, + "loss": 0.3709, + "step": 2186000 + }, + { + "epoch": 14.79604265915981, + "grad_norm": 0.3659546673297882, + "learning_rate": 4.852039573408402e-05, + "loss": 0.3698, + "step": 2186500 + }, + { + "epoch": 14.799426158510178, + "grad_norm": 0.40927523374557495, + "learning_rate": 4.852005738414899e-05, + "loss": 0.3686, + "step": 2187000 + }, + { + "epoch": 14.802809657860546, + "grad_norm": 0.3406883180141449, + "learning_rate": 4.8519719034213945e-05, + "loss": 0.3694, + "step": 2187500 + }, + { + "epoch": 14.806193157210913, + "grad_norm": 0.3734167516231537, + "learning_rate": 4.851938068427891e-05, + "loss": 0.3691, + "step": 2188000 + }, + { + "epoch": 14.809576656561282, + "grad_norm": 0.38106903433799744, + "learning_rate": 4.851904233434387e-05, + "loss": 0.3685, + "step": 2188500 + }, + { + "epoch": 14.81296015591165, + "grad_norm": 0.3433600068092346, + "learning_rate": 4.851870398440884e-05, + "loss": 0.3691, + "step": 2189000 + }, + { + "epoch": 14.816343655262019, + "grad_norm": 0.379891961812973, + "learning_rate": 4.85183656344738e-05, + "loss": 0.3676, + "step": 2189500 + }, + { + "epoch": 14.819727154612387, + "grad_norm": 0.3658895790576935, + "learning_rate": 4.851802728453876e-05, + "loss": 0.3674, + "step": 2190000 + }, + { + "epoch": 14.823110653962754, + "grad_norm": 0.3627364933490753, + "learning_rate": 4.8517688934603725e-05, + "loss": 0.3703, + "step": 2190500 + }, + { + "epoch": 14.826494153313122, + "grad_norm": 0.3883538842201233, + "learning_rate": 4.8517350584668694e-05, + "loss": 0.3684, + "step": 2191000 + }, + { + "epoch": 14.829877652663491, + "grad_norm": 0.3807075619697571, + "learning_rate": 4.8517012234733656e-05, + "loss": 0.37, + "step": 2191500 + }, + { + "epoch": 14.83326115201386, + "grad_norm": 0.3800852298736572, + "learning_rate": 4.851667388479862e-05, + "loss": 0.3701, + "step": 2192000 + }, + { + "epoch": 14.836644651364226, + "grad_norm": 0.365222305059433, + "learning_rate": 4.851633553486358e-05, + "loss": 0.3688, + "step": 2192500 + }, + { + "epoch": 14.840028150714595, + "grad_norm": 0.3683980405330658, + "learning_rate": 4.851599718492854e-05, + "loss": 0.3695, + "step": 2193000 + }, + { + "epoch": 14.843411650064963, + "grad_norm": 0.3718588054180145, + "learning_rate": 4.8515658834993504e-05, + "loss": 0.3703, + "step": 2193500 + }, + { + "epoch": 14.846795149415332, + "grad_norm": 0.36069953441619873, + "learning_rate": 4.8515320485058466e-05, + "loss": 0.3709, + "step": 2194000 + }, + { + "epoch": 14.850178648765699, + "grad_norm": 0.35505011677742004, + "learning_rate": 4.8514982135123435e-05, + "loss": 0.3698, + "step": 2194500 + }, + { + "epoch": 14.853562148116067, + "grad_norm": 0.37225234508514404, + "learning_rate": 4.85146437851884e-05, + "loss": 0.3695, + "step": 2195000 + }, + { + "epoch": 14.856945647466436, + "grad_norm": 0.43268582224845886, + "learning_rate": 4.851430543525336e-05, + "loss": 0.3705, + "step": 2195500 + }, + { + "epoch": 14.860329146816804, + "grad_norm": 0.33920058608055115, + "learning_rate": 4.851396708531832e-05, + "loss": 0.369, + "step": 2196000 + }, + { + "epoch": 14.863712646167173, + "grad_norm": 0.3335217833518982, + "learning_rate": 4.851362873538329e-05, + "loss": 0.3711, + "step": 2196500 + }, + { + "epoch": 14.86709614551754, + "grad_norm": 0.3459427058696747, + "learning_rate": 4.8513290385448246e-05, + "loss": 0.371, + "step": 2197000 + }, + { + "epoch": 14.870479644867908, + "grad_norm": 0.3620784878730774, + "learning_rate": 4.851295203551321e-05, + "loss": 0.3695, + "step": 2197500 + }, + { + "epoch": 14.873863144218276, + "grad_norm": 0.3312499523162842, + "learning_rate": 4.851261368557817e-05, + "loss": 0.3693, + "step": 2198000 + }, + { + "epoch": 14.877246643568645, + "grad_norm": 0.3384900689125061, + "learning_rate": 4.851227533564314e-05, + "loss": 0.3699, + "step": 2198500 + }, + { + "epoch": 14.880630142919012, + "grad_norm": 0.35046958923339844, + "learning_rate": 4.85119369857081e-05, + "loss": 0.3683, + "step": 2199000 + }, + { + "epoch": 14.88401364226938, + "grad_norm": 0.39812368154525757, + "learning_rate": 4.851159863577306e-05, + "loss": 0.3698, + "step": 2199500 + }, + { + "epoch": 14.887397141619749, + "grad_norm": 0.40305712819099426, + "learning_rate": 4.8511260285838025e-05, + "loss": 0.3705, + "step": 2200000 + }, + { + "epoch": 14.890780640970117, + "grad_norm": 0.341285765171051, + "learning_rate": 4.8510921935902994e-05, + "loss": 0.3688, + "step": 2200500 + }, + { + "epoch": 14.894164140320486, + "grad_norm": 0.39060279726982117, + "learning_rate": 4.8510583585967956e-05, + "loss": 0.3692, + "step": 2201000 + }, + { + "epoch": 14.897547639670853, + "grad_norm": 0.3980751633644104, + "learning_rate": 4.851024523603292e-05, + "loss": 0.3688, + "step": 2201500 + }, + { + "epoch": 14.900931139021221, + "grad_norm": 0.30645591020584106, + "learning_rate": 4.850990688609788e-05, + "loss": 0.3682, + "step": 2202000 + }, + { + "epoch": 14.90431463837159, + "grad_norm": 0.4017775356769562, + "learning_rate": 4.850956853616284e-05, + "loss": 0.3696, + "step": 2202500 + }, + { + "epoch": 14.907698137721958, + "grad_norm": 0.35003912448883057, + "learning_rate": 4.8509230186227805e-05, + "loss": 0.3688, + "step": 2203000 + }, + { + "epoch": 14.911081637072325, + "grad_norm": 0.34675323963165283, + "learning_rate": 4.850889183629277e-05, + "loss": 0.3691, + "step": 2203500 + }, + { + "epoch": 14.914465136422693, + "grad_norm": 0.42045390605926514, + "learning_rate": 4.8508553486357736e-05, + "loss": 0.3698, + "step": 2204000 + }, + { + "epoch": 14.917848635773062, + "grad_norm": 0.37545245885849, + "learning_rate": 4.85082151364227e-05, + "loss": 0.3705, + "step": 2204500 + }, + { + "epoch": 14.92123213512343, + "grad_norm": 0.3739573657512665, + "learning_rate": 4.850787678648766e-05, + "loss": 0.3689, + "step": 2205000 + }, + { + "epoch": 14.924615634473799, + "grad_norm": 0.35323527455329895, + "learning_rate": 4.850753843655262e-05, + "loss": 0.3689, + "step": 2205500 + }, + { + "epoch": 14.927999133824166, + "grad_norm": 0.362337589263916, + "learning_rate": 4.850720008661759e-05, + "loss": 0.3683, + "step": 2206000 + }, + { + "epoch": 14.931382633174534, + "grad_norm": 0.3333463668823242, + "learning_rate": 4.8506861736682547e-05, + "loss": 0.3698, + "step": 2206500 + }, + { + "epoch": 14.934766132524903, + "grad_norm": 0.36068710684776306, + "learning_rate": 4.850652338674751e-05, + "loss": 0.369, + "step": 2207000 + }, + { + "epoch": 14.938149631875271, + "grad_norm": 0.34819650650024414, + "learning_rate": 4.850618503681247e-05, + "loss": 0.3692, + "step": 2207500 + }, + { + "epoch": 14.941533131225638, + "grad_norm": 0.3557802438735962, + "learning_rate": 4.850584668687744e-05, + "loss": 0.3693, + "step": 2208000 + }, + { + "epoch": 14.944916630576007, + "grad_norm": 0.40220189094543457, + "learning_rate": 4.85055083369424e-05, + "loss": 0.3678, + "step": 2208500 + }, + { + "epoch": 14.948300129926375, + "grad_norm": 0.3456258773803711, + "learning_rate": 4.8505169987007364e-05, + "loss": 0.3718, + "step": 2209000 + }, + { + "epoch": 14.951683629276744, + "grad_norm": 0.36534014344215393, + "learning_rate": 4.8504831637072326e-05, + "loss": 0.3703, + "step": 2209500 + }, + { + "epoch": 14.95506712862711, + "grad_norm": 0.3737540543079376, + "learning_rate": 4.8504493287137295e-05, + "loss": 0.3694, + "step": 2210000 + }, + { + "epoch": 14.958450627977479, + "grad_norm": 0.3467683494091034, + "learning_rate": 4.850415493720226e-05, + "loss": 0.3675, + "step": 2210500 + }, + { + "epoch": 14.961834127327847, + "grad_norm": 0.38319334387779236, + "learning_rate": 4.850381658726722e-05, + "loss": 0.3689, + "step": 2211000 + }, + { + "epoch": 14.965217626678216, + "grad_norm": 0.39453256130218506, + "learning_rate": 4.850347823733218e-05, + "loss": 0.3698, + "step": 2211500 + }, + { + "epoch": 14.968601126028585, + "grad_norm": 0.3527160882949829, + "learning_rate": 4.8503139887397143e-05, + "loss": 0.3685, + "step": 2212000 + }, + { + "epoch": 14.971984625378951, + "grad_norm": 0.3644430935382843, + "learning_rate": 4.8502801537462106e-05, + "loss": 0.3706, + "step": 2212500 + }, + { + "epoch": 14.97536812472932, + "grad_norm": 0.3877614736557007, + "learning_rate": 4.850246318752707e-05, + "loss": 0.3692, + "step": 2213000 + }, + { + "epoch": 14.978751624079688, + "grad_norm": 0.34349948167800903, + "learning_rate": 4.8502124837592037e-05, + "loss": 0.3691, + "step": 2213500 + }, + { + "epoch": 14.982135123430057, + "grad_norm": 0.3915456235408783, + "learning_rate": 4.8501786487657e-05, + "loss": 0.3691, + "step": 2214000 + }, + { + "epoch": 14.985518622780425, + "grad_norm": 0.41945990920066833, + "learning_rate": 4.850144813772196e-05, + "loss": 0.368, + "step": 2214500 + }, + { + "epoch": 14.988902122130792, + "grad_norm": 0.38330912590026855, + "learning_rate": 4.850110978778692e-05, + "loss": 0.3693, + "step": 2215000 + }, + { + "epoch": 14.99228562148116, + "grad_norm": 0.3266220986843109, + "learning_rate": 4.8500771437851885e-05, + "loss": 0.3691, + "step": 2215500 + }, + { + "epoch": 14.99566912083153, + "grad_norm": 0.34072598814964294, + "learning_rate": 4.850043308791685e-05, + "loss": 0.3679, + "step": 2216000 + }, + { + "epoch": 14.999052620181898, + "grad_norm": 0.36264902353286743, + "learning_rate": 4.850009473798181e-05, + "loss": 0.3702, + "step": 2216500 + }, + { + "epoch": 15.0, + "eval_accuracy": 0.8592597939115394, + "eval_loss": 0.5718047022819519, + "eval_runtime": 3550.6807, + "eval_samples_per_second": 81.884, + "eval_steps_per_second": 5.118, + "step": 2216640 + }, + { + "epoch": 15.002436119532264, + "grad_norm": 0.3186267912387848, + "learning_rate": 4.849975638804677e-05, + "loss": 0.3676, + "step": 2217000 + }, + { + "epoch": 15.005819618882633, + "grad_norm": 0.3836493492126465, + "learning_rate": 4.849941803811174e-05, + "loss": 0.3672, + "step": 2217500 + }, + { + "epoch": 15.009203118233001, + "grad_norm": 0.36411088705062866, + "learning_rate": 4.84990796881767e-05, + "loss": 0.3662, + "step": 2218000 + }, + { + "epoch": 15.01258661758337, + "grad_norm": 0.3376358151435852, + "learning_rate": 4.8498741338241665e-05, + "loss": 0.3674, + "step": 2218500 + }, + { + "epoch": 15.015970116933737, + "grad_norm": 0.36777323484420776, + "learning_rate": 4.849840298830663e-05, + "loss": 0.3679, + "step": 2219000 + }, + { + "epoch": 15.019353616284105, + "grad_norm": 0.3862101435661316, + "learning_rate": 4.8498064638371596e-05, + "loss": 0.3659, + "step": 2219500 + }, + { + "epoch": 15.022737115634474, + "grad_norm": 0.38136187195777893, + "learning_rate": 4.849772628843656e-05, + "loss": 0.3667, + "step": 2220000 + }, + { + "epoch": 15.026120614984842, + "grad_norm": 0.34917306900024414, + "learning_rate": 4.849738793850152e-05, + "loss": 0.3655, + "step": 2220500 + }, + { + "epoch": 15.02950411433521, + "grad_norm": 0.343822717666626, + "learning_rate": 4.849704958856648e-05, + "loss": 0.366, + "step": 2221000 + }, + { + "epoch": 15.032887613685578, + "grad_norm": 0.3669893145561218, + "learning_rate": 4.8496711238631444e-05, + "loss": 0.3674, + "step": 2221500 + }, + { + "epoch": 15.036271113035946, + "grad_norm": 0.37427571415901184, + "learning_rate": 4.8496372888696406e-05, + "loss": 0.3667, + "step": 2222000 + }, + { + "epoch": 15.039654612386315, + "grad_norm": 0.36124661564826965, + "learning_rate": 4.849603453876137e-05, + "loss": 0.3676, + "step": 2222500 + }, + { + "epoch": 15.043038111736683, + "grad_norm": 0.32961371541023254, + "learning_rate": 4.849569618882634e-05, + "loss": 0.3681, + "step": 2223000 + }, + { + "epoch": 15.04642161108705, + "grad_norm": 0.3570033013820648, + "learning_rate": 4.84953578388913e-05, + "loss": 0.3676, + "step": 2223500 + }, + { + "epoch": 15.049805110437418, + "grad_norm": 0.3314495384693146, + "learning_rate": 4.849501948895626e-05, + "loss": 0.3683, + "step": 2224000 + }, + { + "epoch": 15.053188609787787, + "grad_norm": 0.3179497718811035, + "learning_rate": 4.8494681139021224e-05, + "loss": 0.3681, + "step": 2224500 + }, + { + "epoch": 15.056572109138155, + "grad_norm": 0.3659362494945526, + "learning_rate": 4.8494342789086186e-05, + "loss": 0.3683, + "step": 2225000 + }, + { + "epoch": 15.059955608488524, + "grad_norm": 0.36493274569511414, + "learning_rate": 4.8494004439151155e-05, + "loss": 0.366, + "step": 2225500 + }, + { + "epoch": 15.06333910783889, + "grad_norm": 0.35716429352760315, + "learning_rate": 4.849366608921611e-05, + "loss": 0.3668, + "step": 2226000 + }, + { + "epoch": 15.06672260718926, + "grad_norm": 0.35063108801841736, + "learning_rate": 4.849332773928107e-05, + "loss": 0.3685, + "step": 2226500 + }, + { + "epoch": 15.070106106539628, + "grad_norm": 0.3515456020832062, + "learning_rate": 4.849298938934604e-05, + "loss": 0.3674, + "step": 2227000 + }, + { + "epoch": 15.073489605889996, + "grad_norm": 0.34294208884239197, + "learning_rate": 4.8492651039411e-05, + "loss": 0.3673, + "step": 2227500 + }, + { + "epoch": 15.076873105240363, + "grad_norm": 0.3997062146663666, + "learning_rate": 4.8492312689475965e-05, + "loss": 0.3675, + "step": 2228000 + }, + { + "epoch": 15.080256604590732, + "grad_norm": 0.337519109249115, + "learning_rate": 4.849197433954093e-05, + "loss": 0.3657, + "step": 2228500 + }, + { + "epoch": 15.0836401039411, + "grad_norm": 0.3725280463695526, + "learning_rate": 4.8491635989605896e-05, + "loss": 0.3667, + "step": 2229000 + }, + { + "epoch": 15.087023603291469, + "grad_norm": 0.32875075936317444, + "learning_rate": 4.849129763967086e-05, + "loss": 0.3693, + "step": 2229500 + }, + { + "epoch": 15.090407102641837, + "grad_norm": 0.3951806128025055, + "learning_rate": 4.849095928973582e-05, + "loss": 0.3676, + "step": 2230000 + }, + { + "epoch": 15.093790601992204, + "grad_norm": 0.41113314032554626, + "learning_rate": 4.849062093980078e-05, + "loss": 0.368, + "step": 2230500 + }, + { + "epoch": 15.097174101342572, + "grad_norm": 0.36592134833335876, + "learning_rate": 4.8490282589865745e-05, + "loss": 0.3675, + "step": 2231000 + }, + { + "epoch": 15.100557600692941, + "grad_norm": 0.3612339198589325, + "learning_rate": 4.848994423993071e-05, + "loss": 0.3686, + "step": 2231500 + }, + { + "epoch": 15.10394110004331, + "grad_norm": 0.34407153725624084, + "learning_rate": 4.848960588999567e-05, + "loss": 0.3675, + "step": 2232000 + }, + { + "epoch": 15.107324599393676, + "grad_norm": 0.35251736640930176, + "learning_rate": 4.848926754006063e-05, + "loss": 0.3669, + "step": 2232500 + }, + { + "epoch": 15.110708098744045, + "grad_norm": 0.3643178939819336, + "learning_rate": 4.84889291901256e-05, + "loss": 0.369, + "step": 2233000 + }, + { + "epoch": 15.114091598094413, + "grad_norm": 0.38118237257003784, + "learning_rate": 4.848859084019056e-05, + "loss": 0.3677, + "step": 2233500 + }, + { + "epoch": 15.117475097444782, + "grad_norm": 0.40105199813842773, + "learning_rate": 4.8488252490255524e-05, + "loss": 0.3693, + "step": 2234000 + }, + { + "epoch": 15.120858596795149, + "grad_norm": 0.35876044631004333, + "learning_rate": 4.8487914140320486e-05, + "loss": 0.3681, + "step": 2234500 + }, + { + "epoch": 15.124242096145517, + "grad_norm": 0.3273409903049469, + "learning_rate": 4.8487575790385455e-05, + "loss": 0.3664, + "step": 2235000 + }, + { + "epoch": 15.127625595495886, + "grad_norm": 0.38285356760025024, + "learning_rate": 4.848723744045041e-05, + "loss": 0.3695, + "step": 2235500 + }, + { + "epoch": 15.131009094846254, + "grad_norm": 0.34514716267585754, + "learning_rate": 4.848689909051537e-05, + "loss": 0.3667, + "step": 2236000 + }, + { + "epoch": 15.134392594196623, + "grad_norm": 0.3504362106323242, + "learning_rate": 4.848656074058034e-05, + "loss": 0.3701, + "step": 2236500 + }, + { + "epoch": 15.13777609354699, + "grad_norm": 0.3744419813156128, + "learning_rate": 4.8486222390645304e-05, + "loss": 0.3685, + "step": 2237000 + }, + { + "epoch": 15.141159592897358, + "grad_norm": 0.3694138526916504, + "learning_rate": 4.8485884040710266e-05, + "loss": 0.3671, + "step": 2237500 + }, + { + "epoch": 15.144543092247726, + "grad_norm": 0.35547423362731934, + "learning_rate": 4.848554569077523e-05, + "loss": 0.3678, + "step": 2238000 + }, + { + "epoch": 15.147926591598095, + "grad_norm": 0.3246602416038513, + "learning_rate": 4.84852073408402e-05, + "loss": 0.3677, + "step": 2238500 + }, + { + "epoch": 15.151310090948462, + "grad_norm": 0.3738870918750763, + "learning_rate": 4.848486899090516e-05, + "loss": 0.3687, + "step": 2239000 + }, + { + "epoch": 15.15469359029883, + "grad_norm": 0.3235304057598114, + "learning_rate": 4.848453064097012e-05, + "loss": 0.368, + "step": 2239500 + }, + { + "epoch": 15.158077089649199, + "grad_norm": 0.37630218267440796, + "learning_rate": 4.848419229103508e-05, + "loss": 0.3673, + "step": 2240000 + }, + { + "epoch": 15.161460588999567, + "grad_norm": 0.3362449109554291, + "learning_rate": 4.8483853941100045e-05, + "loss": 0.3666, + "step": 2240500 + }, + { + "epoch": 15.164844088349936, + "grad_norm": 0.37820273637771606, + "learning_rate": 4.848351559116501e-05, + "loss": 0.3685, + "step": 2241000 + }, + { + "epoch": 15.168227587700303, + "grad_norm": 0.3779197931289673, + "learning_rate": 4.848317724122997e-05, + "loss": 0.3677, + "step": 2241500 + }, + { + "epoch": 15.171611087050671, + "grad_norm": 0.3281296491622925, + "learning_rate": 4.848283889129493e-05, + "loss": 0.3668, + "step": 2242000 + }, + { + "epoch": 15.17499458640104, + "grad_norm": 0.34983110427856445, + "learning_rate": 4.84825005413599e-05, + "loss": 0.369, + "step": 2242500 + }, + { + "epoch": 15.178378085751408, + "grad_norm": 0.37141284346580505, + "learning_rate": 4.848216219142486e-05, + "loss": 0.3682, + "step": 2243000 + }, + { + "epoch": 15.181761585101775, + "grad_norm": 0.36924445629119873, + "learning_rate": 4.8481823841489825e-05, + "loss": 0.367, + "step": 2243500 + }, + { + "epoch": 15.185145084452143, + "grad_norm": 0.36089447140693665, + "learning_rate": 4.848148549155479e-05, + "loss": 0.3688, + "step": 2244000 + }, + { + "epoch": 15.188528583802512, + "grad_norm": 0.3452288508415222, + "learning_rate": 4.8481147141619756e-05, + "loss": 0.3685, + "step": 2244500 + }, + { + "epoch": 15.19191208315288, + "grad_norm": 0.34450557827949524, + "learning_rate": 4.848080879168471e-05, + "loss": 0.3685, + "step": 2245000 + }, + { + "epoch": 15.195295582503249, + "grad_norm": 0.35631123185157776, + "learning_rate": 4.848047044174967e-05, + "loss": 0.3672, + "step": 2245500 + }, + { + "epoch": 15.198679081853616, + "grad_norm": 0.3319762349128723, + "learning_rate": 4.848013209181464e-05, + "loss": 0.3677, + "step": 2246000 + }, + { + "epoch": 15.202062581203984, + "grad_norm": 0.3290312588214874, + "learning_rate": 4.8479793741879604e-05, + "loss": 0.3689, + "step": 2246500 + }, + { + "epoch": 15.205446080554353, + "grad_norm": 0.3664303719997406, + "learning_rate": 4.8479455391944567e-05, + "loss": 0.3681, + "step": 2247000 + }, + { + "epoch": 15.208829579904721, + "grad_norm": 0.3830331563949585, + "learning_rate": 4.847911704200953e-05, + "loss": 0.3695, + "step": 2247500 + }, + { + "epoch": 15.212213079255088, + "grad_norm": 0.38021984696388245, + "learning_rate": 4.84787786920745e-05, + "loss": 0.3675, + "step": 2248000 + }, + { + "epoch": 15.215596578605457, + "grad_norm": 0.4113265872001648, + "learning_rate": 4.847844034213946e-05, + "loss": 0.3674, + "step": 2248500 + }, + { + "epoch": 15.218980077955825, + "grad_norm": 0.3703368306159973, + "learning_rate": 4.847810199220442e-05, + "loss": 0.3688, + "step": 2249000 + }, + { + "epoch": 15.222363577306194, + "grad_norm": 0.39109688997268677, + "learning_rate": 4.847776364226938e-05, + "loss": 0.3687, + "step": 2249500 + }, + { + "epoch": 15.225747076656562, + "grad_norm": 0.37408167123794556, + "learning_rate": 4.8477425292334346e-05, + "loss": 0.3686, + "step": 2250000 + }, + { + "epoch": 15.229130576006929, + "grad_norm": 0.35299205780029297, + "learning_rate": 4.847708694239931e-05, + "loss": 0.369, + "step": 2250500 + }, + { + "epoch": 15.232514075357297, + "grad_norm": 0.464773565530777, + "learning_rate": 4.847674859246427e-05, + "loss": 0.3689, + "step": 2251000 + }, + { + "epoch": 15.235897574707666, + "grad_norm": 0.3952706456184387, + "learning_rate": 4.847641024252923e-05, + "loss": 0.3669, + "step": 2251500 + }, + { + "epoch": 15.239281074058034, + "grad_norm": 0.3495103716850281, + "learning_rate": 4.84760718925942e-05, + "loss": 0.3677, + "step": 2252000 + }, + { + "epoch": 15.242664573408401, + "grad_norm": 0.3670581877231598, + "learning_rate": 4.8475733542659163e-05, + "loss": 0.3685, + "step": 2252500 + }, + { + "epoch": 15.24604807275877, + "grad_norm": 0.37361136078834534, + "learning_rate": 4.8475395192724126e-05, + "loss": 0.3697, + "step": 2253000 + }, + { + "epoch": 15.249431572109138, + "grad_norm": 0.34565627574920654, + "learning_rate": 4.847505684278909e-05, + "loss": 0.3691, + "step": 2253500 + }, + { + "epoch": 15.252815071459507, + "grad_norm": 0.34091848134994507, + "learning_rate": 4.8474718492854057e-05, + "loss": 0.3673, + "step": 2254000 + }, + { + "epoch": 15.256198570809875, + "grad_norm": 0.3778747320175171, + "learning_rate": 4.847438014291901e-05, + "loss": 0.3674, + "step": 2254500 + }, + { + "epoch": 15.259582070160242, + "grad_norm": 0.3678615689277649, + "learning_rate": 4.8474041792983974e-05, + "loss": 0.3682, + "step": 2255000 + }, + { + "epoch": 15.26296556951061, + "grad_norm": 0.3665476441383362, + "learning_rate": 4.847370344304894e-05, + "loss": 0.3689, + "step": 2255500 + }, + { + "epoch": 15.266349068860979, + "grad_norm": 0.3415987491607666, + "learning_rate": 4.8473365093113905e-05, + "loss": 0.3695, + "step": 2256000 + }, + { + "epoch": 15.269732568211348, + "grad_norm": 0.3851405084133148, + "learning_rate": 4.847302674317887e-05, + "loss": 0.3672, + "step": 2256500 + }, + { + "epoch": 15.273116067561714, + "grad_norm": 0.3553013801574707, + "learning_rate": 4.847268839324383e-05, + "loss": 0.3683, + "step": 2257000 + }, + { + "epoch": 15.276499566912083, + "grad_norm": 0.38508784770965576, + "learning_rate": 4.84723500433088e-05, + "loss": 0.3674, + "step": 2257500 + }, + { + "epoch": 15.279883066262451, + "grad_norm": 0.36417558789253235, + "learning_rate": 4.847201169337376e-05, + "loss": 0.3689, + "step": 2258000 + }, + { + "epoch": 15.28326656561282, + "grad_norm": 0.35827988386154175, + "learning_rate": 4.847167334343872e-05, + "loss": 0.367, + "step": 2258500 + }, + { + "epoch": 15.286650064963187, + "grad_norm": 0.36032170057296753, + "learning_rate": 4.847133499350368e-05, + "loss": 0.3675, + "step": 2259000 + }, + { + "epoch": 15.290033564313555, + "grad_norm": 0.3674727976322174, + "learning_rate": 4.847099664356865e-05, + "loss": 0.3676, + "step": 2259500 + }, + { + "epoch": 15.293417063663924, + "grad_norm": 0.358148455619812, + "learning_rate": 4.847065829363361e-05, + "loss": 0.3683, + "step": 2260000 + }, + { + "epoch": 15.296800563014292, + "grad_norm": 0.36358699202537537, + "learning_rate": 4.847031994369857e-05, + "loss": 0.3677, + "step": 2260500 + }, + { + "epoch": 15.30018406236466, + "grad_norm": 0.4036157429218292, + "learning_rate": 4.846998159376353e-05, + "loss": 0.3679, + "step": 2261000 + }, + { + "epoch": 15.303567561715028, + "grad_norm": 0.38097384572029114, + "learning_rate": 4.84696432438285e-05, + "loss": 0.3687, + "step": 2261500 + }, + { + "epoch": 15.306951061065396, + "grad_norm": 0.4017247259616852, + "learning_rate": 4.8469304893893464e-05, + "loss": 0.3683, + "step": 2262000 + }, + { + "epoch": 15.310334560415765, + "grad_norm": 0.36650019884109497, + "learning_rate": 4.8468966543958426e-05, + "loss": 0.3679, + "step": 2262500 + }, + { + "epoch": 15.313718059766133, + "grad_norm": 0.34185999631881714, + "learning_rate": 4.846862819402339e-05, + "loss": 0.3686, + "step": 2263000 + }, + { + "epoch": 15.3171015591165, + "grad_norm": 0.4078949987888336, + "learning_rate": 4.846828984408836e-05, + "loss": 0.3673, + "step": 2263500 + }, + { + "epoch": 15.320485058466868, + "grad_norm": 0.3369295299053192, + "learning_rate": 4.846795149415331e-05, + "loss": 0.3695, + "step": 2264000 + }, + { + "epoch": 15.323868557817237, + "grad_norm": 0.3768553137779236, + "learning_rate": 4.8467613144218275e-05, + "loss": 0.3691, + "step": 2264500 + }, + { + "epoch": 15.327252057167605, + "grad_norm": 0.37157976627349854, + "learning_rate": 4.8467274794283244e-05, + "loss": 0.37, + "step": 2265000 + }, + { + "epoch": 15.330635556517974, + "grad_norm": 0.3911592364311218, + "learning_rate": 4.8466936444348206e-05, + "loss": 0.3686, + "step": 2265500 + }, + { + "epoch": 15.33401905586834, + "grad_norm": 0.32359597086906433, + "learning_rate": 4.846659809441317e-05, + "loss": 0.3665, + "step": 2266000 + }, + { + "epoch": 15.33740255521871, + "grad_norm": 0.33536919951438904, + "learning_rate": 4.846625974447813e-05, + "loss": 0.3689, + "step": 2266500 + }, + { + "epoch": 15.340786054569078, + "grad_norm": 0.3743899166584015, + "learning_rate": 4.84659213945431e-05, + "loss": 0.368, + "step": 2267000 + }, + { + "epoch": 15.344169553919446, + "grad_norm": 0.36726707220077515, + "learning_rate": 4.846558304460806e-05, + "loss": 0.3677, + "step": 2267500 + }, + { + "epoch": 15.347553053269813, + "grad_norm": 0.3498327434062958, + "learning_rate": 4.846524469467302e-05, + "loss": 0.3688, + "step": 2268000 + }, + { + "epoch": 15.350936552620182, + "grad_norm": 0.3115338981151581, + "learning_rate": 4.846490634473798e-05, + "loss": 0.3678, + "step": 2268500 + }, + { + "epoch": 15.35432005197055, + "grad_norm": 0.3821237087249756, + "learning_rate": 4.846456799480295e-05, + "loss": 0.3679, + "step": 2269000 + }, + { + "epoch": 15.357703551320919, + "grad_norm": 0.3773631155490875, + "learning_rate": 4.846422964486791e-05, + "loss": 0.3678, + "step": 2269500 + }, + { + "epoch": 15.361087050671287, + "grad_norm": 0.34869384765625, + "learning_rate": 4.846389129493287e-05, + "loss": 0.3671, + "step": 2270000 + }, + { + "epoch": 15.364470550021654, + "grad_norm": 0.3591691553592682, + "learning_rate": 4.8463552944997834e-05, + "loss": 0.3671, + "step": 2270500 + }, + { + "epoch": 15.367854049372022, + "grad_norm": 0.36942118406295776, + "learning_rate": 4.84632145950628e-05, + "loss": 0.3679, + "step": 2271000 + }, + { + "epoch": 15.371237548722391, + "grad_norm": 0.3710452616214752, + "learning_rate": 4.8462876245127765e-05, + "loss": 0.3668, + "step": 2271500 + }, + { + "epoch": 15.37462104807276, + "grad_norm": 0.3890515863895416, + "learning_rate": 4.846253789519273e-05, + "loss": 0.3696, + "step": 2272000 + }, + { + "epoch": 15.378004547423126, + "grad_norm": 0.3567555844783783, + "learning_rate": 4.846219954525769e-05, + "loss": 0.3677, + "step": 2272500 + }, + { + "epoch": 15.381388046773495, + "grad_norm": 0.3835057318210602, + "learning_rate": 4.846186119532266e-05, + "loss": 0.3691, + "step": 2273000 + }, + { + "epoch": 15.384771546123863, + "grad_norm": 0.3696700930595398, + "learning_rate": 4.846152284538761e-05, + "loss": 0.369, + "step": 2273500 + }, + { + "epoch": 15.388155045474232, + "grad_norm": 0.3335026800632477, + "learning_rate": 4.8461184495452575e-05, + "loss": 0.3687, + "step": 2274000 + }, + { + "epoch": 15.391538544824598, + "grad_norm": 0.37481197714805603, + "learning_rate": 4.8460846145517544e-05, + "loss": 0.37, + "step": 2274500 + }, + { + "epoch": 15.394922044174967, + "grad_norm": 0.3376085162162781, + "learning_rate": 4.8460507795582506e-05, + "loss": 0.3694, + "step": 2275000 + }, + { + "epoch": 15.398305543525336, + "grad_norm": 0.38262149691581726, + "learning_rate": 4.846016944564747e-05, + "loss": 0.3694, + "step": 2275500 + }, + { + "epoch": 15.401689042875704, + "grad_norm": 0.36028096079826355, + "learning_rate": 4.845983109571243e-05, + "loss": 0.3683, + "step": 2276000 + }, + { + "epoch": 15.405072542226073, + "grad_norm": 0.36225685477256775, + "learning_rate": 4.84594927457774e-05, + "loss": 0.369, + "step": 2276500 + }, + { + "epoch": 15.40845604157644, + "grad_norm": 0.361055463552475, + "learning_rate": 4.845915439584236e-05, + "loss": 0.3691, + "step": 2277000 + }, + { + "epoch": 15.411839540926808, + "grad_norm": 0.34308624267578125, + "learning_rate": 4.8458816045907324e-05, + "loss": 0.3685, + "step": 2277500 + }, + { + "epoch": 15.415223040277176, + "grad_norm": 0.37975791096687317, + "learning_rate": 4.845847769597228e-05, + "loss": 0.3675, + "step": 2278000 + }, + { + "epoch": 15.418606539627545, + "grad_norm": 0.3387218713760376, + "learning_rate": 4.845813934603725e-05, + "loss": 0.3674, + "step": 2278500 + }, + { + "epoch": 15.421990038977913, + "grad_norm": 0.3455009162425995, + "learning_rate": 4.845780099610221e-05, + "loss": 0.3689, + "step": 2279000 + }, + { + "epoch": 15.42537353832828, + "grad_norm": 0.3392178416252136, + "learning_rate": 4.845746264616717e-05, + "loss": 0.3668, + "step": 2279500 + }, + { + "epoch": 15.428757037678649, + "grad_norm": 0.37927737832069397, + "learning_rate": 4.8457124296232134e-05, + "loss": 0.3687, + "step": 2280000 + }, + { + "epoch": 15.432140537029017, + "grad_norm": 0.3556533455848694, + "learning_rate": 4.84567859462971e-05, + "loss": 0.3695, + "step": 2280500 + }, + { + "epoch": 15.435524036379386, + "grad_norm": 0.37136268615722656, + "learning_rate": 4.8456447596362065e-05, + "loss": 0.3674, + "step": 2281000 + }, + { + "epoch": 15.438907535729752, + "grad_norm": 0.39647194743156433, + "learning_rate": 4.845610924642703e-05, + "loss": 0.3677, + "step": 2281500 + }, + { + "epoch": 15.442291035080121, + "grad_norm": 0.39070430397987366, + "learning_rate": 4.845577089649199e-05, + "loss": 0.3681, + "step": 2282000 + }, + { + "epoch": 15.44567453443049, + "grad_norm": 0.34807637333869934, + "learning_rate": 4.845543254655696e-05, + "loss": 0.3685, + "step": 2282500 + }, + { + "epoch": 15.449058033780858, + "grad_norm": 0.3645128011703491, + "learning_rate": 4.8455094196621914e-05, + "loss": 0.3687, + "step": 2283000 + }, + { + "epoch": 15.452441533131225, + "grad_norm": 0.3826288878917694, + "learning_rate": 4.8454755846686876e-05, + "loss": 0.369, + "step": 2283500 + }, + { + "epoch": 15.455825032481593, + "grad_norm": 0.4068937301635742, + "learning_rate": 4.8454417496751845e-05, + "loss": 0.3673, + "step": 2284000 + }, + { + "epoch": 15.459208531831962, + "grad_norm": 0.35856378078460693, + "learning_rate": 4.845407914681681e-05, + "loss": 0.3678, + "step": 2284500 + }, + { + "epoch": 15.46259203118233, + "grad_norm": 0.4144749045372009, + "learning_rate": 4.845374079688177e-05, + "loss": 0.3679, + "step": 2285000 + }, + { + "epoch": 15.465975530532699, + "grad_norm": 0.36407583951950073, + "learning_rate": 4.845340244694673e-05, + "loss": 0.3676, + "step": 2285500 + }, + { + "epoch": 15.469359029883066, + "grad_norm": 0.33645474910736084, + "learning_rate": 4.84530640970117e-05, + "loss": 0.3682, + "step": 2286000 + }, + { + "epoch": 15.472742529233434, + "grad_norm": 0.36144402623176575, + "learning_rate": 4.845272574707666e-05, + "loss": 0.3676, + "step": 2286500 + }, + { + "epoch": 15.476126028583803, + "grad_norm": 0.3633367419242859, + "learning_rate": 4.8452387397141624e-05, + "loss": 0.3678, + "step": 2287000 + }, + { + "epoch": 15.479509527934171, + "grad_norm": 0.37776997685432434, + "learning_rate": 4.8452049047206586e-05, + "loss": 0.3683, + "step": 2287500 + }, + { + "epoch": 15.482893027284538, + "grad_norm": 0.35414132475852966, + "learning_rate": 4.845171069727155e-05, + "loss": 0.3685, + "step": 2288000 + }, + { + "epoch": 15.486276526634907, + "grad_norm": 0.3871769905090332, + "learning_rate": 4.845137234733651e-05, + "loss": 0.3685, + "step": 2288500 + }, + { + "epoch": 15.489660025985275, + "grad_norm": 0.37936681509017944, + "learning_rate": 4.845103399740147e-05, + "loss": 0.3688, + "step": 2289000 + }, + { + "epoch": 15.493043525335644, + "grad_norm": 0.37192994356155396, + "learning_rate": 4.8450695647466435e-05, + "loss": 0.3693, + "step": 2289500 + }, + { + "epoch": 15.496427024686012, + "grad_norm": 0.3410022258758545, + "learning_rate": 4.8450357297531404e-05, + "loss": 0.3688, + "step": 2290000 + }, + { + "epoch": 15.499810524036379, + "grad_norm": 0.3851412236690521, + "learning_rate": 4.8450018947596366e-05, + "loss": 0.369, + "step": 2290500 + }, + { + "epoch": 15.503194023386747, + "grad_norm": 0.39784419536590576, + "learning_rate": 4.844968059766133e-05, + "loss": 0.3679, + "step": 2291000 + }, + { + "epoch": 15.506577522737116, + "grad_norm": 0.37226787209510803, + "learning_rate": 4.844934224772629e-05, + "loss": 0.3684, + "step": 2291500 + }, + { + "epoch": 15.509961022087484, + "grad_norm": 0.37065237760543823, + "learning_rate": 4.844900389779126e-05, + "loss": 0.3684, + "step": 2292000 + }, + { + "epoch": 15.513344521437851, + "grad_norm": 0.33598220348358154, + "learning_rate": 4.8448665547856214e-05, + "loss": 0.3686, + "step": 2292500 + }, + { + "epoch": 15.51672802078822, + "grad_norm": 0.3342346251010895, + "learning_rate": 4.8448327197921177e-05, + "loss": 0.3668, + "step": 2293000 + }, + { + "epoch": 15.520111520138588, + "grad_norm": 0.32238996028900146, + "learning_rate": 4.8447988847986145e-05, + "loss": 0.3681, + "step": 2293500 + }, + { + "epoch": 15.523495019488957, + "grad_norm": 0.3525577783584595, + "learning_rate": 4.844765049805111e-05, + "loss": 0.3683, + "step": 2294000 + }, + { + "epoch": 15.526878518839325, + "grad_norm": 0.3491257429122925, + "learning_rate": 4.844731214811607e-05, + "loss": 0.368, + "step": 2294500 + }, + { + "epoch": 15.530262018189692, + "grad_norm": 0.3477993309497833, + "learning_rate": 4.844697379818103e-05, + "loss": 0.3685, + "step": 2295000 + }, + { + "epoch": 15.53364551754006, + "grad_norm": 0.35921916365623474, + "learning_rate": 4.8446635448245994e-05, + "loss": 0.3671, + "step": 2295500 + }, + { + "epoch": 15.537029016890429, + "grad_norm": 0.3641141951084137, + "learning_rate": 4.844629709831096e-05, + "loss": 0.3674, + "step": 2296000 + }, + { + "epoch": 15.540412516240798, + "grad_norm": 0.34084510803222656, + "learning_rate": 4.8445958748375925e-05, + "loss": 0.3687, + "step": 2296500 + }, + { + "epoch": 15.543796015591164, + "grad_norm": 0.3691849410533905, + "learning_rate": 4.844562039844089e-05, + "loss": 0.3685, + "step": 2297000 + }, + { + "epoch": 15.547179514941533, + "grad_norm": 0.3591060936450958, + "learning_rate": 4.844528204850585e-05, + "loss": 0.3695, + "step": 2297500 + }, + { + "epoch": 15.550563014291901, + "grad_norm": 0.3356313109397888, + "learning_rate": 4.844494369857081e-05, + "loss": 0.3687, + "step": 2298000 + }, + { + "epoch": 15.55394651364227, + "grad_norm": 0.35305818915367126, + "learning_rate": 4.8444605348635773e-05, + "loss": 0.3674, + "step": 2298500 + }, + { + "epoch": 15.557330012992637, + "grad_norm": 0.3080775737762451, + "learning_rate": 4.8444266998700736e-05, + "loss": 0.3696, + "step": 2299000 + }, + { + "epoch": 15.560713512343005, + "grad_norm": 0.3750038146972656, + "learning_rate": 4.8443928648765704e-05, + "loss": 0.3687, + "step": 2299500 + }, + { + "epoch": 15.564097011693374, + "grad_norm": 0.3531736135482788, + "learning_rate": 4.8443590298830667e-05, + "loss": 0.3688, + "step": 2300000 + }, + { + "epoch": 15.567480511043742, + "grad_norm": 0.37959063053131104, + "learning_rate": 4.844325194889563e-05, + "loss": 0.3695, + "step": 2300500 + }, + { + "epoch": 15.57086401039411, + "grad_norm": 0.37598180770874023, + "learning_rate": 4.844291359896059e-05, + "loss": 0.3693, + "step": 2301000 + }, + { + "epoch": 15.574247509744477, + "grad_norm": 0.3553810119628906, + "learning_rate": 4.844257524902556e-05, + "loss": 0.3679, + "step": 2301500 + }, + { + "epoch": 15.577631009094846, + "grad_norm": 0.38904568552970886, + "learning_rate": 4.8442236899090515e-05, + "loss": 0.3681, + "step": 2302000 + }, + { + "epoch": 15.581014508445215, + "grad_norm": 0.36272913217544556, + "learning_rate": 4.844189854915548e-05, + "loss": 0.3683, + "step": 2302500 + }, + { + "epoch": 15.584398007795583, + "grad_norm": 0.37295016646385193, + "learning_rate": 4.844156019922044e-05, + "loss": 0.3675, + "step": 2303000 + }, + { + "epoch": 15.587781507145952, + "grad_norm": 0.402294784784317, + "learning_rate": 4.844122184928541e-05, + "loss": 0.3685, + "step": 2303500 + }, + { + "epoch": 15.591165006496318, + "grad_norm": 0.32193323969841003, + "learning_rate": 4.844088349935037e-05, + "loss": 0.3689, + "step": 2304000 + }, + { + "epoch": 15.594548505846687, + "grad_norm": 0.35976263880729675, + "learning_rate": 4.844054514941533e-05, + "loss": 0.3669, + "step": 2304500 + }, + { + "epoch": 15.597932005197055, + "grad_norm": 0.38775837421417236, + "learning_rate": 4.8440206799480295e-05, + "loss": 0.3677, + "step": 2305000 + }, + { + "epoch": 15.601315504547424, + "grad_norm": 0.3899919092655182, + "learning_rate": 4.8439868449545263e-05, + "loss": 0.3672, + "step": 2305500 + }, + { + "epoch": 15.60469900389779, + "grad_norm": 0.3507950007915497, + "learning_rate": 4.8439530099610226e-05, + "loss": 0.3687, + "step": 2306000 + }, + { + "epoch": 15.60808250324816, + "grad_norm": 0.3544395864009857, + "learning_rate": 4.843919174967519e-05, + "loss": 0.3691, + "step": 2306500 + }, + { + "epoch": 15.611466002598528, + "grad_norm": 0.3738711476325989, + "learning_rate": 4.843885339974015e-05, + "loss": 0.3677, + "step": 2307000 + }, + { + "epoch": 15.614849501948896, + "grad_norm": 0.38061192631721497, + "learning_rate": 4.843851504980511e-05, + "loss": 0.3702, + "step": 2307500 + }, + { + "epoch": 15.618233001299263, + "grad_norm": 0.3381912112236023, + "learning_rate": 4.8438176699870074e-05, + "loss": 0.3675, + "step": 2308000 + }, + { + "epoch": 15.621616500649631, + "grad_norm": 0.3231547176837921, + "learning_rate": 4.8437838349935036e-05, + "loss": 0.3707, + "step": 2308500 + }, + { + "epoch": 15.625, + "grad_norm": 0.3673042356967926, + "learning_rate": 4.8437500000000005e-05, + "loss": 0.3693, + "step": 2309000 + }, + { + "epoch": 15.628383499350369, + "grad_norm": 0.3947595953941345, + "learning_rate": 4.843716165006497e-05, + "loss": 0.3686, + "step": 2309500 + }, + { + "epoch": 15.631766998700737, + "grad_norm": 0.4065065383911133, + "learning_rate": 4.843682330012993e-05, + "loss": 0.3685, + "step": 2310000 + }, + { + "epoch": 15.635150498051104, + "grad_norm": 0.37687912583351135, + "learning_rate": 4.843648495019489e-05, + "loss": 0.3673, + "step": 2310500 + }, + { + "epoch": 15.638533997401472, + "grad_norm": 0.38236334919929504, + "learning_rate": 4.843614660025986e-05, + "loss": 0.3674, + "step": 2311000 + }, + { + "epoch": 15.64191749675184, + "grad_norm": 0.3740776479244232, + "learning_rate": 4.8435808250324816e-05, + "loss": 0.3685, + "step": 2311500 + }, + { + "epoch": 15.64530099610221, + "grad_norm": 0.3825748562812805, + "learning_rate": 4.843546990038978e-05, + "loss": 0.3689, + "step": 2312000 + }, + { + "epoch": 15.648684495452576, + "grad_norm": 0.3572474718093872, + "learning_rate": 4.843513155045474e-05, + "loss": 0.3685, + "step": 2312500 + }, + { + "epoch": 15.652067994802945, + "grad_norm": 0.35315316915512085, + "learning_rate": 4.843479320051971e-05, + "loss": 0.3687, + "step": 2313000 + }, + { + "epoch": 15.655451494153313, + "grad_norm": 0.32699042558670044, + "learning_rate": 4.843445485058467e-05, + "loss": 0.3694, + "step": 2313500 + }, + { + "epoch": 15.658834993503682, + "grad_norm": 0.33419954776763916, + "learning_rate": 4.843411650064963e-05, + "loss": 0.3685, + "step": 2314000 + }, + { + "epoch": 15.662218492854048, + "grad_norm": 0.3580189347267151, + "learning_rate": 4.8433778150714595e-05, + "loss": 0.3693, + "step": 2314500 + }, + { + "epoch": 15.665601992204417, + "grad_norm": 0.3577320873737335, + "learning_rate": 4.8433439800779564e-05, + "loss": 0.369, + "step": 2315000 + }, + { + "epoch": 15.668985491554785, + "grad_norm": 0.4055269956588745, + "learning_rate": 4.8433101450844526e-05, + "loss": 0.3686, + "step": 2315500 + }, + { + "epoch": 15.672368990905154, + "grad_norm": 0.3679068088531494, + "learning_rate": 4.843276310090949e-05, + "loss": 0.3691, + "step": 2316000 + }, + { + "epoch": 15.675752490255523, + "grad_norm": 0.33523890376091003, + "learning_rate": 4.843242475097445e-05, + "loss": 0.3696, + "step": 2316500 + }, + { + "epoch": 15.67913598960589, + "grad_norm": 0.3535049259662628, + "learning_rate": 4.843208640103941e-05, + "loss": 0.368, + "step": 2317000 + }, + { + "epoch": 15.682519488956258, + "grad_norm": 0.3424280881881714, + "learning_rate": 4.8431748051104375e-05, + "loss": 0.3676, + "step": 2317500 + }, + { + "epoch": 15.685902988306626, + "grad_norm": 0.34147903323173523, + "learning_rate": 4.843140970116934e-05, + "loss": 0.368, + "step": 2318000 + }, + { + "epoch": 15.689286487656995, + "grad_norm": 0.4109104573726654, + "learning_rate": 4.8431071351234306e-05, + "loss": 0.3692, + "step": 2318500 + }, + { + "epoch": 15.692669987007363, + "grad_norm": 0.32174432277679443, + "learning_rate": 4.843073300129927e-05, + "loss": 0.3674, + "step": 2319000 + }, + { + "epoch": 15.69605348635773, + "grad_norm": 0.3250316083431244, + "learning_rate": 4.843039465136423e-05, + "loss": 0.3676, + "step": 2319500 + }, + { + "epoch": 15.699436985708099, + "grad_norm": 0.34445062279701233, + "learning_rate": 4.843005630142919e-05, + "loss": 0.3701, + "step": 2320000 + }, + { + "epoch": 15.702820485058467, + "grad_norm": 0.37600356340408325, + "learning_rate": 4.842971795149416e-05, + "loss": 0.3675, + "step": 2320500 + }, + { + "epoch": 15.706203984408836, + "grad_norm": 0.36351558566093445, + "learning_rate": 4.8429379601559116e-05, + "loss": 0.3684, + "step": 2321000 + }, + { + "epoch": 15.709587483759202, + "grad_norm": 0.36934009194374084, + "learning_rate": 4.842904125162408e-05, + "loss": 0.3696, + "step": 2321500 + }, + { + "epoch": 15.712970983109571, + "grad_norm": 0.34784290194511414, + "learning_rate": 4.842870290168904e-05, + "loss": 0.3683, + "step": 2322000 + }, + { + "epoch": 15.71635448245994, + "grad_norm": 0.36407607793807983, + "learning_rate": 4.842836455175401e-05, + "loss": 0.3685, + "step": 2322500 + }, + { + "epoch": 15.719737981810308, + "grad_norm": 0.3845270574092865, + "learning_rate": 4.842802620181897e-05, + "loss": 0.3685, + "step": 2323000 + }, + { + "epoch": 15.723121481160675, + "grad_norm": 0.39453384280204773, + "learning_rate": 4.8427687851883934e-05, + "loss": 0.3692, + "step": 2323500 + }, + { + "epoch": 15.726504980511043, + "grad_norm": 0.34942588210105896, + "learning_rate": 4.8427349501948896e-05, + "loss": 0.3684, + "step": 2324000 + }, + { + "epoch": 15.729888479861412, + "grad_norm": 0.37566226720809937, + "learning_rate": 4.8427011152013865e-05, + "loss": 0.3675, + "step": 2324500 + }, + { + "epoch": 15.73327197921178, + "grad_norm": 0.36661645770072937, + "learning_rate": 4.842667280207883e-05, + "loss": 0.3692, + "step": 2325000 + }, + { + "epoch": 15.736655478562149, + "grad_norm": 0.381184458732605, + "learning_rate": 4.842633445214379e-05, + "loss": 0.3693, + "step": 2325500 + }, + { + "epoch": 15.740038977912516, + "grad_norm": 0.3286978006362915, + "learning_rate": 4.842599610220875e-05, + "loss": 0.3691, + "step": 2326000 + }, + { + "epoch": 15.743422477262884, + "grad_norm": 0.3738749623298645, + "learning_rate": 4.842565775227371e-05, + "loss": 0.3677, + "step": 2326500 + }, + { + "epoch": 15.746805976613253, + "grad_norm": 0.3820296823978424, + "learning_rate": 4.8425319402338675e-05, + "loss": 0.3689, + "step": 2327000 + }, + { + "epoch": 15.750189475963621, + "grad_norm": 0.36072027683258057, + "learning_rate": 4.842498105240364e-05, + "loss": 0.368, + "step": 2327500 + }, + { + "epoch": 15.753572975313988, + "grad_norm": 0.3539668619632721, + "learning_rate": 4.8424642702468606e-05, + "loss": 0.37, + "step": 2328000 + }, + { + "epoch": 15.756956474664356, + "grad_norm": 0.360319584608078, + "learning_rate": 4.842430435253357e-05, + "loss": 0.3701, + "step": 2328500 + }, + { + "epoch": 15.760339974014725, + "grad_norm": 0.3770919442176819, + "learning_rate": 4.842396600259853e-05, + "loss": 0.3669, + "step": 2329000 + }, + { + "epoch": 15.763723473365093, + "grad_norm": 0.35376468300819397, + "learning_rate": 4.842362765266349e-05, + "loss": 0.3698, + "step": 2329500 + }, + { + "epoch": 15.767106972715462, + "grad_norm": 0.40981361269950867, + "learning_rate": 4.842328930272846e-05, + "loss": 0.3688, + "step": 2330000 + }, + { + "epoch": 15.770490472065829, + "grad_norm": 0.34691673517227173, + "learning_rate": 4.842295095279342e-05, + "loss": 0.3689, + "step": 2330500 + }, + { + "epoch": 15.773873971416197, + "grad_norm": 0.34768006205558777, + "learning_rate": 4.842261260285838e-05, + "loss": 0.3692, + "step": 2331000 + }, + { + "epoch": 15.777257470766566, + "grad_norm": 0.4068549573421478, + "learning_rate": 4.842227425292334e-05, + "loss": 0.3689, + "step": 2331500 + }, + { + "epoch": 15.780640970116934, + "grad_norm": 0.3581763803958893, + "learning_rate": 4.842193590298831e-05, + "loss": 0.368, + "step": 2332000 + }, + { + "epoch": 15.784024469467301, + "grad_norm": 0.3405434191226959, + "learning_rate": 4.842159755305327e-05, + "loss": 0.3684, + "step": 2332500 + }, + { + "epoch": 15.78740796881767, + "grad_norm": 0.3610570728778839, + "learning_rate": 4.8421259203118234e-05, + "loss": 0.3696, + "step": 2333000 + }, + { + "epoch": 15.790791468168038, + "grad_norm": 0.3928324282169342, + "learning_rate": 4.8420920853183196e-05, + "loss": 0.3683, + "step": 2333500 + }, + { + "epoch": 15.794174967518407, + "grad_norm": 0.3627435266971588, + "learning_rate": 4.8420582503248165e-05, + "loss": 0.3684, + "step": 2334000 + }, + { + "epoch": 15.797558466868775, + "grad_norm": 0.35874125361442566, + "learning_rate": 4.842024415331313e-05, + "loss": 0.369, + "step": 2334500 + }, + { + "epoch": 15.800941966219142, + "grad_norm": 0.34165313839912415, + "learning_rate": 4.841990580337809e-05, + "loss": 0.3695, + "step": 2335000 + }, + { + "epoch": 15.80432546556951, + "grad_norm": 0.33219292759895325, + "learning_rate": 4.841956745344305e-05, + "loss": 0.3682, + "step": 2335500 + }, + { + "epoch": 15.807708964919879, + "grad_norm": 0.3279459774494171, + "learning_rate": 4.8419229103508014e-05, + "loss": 0.37, + "step": 2336000 + }, + { + "epoch": 15.811092464270248, + "grad_norm": 0.3773161470890045, + "learning_rate": 4.8418890753572976e-05, + "loss": 0.3679, + "step": 2336500 + }, + { + "epoch": 15.814475963620614, + "grad_norm": 0.3741438388824463, + "learning_rate": 4.841855240363794e-05, + "loss": 0.3697, + "step": 2337000 + }, + { + "epoch": 15.817859462970983, + "grad_norm": 0.3438836932182312, + "learning_rate": 4.841821405370291e-05, + "loss": 0.3685, + "step": 2337500 + }, + { + "epoch": 15.821242962321351, + "grad_norm": 0.36285704374313354, + "learning_rate": 4.841787570376787e-05, + "loss": 0.3682, + "step": 2338000 + }, + { + "epoch": 15.82462646167172, + "grad_norm": 0.35527315735816956, + "learning_rate": 4.841753735383283e-05, + "loss": 0.3697, + "step": 2338500 + }, + { + "epoch": 15.828009961022087, + "grad_norm": 0.38485226035118103, + "learning_rate": 4.841719900389779e-05, + "loss": 0.3683, + "step": 2339000 + }, + { + "epoch": 15.831393460372455, + "grad_norm": 0.38870787620544434, + "learning_rate": 4.841686065396276e-05, + "loss": 0.3682, + "step": 2339500 + }, + { + "epoch": 15.834776959722824, + "grad_norm": 0.36204108595848083, + "learning_rate": 4.8416522304027724e-05, + "loss": 0.3687, + "step": 2340000 + }, + { + "epoch": 15.838160459073192, + "grad_norm": 0.37256479263305664, + "learning_rate": 4.841618395409268e-05, + "loss": 0.3673, + "step": 2340500 + }, + { + "epoch": 15.84154395842356, + "grad_norm": 0.37685421109199524, + "learning_rate": 4.841584560415764e-05, + "loss": 0.3684, + "step": 2341000 + }, + { + "epoch": 15.844927457773927, + "grad_norm": 0.36085960268974304, + "learning_rate": 4.841550725422261e-05, + "loss": 0.3691, + "step": 2341500 + }, + { + "epoch": 15.848310957124296, + "grad_norm": 0.3423760235309601, + "learning_rate": 4.841516890428757e-05, + "loss": 0.3691, + "step": 2342000 + }, + { + "epoch": 15.851694456474664, + "grad_norm": 0.383521169424057, + "learning_rate": 4.8414830554352535e-05, + "loss": 0.3692, + "step": 2342500 + }, + { + "epoch": 15.855077955825033, + "grad_norm": 0.38384073972702026, + "learning_rate": 4.84144922044175e-05, + "loss": 0.3678, + "step": 2343000 + }, + { + "epoch": 15.858461455175402, + "grad_norm": 0.36292794346809387, + "learning_rate": 4.8414153854482466e-05, + "loss": 0.3697, + "step": 2343500 + }, + { + "epoch": 15.861844954525768, + "grad_norm": 0.3684341013431549, + "learning_rate": 4.841381550454743e-05, + "loss": 0.3687, + "step": 2344000 + }, + { + "epoch": 15.865228453876137, + "grad_norm": 0.3589087128639221, + "learning_rate": 4.841347715461239e-05, + "loss": 0.368, + "step": 2344500 + }, + { + "epoch": 15.868611953226505, + "grad_norm": 0.37417420744895935, + "learning_rate": 4.841313880467735e-05, + "loss": 0.3659, + "step": 2345000 + }, + { + "epoch": 15.871995452576874, + "grad_norm": 0.3539438247680664, + "learning_rate": 4.8412800454742314e-05, + "loss": 0.3678, + "step": 2345500 + }, + { + "epoch": 15.87537895192724, + "grad_norm": 0.3440802991390228, + "learning_rate": 4.8412462104807277e-05, + "loss": 0.3679, + "step": 2346000 + }, + { + "epoch": 15.878762451277609, + "grad_norm": 0.3424714207649231, + "learning_rate": 4.841212375487224e-05, + "loss": 0.3692, + "step": 2346500 + }, + { + "epoch": 15.882145950627978, + "grad_norm": 0.39180028438568115, + "learning_rate": 4.841178540493721e-05, + "loss": 0.3677, + "step": 2347000 + }, + { + "epoch": 15.885529449978346, + "grad_norm": 0.34315669536590576, + "learning_rate": 4.841144705500217e-05, + "loss": 0.3693, + "step": 2347500 + }, + { + "epoch": 15.888912949328713, + "grad_norm": 0.3350779414176941, + "learning_rate": 4.841110870506713e-05, + "loss": 0.3676, + "step": 2348000 + }, + { + "epoch": 15.892296448679081, + "grad_norm": 0.32782721519470215, + "learning_rate": 4.8410770355132094e-05, + "loss": 0.3681, + "step": 2348500 + }, + { + "epoch": 15.89567994802945, + "grad_norm": 0.3709561228752136, + "learning_rate": 4.8410432005197056e-05, + "loss": 0.3676, + "step": 2349000 + }, + { + "epoch": 15.899063447379818, + "grad_norm": 0.3544650673866272, + "learning_rate": 4.8410093655262025e-05, + "loss": 0.3686, + "step": 2349500 + }, + { + "epoch": 15.902446946730187, + "grad_norm": 0.3804757297039032, + "learning_rate": 4.840975530532698e-05, + "loss": 0.3702, + "step": 2350000 + }, + { + "epoch": 15.905830446080554, + "grad_norm": 0.3296406865119934, + "learning_rate": 4.840941695539194e-05, + "loss": 0.3697, + "step": 2350500 + }, + { + "epoch": 15.909213945430922, + "grad_norm": 0.36700183153152466, + "learning_rate": 4.840907860545691e-05, + "loss": 0.3695, + "step": 2351000 + }, + { + "epoch": 15.91259744478129, + "grad_norm": 0.35151296854019165, + "learning_rate": 4.8408740255521873e-05, + "loss": 0.3676, + "step": 2351500 + }, + { + "epoch": 15.91598094413166, + "grad_norm": 0.34182360768318176, + "learning_rate": 4.8408401905586836e-05, + "loss": 0.3675, + "step": 2352000 + }, + { + "epoch": 15.919364443482026, + "grad_norm": 0.36365705728530884, + "learning_rate": 4.84080635556518e-05, + "loss": 0.3704, + "step": 2352500 + }, + { + "epoch": 15.922747942832395, + "grad_norm": 0.35310786962509155, + "learning_rate": 4.840772520571677e-05, + "loss": 0.3683, + "step": 2353000 + }, + { + "epoch": 15.926131442182763, + "grad_norm": 0.3463936448097229, + "learning_rate": 4.840738685578173e-05, + "loss": 0.3669, + "step": 2353500 + }, + { + "epoch": 15.929514941533132, + "grad_norm": 0.3467606008052826, + "learning_rate": 4.840704850584669e-05, + "loss": 0.3686, + "step": 2354000 + }, + { + "epoch": 15.9328984408835, + "grad_norm": 0.3721490502357483, + "learning_rate": 4.840671015591165e-05, + "loss": 0.3691, + "step": 2354500 + }, + { + "epoch": 15.936281940233867, + "grad_norm": 0.3566875457763672, + "learning_rate": 4.8406371805976615e-05, + "loss": 0.368, + "step": 2355000 + }, + { + "epoch": 15.939665439584235, + "grad_norm": 0.36238959431648254, + "learning_rate": 4.840603345604158e-05, + "loss": 0.3681, + "step": 2355500 + }, + { + "epoch": 15.943048938934604, + "grad_norm": 0.37380194664001465, + "learning_rate": 4.840569510610654e-05, + "loss": 0.3668, + "step": 2356000 + }, + { + "epoch": 15.946432438284972, + "grad_norm": 0.3661109209060669, + "learning_rate": 4.840535675617151e-05, + "loss": 0.3677, + "step": 2356500 + }, + { + "epoch": 15.94981593763534, + "grad_norm": 0.33019015192985535, + "learning_rate": 4.840501840623647e-05, + "loss": 0.3682, + "step": 2357000 + }, + { + "epoch": 15.953199436985708, + "grad_norm": 0.34557926654815674, + "learning_rate": 4.840468005630143e-05, + "loss": 0.3693, + "step": 2357500 + }, + { + "epoch": 15.956582936336076, + "grad_norm": 0.3640996813774109, + "learning_rate": 4.8404341706366395e-05, + "loss": 0.3666, + "step": 2358000 + }, + { + "epoch": 15.959966435686445, + "grad_norm": 0.3293427526950836, + "learning_rate": 4.840400335643136e-05, + "loss": 0.3678, + "step": 2358500 + }, + { + "epoch": 15.963349935036813, + "grad_norm": 0.39549505710601807, + "learning_rate": 4.8403665006496326e-05, + "loss": 0.3703, + "step": 2359000 + }, + { + "epoch": 15.96673343438718, + "grad_norm": 0.37706345319747925, + "learning_rate": 4.840332665656128e-05, + "loss": 0.3687, + "step": 2359500 + }, + { + "epoch": 15.970116933737549, + "grad_norm": 0.32542896270751953, + "learning_rate": 4.840298830662624e-05, + "loss": 0.369, + "step": 2360000 + }, + { + "epoch": 15.973500433087917, + "grad_norm": 0.3571685254573822, + "learning_rate": 4.840264995669121e-05, + "loss": 0.368, + "step": 2360500 + }, + { + "epoch": 15.976883932438286, + "grad_norm": 0.3591122627258301, + "learning_rate": 4.8402311606756174e-05, + "loss": 0.3688, + "step": 2361000 + }, + { + "epoch": 15.980267431788652, + "grad_norm": 0.38528913259506226, + "learning_rate": 4.8401973256821136e-05, + "loss": 0.3688, + "step": 2361500 + }, + { + "epoch": 15.983650931139021, + "grad_norm": 0.38601624965667725, + "learning_rate": 4.84016349068861e-05, + "loss": 0.3674, + "step": 2362000 + }, + { + "epoch": 15.98703443048939, + "grad_norm": 0.35946911573410034, + "learning_rate": 4.840129655695107e-05, + "loss": 0.3677, + "step": 2362500 + }, + { + "epoch": 15.990417929839758, + "grad_norm": 0.3463689386844635, + "learning_rate": 4.840095820701603e-05, + "loss": 0.3687, + "step": 2363000 + }, + { + "epoch": 15.993801429190125, + "grad_norm": 0.3598881661891937, + "learning_rate": 4.840061985708099e-05, + "loss": 0.3709, + "step": 2363500 + }, + { + "epoch": 15.997184928540493, + "grad_norm": 0.3293743431568146, + "learning_rate": 4.8400281507145954e-05, + "loss": 0.3671, + "step": 2364000 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.8596614364942428, + "eval_loss": 0.5696810483932495, + "eval_runtime": 3400.6592, + "eval_samples_per_second": 85.496, + "eval_steps_per_second": 5.344, + "step": 2364416 + }, + { + "epoch": 16.00056842789086, + "grad_norm": 0.3467516005039215, + "learning_rate": 4.8399943157210916e-05, + "loss": 0.3687, + "step": 2364500 + }, + { + "epoch": 16.00395192724123, + "grad_norm": 0.34219473600387573, + "learning_rate": 4.839960480727588e-05, + "loss": 0.3661, + "step": 2365000 + }, + { + "epoch": 16.0073354265916, + "grad_norm": 0.33582720160484314, + "learning_rate": 4.839926645734084e-05, + "loss": 0.3664, + "step": 2365500 + }, + { + "epoch": 16.010718925941966, + "grad_norm": 0.37527623772621155, + "learning_rate": 4.83989281074058e-05, + "loss": 0.3668, + "step": 2366000 + }, + { + "epoch": 16.014102425292336, + "grad_norm": 0.5134567618370056, + "learning_rate": 4.839858975747077e-05, + "loss": 0.3667, + "step": 2366500 + }, + { + "epoch": 16.017485924642703, + "grad_norm": 0.3311164081096649, + "learning_rate": 4.839825140753573e-05, + "loss": 0.3668, + "step": 2367000 + }, + { + "epoch": 16.02086942399307, + "grad_norm": 0.33676856756210327, + "learning_rate": 4.8397913057600695e-05, + "loss": 0.366, + "step": 2367500 + }, + { + "epoch": 16.02425292334344, + "grad_norm": 0.3555416166782379, + "learning_rate": 4.839757470766566e-05, + "loss": 0.3675, + "step": 2368000 + }, + { + "epoch": 16.027636422693806, + "grad_norm": 0.36959418654441833, + "learning_rate": 4.8397236357730626e-05, + "loss": 0.366, + "step": 2368500 + }, + { + "epoch": 16.031019922044177, + "grad_norm": 0.38383468985557556, + "learning_rate": 4.839689800779558e-05, + "loss": 0.3667, + "step": 2369000 + }, + { + "epoch": 16.034403421394543, + "grad_norm": 0.35724765062332153, + "learning_rate": 4.8396559657860544e-05, + "loss": 0.3681, + "step": 2369500 + }, + { + "epoch": 16.03778692074491, + "grad_norm": 0.37634825706481934, + "learning_rate": 4.839622130792551e-05, + "loss": 0.367, + "step": 2370000 + }, + { + "epoch": 16.04117042009528, + "grad_norm": 0.3529738485813141, + "learning_rate": 4.8395882957990475e-05, + "loss": 0.366, + "step": 2370500 + }, + { + "epoch": 16.044553919445647, + "grad_norm": 0.38263168931007385, + "learning_rate": 4.839554460805544e-05, + "loss": 0.3675, + "step": 2371000 + }, + { + "epoch": 16.047937418796014, + "grad_norm": 0.40001940727233887, + "learning_rate": 4.83952062581204e-05, + "loss": 0.3656, + "step": 2371500 + }, + { + "epoch": 16.051320918146384, + "grad_norm": 0.3931584656238556, + "learning_rate": 4.839486790818537e-05, + "loss": 0.365, + "step": 2372000 + }, + { + "epoch": 16.05470441749675, + "grad_norm": 0.38671112060546875, + "learning_rate": 4.839452955825033e-05, + "loss": 0.3669, + "step": 2372500 + }, + { + "epoch": 16.05808791684712, + "grad_norm": 0.3262976109981537, + "learning_rate": 4.839419120831529e-05, + "loss": 0.3669, + "step": 2373000 + }, + { + "epoch": 16.061471416197488, + "grad_norm": 0.387162983417511, + "learning_rate": 4.839385285838025e-05, + "loss": 0.3675, + "step": 2373500 + }, + { + "epoch": 16.064854915547855, + "grad_norm": 0.3413422405719757, + "learning_rate": 4.8393514508445216e-05, + "loss": 0.3656, + "step": 2374000 + }, + { + "epoch": 16.068238414898225, + "grad_norm": 0.39504578709602356, + "learning_rate": 4.839317615851018e-05, + "loss": 0.3666, + "step": 2374500 + }, + { + "epoch": 16.071621914248592, + "grad_norm": 0.38605839014053345, + "learning_rate": 4.839283780857514e-05, + "loss": 0.3657, + "step": 2375000 + }, + { + "epoch": 16.075005413598962, + "grad_norm": 0.3513329327106476, + "learning_rate": 4.83924994586401e-05, + "loss": 0.3655, + "step": 2375500 + }, + { + "epoch": 16.07838891294933, + "grad_norm": 0.3624356687068939, + "learning_rate": 4.839216110870507e-05, + "loss": 0.3661, + "step": 2376000 + }, + { + "epoch": 16.081772412299696, + "grad_norm": 0.3576614260673523, + "learning_rate": 4.8391822758770034e-05, + "loss": 0.3684, + "step": 2376500 + }, + { + "epoch": 16.085155911650066, + "grad_norm": 0.3729521930217743, + "learning_rate": 4.8391484408834996e-05, + "loss": 0.3679, + "step": 2377000 + }, + { + "epoch": 16.088539411000433, + "grad_norm": 0.3682872951030731, + "learning_rate": 4.839114605889996e-05, + "loss": 0.3664, + "step": 2377500 + }, + { + "epoch": 16.0919229103508, + "grad_norm": 0.34409353137016296, + "learning_rate": 4.839080770896493e-05, + "loss": 0.365, + "step": 2378000 + }, + { + "epoch": 16.09530640970117, + "grad_norm": 0.3514534831047058, + "learning_rate": 4.839046935902988e-05, + "loss": 0.3673, + "step": 2378500 + }, + { + "epoch": 16.098689909051537, + "grad_norm": 0.38848912715911865, + "learning_rate": 4.8390131009094844e-05, + "loss": 0.3658, + "step": 2379000 + }, + { + "epoch": 16.102073408401907, + "grad_norm": 0.3803500235080719, + "learning_rate": 4.838979265915981e-05, + "loss": 0.3671, + "step": 2379500 + }, + { + "epoch": 16.105456907752274, + "grad_norm": 0.3359062075614929, + "learning_rate": 4.8389454309224775e-05, + "loss": 0.3672, + "step": 2380000 + }, + { + "epoch": 16.10884040710264, + "grad_norm": 0.3859556019306183, + "learning_rate": 4.838911595928974e-05, + "loss": 0.3668, + "step": 2380500 + }, + { + "epoch": 16.11222390645301, + "grad_norm": 0.3473733067512512, + "learning_rate": 4.83887776093547e-05, + "loss": 0.3634, + "step": 2381000 + }, + { + "epoch": 16.115607405803377, + "grad_norm": 0.3759918212890625, + "learning_rate": 4.838843925941967e-05, + "loss": 0.3664, + "step": 2381500 + }, + { + "epoch": 16.118990905153748, + "grad_norm": 0.38334834575653076, + "learning_rate": 4.838810090948463e-05, + "loss": 0.3673, + "step": 2382000 + }, + { + "epoch": 16.122374404504114, + "grad_norm": 0.3702244162559509, + "learning_rate": 4.838776255954959e-05, + "loss": 0.3676, + "step": 2382500 + }, + { + "epoch": 16.12575790385448, + "grad_norm": 0.35715895891189575, + "learning_rate": 4.838742420961455e-05, + "loss": 0.3652, + "step": 2383000 + }, + { + "epoch": 16.12914140320485, + "grad_norm": 0.41490229964256287, + "learning_rate": 4.838708585967952e-05, + "loss": 0.3674, + "step": 2383500 + }, + { + "epoch": 16.132524902555218, + "grad_norm": 0.3774898946285248, + "learning_rate": 4.838674750974448e-05, + "loss": 0.3668, + "step": 2384000 + }, + { + "epoch": 16.13590840190559, + "grad_norm": 0.40797972679138184, + "learning_rate": 4.838640915980944e-05, + "loss": 0.3666, + "step": 2384500 + }, + { + "epoch": 16.139291901255955, + "grad_norm": 0.3960728943347931, + "learning_rate": 4.83860708098744e-05, + "loss": 0.368, + "step": 2385000 + }, + { + "epoch": 16.142675400606322, + "grad_norm": 0.36006104946136475, + "learning_rate": 4.838573245993937e-05, + "loss": 0.3683, + "step": 2385500 + }, + { + "epoch": 16.146058899956692, + "grad_norm": 0.3449351191520691, + "learning_rate": 4.8385394110004334e-05, + "loss": 0.3666, + "step": 2386000 + }, + { + "epoch": 16.14944239930706, + "grad_norm": 0.3677423596382141, + "learning_rate": 4.8385055760069297e-05, + "loss": 0.3676, + "step": 2386500 + }, + { + "epoch": 16.152825898657426, + "grad_norm": 0.33477944135665894, + "learning_rate": 4.838471741013426e-05, + "loss": 0.3684, + "step": 2387000 + }, + { + "epoch": 16.156209398007796, + "grad_norm": 0.36839577555656433, + "learning_rate": 4.838437906019923e-05, + "loss": 0.3658, + "step": 2387500 + }, + { + "epoch": 16.159592897358163, + "grad_norm": 0.3929869830608368, + "learning_rate": 4.838404071026418e-05, + "loss": 0.3675, + "step": 2388000 + }, + { + "epoch": 16.162976396708533, + "grad_norm": 0.38416117429733276, + "learning_rate": 4.8383702360329145e-05, + "loss": 0.3667, + "step": 2388500 + }, + { + "epoch": 16.1663598960589, + "grad_norm": 0.35896387696266174, + "learning_rate": 4.8383364010394114e-05, + "loss": 0.3664, + "step": 2389000 + }, + { + "epoch": 16.169743395409267, + "grad_norm": 0.41448551416397095, + "learning_rate": 4.8383025660459076e-05, + "loss": 0.3678, + "step": 2389500 + }, + { + "epoch": 16.173126894759637, + "grad_norm": 0.3747521638870239, + "learning_rate": 4.838268731052404e-05, + "loss": 0.3668, + "step": 2390000 + }, + { + "epoch": 16.176510394110004, + "grad_norm": 0.3750576078891754, + "learning_rate": 4.8382348960589e-05, + "loss": 0.3681, + "step": 2390500 + }, + { + "epoch": 16.179893893460374, + "grad_norm": 0.3896361291408539, + "learning_rate": 4.838201061065397e-05, + "loss": 0.368, + "step": 2391000 + }, + { + "epoch": 16.18327739281074, + "grad_norm": 0.36419275403022766, + "learning_rate": 4.838167226071893e-05, + "loss": 0.3669, + "step": 2391500 + }, + { + "epoch": 16.186660892161107, + "grad_norm": 0.3544306457042694, + "learning_rate": 4.8381333910783893e-05, + "loss": 0.3689, + "step": 2392000 + }, + { + "epoch": 16.190044391511478, + "grad_norm": 0.38578328490257263, + "learning_rate": 4.838099556084885e-05, + "loss": 0.3667, + "step": 2392500 + }, + { + "epoch": 16.193427890861845, + "grad_norm": 0.37322214245796204, + "learning_rate": 4.838065721091382e-05, + "loss": 0.3673, + "step": 2393000 + }, + { + "epoch": 16.196811390212215, + "grad_norm": 0.37914374470710754, + "learning_rate": 4.838031886097878e-05, + "loss": 0.3678, + "step": 2393500 + }, + { + "epoch": 16.20019488956258, + "grad_norm": 0.3337806761264801, + "learning_rate": 4.837998051104374e-05, + "loss": 0.3678, + "step": 2394000 + }, + { + "epoch": 16.20357838891295, + "grad_norm": 0.3879007399082184, + "learning_rate": 4.8379642161108704e-05, + "loss": 0.3678, + "step": 2394500 + }, + { + "epoch": 16.20696188826332, + "grad_norm": 0.3810998797416687, + "learning_rate": 4.837930381117367e-05, + "loss": 0.3675, + "step": 2395000 + }, + { + "epoch": 16.210345387613685, + "grad_norm": 0.4061427414417267, + "learning_rate": 4.8378965461238635e-05, + "loss": 0.3664, + "step": 2395500 + }, + { + "epoch": 16.213728886964052, + "grad_norm": 0.34380391240119934, + "learning_rate": 4.83786271113036e-05, + "loss": 0.3673, + "step": 2396000 + }, + { + "epoch": 16.217112386314422, + "grad_norm": 0.38658061623573303, + "learning_rate": 4.837828876136856e-05, + "loss": 0.3693, + "step": 2396500 + }, + { + "epoch": 16.22049588566479, + "grad_norm": 0.34300288558006287, + "learning_rate": 4.837795041143353e-05, + "loss": 0.3661, + "step": 2397000 + }, + { + "epoch": 16.22387938501516, + "grad_norm": 0.35389596223831177, + "learning_rate": 4.8377612061498484e-05, + "loss": 0.3669, + "step": 2397500 + }, + { + "epoch": 16.227262884365526, + "grad_norm": 0.34756699204444885, + "learning_rate": 4.8377273711563446e-05, + "loss": 0.3671, + "step": 2398000 + }, + { + "epoch": 16.230646383715893, + "grad_norm": 0.36334916949272156, + "learning_rate": 4.8376935361628415e-05, + "loss": 0.3666, + "step": 2398500 + }, + { + "epoch": 16.234029883066263, + "grad_norm": 0.36038801074028015, + "learning_rate": 4.837659701169338e-05, + "loss": 0.3669, + "step": 2399000 + }, + { + "epoch": 16.23741338241663, + "grad_norm": 0.38227176666259766, + "learning_rate": 4.837625866175834e-05, + "loss": 0.3676, + "step": 2399500 + }, + { + "epoch": 16.240796881767, + "grad_norm": 0.32234764099121094, + "learning_rate": 4.83759203118233e-05, + "loss": 0.3668, + "step": 2400000 + }, + { + "epoch": 16.244180381117367, + "grad_norm": 0.37640976905822754, + "learning_rate": 4.837558196188827e-05, + "loss": 0.3676, + "step": 2400500 + }, + { + "epoch": 16.247563880467734, + "grad_norm": 0.34978267550468445, + "learning_rate": 4.837524361195323e-05, + "loss": 0.3668, + "step": 2401000 + }, + { + "epoch": 16.250947379818104, + "grad_norm": 0.39871707558631897, + "learning_rate": 4.8374905262018194e-05, + "loss": 0.3687, + "step": 2401500 + }, + { + "epoch": 16.25433087916847, + "grad_norm": 0.37368524074554443, + "learning_rate": 4.8374566912083156e-05, + "loss": 0.3681, + "step": 2402000 + }, + { + "epoch": 16.257714378518838, + "grad_norm": 0.36890408396720886, + "learning_rate": 4.837422856214812e-05, + "loss": 0.3667, + "step": 2402500 + }, + { + "epoch": 16.261097877869208, + "grad_norm": 0.3481537103652954, + "learning_rate": 4.837389021221308e-05, + "loss": 0.3678, + "step": 2403000 + }, + { + "epoch": 16.264481377219575, + "grad_norm": 0.37822774052619934, + "learning_rate": 4.837355186227804e-05, + "loss": 0.3681, + "step": 2403500 + }, + { + "epoch": 16.267864876569945, + "grad_norm": 0.34673652052879333, + "learning_rate": 4.8373213512343005e-05, + "loss": 0.3662, + "step": 2404000 + }, + { + "epoch": 16.27124837592031, + "grad_norm": 0.342875212430954, + "learning_rate": 4.8372875162407974e-05, + "loss": 0.3661, + "step": 2404500 + }, + { + "epoch": 16.27463187527068, + "grad_norm": 0.3608645498752594, + "learning_rate": 4.8372536812472936e-05, + "loss": 0.366, + "step": 2405000 + }, + { + "epoch": 16.27801537462105, + "grad_norm": 0.39905846118927, + "learning_rate": 4.83721984625379e-05, + "loss": 0.3679, + "step": 2405500 + }, + { + "epoch": 16.281398873971415, + "grad_norm": 0.37270793318748474, + "learning_rate": 4.837186011260286e-05, + "loss": 0.368, + "step": 2406000 + }, + { + "epoch": 16.284782373321786, + "grad_norm": 0.33094844222068787, + "learning_rate": 4.837152176266783e-05, + "loss": 0.3661, + "step": 2406500 + }, + { + "epoch": 16.288165872672153, + "grad_norm": 0.3748573958873749, + "learning_rate": 4.8371183412732784e-05, + "loss": 0.3673, + "step": 2407000 + }, + { + "epoch": 16.29154937202252, + "grad_norm": 0.3888343870639801, + "learning_rate": 4.8370845062797746e-05, + "loss": 0.3674, + "step": 2407500 + }, + { + "epoch": 16.29493287137289, + "grad_norm": 0.3585543632507324, + "learning_rate": 4.8370506712862715e-05, + "loss": 0.3686, + "step": 2408000 + }, + { + "epoch": 16.298316370723256, + "grad_norm": 0.3523589074611664, + "learning_rate": 4.837016836292768e-05, + "loss": 0.3676, + "step": 2408500 + }, + { + "epoch": 16.301699870073627, + "grad_norm": 0.31586897373199463, + "learning_rate": 4.836983001299264e-05, + "loss": 0.3689, + "step": 2409000 + }, + { + "epoch": 16.305083369423993, + "grad_norm": 0.3635201156139374, + "learning_rate": 4.83694916630576e-05, + "loss": 0.3672, + "step": 2409500 + }, + { + "epoch": 16.30846686877436, + "grad_norm": 0.36935552954673767, + "learning_rate": 4.836915331312257e-05, + "loss": 0.367, + "step": 2410000 + }, + { + "epoch": 16.31185036812473, + "grad_norm": 0.3307409882545471, + "learning_rate": 4.836881496318753e-05, + "loss": 0.3668, + "step": 2410500 + }, + { + "epoch": 16.315233867475097, + "grad_norm": 0.36583974957466125, + "learning_rate": 4.8368476613252495e-05, + "loss": 0.3669, + "step": 2411000 + }, + { + "epoch": 16.318617366825464, + "grad_norm": 0.4040462076663971, + "learning_rate": 4.836813826331746e-05, + "loss": 0.3675, + "step": 2411500 + }, + { + "epoch": 16.322000866175834, + "grad_norm": 0.35600608587265015, + "learning_rate": 4.836779991338242e-05, + "loss": 0.3666, + "step": 2412000 + }, + { + "epoch": 16.3253843655262, + "grad_norm": 0.3632058799266815, + "learning_rate": 4.836746156344738e-05, + "loss": 0.3674, + "step": 2412500 + }, + { + "epoch": 16.32876786487657, + "grad_norm": 0.376446396112442, + "learning_rate": 4.836712321351234e-05, + "loss": 0.3676, + "step": 2413000 + }, + { + "epoch": 16.332151364226938, + "grad_norm": 0.39327868819236755, + "learning_rate": 4.8366784863577305e-05, + "loss": 0.3667, + "step": 2413500 + }, + { + "epoch": 16.335534863577305, + "grad_norm": 0.37273505330085754, + "learning_rate": 4.8366446513642274e-05, + "loss": 0.3685, + "step": 2414000 + }, + { + "epoch": 16.338918362927675, + "grad_norm": 0.3481425940990448, + "learning_rate": 4.8366108163707236e-05, + "loss": 0.3668, + "step": 2414500 + }, + { + "epoch": 16.342301862278042, + "grad_norm": 0.3970904052257538, + "learning_rate": 4.83657698137722e-05, + "loss": 0.3669, + "step": 2415000 + }, + { + "epoch": 16.345685361628412, + "grad_norm": 0.38457170128822327, + "learning_rate": 4.836543146383716e-05, + "loss": 0.3669, + "step": 2415500 + }, + { + "epoch": 16.34906886097878, + "grad_norm": 0.36586880683898926, + "learning_rate": 4.836509311390213e-05, + "loss": 0.3686, + "step": 2416000 + }, + { + "epoch": 16.352452360329146, + "grad_norm": 0.3561878800392151, + "learning_rate": 4.8364754763967085e-05, + "loss": 0.3678, + "step": 2416500 + }, + { + "epoch": 16.355835859679516, + "grad_norm": 0.3339352309703827, + "learning_rate": 4.836441641403205e-05, + "loss": 0.3691, + "step": 2417000 + }, + { + "epoch": 16.359219359029883, + "grad_norm": 0.33959805965423584, + "learning_rate": 4.8364078064097016e-05, + "loss": 0.3679, + "step": 2417500 + }, + { + "epoch": 16.36260285838025, + "grad_norm": 0.38634565472602844, + "learning_rate": 4.836373971416198e-05, + "loss": 0.37, + "step": 2418000 + }, + { + "epoch": 16.36598635773062, + "grad_norm": 0.3656438887119293, + "learning_rate": 4.836340136422694e-05, + "loss": 0.3675, + "step": 2418500 + }, + { + "epoch": 16.369369857080986, + "grad_norm": 0.393478661775589, + "learning_rate": 4.83630630142919e-05, + "loss": 0.3671, + "step": 2419000 + }, + { + "epoch": 16.372753356431357, + "grad_norm": 0.3794505000114441, + "learning_rate": 4.8362724664356864e-05, + "loss": 0.3674, + "step": 2419500 + }, + { + "epoch": 16.376136855781724, + "grad_norm": 0.34691929817199707, + "learning_rate": 4.836238631442183e-05, + "loss": 0.3674, + "step": 2420000 + }, + { + "epoch": 16.37952035513209, + "grad_norm": 0.3692687749862671, + "learning_rate": 4.8362047964486795e-05, + "loss": 0.3692, + "step": 2420500 + }, + { + "epoch": 16.38290385448246, + "grad_norm": 0.3891196846961975, + "learning_rate": 4.836170961455176e-05, + "loss": 0.3673, + "step": 2421000 + }, + { + "epoch": 16.386287353832827, + "grad_norm": 0.3694235682487488, + "learning_rate": 4.836137126461672e-05, + "loss": 0.3674, + "step": 2421500 + }, + { + "epoch": 16.389670853183198, + "grad_norm": 0.3857375383377075, + "learning_rate": 4.836103291468168e-05, + "loss": 0.3679, + "step": 2422000 + }, + { + "epoch": 16.393054352533564, + "grad_norm": 0.3707408905029297, + "learning_rate": 4.8360694564746644e-05, + "loss": 0.3666, + "step": 2422500 + }, + { + "epoch": 16.39643785188393, + "grad_norm": 0.30295616388320923, + "learning_rate": 4.8360356214811606e-05, + "loss": 0.3677, + "step": 2423000 + }, + { + "epoch": 16.3998213512343, + "grad_norm": 0.3722294270992279, + "learning_rate": 4.8360017864876575e-05, + "loss": 0.3694, + "step": 2423500 + }, + { + "epoch": 16.403204850584668, + "grad_norm": 0.37082570791244507, + "learning_rate": 4.835967951494154e-05, + "loss": 0.3673, + "step": 2424000 + }, + { + "epoch": 16.40658834993504, + "grad_norm": 0.378887802362442, + "learning_rate": 4.83593411650065e-05, + "loss": 0.3675, + "step": 2424500 + }, + { + "epoch": 16.409971849285405, + "grad_norm": 0.3331888020038605, + "learning_rate": 4.835900281507146e-05, + "loss": 0.3684, + "step": 2425000 + }, + { + "epoch": 16.413355348635772, + "grad_norm": 0.3269572854042053, + "learning_rate": 4.835866446513643e-05, + "loss": 0.3676, + "step": 2425500 + }, + { + "epoch": 16.416738847986142, + "grad_norm": 0.36776217818260193, + "learning_rate": 4.8358326115201385e-05, + "loss": 0.3668, + "step": 2426000 + }, + { + "epoch": 16.42012234733651, + "grad_norm": 0.40682822465896606, + "learning_rate": 4.835798776526635e-05, + "loss": 0.3665, + "step": 2426500 + }, + { + "epoch": 16.423505846686876, + "grad_norm": 0.37275636196136475, + "learning_rate": 4.8357649415331316e-05, + "loss": 0.3677, + "step": 2427000 + }, + { + "epoch": 16.426889346037246, + "grad_norm": 0.35203757882118225, + "learning_rate": 4.835731106539628e-05, + "loss": 0.3682, + "step": 2427500 + }, + { + "epoch": 16.430272845387613, + "grad_norm": 0.363955020904541, + "learning_rate": 4.835697271546124e-05, + "loss": 0.3677, + "step": 2428000 + }, + { + "epoch": 16.433656344737983, + "grad_norm": 0.3390071988105774, + "learning_rate": 4.83566343655262e-05, + "loss": 0.37, + "step": 2428500 + }, + { + "epoch": 16.43703984408835, + "grad_norm": 0.36319419741630554, + "learning_rate": 4.8356296015591165e-05, + "loss": 0.3693, + "step": 2429000 + }, + { + "epoch": 16.440423343438717, + "grad_norm": 0.35319650173187256, + "learning_rate": 4.8355957665656134e-05, + "loss": 0.367, + "step": 2429500 + }, + { + "epoch": 16.443806842789087, + "grad_norm": 0.37532761693000793, + "learning_rate": 4.8355619315721096e-05, + "loss": 0.3679, + "step": 2430000 + }, + { + "epoch": 16.447190342139454, + "grad_norm": 0.3767538070678711, + "learning_rate": 4.835528096578606e-05, + "loss": 0.3682, + "step": 2430500 + }, + { + "epoch": 16.450573841489824, + "grad_norm": 0.3882306218147278, + "learning_rate": 4.835494261585102e-05, + "loss": 0.3688, + "step": 2431000 + }, + { + "epoch": 16.45395734084019, + "grad_norm": 0.3863130509853363, + "learning_rate": 4.835460426591598e-05, + "loss": 0.3693, + "step": 2431500 + }, + { + "epoch": 16.457340840190557, + "grad_norm": 0.3632519543170929, + "learning_rate": 4.8354265915980944e-05, + "loss": 0.3676, + "step": 2432000 + }, + { + "epoch": 16.460724339540928, + "grad_norm": 0.34005188941955566, + "learning_rate": 4.8353927566045907e-05, + "loss": 0.3674, + "step": 2432500 + }, + { + "epoch": 16.464107838891294, + "grad_norm": 0.33168646693229675, + "learning_rate": 4.8353589216110875e-05, + "loss": 0.3675, + "step": 2433000 + }, + { + "epoch": 16.467491338241665, + "grad_norm": 0.35770660638809204, + "learning_rate": 4.835325086617584e-05, + "loss": 0.3679, + "step": 2433500 + }, + { + "epoch": 16.47087483759203, + "grad_norm": 0.37699154019355774, + "learning_rate": 4.83529125162408e-05, + "loss": 0.368, + "step": 2434000 + }, + { + "epoch": 16.4742583369424, + "grad_norm": 0.3676162362098694, + "learning_rate": 4.835257416630576e-05, + "loss": 0.3683, + "step": 2434500 + }, + { + "epoch": 16.47764183629277, + "grad_norm": 0.30709290504455566, + "learning_rate": 4.835223581637073e-05, + "loss": 0.3688, + "step": 2435000 + }, + { + "epoch": 16.481025335643135, + "grad_norm": 0.36708635091781616, + "learning_rate": 4.8351897466435686e-05, + "loss": 0.3679, + "step": 2435500 + }, + { + "epoch": 16.484408834993502, + "grad_norm": 0.31738120317459106, + "learning_rate": 4.835155911650065e-05, + "loss": 0.3666, + "step": 2436000 + }, + { + "epoch": 16.487792334343872, + "grad_norm": 0.3448878228664398, + "learning_rate": 4.835122076656561e-05, + "loss": 0.368, + "step": 2436500 + }, + { + "epoch": 16.49117583369424, + "grad_norm": 0.33406171202659607, + "learning_rate": 4.835088241663058e-05, + "loss": 0.3687, + "step": 2437000 + }, + { + "epoch": 16.49455933304461, + "grad_norm": 0.3607676923274994, + "learning_rate": 4.835054406669554e-05, + "loss": 0.3683, + "step": 2437500 + }, + { + "epoch": 16.497942832394976, + "grad_norm": 0.38623955845832825, + "learning_rate": 4.8350205716760503e-05, + "loss": 0.3677, + "step": 2438000 + }, + { + "epoch": 16.501326331745343, + "grad_norm": 0.3449345827102661, + "learning_rate": 4.8349867366825466e-05, + "loss": 0.368, + "step": 2438500 + }, + { + "epoch": 16.504709831095713, + "grad_norm": 0.36246615648269653, + "learning_rate": 4.8349529016890434e-05, + "loss": 0.368, + "step": 2439000 + }, + { + "epoch": 16.50809333044608, + "grad_norm": 0.343545526266098, + "learning_rate": 4.8349190666955397e-05, + "loss": 0.3685, + "step": 2439500 + }, + { + "epoch": 16.51147682979645, + "grad_norm": 0.3407546281814575, + "learning_rate": 4.834885231702036e-05, + "loss": 0.367, + "step": 2440000 + }, + { + "epoch": 16.514860329146817, + "grad_norm": 0.417532354593277, + "learning_rate": 4.834851396708532e-05, + "loss": 0.3684, + "step": 2440500 + }, + { + "epoch": 16.518243828497184, + "grad_norm": 0.3835320472717285, + "learning_rate": 4.834817561715028e-05, + "loss": 0.3657, + "step": 2441000 + }, + { + "epoch": 16.521627327847554, + "grad_norm": 0.37053796648979187, + "learning_rate": 4.8347837267215245e-05, + "loss": 0.3679, + "step": 2441500 + }, + { + "epoch": 16.52501082719792, + "grad_norm": 0.34645986557006836, + "learning_rate": 4.834749891728021e-05, + "loss": 0.3664, + "step": 2442000 + }, + { + "epoch": 16.528394326548288, + "grad_norm": 0.3670058846473694, + "learning_rate": 4.8347160567345176e-05, + "loss": 0.3688, + "step": 2442500 + }, + { + "epoch": 16.531777825898658, + "grad_norm": 0.33533594012260437, + "learning_rate": 4.834682221741014e-05, + "loss": 0.3687, + "step": 2443000 + }, + { + "epoch": 16.535161325249025, + "grad_norm": 0.3722659647464752, + "learning_rate": 4.83464838674751e-05, + "loss": 0.3668, + "step": 2443500 + }, + { + "epoch": 16.538544824599395, + "grad_norm": 0.3470999300479889, + "learning_rate": 4.834614551754006e-05, + "loss": 0.3669, + "step": 2444000 + }, + { + "epoch": 16.54192832394976, + "grad_norm": 0.3363528549671173, + "learning_rate": 4.834580716760503e-05, + "loss": 0.3668, + "step": 2444500 + }, + { + "epoch": 16.54531182330013, + "grad_norm": 0.35689017176628113, + "learning_rate": 4.834546881766999e-05, + "loss": 0.3677, + "step": 2445000 + }, + { + "epoch": 16.5486953226505, + "grad_norm": 0.3551176190376282, + "learning_rate": 4.834513046773495e-05, + "loss": 0.3685, + "step": 2445500 + }, + { + "epoch": 16.552078822000865, + "grad_norm": 0.39262300729751587, + "learning_rate": 4.834479211779991e-05, + "loss": 0.3675, + "step": 2446000 + }, + { + "epoch": 16.555462321351236, + "grad_norm": 0.3882499933242798, + "learning_rate": 4.834445376786488e-05, + "loss": 0.3675, + "step": 2446500 + }, + { + "epoch": 16.558845820701602, + "grad_norm": 0.3415895700454712, + "learning_rate": 4.834411541792984e-05, + "loss": 0.3682, + "step": 2447000 + }, + { + "epoch": 16.56222932005197, + "grad_norm": 0.35380828380584717, + "learning_rate": 4.8343777067994804e-05, + "loss": 0.3674, + "step": 2447500 + }, + { + "epoch": 16.56561281940234, + "grad_norm": 0.35650962591171265, + "learning_rate": 4.8343438718059766e-05, + "loss": 0.3676, + "step": 2448000 + }, + { + "epoch": 16.568996318752706, + "grad_norm": 0.34146806597709656, + "learning_rate": 4.8343100368124735e-05, + "loss": 0.3697, + "step": 2448500 + }, + { + "epoch": 16.572379818103077, + "grad_norm": 0.3345761299133301, + "learning_rate": 4.83427620181897e-05, + "loss": 0.3672, + "step": 2449000 + }, + { + "epoch": 16.575763317453443, + "grad_norm": 0.34446024894714355, + "learning_rate": 4.834242366825466e-05, + "loss": 0.3684, + "step": 2449500 + }, + { + "epoch": 16.57914681680381, + "grad_norm": 0.47220537066459656, + "learning_rate": 4.834208531831962e-05, + "loss": 0.3672, + "step": 2450000 + }, + { + "epoch": 16.58253031615418, + "grad_norm": 0.35243332386016846, + "learning_rate": 4.8341746968384584e-05, + "loss": 0.3676, + "step": 2450500 + }, + { + "epoch": 16.585913815504547, + "grad_norm": 0.3121185898780823, + "learning_rate": 4.8341408618449546e-05, + "loss": 0.3677, + "step": 2451000 + }, + { + "epoch": 16.589297314854914, + "grad_norm": 0.3816813826560974, + "learning_rate": 4.834107026851451e-05, + "loss": 0.3666, + "step": 2451500 + }, + { + "epoch": 16.592680814205284, + "grad_norm": 0.3435656726360321, + "learning_rate": 4.834073191857948e-05, + "loss": 0.368, + "step": 2452000 + }, + { + "epoch": 16.59606431355565, + "grad_norm": 0.387119859457016, + "learning_rate": 4.834039356864444e-05, + "loss": 0.3682, + "step": 2452500 + }, + { + "epoch": 16.59944781290602, + "grad_norm": 0.39400357007980347, + "learning_rate": 4.83400552187094e-05, + "loss": 0.3669, + "step": 2453000 + }, + { + "epoch": 16.602831312256388, + "grad_norm": 0.3987778425216675, + "learning_rate": 4.833971686877436e-05, + "loss": 0.3669, + "step": 2453500 + }, + { + "epoch": 16.606214811606755, + "grad_norm": 0.3489267826080322, + "learning_rate": 4.833937851883933e-05, + "loss": 0.3695, + "step": 2454000 + }, + { + "epoch": 16.609598310957125, + "grad_norm": 0.3362281918525696, + "learning_rate": 4.8339040168904294e-05, + "loss": 0.3668, + "step": 2454500 + }, + { + "epoch": 16.61298181030749, + "grad_norm": 0.3904046416282654, + "learning_rate": 4.833870181896925e-05, + "loss": 0.3685, + "step": 2455000 + }, + { + "epoch": 16.616365309657862, + "grad_norm": 0.3561997711658478, + "learning_rate": 4.833836346903421e-05, + "loss": 0.3683, + "step": 2455500 + }, + { + "epoch": 16.61974880900823, + "grad_norm": 0.36367231607437134, + "learning_rate": 4.833802511909918e-05, + "loss": 0.368, + "step": 2456000 + }, + { + "epoch": 16.623132308358596, + "grad_norm": 0.3647955358028412, + "learning_rate": 4.833768676916414e-05, + "loss": 0.3669, + "step": 2456500 + }, + { + "epoch": 16.626515807708966, + "grad_norm": 0.3903414309024811, + "learning_rate": 4.8337348419229105e-05, + "loss": 0.3678, + "step": 2457000 + }, + { + "epoch": 16.629899307059333, + "grad_norm": 0.36575034260749817, + "learning_rate": 4.833701006929407e-05, + "loss": 0.3687, + "step": 2457500 + }, + { + "epoch": 16.633282806409703, + "grad_norm": 0.3615660071372986, + "learning_rate": 4.8336671719359036e-05, + "loss": 0.3662, + "step": 2458000 + }, + { + "epoch": 16.63666630576007, + "grad_norm": 0.38607707619667053, + "learning_rate": 4.8336333369424e-05, + "loss": 0.3696, + "step": 2458500 + }, + { + "epoch": 16.640049805110436, + "grad_norm": 0.3894058167934418, + "learning_rate": 4.833599501948896e-05, + "loss": 0.3656, + "step": 2459000 + }, + { + "epoch": 16.643433304460807, + "grad_norm": 0.396003395318985, + "learning_rate": 4.833565666955392e-05, + "loss": 0.369, + "step": 2459500 + }, + { + "epoch": 16.646816803811173, + "grad_norm": 0.3519524335861206, + "learning_rate": 4.8335318319618884e-05, + "loss": 0.3685, + "step": 2460000 + }, + { + "epoch": 16.65020030316154, + "grad_norm": 0.3568274676799774, + "learning_rate": 4.8334979969683846e-05, + "loss": 0.3694, + "step": 2460500 + }, + { + "epoch": 16.65358380251191, + "grad_norm": 0.3595027029514313, + "learning_rate": 4.833464161974881e-05, + "loss": 0.3678, + "step": 2461000 + }, + { + "epoch": 16.656967301862277, + "grad_norm": 0.40519922971725464, + "learning_rate": 4.833430326981378e-05, + "loss": 0.3663, + "step": 2461500 + }, + { + "epoch": 16.660350801212648, + "grad_norm": 0.35902488231658936, + "learning_rate": 4.833396491987874e-05, + "loss": 0.3669, + "step": 2462000 + }, + { + "epoch": 16.663734300563014, + "grad_norm": 0.3957083225250244, + "learning_rate": 4.83336265699437e-05, + "loss": 0.3676, + "step": 2462500 + }, + { + "epoch": 16.66711779991338, + "grad_norm": 0.4037426710128784, + "learning_rate": 4.8333288220008664e-05, + "loss": 0.3681, + "step": 2463000 + }, + { + "epoch": 16.67050129926375, + "grad_norm": 0.3176875412464142, + "learning_rate": 4.833294987007363e-05, + "loss": 0.3684, + "step": 2463500 + }, + { + "epoch": 16.673884798614118, + "grad_norm": 0.40276309847831726, + "learning_rate": 4.8332611520138595e-05, + "loss": 0.3683, + "step": 2464000 + }, + { + "epoch": 16.67726829796449, + "grad_norm": 0.3713991940021515, + "learning_rate": 4.833227317020355e-05, + "loss": 0.367, + "step": 2464500 + }, + { + "epoch": 16.680651797314855, + "grad_norm": 0.3815590441226959, + "learning_rate": 4.833193482026851e-05, + "loss": 0.369, + "step": 2465000 + }, + { + "epoch": 16.684035296665222, + "grad_norm": 0.3649478554725647, + "learning_rate": 4.833159647033348e-05, + "loss": 0.3668, + "step": 2465500 + }, + { + "epoch": 16.687418796015592, + "grad_norm": 0.3144074082374573, + "learning_rate": 4.833125812039844e-05, + "loss": 0.3672, + "step": 2466000 + }, + { + "epoch": 16.69080229536596, + "grad_norm": 0.3706476092338562, + "learning_rate": 4.8330919770463405e-05, + "loss": 0.3676, + "step": 2466500 + }, + { + "epoch": 16.694185794716326, + "grad_norm": 0.4185875356197357, + "learning_rate": 4.833058142052837e-05, + "loss": 0.3683, + "step": 2467000 + }, + { + "epoch": 16.697569294066696, + "grad_norm": 0.38653209805488586, + "learning_rate": 4.8330243070593336e-05, + "loss": 0.3682, + "step": 2467500 + }, + { + "epoch": 16.700952793417063, + "grad_norm": 0.37850135564804077, + "learning_rate": 4.83299047206583e-05, + "loss": 0.3678, + "step": 2468000 + }, + { + "epoch": 16.704336292767433, + "grad_norm": 0.3965655267238617, + "learning_rate": 4.832956637072326e-05, + "loss": 0.3673, + "step": 2468500 + }, + { + "epoch": 16.7077197921178, + "grad_norm": 0.3785822093486786, + "learning_rate": 4.832922802078822e-05, + "loss": 0.368, + "step": 2469000 + }, + { + "epoch": 16.711103291468167, + "grad_norm": 0.36220183968544006, + "learning_rate": 4.8328889670853185e-05, + "loss": 0.3664, + "step": 2469500 + }, + { + "epoch": 16.714486790818537, + "grad_norm": 0.34872257709503174, + "learning_rate": 4.832855132091815e-05, + "loss": 0.3685, + "step": 2470000 + }, + { + "epoch": 16.717870290168904, + "grad_norm": 0.4009423553943634, + "learning_rate": 4.832821297098311e-05, + "loss": 0.3687, + "step": 2470500 + }, + { + "epoch": 16.721253789519274, + "grad_norm": 0.3455296754837036, + "learning_rate": 4.832787462104808e-05, + "loss": 0.3667, + "step": 2471000 + }, + { + "epoch": 16.72463728886964, + "grad_norm": 0.3740684986114502, + "learning_rate": 4.832753627111304e-05, + "loss": 0.3686, + "step": 2471500 + }, + { + "epoch": 16.728020788220007, + "grad_norm": 0.35268333554267883, + "learning_rate": 4.8327197921178e-05, + "loss": 0.3693, + "step": 2472000 + }, + { + "epoch": 16.731404287570378, + "grad_norm": 0.3672443628311157, + "learning_rate": 4.8326859571242964e-05, + "loss": 0.3679, + "step": 2472500 + }, + { + "epoch": 16.734787786920744, + "grad_norm": 0.34308382868766785, + "learning_rate": 4.832652122130793e-05, + "loss": 0.3671, + "step": 2473000 + }, + { + "epoch": 16.738171286271115, + "grad_norm": 0.3512164056301117, + "learning_rate": 4.8326182871372895e-05, + "loss": 0.3678, + "step": 2473500 + }, + { + "epoch": 16.74155478562148, + "grad_norm": 0.3707292377948761, + "learning_rate": 4.832584452143785e-05, + "loss": 0.3675, + "step": 2474000 + }, + { + "epoch": 16.744938284971848, + "grad_norm": 0.3685651421546936, + "learning_rate": 4.832550617150281e-05, + "loss": 0.367, + "step": 2474500 + }, + { + "epoch": 16.74832178432222, + "grad_norm": 0.3733166754245758, + "learning_rate": 4.832516782156778e-05, + "loss": 0.3669, + "step": 2475000 + }, + { + "epoch": 16.751705283672585, + "grad_norm": 0.3869015872478485, + "learning_rate": 4.8324829471632744e-05, + "loss": 0.3678, + "step": 2475500 + }, + { + "epoch": 16.755088783022952, + "grad_norm": 0.35414817929267883, + "learning_rate": 4.8324491121697706e-05, + "loss": 0.3685, + "step": 2476000 + }, + { + "epoch": 16.758472282373322, + "grad_norm": 0.4135025143623352, + "learning_rate": 4.832415277176267e-05, + "loss": 0.3693, + "step": 2476500 + }, + { + "epoch": 16.76185578172369, + "grad_norm": 0.3596659302711487, + "learning_rate": 4.832381442182764e-05, + "loss": 0.3672, + "step": 2477000 + }, + { + "epoch": 16.76523928107406, + "grad_norm": 0.35768067836761475, + "learning_rate": 4.83234760718926e-05, + "loss": 0.3668, + "step": 2477500 + }, + { + "epoch": 16.768622780424426, + "grad_norm": 0.4043077826499939, + "learning_rate": 4.832313772195756e-05, + "loss": 0.3664, + "step": 2478000 + }, + { + "epoch": 16.772006279774793, + "grad_norm": 0.3664889931678772, + "learning_rate": 4.832279937202252e-05, + "loss": 0.368, + "step": 2478500 + }, + { + "epoch": 16.775389779125163, + "grad_norm": 0.3519807457923889, + "learning_rate": 4.8322461022087485e-05, + "loss": 0.3679, + "step": 2479000 + }, + { + "epoch": 16.77877327847553, + "grad_norm": 0.3843582570552826, + "learning_rate": 4.832212267215245e-05, + "loss": 0.3675, + "step": 2479500 + }, + { + "epoch": 16.7821567778259, + "grad_norm": 0.3271946907043457, + "learning_rate": 4.832178432221741e-05, + "loss": 0.3677, + "step": 2480000 + }, + { + "epoch": 16.785540277176267, + "grad_norm": 0.389017790555954, + "learning_rate": 4.832144597228238e-05, + "loss": 0.367, + "step": 2480500 + }, + { + "epoch": 16.788923776526634, + "grad_norm": 0.34948909282684326, + "learning_rate": 4.832110762234734e-05, + "loss": 0.367, + "step": 2481000 + }, + { + "epoch": 16.792307275877004, + "grad_norm": 0.3824010193347931, + "learning_rate": 4.83207692724123e-05, + "loss": 0.3672, + "step": 2481500 + }, + { + "epoch": 16.79569077522737, + "grad_norm": 0.3278599679470062, + "learning_rate": 4.8320430922477265e-05, + "loss": 0.3669, + "step": 2482000 + }, + { + "epoch": 16.79907427457774, + "grad_norm": 0.35566702485084534, + "learning_rate": 4.832009257254223e-05, + "loss": 0.3665, + "step": 2482500 + }, + { + "epoch": 16.802457773928108, + "grad_norm": 0.36212462186813354, + "learning_rate": 4.8319754222607196e-05, + "loss": 0.3684, + "step": 2483000 + }, + { + "epoch": 16.805841273278475, + "grad_norm": 0.3928414285182953, + "learning_rate": 4.831941587267215e-05, + "loss": 0.3678, + "step": 2483500 + }, + { + "epoch": 16.809224772628845, + "grad_norm": 0.4236493408679962, + "learning_rate": 4.8319077522737113e-05, + "loss": 0.3681, + "step": 2484000 + }, + { + "epoch": 16.81260827197921, + "grad_norm": 0.4181719422340393, + "learning_rate": 4.831873917280208e-05, + "loss": 0.3672, + "step": 2484500 + }, + { + "epoch": 16.81599177132958, + "grad_norm": 0.338945597410202, + "learning_rate": 4.8318400822867044e-05, + "loss": 0.3671, + "step": 2485000 + }, + { + "epoch": 16.81937527067995, + "grad_norm": 0.355094313621521, + "learning_rate": 4.8318062472932007e-05, + "loss": 0.3686, + "step": 2485500 + }, + { + "epoch": 16.822758770030315, + "grad_norm": 0.37627944350242615, + "learning_rate": 4.831772412299697e-05, + "loss": 0.3672, + "step": 2486000 + }, + { + "epoch": 16.826142269380686, + "grad_norm": 0.3468860983848572, + "learning_rate": 4.831738577306194e-05, + "loss": 0.3663, + "step": 2486500 + }, + { + "epoch": 16.829525768731052, + "grad_norm": 0.3257055878639221, + "learning_rate": 4.83170474231269e-05, + "loss": 0.3667, + "step": 2487000 + }, + { + "epoch": 16.83290926808142, + "grad_norm": 0.38776686787605286, + "learning_rate": 4.831670907319186e-05, + "loss": 0.3666, + "step": 2487500 + }, + { + "epoch": 16.83629276743179, + "grad_norm": 0.35589778423309326, + "learning_rate": 4.8316370723256824e-05, + "loss": 0.3669, + "step": 2488000 + }, + { + "epoch": 16.839676266782156, + "grad_norm": 0.3393418490886688, + "learning_rate": 4.8316032373321786e-05, + "loss": 0.3676, + "step": 2488500 + }, + { + "epoch": 16.843059766132527, + "grad_norm": 0.38172590732574463, + "learning_rate": 4.831569402338675e-05, + "loss": 0.3677, + "step": 2489000 + }, + { + "epoch": 16.846443265482893, + "grad_norm": 0.358637273311615, + "learning_rate": 4.831535567345171e-05, + "loss": 0.3677, + "step": 2489500 + }, + { + "epoch": 16.84982676483326, + "grad_norm": 0.34321945905685425, + "learning_rate": 4.831501732351668e-05, + "loss": 0.3664, + "step": 2490000 + }, + { + "epoch": 16.85321026418363, + "grad_norm": 0.36980679631233215, + "learning_rate": 4.831467897358164e-05, + "loss": 0.3673, + "step": 2490500 + }, + { + "epoch": 16.856593763533997, + "grad_norm": 0.3730469048023224, + "learning_rate": 4.8314340623646603e-05, + "loss": 0.3684, + "step": 2491000 + }, + { + "epoch": 16.859977262884364, + "grad_norm": 0.3678283095359802, + "learning_rate": 4.8314002273711566e-05, + "loss": 0.3677, + "step": 2491500 + }, + { + "epoch": 16.863360762234734, + "grad_norm": 0.3648795783519745, + "learning_rate": 4.831366392377653e-05, + "loss": 0.3661, + "step": 2492000 + }, + { + "epoch": 16.8667442615851, + "grad_norm": 0.3346042335033417, + "learning_rate": 4.83133255738415e-05, + "loss": 0.3682, + "step": 2492500 + }, + { + "epoch": 16.87012776093547, + "grad_norm": 0.3473329246044159, + "learning_rate": 4.831298722390645e-05, + "loss": 0.3669, + "step": 2493000 + }, + { + "epoch": 16.873511260285838, + "grad_norm": 0.3240760266780853, + "learning_rate": 4.8312648873971414e-05, + "loss": 0.368, + "step": 2493500 + }, + { + "epoch": 16.876894759636205, + "grad_norm": 0.36841729283332825, + "learning_rate": 4.831231052403638e-05, + "loss": 0.3681, + "step": 2494000 + }, + { + "epoch": 16.880278258986575, + "grad_norm": 0.37722286581993103, + "learning_rate": 4.8311972174101345e-05, + "loss": 0.3679, + "step": 2494500 + }, + { + "epoch": 16.88366175833694, + "grad_norm": 0.3509593904018402, + "learning_rate": 4.831163382416631e-05, + "loss": 0.3664, + "step": 2495000 + }, + { + "epoch": 16.887045257687312, + "grad_norm": 0.347510427236557, + "learning_rate": 4.831129547423127e-05, + "loss": 0.3677, + "step": 2495500 + }, + { + "epoch": 16.89042875703768, + "grad_norm": 0.34216421842575073, + "learning_rate": 4.831095712429624e-05, + "loss": 0.3684, + "step": 2496000 + }, + { + "epoch": 16.893812256388046, + "grad_norm": 0.3292529881000519, + "learning_rate": 4.83106187743612e-05, + "loss": 0.3684, + "step": 2496500 + }, + { + "epoch": 16.897195755738416, + "grad_norm": 0.3332473337650299, + "learning_rate": 4.831028042442616e-05, + "loss": 0.3673, + "step": 2497000 + }, + { + "epoch": 16.900579255088783, + "grad_norm": 0.3342098593711853, + "learning_rate": 4.8309942074491125e-05, + "loss": 0.3659, + "step": 2497500 + }, + { + "epoch": 16.903962754439153, + "grad_norm": 0.36247438192367554, + "learning_rate": 4.830960372455609e-05, + "loss": 0.367, + "step": 2498000 + }, + { + "epoch": 16.90734625378952, + "grad_norm": 0.3494985103607178, + "learning_rate": 4.830926537462105e-05, + "loss": 0.3672, + "step": 2498500 + }, + { + "epoch": 16.910729753139886, + "grad_norm": 0.41263943910598755, + "learning_rate": 4.830892702468601e-05, + "loss": 0.3685, + "step": 2499000 + }, + { + "epoch": 16.914113252490257, + "grad_norm": 0.33395126461982727, + "learning_rate": 4.830858867475097e-05, + "loss": 0.3686, + "step": 2499500 + }, + { + "epoch": 16.917496751840623, + "grad_norm": 0.3553406298160553, + "learning_rate": 4.830825032481594e-05, + "loss": 0.3677, + "step": 2500000 + }, + { + "epoch": 16.92088025119099, + "grad_norm": 0.33235299587249756, + "learning_rate": 4.8307911974880904e-05, + "loss": 0.3674, + "step": 2500500 + }, + { + "epoch": 16.92426375054136, + "grad_norm": 0.35224485397338867, + "learning_rate": 4.8307573624945866e-05, + "loss": 0.3702, + "step": 2501000 + }, + { + "epoch": 16.927647249891727, + "grad_norm": 0.39313584566116333, + "learning_rate": 4.830723527501083e-05, + "loss": 0.3684, + "step": 2501500 + }, + { + "epoch": 16.931030749242097, + "grad_norm": 0.3749732971191406, + "learning_rate": 4.83068969250758e-05, + "loss": 0.3676, + "step": 2502000 + }, + { + "epoch": 16.934414248592464, + "grad_norm": 0.34019801020622253, + "learning_rate": 4.830655857514075e-05, + "loss": 0.3677, + "step": 2502500 + }, + { + "epoch": 16.93779774794283, + "grad_norm": 0.32792386412620544, + "learning_rate": 4.8306220225205715e-05, + "loss": 0.3668, + "step": 2503000 + }, + { + "epoch": 16.9411812472932, + "grad_norm": 0.37263306975364685, + "learning_rate": 4.8305881875270684e-05, + "loss": 0.3673, + "step": 2503500 + }, + { + "epoch": 16.944564746643568, + "grad_norm": 0.3549545705318451, + "learning_rate": 4.8305543525335646e-05, + "loss": 0.3674, + "step": 2504000 + }, + { + "epoch": 16.94794824599394, + "grad_norm": 0.3565514385700226, + "learning_rate": 4.830520517540061e-05, + "loss": 0.3675, + "step": 2504500 + }, + { + "epoch": 16.951331745344305, + "grad_norm": 0.3621446490287781, + "learning_rate": 4.830486682546557e-05, + "loss": 0.3667, + "step": 2505000 + }, + { + "epoch": 16.954715244694672, + "grad_norm": 0.3795997202396393, + "learning_rate": 4.830452847553054e-05, + "loss": 0.3671, + "step": 2505500 + }, + { + "epoch": 16.958098744045042, + "grad_norm": 0.3907493054866791, + "learning_rate": 4.83041901255955e-05, + "loss": 0.3692, + "step": 2506000 + }, + { + "epoch": 16.96148224339541, + "grad_norm": 0.3782593607902527, + "learning_rate": 4.830385177566046e-05, + "loss": 0.3667, + "step": 2506500 + }, + { + "epoch": 16.96486574274578, + "grad_norm": 0.31225937604904175, + "learning_rate": 4.830351342572542e-05, + "loss": 0.3679, + "step": 2507000 + }, + { + "epoch": 16.968249242096146, + "grad_norm": 0.37021970748901367, + "learning_rate": 4.830317507579039e-05, + "loss": 0.3705, + "step": 2507500 + }, + { + "epoch": 16.971632741446513, + "grad_norm": 0.3210141360759735, + "learning_rate": 4.830283672585535e-05, + "loss": 0.3672, + "step": 2508000 + }, + { + "epoch": 16.975016240796883, + "grad_norm": 0.3552591800689697, + "learning_rate": 4.830249837592031e-05, + "loss": 0.3691, + "step": 2508500 + }, + { + "epoch": 16.97839974014725, + "grad_norm": 0.3715352416038513, + "learning_rate": 4.8302160025985274e-05, + "loss": 0.3689, + "step": 2509000 + }, + { + "epoch": 16.981783239497616, + "grad_norm": 0.3898894190788269, + "learning_rate": 4.830182167605024e-05, + "loss": 0.3682, + "step": 2509500 + }, + { + "epoch": 16.985166738847987, + "grad_norm": 0.3399691581726074, + "learning_rate": 4.8301483326115205e-05, + "loss": 0.3672, + "step": 2510000 + }, + { + "epoch": 16.988550238198354, + "grad_norm": 0.36466071009635925, + "learning_rate": 4.830114497618017e-05, + "loss": 0.3674, + "step": 2510500 + }, + { + "epoch": 16.991933737548724, + "grad_norm": 0.38154199719429016, + "learning_rate": 4.830080662624513e-05, + "loss": 0.3693, + "step": 2511000 + }, + { + "epoch": 16.99531723689909, + "grad_norm": 0.3445392847061157, + "learning_rate": 4.83004682763101e-05, + "loss": 0.3676, + "step": 2511500 + }, + { + "epoch": 16.998700736249457, + "grad_norm": 0.33373844623565674, + "learning_rate": 4.830012992637505e-05, + "loss": 0.3675, + "step": 2512000 + }, + { + "epoch": 17.0, + "eval_accuracy": 0.859953776456311, + "eval_loss": 0.5690349340438843, + "eval_runtime": 3383.2894, + "eval_samples_per_second": 85.935, + "eval_steps_per_second": 5.371, + "step": 2512192 + }, + { + "epoch": 17.002084235599828, + "grad_norm": 0.3574970066547394, + "learning_rate": 4.8299791576440015e-05, + "loss": 0.3654, + "step": 2512500 + }, + { + "epoch": 17.005467734950194, + "grad_norm": 0.3643225133419037, + "learning_rate": 4.8299453226504984e-05, + "loss": 0.3667, + "step": 2513000 + }, + { + "epoch": 17.008851234300565, + "grad_norm": 0.36455971002578735, + "learning_rate": 4.8299114876569946e-05, + "loss": 0.3638, + "step": 2513500 + }, + { + "epoch": 17.01223473365093, + "grad_norm": 0.3444270193576813, + "learning_rate": 4.829877652663491e-05, + "loss": 0.366, + "step": 2514000 + }, + { + "epoch": 17.015618233001298, + "grad_norm": 0.3640088438987732, + "learning_rate": 4.829843817669987e-05, + "loss": 0.3669, + "step": 2514500 + }, + { + "epoch": 17.01900173235167, + "grad_norm": 0.35751673579216003, + "learning_rate": 4.829809982676484e-05, + "loss": 0.3653, + "step": 2515000 + }, + { + "epoch": 17.022385231702035, + "grad_norm": 0.38286295533180237, + "learning_rate": 4.82977614768298e-05, + "loss": 0.3648, + "step": 2515500 + }, + { + "epoch": 17.025768731052402, + "grad_norm": 0.37575405836105347, + "learning_rate": 4.8297423126894764e-05, + "loss": 0.3652, + "step": 2516000 + }, + { + "epoch": 17.029152230402772, + "grad_norm": 0.35940518975257874, + "learning_rate": 4.8297084776959726e-05, + "loss": 0.3667, + "step": 2516500 + }, + { + "epoch": 17.03253572975314, + "grad_norm": 0.37399694323539734, + "learning_rate": 4.829674642702469e-05, + "loss": 0.3644, + "step": 2517000 + }, + { + "epoch": 17.03591922910351, + "grad_norm": 0.4155278205871582, + "learning_rate": 4.829640807708965e-05, + "loss": 0.3662, + "step": 2517500 + }, + { + "epoch": 17.039302728453876, + "grad_norm": 0.3764472007751465, + "learning_rate": 4.829606972715461e-05, + "loss": 0.3656, + "step": 2518000 + }, + { + "epoch": 17.042686227804243, + "grad_norm": 0.36024290323257446, + "learning_rate": 4.8295731377219574e-05, + "loss": 0.3648, + "step": 2518500 + }, + { + "epoch": 17.046069727154613, + "grad_norm": 0.37095972895622253, + "learning_rate": 4.829539302728454e-05, + "loss": 0.3655, + "step": 2519000 + }, + { + "epoch": 17.04945322650498, + "grad_norm": 0.36604204773902893, + "learning_rate": 4.8295054677349505e-05, + "loss": 0.3653, + "step": 2519500 + }, + { + "epoch": 17.05283672585535, + "grad_norm": 0.37407541275024414, + "learning_rate": 4.829471632741447e-05, + "loss": 0.3658, + "step": 2520000 + }, + { + "epoch": 17.056220225205717, + "grad_norm": 0.3428348898887634, + "learning_rate": 4.829437797747943e-05, + "loss": 0.3648, + "step": 2520500 + }, + { + "epoch": 17.059603724556084, + "grad_norm": 0.3618278205394745, + "learning_rate": 4.82940396275444e-05, + "loss": 0.3655, + "step": 2521000 + }, + { + "epoch": 17.062987223906454, + "grad_norm": 0.353392094373703, + "learning_rate": 4.8293701277609354e-05, + "loss": 0.3656, + "step": 2521500 + }, + { + "epoch": 17.06637072325682, + "grad_norm": 0.403253972530365, + "learning_rate": 4.8293362927674316e-05, + "loss": 0.3664, + "step": 2522000 + }, + { + "epoch": 17.06975422260719, + "grad_norm": 0.3712442219257355, + "learning_rate": 4.8293024577739285e-05, + "loss": 0.3659, + "step": 2522500 + }, + { + "epoch": 17.073137721957558, + "grad_norm": 0.36106806993484497, + "learning_rate": 4.829268622780425e-05, + "loss": 0.3656, + "step": 2523000 + }, + { + "epoch": 17.076521221307924, + "grad_norm": 0.3803187608718872, + "learning_rate": 4.829234787786921e-05, + "loss": 0.3661, + "step": 2523500 + }, + { + "epoch": 17.079904720658295, + "grad_norm": 0.3683875799179077, + "learning_rate": 4.829200952793417e-05, + "loss": 0.3663, + "step": 2524000 + }, + { + "epoch": 17.08328822000866, + "grad_norm": 0.379422128200531, + "learning_rate": 4.829167117799914e-05, + "loss": 0.3666, + "step": 2524500 + }, + { + "epoch": 17.08667171935903, + "grad_norm": 0.3635571300983429, + "learning_rate": 4.82913328280641e-05, + "loss": 0.3658, + "step": 2525000 + }, + { + "epoch": 17.0900552187094, + "grad_norm": 0.3490724265575409, + "learning_rate": 4.8290994478129064e-05, + "loss": 0.3658, + "step": 2525500 + }, + { + "epoch": 17.093438718059765, + "grad_norm": 0.37665268778800964, + "learning_rate": 4.8290656128194027e-05, + "loss": 0.3676, + "step": 2526000 + }, + { + "epoch": 17.096822217410136, + "grad_norm": 0.369052529335022, + "learning_rate": 4.829031777825899e-05, + "loss": 0.3675, + "step": 2526500 + }, + { + "epoch": 17.100205716760502, + "grad_norm": 0.34089598059654236, + "learning_rate": 4.828997942832395e-05, + "loss": 0.3674, + "step": 2527000 + }, + { + "epoch": 17.10358921611087, + "grad_norm": 0.3988218903541565, + "learning_rate": 4.828964107838891e-05, + "loss": 0.3665, + "step": 2527500 + }, + { + "epoch": 17.10697271546124, + "grad_norm": 0.3494698107242584, + "learning_rate": 4.8289302728453875e-05, + "loss": 0.3673, + "step": 2528000 + }, + { + "epoch": 17.110356214811606, + "grad_norm": 0.3407789468765259, + "learning_rate": 4.8288964378518844e-05, + "loss": 0.3663, + "step": 2528500 + }, + { + "epoch": 17.113739714161976, + "grad_norm": 0.3363821506500244, + "learning_rate": 4.8288626028583806e-05, + "loss": 0.3662, + "step": 2529000 + }, + { + "epoch": 17.117123213512343, + "grad_norm": 0.37021976709365845, + "learning_rate": 4.828828767864877e-05, + "loss": 0.3649, + "step": 2529500 + }, + { + "epoch": 17.12050671286271, + "grad_norm": 0.3477225601673126, + "learning_rate": 4.828794932871373e-05, + "loss": 0.3663, + "step": 2530000 + }, + { + "epoch": 17.12389021221308, + "grad_norm": 0.35830751061439514, + "learning_rate": 4.82876109787787e-05, + "loss": 0.3673, + "step": 2530500 + }, + { + "epoch": 17.127273711563447, + "grad_norm": 0.3866226077079773, + "learning_rate": 4.8287272628843655e-05, + "loss": 0.368, + "step": 2531000 + }, + { + "epoch": 17.130657210913814, + "grad_norm": 0.36625948548316956, + "learning_rate": 4.828693427890862e-05, + "loss": 0.3662, + "step": 2531500 + }, + { + "epoch": 17.134040710264184, + "grad_norm": 0.3588632047176361, + "learning_rate": 4.8286595928973586e-05, + "loss": 0.3661, + "step": 2532000 + }, + { + "epoch": 17.13742420961455, + "grad_norm": 0.36328765749931335, + "learning_rate": 4.828625757903855e-05, + "loss": 0.365, + "step": 2532500 + }, + { + "epoch": 17.14080770896492, + "grad_norm": 0.37686029076576233, + "learning_rate": 4.828591922910351e-05, + "loss": 0.368, + "step": 2533000 + }, + { + "epoch": 17.144191208315288, + "grad_norm": 0.3592396378517151, + "learning_rate": 4.828558087916847e-05, + "loss": 0.3661, + "step": 2533500 + }, + { + "epoch": 17.147574707665655, + "grad_norm": 0.35530906915664673, + "learning_rate": 4.828524252923344e-05, + "loss": 0.3674, + "step": 2534000 + }, + { + "epoch": 17.150958207016025, + "grad_norm": 0.39262640476226807, + "learning_rate": 4.82849041792984e-05, + "loss": 0.3652, + "step": 2534500 + }, + { + "epoch": 17.15434170636639, + "grad_norm": 0.3824649751186371, + "learning_rate": 4.8284565829363365e-05, + "loss": 0.366, + "step": 2535000 + }, + { + "epoch": 17.157725205716762, + "grad_norm": 0.3323943316936493, + "learning_rate": 4.828422747942833e-05, + "loss": 0.3656, + "step": 2535500 + }, + { + "epoch": 17.16110870506713, + "grad_norm": 0.3842216432094574, + "learning_rate": 4.828388912949329e-05, + "loss": 0.3655, + "step": 2536000 + }, + { + "epoch": 17.164492204417495, + "grad_norm": 0.4065781831741333, + "learning_rate": 4.828355077955825e-05, + "loss": 0.3659, + "step": 2536500 + }, + { + "epoch": 17.167875703767866, + "grad_norm": 0.35870862007141113, + "learning_rate": 4.8283212429623214e-05, + "loss": 0.3662, + "step": 2537000 + }, + { + "epoch": 17.171259203118233, + "grad_norm": 0.3968873620033264, + "learning_rate": 4.8282874079688176e-05, + "loss": 0.3658, + "step": 2537500 + }, + { + "epoch": 17.174642702468603, + "grad_norm": 0.35877475142478943, + "learning_rate": 4.8282535729753145e-05, + "loss": 0.3658, + "step": 2538000 + }, + { + "epoch": 17.17802620181897, + "grad_norm": 0.37040311098098755, + "learning_rate": 4.828219737981811e-05, + "loss": 0.3665, + "step": 2538500 + }, + { + "epoch": 17.181409701169336, + "grad_norm": 0.3944379985332489, + "learning_rate": 4.828185902988307e-05, + "loss": 0.3668, + "step": 2539000 + }, + { + "epoch": 17.184793200519707, + "grad_norm": 0.3358082175254822, + "learning_rate": 4.828152067994803e-05, + "loss": 0.3666, + "step": 2539500 + }, + { + "epoch": 17.188176699870073, + "grad_norm": 0.35811847448349, + "learning_rate": 4.8281182330013e-05, + "loss": 0.3667, + "step": 2540000 + }, + { + "epoch": 17.19156019922044, + "grad_norm": 0.3784216642379761, + "learning_rate": 4.8280843980077955e-05, + "loss": 0.3668, + "step": 2540500 + }, + { + "epoch": 17.19494369857081, + "grad_norm": 0.3526269495487213, + "learning_rate": 4.828050563014292e-05, + "loss": 0.3677, + "step": 2541000 + }, + { + "epoch": 17.198327197921177, + "grad_norm": 0.36154642701148987, + "learning_rate": 4.8280167280207886e-05, + "loss": 0.366, + "step": 2541500 + }, + { + "epoch": 17.201710697271547, + "grad_norm": 0.3573142886161804, + "learning_rate": 4.827982893027285e-05, + "loss": 0.3664, + "step": 2542000 + }, + { + "epoch": 17.205094196621914, + "grad_norm": 0.3573852777481079, + "learning_rate": 4.827949058033781e-05, + "loss": 0.3658, + "step": 2542500 + }, + { + "epoch": 17.20847769597228, + "grad_norm": 0.3711315393447876, + "learning_rate": 4.827915223040277e-05, + "loss": 0.3673, + "step": 2543000 + }, + { + "epoch": 17.21186119532265, + "grad_norm": 0.35421717166900635, + "learning_rate": 4.827881388046774e-05, + "loss": 0.3667, + "step": 2543500 + }, + { + "epoch": 17.215244694673018, + "grad_norm": 0.3692740499973297, + "learning_rate": 4.8278475530532704e-05, + "loss": 0.3665, + "step": 2544000 + }, + { + "epoch": 17.21862819402339, + "grad_norm": 0.33599853515625, + "learning_rate": 4.8278137180597666e-05, + "loss": 0.367, + "step": 2544500 + }, + { + "epoch": 17.222011693373755, + "grad_norm": 0.356893390417099, + "learning_rate": 4.827779883066263e-05, + "loss": 0.3671, + "step": 2545000 + }, + { + "epoch": 17.225395192724122, + "grad_norm": 0.33157333731651306, + "learning_rate": 4.827746048072759e-05, + "loss": 0.3675, + "step": 2545500 + }, + { + "epoch": 17.228778692074492, + "grad_norm": 0.3692445456981659, + "learning_rate": 4.827712213079255e-05, + "loss": 0.366, + "step": 2546000 + }, + { + "epoch": 17.23216219142486, + "grad_norm": 0.35876113176345825, + "learning_rate": 4.8276783780857514e-05, + "loss": 0.3665, + "step": 2546500 + }, + { + "epoch": 17.23554569077523, + "grad_norm": 0.37952202558517456, + "learning_rate": 4.8276445430922476e-05, + "loss": 0.3661, + "step": 2547000 + }, + { + "epoch": 17.238929190125596, + "grad_norm": 0.34852099418640137, + "learning_rate": 4.8276107080987445e-05, + "loss": 0.3665, + "step": 2547500 + }, + { + "epoch": 17.242312689475963, + "grad_norm": 0.32389768958091736, + "learning_rate": 4.827576873105241e-05, + "loss": 0.367, + "step": 2548000 + }, + { + "epoch": 17.245696188826333, + "grad_norm": 0.36108294129371643, + "learning_rate": 4.827543038111737e-05, + "loss": 0.3679, + "step": 2548500 + }, + { + "epoch": 17.2490796881767, + "grad_norm": 0.3562123775482178, + "learning_rate": 4.827509203118233e-05, + "loss": 0.3675, + "step": 2549000 + }, + { + "epoch": 17.252463187527066, + "grad_norm": 0.33895066380500793, + "learning_rate": 4.82747536812473e-05, + "loss": 0.3666, + "step": 2549500 + }, + { + "epoch": 17.255846686877437, + "grad_norm": 0.39131903648376465, + "learning_rate": 4.8274415331312256e-05, + "loss": 0.366, + "step": 2550000 + }, + { + "epoch": 17.259230186227803, + "grad_norm": 0.3654680848121643, + "learning_rate": 4.827407698137722e-05, + "loss": 0.3667, + "step": 2550500 + }, + { + "epoch": 17.262613685578174, + "grad_norm": 0.35606712102890015, + "learning_rate": 4.827373863144219e-05, + "loss": 0.3661, + "step": 2551000 + }, + { + "epoch": 17.26599718492854, + "grad_norm": 0.38734468817710876, + "learning_rate": 4.827340028150715e-05, + "loss": 0.3663, + "step": 2551500 + }, + { + "epoch": 17.269380684278907, + "grad_norm": 0.35687586665153503, + "learning_rate": 4.827306193157211e-05, + "loss": 0.3674, + "step": 2552000 + }, + { + "epoch": 17.272764183629278, + "grad_norm": 0.3550911843776703, + "learning_rate": 4.827272358163707e-05, + "loss": 0.368, + "step": 2552500 + }, + { + "epoch": 17.276147682979644, + "grad_norm": 0.3244481384754181, + "learning_rate": 4.8272385231702035e-05, + "loss": 0.3672, + "step": 2553000 + }, + { + "epoch": 17.279531182330015, + "grad_norm": 0.35181885957717896, + "learning_rate": 4.8272046881767004e-05, + "loss": 0.3679, + "step": 2553500 + }, + { + "epoch": 17.28291468168038, + "grad_norm": 0.37776389718055725, + "learning_rate": 4.8271708531831966e-05, + "loss": 0.3679, + "step": 2554000 + }, + { + "epoch": 17.286298181030748, + "grad_norm": 0.36275383830070496, + "learning_rate": 4.827137018189693e-05, + "loss": 0.3667, + "step": 2554500 + }, + { + "epoch": 17.28968168038112, + "grad_norm": 0.3706962466239929, + "learning_rate": 4.827103183196189e-05, + "loss": 0.3683, + "step": 2555000 + }, + { + "epoch": 17.293065179731485, + "grad_norm": 0.35722899436950684, + "learning_rate": 4.827069348202685e-05, + "loss": 0.367, + "step": 2555500 + }, + { + "epoch": 17.296448679081852, + "grad_norm": 0.3421380817890167, + "learning_rate": 4.8270355132091815e-05, + "loss": 0.3664, + "step": 2556000 + }, + { + "epoch": 17.299832178432222, + "grad_norm": 0.3910340666770935, + "learning_rate": 4.827001678215678e-05, + "loss": 0.3669, + "step": 2556500 + }, + { + "epoch": 17.30321567778259, + "grad_norm": 0.34034547209739685, + "learning_rate": 4.8269678432221746e-05, + "loss": 0.3681, + "step": 2557000 + }, + { + "epoch": 17.30659917713296, + "grad_norm": 0.36404237151145935, + "learning_rate": 4.826934008228671e-05, + "loss": 0.3671, + "step": 2557500 + }, + { + "epoch": 17.309982676483326, + "grad_norm": 0.3977389931678772, + "learning_rate": 4.826900173235167e-05, + "loss": 0.3665, + "step": 2558000 + }, + { + "epoch": 17.313366175833693, + "grad_norm": 0.34738966822624207, + "learning_rate": 4.826866338241663e-05, + "loss": 0.3667, + "step": 2558500 + }, + { + "epoch": 17.316749675184063, + "grad_norm": 0.34294068813323975, + "learning_rate": 4.82683250324816e-05, + "loss": 0.3668, + "step": 2559000 + }, + { + "epoch": 17.32013317453443, + "grad_norm": 0.38207390904426575, + "learning_rate": 4.8267986682546556e-05, + "loss": 0.3667, + "step": 2559500 + }, + { + "epoch": 17.3235166738848, + "grad_norm": 0.36952969431877136, + "learning_rate": 4.826764833261152e-05, + "loss": 0.3659, + "step": 2560000 + }, + { + "epoch": 17.326900173235167, + "grad_norm": 0.4127836525440216, + "learning_rate": 4.826730998267649e-05, + "loss": 0.3674, + "step": 2560500 + }, + { + "epoch": 17.330283672585534, + "grad_norm": 0.35767117142677307, + "learning_rate": 4.826697163274145e-05, + "loss": 0.366, + "step": 2561000 + }, + { + "epoch": 17.333667171935904, + "grad_norm": 0.3588179051876068, + "learning_rate": 4.826663328280641e-05, + "loss": 0.3661, + "step": 2561500 + }, + { + "epoch": 17.33705067128627, + "grad_norm": 0.3567115068435669, + "learning_rate": 4.8266294932871374e-05, + "loss": 0.3669, + "step": 2562000 + }, + { + "epoch": 17.34043417063664, + "grad_norm": 0.3763863742351532, + "learning_rate": 4.8265956582936336e-05, + "loss": 0.3652, + "step": 2562500 + }, + { + "epoch": 17.343817669987008, + "grad_norm": 0.4155043661594391, + "learning_rate": 4.8265618233001305e-05, + "loss": 0.3667, + "step": 2563000 + }, + { + "epoch": 17.347201169337374, + "grad_norm": 0.3805868625640869, + "learning_rate": 4.826527988306627e-05, + "loss": 0.3666, + "step": 2563500 + }, + { + "epoch": 17.350584668687745, + "grad_norm": 0.36847659945487976, + "learning_rate": 4.826494153313123e-05, + "loss": 0.3671, + "step": 2564000 + }, + { + "epoch": 17.35396816803811, + "grad_norm": 0.36805492639541626, + "learning_rate": 4.826460318319619e-05, + "loss": 0.3665, + "step": 2564500 + }, + { + "epoch": 17.35735166738848, + "grad_norm": 0.3310345411300659, + "learning_rate": 4.826426483326115e-05, + "loss": 0.3666, + "step": 2565000 + }, + { + "epoch": 17.36073516673885, + "grad_norm": 0.38131046295166016, + "learning_rate": 4.8263926483326115e-05, + "loss": 0.3673, + "step": 2565500 + }, + { + "epoch": 17.364118666089215, + "grad_norm": 0.3580634891986847, + "learning_rate": 4.826358813339108e-05, + "loss": 0.3671, + "step": 2566000 + }, + { + "epoch": 17.367502165439586, + "grad_norm": 0.3496025502681732, + "learning_rate": 4.8263249783456046e-05, + "loss": 0.3669, + "step": 2566500 + }, + { + "epoch": 17.370885664789952, + "grad_norm": 0.3704014718532562, + "learning_rate": 4.826291143352101e-05, + "loss": 0.3651, + "step": 2567000 + }, + { + "epoch": 17.37426916414032, + "grad_norm": 0.37614890933036804, + "learning_rate": 4.826257308358597e-05, + "loss": 0.3672, + "step": 2567500 + }, + { + "epoch": 17.37765266349069, + "grad_norm": 0.37118321657180786, + "learning_rate": 4.826223473365093e-05, + "loss": 0.3661, + "step": 2568000 + }, + { + "epoch": 17.381036162841056, + "grad_norm": 0.35460785031318665, + "learning_rate": 4.82618963837159e-05, + "loss": 0.3661, + "step": 2568500 + }, + { + "epoch": 17.384419662191426, + "grad_norm": 0.351251482963562, + "learning_rate": 4.8261558033780864e-05, + "loss": 0.3674, + "step": 2569000 + }, + { + "epoch": 17.387803161541793, + "grad_norm": 0.3240748345851898, + "learning_rate": 4.826121968384582e-05, + "loss": 0.3655, + "step": 2569500 + }, + { + "epoch": 17.39118666089216, + "grad_norm": 0.3724314272403717, + "learning_rate": 4.826088133391078e-05, + "loss": 0.3679, + "step": 2570000 + }, + { + "epoch": 17.39457016024253, + "grad_norm": 0.3763544261455536, + "learning_rate": 4.826054298397575e-05, + "loss": 0.3663, + "step": 2570500 + }, + { + "epoch": 17.397953659592897, + "grad_norm": 0.42186710238456726, + "learning_rate": 4.826020463404071e-05, + "loss": 0.3676, + "step": 2571000 + }, + { + "epoch": 17.401337158943264, + "grad_norm": 0.32255861163139343, + "learning_rate": 4.8259866284105674e-05, + "loss": 0.367, + "step": 2571500 + }, + { + "epoch": 17.404720658293634, + "grad_norm": 0.35966476798057556, + "learning_rate": 4.8259527934170637e-05, + "loss": 0.3661, + "step": 2572000 + }, + { + "epoch": 17.408104157644, + "grad_norm": 0.39646032452583313, + "learning_rate": 4.8259189584235605e-05, + "loss": 0.3663, + "step": 2572500 + }, + { + "epoch": 17.41148765699437, + "grad_norm": 0.3326699435710907, + "learning_rate": 4.825885123430057e-05, + "loss": 0.367, + "step": 2573000 + }, + { + "epoch": 17.414871156344738, + "grad_norm": 0.34483450651168823, + "learning_rate": 4.825851288436553e-05, + "loss": 0.367, + "step": 2573500 + }, + { + "epoch": 17.418254655695105, + "grad_norm": 0.3655441403388977, + "learning_rate": 4.825817453443049e-05, + "loss": 0.3675, + "step": 2574000 + }, + { + "epoch": 17.421638155045475, + "grad_norm": 0.32843220233917236, + "learning_rate": 4.8257836184495454e-05, + "loss": 0.3682, + "step": 2574500 + }, + { + "epoch": 17.42502165439584, + "grad_norm": 0.3860529661178589, + "learning_rate": 4.8257497834560416e-05, + "loss": 0.367, + "step": 2575000 + }, + { + "epoch": 17.428405153746212, + "grad_norm": 0.3711206614971161, + "learning_rate": 4.825715948462538e-05, + "loss": 0.3666, + "step": 2575500 + }, + { + "epoch": 17.43178865309658, + "grad_norm": 0.3526160418987274, + "learning_rate": 4.825682113469035e-05, + "loss": 0.3671, + "step": 2576000 + }, + { + "epoch": 17.435172152446945, + "grad_norm": 0.39998453855514526, + "learning_rate": 4.825648278475531e-05, + "loss": 0.3671, + "step": 2576500 + }, + { + "epoch": 17.438555651797316, + "grad_norm": 0.3498559296131134, + "learning_rate": 4.825614443482027e-05, + "loss": 0.3676, + "step": 2577000 + }, + { + "epoch": 17.441939151147682, + "grad_norm": 0.37197813391685486, + "learning_rate": 4.8255806084885233e-05, + "loss": 0.3663, + "step": 2577500 + }, + { + "epoch": 17.445322650498053, + "grad_norm": 0.3847373127937317, + "learning_rate": 4.82554677349502e-05, + "loss": 0.367, + "step": 2578000 + }, + { + "epoch": 17.44870614984842, + "grad_norm": 0.4149755835533142, + "learning_rate": 4.8255129385015164e-05, + "loss": 0.3673, + "step": 2578500 + }, + { + "epoch": 17.452089649198786, + "grad_norm": 0.37201008200645447, + "learning_rate": 4.825479103508012e-05, + "loss": 0.3656, + "step": 2579000 + }, + { + "epoch": 17.455473148549157, + "grad_norm": 0.3404081165790558, + "learning_rate": 4.825445268514508e-05, + "loss": 0.3684, + "step": 2579500 + }, + { + "epoch": 17.458856647899523, + "grad_norm": 0.3775407671928406, + "learning_rate": 4.825411433521005e-05, + "loss": 0.3671, + "step": 2580000 + }, + { + "epoch": 17.46224014724989, + "grad_norm": 0.316976934671402, + "learning_rate": 4.825377598527501e-05, + "loss": 0.3654, + "step": 2580500 + }, + { + "epoch": 17.46562364660026, + "grad_norm": 0.3777703046798706, + "learning_rate": 4.8253437635339975e-05, + "loss": 0.3667, + "step": 2581000 + }, + { + "epoch": 17.469007145950627, + "grad_norm": 0.34554705023765564, + "learning_rate": 4.825309928540494e-05, + "loss": 0.3679, + "step": 2581500 + }, + { + "epoch": 17.472390645300997, + "grad_norm": 0.36104902625083923, + "learning_rate": 4.8252760935469906e-05, + "loss": 0.3671, + "step": 2582000 + }, + { + "epoch": 17.475774144651364, + "grad_norm": 0.3420993983745575, + "learning_rate": 4.825242258553487e-05, + "loss": 0.3663, + "step": 2582500 + }, + { + "epoch": 17.47915764400173, + "grad_norm": 0.3704248368740082, + "learning_rate": 4.825208423559983e-05, + "loss": 0.3674, + "step": 2583000 + }, + { + "epoch": 17.4825411433521, + "grad_norm": 0.36432328820228577, + "learning_rate": 4.825174588566479e-05, + "loss": 0.3676, + "step": 2583500 + }, + { + "epoch": 17.485924642702468, + "grad_norm": 0.3504055440425873, + "learning_rate": 4.8251407535729755e-05, + "loss": 0.3682, + "step": 2584000 + }, + { + "epoch": 17.48930814205284, + "grad_norm": 0.37538814544677734, + "learning_rate": 4.825106918579472e-05, + "loss": 0.3659, + "step": 2584500 + }, + { + "epoch": 17.492691641403205, + "grad_norm": 0.350363552570343, + "learning_rate": 4.825073083585968e-05, + "loss": 0.3682, + "step": 2585000 + }, + { + "epoch": 17.49607514075357, + "grad_norm": 0.3279978930950165, + "learning_rate": 4.825039248592465e-05, + "loss": 0.3666, + "step": 2585500 + }, + { + "epoch": 17.499458640103942, + "grad_norm": 0.37351611256599426, + "learning_rate": 4.825005413598961e-05, + "loss": 0.3669, + "step": 2586000 + }, + { + "epoch": 17.50284213945431, + "grad_norm": 0.3720766305923462, + "learning_rate": 4.824971578605457e-05, + "loss": 0.3674, + "step": 2586500 + }, + { + "epoch": 17.50622563880468, + "grad_norm": 0.38092654943466187, + "learning_rate": 4.8249377436119534e-05, + "loss": 0.3669, + "step": 2587000 + }, + { + "epoch": 17.509609138155046, + "grad_norm": 0.3415418267250061, + "learning_rate": 4.82490390861845e-05, + "loss": 0.3662, + "step": 2587500 + }, + { + "epoch": 17.512992637505413, + "grad_norm": 0.35386013984680176, + "learning_rate": 4.8248700736249465e-05, + "loss": 0.3678, + "step": 2588000 + }, + { + "epoch": 17.516376136855783, + "grad_norm": 0.37551605701446533, + "learning_rate": 4.824836238631442e-05, + "loss": 0.3669, + "step": 2588500 + }, + { + "epoch": 17.51975963620615, + "grad_norm": 0.36551395058631897, + "learning_rate": 4.824802403637938e-05, + "loss": 0.3674, + "step": 2589000 + }, + { + "epoch": 17.523143135556516, + "grad_norm": 0.3686535656452179, + "learning_rate": 4.824768568644435e-05, + "loss": 0.3673, + "step": 2589500 + }, + { + "epoch": 17.526526634906887, + "grad_norm": 0.4045044779777527, + "learning_rate": 4.8247347336509314e-05, + "loss": 0.3673, + "step": 2590000 + }, + { + "epoch": 17.529910134257253, + "grad_norm": 0.3256719708442688, + "learning_rate": 4.8247008986574276e-05, + "loss": 0.3662, + "step": 2590500 + }, + { + "epoch": 17.533293633607624, + "grad_norm": 0.3593628406524658, + "learning_rate": 4.824667063663924e-05, + "loss": 0.3675, + "step": 2591000 + }, + { + "epoch": 17.53667713295799, + "grad_norm": 0.3674556612968445, + "learning_rate": 4.824633228670421e-05, + "loss": 0.366, + "step": 2591500 + }, + { + "epoch": 17.540060632308357, + "grad_norm": 0.39123180508613586, + "learning_rate": 4.824599393676917e-05, + "loss": 0.3659, + "step": 2592000 + }, + { + "epoch": 17.543444131658728, + "grad_norm": 0.36383578181266785, + "learning_rate": 4.824565558683413e-05, + "loss": 0.3674, + "step": 2592500 + }, + { + "epoch": 17.546827631009094, + "grad_norm": 0.3695049583911896, + "learning_rate": 4.824531723689909e-05, + "loss": 0.3666, + "step": 2593000 + }, + { + "epoch": 17.550211130359465, + "grad_norm": 0.38377198576927185, + "learning_rate": 4.8244978886964055e-05, + "loss": 0.368, + "step": 2593500 + }, + { + "epoch": 17.55359462970983, + "grad_norm": 0.35402801632881165, + "learning_rate": 4.824464053702902e-05, + "loss": 0.366, + "step": 2594000 + }, + { + "epoch": 17.556978129060198, + "grad_norm": 0.3696456253528595, + "learning_rate": 4.824430218709398e-05, + "loss": 0.3675, + "step": 2594500 + }, + { + "epoch": 17.56036162841057, + "grad_norm": 0.3580005168914795, + "learning_rate": 4.824396383715895e-05, + "loss": 0.3653, + "step": 2595000 + }, + { + "epoch": 17.563745127760935, + "grad_norm": 0.3534056842327118, + "learning_rate": 4.824362548722391e-05, + "loss": 0.3658, + "step": 2595500 + }, + { + "epoch": 17.567128627111302, + "grad_norm": 0.3255242705345154, + "learning_rate": 4.824328713728887e-05, + "loss": 0.3667, + "step": 2596000 + }, + { + "epoch": 17.570512126461672, + "grad_norm": 0.3532315194606781, + "learning_rate": 4.8242948787353835e-05, + "loss": 0.3673, + "step": 2596500 + }, + { + "epoch": 17.57389562581204, + "grad_norm": 0.37919074296951294, + "learning_rate": 4.8242610437418804e-05, + "loss": 0.3668, + "step": 2597000 + }, + { + "epoch": 17.57727912516241, + "grad_norm": 0.36233749985694885, + "learning_rate": 4.8242272087483766e-05, + "loss": 0.3649, + "step": 2597500 + }, + { + "epoch": 17.580662624512776, + "grad_norm": 0.3300325572490692, + "learning_rate": 4.824193373754872e-05, + "loss": 0.3679, + "step": 2598000 + }, + { + "epoch": 17.584046123863143, + "grad_norm": 0.3588019609451294, + "learning_rate": 4.824159538761368e-05, + "loss": 0.3661, + "step": 2598500 + }, + { + "epoch": 17.587429623213513, + "grad_norm": 0.3422711193561554, + "learning_rate": 4.824125703767865e-05, + "loss": 0.3669, + "step": 2599000 + }, + { + "epoch": 17.59081312256388, + "grad_norm": 0.4143356680870056, + "learning_rate": 4.8240918687743614e-05, + "loss": 0.3669, + "step": 2599500 + }, + { + "epoch": 17.59419662191425, + "grad_norm": 0.36779430508613586, + "learning_rate": 4.8240580337808576e-05, + "loss": 0.3681, + "step": 2600000 + }, + { + "epoch": 17.597580121264617, + "grad_norm": 0.33779770135879517, + "learning_rate": 4.824024198787354e-05, + "loss": 0.3686, + "step": 2600500 + }, + { + "epoch": 17.600963620614984, + "grad_norm": 0.3325331509113312, + "learning_rate": 4.823990363793851e-05, + "loss": 0.3663, + "step": 2601000 + }, + { + "epoch": 17.604347119965354, + "grad_norm": 0.34794822335243225, + "learning_rate": 4.823956528800347e-05, + "loss": 0.3684, + "step": 2601500 + }, + { + "epoch": 17.60773061931572, + "grad_norm": 0.3359128534793854, + "learning_rate": 4.823922693806843e-05, + "loss": 0.3673, + "step": 2602000 + }, + { + "epoch": 17.61111411866609, + "grad_norm": 0.3586720824241638, + "learning_rate": 4.8238888588133394e-05, + "loss": 0.3658, + "step": 2602500 + }, + { + "epoch": 17.614497618016458, + "grad_norm": 0.3466437757015228, + "learning_rate": 4.8238550238198356e-05, + "loss": 0.3666, + "step": 2603000 + }, + { + "epoch": 17.617881117366824, + "grad_norm": 0.3655886948108673, + "learning_rate": 4.823821188826332e-05, + "loss": 0.3676, + "step": 2603500 + }, + { + "epoch": 17.621264616717195, + "grad_norm": 0.4002399146556854, + "learning_rate": 4.823787353832828e-05, + "loss": 0.3677, + "step": 2604000 + }, + { + "epoch": 17.62464811606756, + "grad_norm": 0.4021463096141815, + "learning_rate": 4.823753518839325e-05, + "loss": 0.3668, + "step": 2604500 + }, + { + "epoch": 17.628031615417928, + "grad_norm": 0.34503015875816345, + "learning_rate": 4.823719683845821e-05, + "loss": 0.3688, + "step": 2605000 + }, + { + "epoch": 17.6314151147683, + "grad_norm": 0.35058820247650146, + "learning_rate": 4.823685848852317e-05, + "loss": 0.3682, + "step": 2605500 + }, + { + "epoch": 17.634798614118665, + "grad_norm": 0.36215049028396606, + "learning_rate": 4.8236520138588135e-05, + "loss": 0.3677, + "step": 2606000 + }, + { + "epoch": 17.638182113469036, + "grad_norm": 0.3813348114490509, + "learning_rate": 4.8236181788653104e-05, + "loss": 0.3666, + "step": 2606500 + }, + { + "epoch": 17.641565612819402, + "grad_norm": 0.3682672679424286, + "learning_rate": 4.8235843438718066e-05, + "loss": 0.3684, + "step": 2607000 + }, + { + "epoch": 17.64494911216977, + "grad_norm": 0.39241448044776917, + "learning_rate": 4.823550508878302e-05, + "loss": 0.3652, + "step": 2607500 + }, + { + "epoch": 17.64833261152014, + "grad_norm": 0.32166317105293274, + "learning_rate": 4.8235166738847984e-05, + "loss": 0.3687, + "step": 2608000 + }, + { + "epoch": 17.651716110870506, + "grad_norm": 0.3621087074279785, + "learning_rate": 4.823482838891295e-05, + "loss": 0.3664, + "step": 2608500 + }, + { + "epoch": 17.655099610220876, + "grad_norm": 0.3711774945259094, + "learning_rate": 4.8234490038977915e-05, + "loss": 0.3659, + "step": 2609000 + }, + { + "epoch": 17.658483109571243, + "grad_norm": 0.3362772762775421, + "learning_rate": 4.823415168904288e-05, + "loss": 0.3681, + "step": 2609500 + }, + { + "epoch": 17.66186660892161, + "grad_norm": 0.34222641587257385, + "learning_rate": 4.823381333910784e-05, + "loss": 0.3666, + "step": 2610000 + }, + { + "epoch": 17.66525010827198, + "grad_norm": 0.34946516156196594, + "learning_rate": 4.823347498917281e-05, + "loss": 0.3678, + "step": 2610500 + }, + { + "epoch": 17.668633607622347, + "grad_norm": 0.3598966598510742, + "learning_rate": 4.823313663923777e-05, + "loss": 0.3682, + "step": 2611000 + }, + { + "epoch": 17.672017106972717, + "grad_norm": 0.39595934748649597, + "learning_rate": 4.823279828930273e-05, + "loss": 0.3652, + "step": 2611500 + }, + { + "epoch": 17.675400606323084, + "grad_norm": 0.38074570894241333, + "learning_rate": 4.8232459939367694e-05, + "loss": 0.3651, + "step": 2612000 + }, + { + "epoch": 17.67878410567345, + "grad_norm": 0.37233513593673706, + "learning_rate": 4.8232121589432656e-05, + "loss": 0.3662, + "step": 2612500 + }, + { + "epoch": 17.68216760502382, + "grad_norm": 0.33832278847694397, + "learning_rate": 4.823178323949762e-05, + "loss": 0.3648, + "step": 2613000 + }, + { + "epoch": 17.685551104374188, + "grad_norm": 0.34705302119255066, + "learning_rate": 4.823144488956258e-05, + "loss": 0.3664, + "step": 2613500 + }, + { + "epoch": 17.688934603724555, + "grad_norm": 0.3784879148006439, + "learning_rate": 4.823110653962755e-05, + "loss": 0.3674, + "step": 2614000 + }, + { + "epoch": 17.692318103074925, + "grad_norm": 0.4426981210708618, + "learning_rate": 4.823076818969251e-05, + "loss": 0.3671, + "step": 2614500 + }, + { + "epoch": 17.69570160242529, + "grad_norm": 0.3958585560321808, + "learning_rate": 4.8230429839757474e-05, + "loss": 0.3674, + "step": 2615000 + }, + { + "epoch": 17.699085101775662, + "grad_norm": 0.3677558898925781, + "learning_rate": 4.8230091489822436e-05, + "loss": 0.3658, + "step": 2615500 + }, + { + "epoch": 17.70246860112603, + "grad_norm": 0.35928767919540405, + "learning_rate": 4.82297531398874e-05, + "loss": 0.3667, + "step": 2616000 + }, + { + "epoch": 17.705852100476395, + "grad_norm": 0.3817287087440491, + "learning_rate": 4.822941478995237e-05, + "loss": 0.3664, + "step": 2616500 + }, + { + "epoch": 17.709235599826766, + "grad_norm": 0.3287027180194855, + "learning_rate": 4.822907644001732e-05, + "loss": 0.3669, + "step": 2617000 + }, + { + "epoch": 17.712619099177132, + "grad_norm": 0.34923413395881653, + "learning_rate": 4.8228738090082284e-05, + "loss": 0.3657, + "step": 2617500 + }, + { + "epoch": 17.716002598527503, + "grad_norm": 0.3792766332626343, + "learning_rate": 4.822839974014725e-05, + "loss": 0.3679, + "step": 2618000 + }, + { + "epoch": 17.71938609787787, + "grad_norm": 0.3916323482990265, + "learning_rate": 4.8228061390212215e-05, + "loss": 0.3668, + "step": 2618500 + }, + { + "epoch": 17.722769597228236, + "grad_norm": 0.3880055248737335, + "learning_rate": 4.822772304027718e-05, + "loss": 0.3677, + "step": 2619000 + }, + { + "epoch": 17.726153096578606, + "grad_norm": 0.3902827799320221, + "learning_rate": 4.822738469034214e-05, + "loss": 0.3646, + "step": 2619500 + }, + { + "epoch": 17.729536595928973, + "grad_norm": 0.352867066860199, + "learning_rate": 4.822704634040711e-05, + "loss": 0.3683, + "step": 2620000 + }, + { + "epoch": 17.73292009527934, + "grad_norm": 0.37308165431022644, + "learning_rate": 4.822670799047207e-05, + "loss": 0.3675, + "step": 2620500 + }, + { + "epoch": 17.73630359462971, + "grad_norm": 0.3646025061607361, + "learning_rate": 4.822636964053703e-05, + "loss": 0.3675, + "step": 2621000 + }, + { + "epoch": 17.739687093980077, + "grad_norm": 0.3406023383140564, + "learning_rate": 4.8226031290601995e-05, + "loss": 0.3677, + "step": 2621500 + }, + { + "epoch": 17.743070593330447, + "grad_norm": 0.37065035104751587, + "learning_rate": 4.822569294066696e-05, + "loss": 0.3666, + "step": 2622000 + }, + { + "epoch": 17.746454092680814, + "grad_norm": 0.32739582657814026, + "learning_rate": 4.822535459073192e-05, + "loss": 0.3655, + "step": 2622500 + }, + { + "epoch": 17.74983759203118, + "grad_norm": 0.3511997163295746, + "learning_rate": 4.822501624079688e-05, + "loss": 0.3664, + "step": 2623000 + }, + { + "epoch": 17.75322109138155, + "grad_norm": 0.3391745090484619, + "learning_rate": 4.8224677890861843e-05, + "loss": 0.3678, + "step": 2623500 + }, + { + "epoch": 17.756604590731918, + "grad_norm": 0.2988252341747284, + "learning_rate": 4.822433954092681e-05, + "loss": 0.3673, + "step": 2624000 + }, + { + "epoch": 17.759988090082288, + "grad_norm": 0.36709460616111755, + "learning_rate": 4.8224001190991774e-05, + "loss": 0.3661, + "step": 2624500 + }, + { + "epoch": 17.763371589432655, + "grad_norm": 0.36869344115257263, + "learning_rate": 4.8223662841056737e-05, + "loss": 0.3672, + "step": 2625000 + }, + { + "epoch": 17.76675508878302, + "grad_norm": 0.3532327711582184, + "learning_rate": 4.82233244911217e-05, + "loss": 0.3677, + "step": 2625500 + }, + { + "epoch": 17.770138588133392, + "grad_norm": 0.36017414927482605, + "learning_rate": 4.822298614118667e-05, + "loss": 0.3687, + "step": 2626000 + }, + { + "epoch": 17.77352208748376, + "grad_norm": 0.3668579161167145, + "learning_rate": 4.822264779125162e-05, + "loss": 0.3665, + "step": 2626500 + }, + { + "epoch": 17.77690558683413, + "grad_norm": 0.3817662000656128, + "learning_rate": 4.8222309441316585e-05, + "loss": 0.3671, + "step": 2627000 + }, + { + "epoch": 17.780289086184496, + "grad_norm": 0.3410915434360504, + "learning_rate": 4.8221971091381554e-05, + "loss": 0.3671, + "step": 2627500 + }, + { + "epoch": 17.783672585534863, + "grad_norm": 0.36917775869369507, + "learning_rate": 4.8221632741446516e-05, + "loss": 0.3665, + "step": 2628000 + }, + { + "epoch": 17.787056084885233, + "grad_norm": 0.3521798253059387, + "learning_rate": 4.822129439151148e-05, + "loss": 0.3673, + "step": 2628500 + }, + { + "epoch": 17.7904395842356, + "grad_norm": 0.35278838872909546, + "learning_rate": 4.822095604157644e-05, + "loss": 0.3668, + "step": 2629000 + }, + { + "epoch": 17.793823083585966, + "grad_norm": 0.3609314560890198, + "learning_rate": 4.822061769164141e-05, + "loss": 0.3674, + "step": 2629500 + }, + { + "epoch": 17.797206582936337, + "grad_norm": 0.3256959021091461, + "learning_rate": 4.822027934170637e-05, + "loss": 0.367, + "step": 2630000 + }, + { + "epoch": 17.800590082286703, + "grad_norm": 0.3367217779159546, + "learning_rate": 4.8219940991771333e-05, + "loss": 0.3672, + "step": 2630500 + }, + { + "epoch": 17.803973581637074, + "grad_norm": 0.35630953311920166, + "learning_rate": 4.8219602641836296e-05, + "loss": 0.368, + "step": 2631000 + }, + { + "epoch": 17.80735708098744, + "grad_norm": 0.3785656690597534, + "learning_rate": 4.821926429190126e-05, + "loss": 0.3672, + "step": 2631500 + }, + { + "epoch": 17.810740580337807, + "grad_norm": 0.40297931432724, + "learning_rate": 4.821892594196622e-05, + "loss": 0.3675, + "step": 2632000 + }, + { + "epoch": 17.814124079688177, + "grad_norm": 0.39543208479881287, + "learning_rate": 4.821858759203118e-05, + "loss": 0.367, + "step": 2632500 + }, + { + "epoch": 17.817507579038544, + "grad_norm": 0.3973096013069153, + "learning_rate": 4.8218249242096144e-05, + "loss": 0.3665, + "step": 2633000 + }, + { + "epoch": 17.820891078388915, + "grad_norm": 0.3314948081970215, + "learning_rate": 4.821791089216111e-05, + "loss": 0.3676, + "step": 2633500 + }, + { + "epoch": 17.82427457773928, + "grad_norm": 0.34036216139793396, + "learning_rate": 4.8217572542226075e-05, + "loss": 0.3666, + "step": 2634000 + }, + { + "epoch": 17.827658077089648, + "grad_norm": 0.3349308669567108, + "learning_rate": 4.821723419229104e-05, + "loss": 0.3656, + "step": 2634500 + }, + { + "epoch": 17.83104157644002, + "grad_norm": 0.38641485571861267, + "learning_rate": 4.8216895842356e-05, + "loss": 0.3682, + "step": 2635000 + }, + { + "epoch": 17.834425075790385, + "grad_norm": 0.3417682349681854, + "learning_rate": 4.821655749242097e-05, + "loss": 0.3677, + "step": 2635500 + }, + { + "epoch": 17.837808575140755, + "grad_norm": 0.37354832887649536, + "learning_rate": 4.8216219142485924e-05, + "loss": 0.3661, + "step": 2636000 + }, + { + "epoch": 17.841192074491122, + "grad_norm": 0.3448849022388458, + "learning_rate": 4.8215880792550886e-05, + "loss": 0.3676, + "step": 2636500 + }, + { + "epoch": 17.84457557384149, + "grad_norm": 0.3836000859737396, + "learning_rate": 4.8215542442615855e-05, + "loss": 0.3661, + "step": 2637000 + }, + { + "epoch": 17.84795907319186, + "grad_norm": 0.3253054618835449, + "learning_rate": 4.821520409268082e-05, + "loss": 0.3653, + "step": 2637500 + }, + { + "epoch": 17.851342572542226, + "grad_norm": 0.3134402334690094, + "learning_rate": 4.821486574274578e-05, + "loss": 0.3652, + "step": 2638000 + }, + { + "epoch": 17.854726071892593, + "grad_norm": 0.3837721049785614, + "learning_rate": 4.821452739281074e-05, + "loss": 0.3662, + "step": 2638500 + }, + { + "epoch": 17.858109571242963, + "grad_norm": 0.36032843589782715, + "learning_rate": 4.821418904287571e-05, + "loss": 0.3664, + "step": 2639000 + }, + { + "epoch": 17.86149307059333, + "grad_norm": 0.3683348298072815, + "learning_rate": 4.821385069294067e-05, + "loss": 0.3665, + "step": 2639500 + }, + { + "epoch": 17.8648765699437, + "grad_norm": 0.37075766921043396, + "learning_rate": 4.8213512343005634e-05, + "loss": 0.3676, + "step": 2640000 + }, + { + "epoch": 17.868260069294067, + "grad_norm": 0.37984272837638855, + "learning_rate": 4.8213173993070596e-05, + "loss": 0.3673, + "step": 2640500 + }, + { + "epoch": 17.871643568644433, + "grad_norm": 0.3673933148384094, + "learning_rate": 4.821283564313556e-05, + "loss": 0.3673, + "step": 2641000 + }, + { + "epoch": 17.875027067994804, + "grad_norm": 0.315318763256073, + "learning_rate": 4.821249729320052e-05, + "loss": 0.3677, + "step": 2641500 + }, + { + "epoch": 17.87841056734517, + "grad_norm": 0.3659175932407379, + "learning_rate": 4.821215894326548e-05, + "loss": 0.3683, + "step": 2642000 + }, + { + "epoch": 17.88179406669554, + "grad_norm": 0.35016539692878723, + "learning_rate": 4.8211820593330445e-05, + "loss": 0.3658, + "step": 2642500 + }, + { + "epoch": 17.885177566045908, + "grad_norm": 0.3711670935153961, + "learning_rate": 4.8211482243395414e-05, + "loss": 0.3676, + "step": 2643000 + }, + { + "epoch": 17.888561065396274, + "grad_norm": 0.3717668056488037, + "learning_rate": 4.8211143893460376e-05, + "loss": 0.3664, + "step": 2643500 + }, + { + "epoch": 17.891944564746645, + "grad_norm": 0.3931821286678314, + "learning_rate": 4.821080554352534e-05, + "loss": 0.3668, + "step": 2644000 + }, + { + "epoch": 17.89532806409701, + "grad_norm": 0.3632790446281433, + "learning_rate": 4.82104671935903e-05, + "loss": 0.3674, + "step": 2644500 + }, + { + "epoch": 17.898711563447378, + "grad_norm": 0.3724515736103058, + "learning_rate": 4.821012884365527e-05, + "loss": 0.3663, + "step": 2645000 + }, + { + "epoch": 17.90209506279775, + "grad_norm": 0.33584630489349365, + "learning_rate": 4.8209790493720224e-05, + "loss": 0.3662, + "step": 2645500 + }, + { + "epoch": 17.905478562148115, + "grad_norm": 0.321268230676651, + "learning_rate": 4.8209452143785186e-05, + "loss": 0.3681, + "step": 2646000 + }, + { + "epoch": 17.908862061498485, + "grad_norm": 0.3851405084133148, + "learning_rate": 4.8209113793850155e-05, + "loss": 0.3669, + "step": 2646500 + }, + { + "epoch": 17.912245560848852, + "grad_norm": 0.3688462972640991, + "learning_rate": 4.820877544391512e-05, + "loss": 0.3674, + "step": 2647000 + }, + { + "epoch": 17.91562906019922, + "grad_norm": 0.3237268030643463, + "learning_rate": 4.820843709398008e-05, + "loss": 0.3664, + "step": 2647500 + }, + { + "epoch": 17.91901255954959, + "grad_norm": 0.36310896277427673, + "learning_rate": 4.820809874404504e-05, + "loss": 0.3677, + "step": 2648000 + }, + { + "epoch": 17.922396058899956, + "grad_norm": 0.38640767335891724, + "learning_rate": 4.820776039411001e-05, + "loss": 0.3666, + "step": 2648500 + }, + { + "epoch": 17.925779558250326, + "grad_norm": 0.36768364906311035, + "learning_rate": 4.820742204417497e-05, + "loss": 0.3673, + "step": 2649000 + }, + { + "epoch": 17.929163057600693, + "grad_norm": 0.3582685589790344, + "learning_rate": 4.8207083694239935e-05, + "loss": 0.3657, + "step": 2649500 + }, + { + "epoch": 17.93254655695106, + "grad_norm": 0.3740227520465851, + "learning_rate": 4.82067453443049e-05, + "loss": 0.3695, + "step": 2650000 + }, + { + "epoch": 17.93593005630143, + "grad_norm": 0.3254905641078949, + "learning_rate": 4.820640699436986e-05, + "loss": 0.3678, + "step": 2650500 + }, + { + "epoch": 17.939313555651797, + "grad_norm": 0.3392098546028137, + "learning_rate": 4.820606864443482e-05, + "loss": 0.366, + "step": 2651000 + }, + { + "epoch": 17.942697055002164, + "grad_norm": 0.31783849000930786, + "learning_rate": 4.820573029449978e-05, + "loss": 0.3673, + "step": 2651500 + }, + { + "epoch": 17.946080554352534, + "grad_norm": 0.37822237610816956, + "learning_rate": 4.8205391944564745e-05, + "loss": 0.3669, + "step": 2652000 + }, + { + "epoch": 17.9494640537029, + "grad_norm": 0.3652763366699219, + "learning_rate": 4.8205053594629714e-05, + "loss": 0.3684, + "step": 2652500 + }, + { + "epoch": 17.95284755305327, + "grad_norm": 0.39992907643318176, + "learning_rate": 4.8204715244694676e-05, + "loss": 0.367, + "step": 2653000 + }, + { + "epoch": 17.956231052403638, + "grad_norm": 0.34198036789894104, + "learning_rate": 4.820437689475964e-05, + "loss": 0.3671, + "step": 2653500 + }, + { + "epoch": 17.959614551754004, + "grad_norm": 0.3602445423603058, + "learning_rate": 4.82040385448246e-05, + "loss": 0.3675, + "step": 2654000 + }, + { + "epoch": 17.962998051104375, + "grad_norm": 0.3950427770614624, + "learning_rate": 4.820370019488957e-05, + "loss": 0.3671, + "step": 2654500 + }, + { + "epoch": 17.96638155045474, + "grad_norm": 0.3671565055847168, + "learning_rate": 4.8203361844954525e-05, + "loss": 0.3686, + "step": 2655000 + }, + { + "epoch": 17.969765049805112, + "grad_norm": 0.36367377638816833, + "learning_rate": 4.820302349501949e-05, + "loss": 0.3686, + "step": 2655500 + }, + { + "epoch": 17.97314854915548, + "grad_norm": 0.36010125279426575, + "learning_rate": 4.8202685145084456e-05, + "loss": 0.3658, + "step": 2656000 + }, + { + "epoch": 17.976532048505845, + "grad_norm": 0.338571697473526, + "learning_rate": 4.820234679514942e-05, + "loss": 0.3666, + "step": 2656500 + }, + { + "epoch": 17.979915547856216, + "grad_norm": 0.3533976972103119, + "learning_rate": 4.820200844521438e-05, + "loss": 0.3663, + "step": 2657000 + }, + { + "epoch": 17.983299047206582, + "grad_norm": 0.3369833827018738, + "learning_rate": 4.820167009527934e-05, + "loss": 0.3677, + "step": 2657500 + }, + { + "epoch": 17.986682546556953, + "grad_norm": 0.36669063568115234, + "learning_rate": 4.820133174534431e-05, + "loss": 0.3685, + "step": 2658000 + }, + { + "epoch": 17.99006604590732, + "grad_norm": 0.3408244550228119, + "learning_rate": 4.820099339540927e-05, + "loss": 0.3669, + "step": 2658500 + }, + { + "epoch": 17.993449545257686, + "grad_norm": 0.3309365212917328, + "learning_rate": 4.8200655045474235e-05, + "loss": 0.3663, + "step": 2659000 + }, + { + "epoch": 17.996833044608056, + "grad_norm": 0.38393697142601013, + "learning_rate": 4.82003166955392e-05, + "loss": 0.3675, + "step": 2659500 + }, + { + "epoch": 18.0, + "eval_accuracy": 0.8601398565191122, + "eval_loss": 0.5673038959503174, + "eval_runtime": 3700.8712, + "eval_samples_per_second": 78.561, + "eval_steps_per_second": 4.91, + "step": 2659968 + }, + { + "epoch": 18.000216543958423, + "grad_norm": 0.3817974925041199, + "learning_rate": 4.819997834560416e-05, + "loss": 0.3665, + "step": 2660000 + }, + { + "epoch": 18.00360004330879, + "grad_norm": 0.3546946942806244, + "learning_rate": 4.819963999566912e-05, + "loss": 0.3656, + "step": 2660500 + }, + { + "epoch": 18.00698354265916, + "grad_norm": 0.375627338886261, + "learning_rate": 4.8199301645734084e-05, + "loss": 0.3641, + "step": 2661000 + }, + { + "epoch": 18.010367042009527, + "grad_norm": 0.37911269068717957, + "learning_rate": 4.8198963295799046e-05, + "loss": 0.3645, + "step": 2661500 + }, + { + "epoch": 18.013750541359897, + "grad_norm": 0.3864506185054779, + "learning_rate": 4.8198624945864015e-05, + "loss": 0.3647, + "step": 2662000 + }, + { + "epoch": 18.017134040710264, + "grad_norm": 0.40366044640541077, + "learning_rate": 4.819828659592898e-05, + "loss": 0.3652, + "step": 2662500 + }, + { + "epoch": 18.02051754006063, + "grad_norm": 0.3945550322532654, + "learning_rate": 4.819794824599394e-05, + "loss": 0.3658, + "step": 2663000 + }, + { + "epoch": 18.023901039411, + "grad_norm": 0.3441663980484009, + "learning_rate": 4.81976098960589e-05, + "loss": 0.3642, + "step": 2663500 + }, + { + "epoch": 18.027284538761368, + "grad_norm": 0.38696587085723877, + "learning_rate": 4.819727154612387e-05, + "loss": 0.3652, + "step": 2664000 + }, + { + "epoch": 18.030668038111738, + "grad_norm": 0.3462171256542206, + "learning_rate": 4.8196933196188826e-05, + "loss": 0.3646, + "step": 2664500 + }, + { + "epoch": 18.034051537462105, + "grad_norm": 0.39164283871650696, + "learning_rate": 4.819659484625379e-05, + "loss": 0.3655, + "step": 2665000 + }, + { + "epoch": 18.03743503681247, + "grad_norm": 0.3807593584060669, + "learning_rate": 4.8196256496318757e-05, + "loss": 0.367, + "step": 2665500 + }, + { + "epoch": 18.040818536162842, + "grad_norm": 0.36958304047584534, + "learning_rate": 4.819591814638372e-05, + "loss": 0.3652, + "step": 2666000 + }, + { + "epoch": 18.04420203551321, + "grad_norm": 0.3937586545944214, + "learning_rate": 4.819557979644868e-05, + "loss": 0.3652, + "step": 2666500 + }, + { + "epoch": 18.04758553486358, + "grad_norm": 0.3388778269290924, + "learning_rate": 4.819524144651364e-05, + "loss": 0.3659, + "step": 2667000 + }, + { + "epoch": 18.050969034213946, + "grad_norm": 0.374830961227417, + "learning_rate": 4.819490309657861e-05, + "loss": 0.3653, + "step": 2667500 + }, + { + "epoch": 18.054352533564312, + "grad_norm": 0.39372459053993225, + "learning_rate": 4.8194564746643574e-05, + "loss": 0.3661, + "step": 2668000 + }, + { + "epoch": 18.057736032914683, + "grad_norm": 0.36588943004608154, + "learning_rate": 4.8194226396708536e-05, + "loss": 0.3661, + "step": 2668500 + }, + { + "epoch": 18.06111953226505, + "grad_norm": 0.4009133577346802, + "learning_rate": 4.81938880467735e-05, + "loss": 0.3659, + "step": 2669000 + }, + { + "epoch": 18.064503031615416, + "grad_norm": 0.3981277644634247, + "learning_rate": 4.819354969683846e-05, + "loss": 0.3633, + "step": 2669500 + }, + { + "epoch": 18.067886530965787, + "grad_norm": 0.38319653272628784, + "learning_rate": 4.819321134690342e-05, + "loss": 0.3659, + "step": 2670000 + }, + { + "epoch": 18.071270030316153, + "grad_norm": 0.38970398902893066, + "learning_rate": 4.8192872996968385e-05, + "loss": 0.3655, + "step": 2670500 + }, + { + "epoch": 18.074653529666524, + "grad_norm": 0.3901421129703522, + "learning_rate": 4.819253464703335e-05, + "loss": 0.3658, + "step": 2671000 + }, + { + "epoch": 18.07803702901689, + "grad_norm": 0.37622517347335815, + "learning_rate": 4.8192196297098316e-05, + "loss": 0.3652, + "step": 2671500 + }, + { + "epoch": 18.081420528367257, + "grad_norm": 0.33386629819869995, + "learning_rate": 4.819185794716328e-05, + "loss": 0.3662, + "step": 2672000 + }, + { + "epoch": 18.084804027717627, + "grad_norm": 0.3417004942893982, + "learning_rate": 4.819151959722824e-05, + "loss": 0.3661, + "step": 2672500 + }, + { + "epoch": 18.088187527067994, + "grad_norm": 0.35943761467933655, + "learning_rate": 4.81911812472932e-05, + "loss": 0.3651, + "step": 2673000 + }, + { + "epoch": 18.091571026418364, + "grad_norm": 0.40275195240974426, + "learning_rate": 4.819084289735817e-05, + "loss": 0.3656, + "step": 2673500 + }, + { + "epoch": 18.09495452576873, + "grad_norm": 0.367982417345047, + "learning_rate": 4.8190504547423126e-05, + "loss": 0.3662, + "step": 2674000 + }, + { + "epoch": 18.098338025119098, + "grad_norm": 0.35256427526474, + "learning_rate": 4.819016619748809e-05, + "loss": 0.3649, + "step": 2674500 + }, + { + "epoch": 18.10172152446947, + "grad_norm": 0.4005764424800873, + "learning_rate": 4.818982784755306e-05, + "loss": 0.366, + "step": 2675000 + }, + { + "epoch": 18.105105023819835, + "grad_norm": 0.3787810206413269, + "learning_rate": 4.818948949761802e-05, + "loss": 0.3648, + "step": 2675500 + }, + { + "epoch": 18.108488523170205, + "grad_norm": 0.3439185619354248, + "learning_rate": 4.818915114768298e-05, + "loss": 0.3677, + "step": 2676000 + }, + { + "epoch": 18.111872022520572, + "grad_norm": 0.38821837306022644, + "learning_rate": 4.8188812797747944e-05, + "loss": 0.3654, + "step": 2676500 + }, + { + "epoch": 18.11525552187094, + "grad_norm": 0.37852784991264343, + "learning_rate": 4.818847444781291e-05, + "loss": 0.3649, + "step": 2677000 + }, + { + "epoch": 18.11863902122131, + "grad_norm": 0.36914563179016113, + "learning_rate": 4.8188136097877875e-05, + "loss": 0.3669, + "step": 2677500 + }, + { + "epoch": 18.122022520571676, + "grad_norm": 0.3899284303188324, + "learning_rate": 4.818779774794284e-05, + "loss": 0.3661, + "step": 2678000 + }, + { + "epoch": 18.125406019922043, + "grad_norm": 0.37456607818603516, + "learning_rate": 4.81874593980078e-05, + "loss": 0.3662, + "step": 2678500 + }, + { + "epoch": 18.128789519272413, + "grad_norm": 0.34672752022743225, + "learning_rate": 4.818712104807276e-05, + "loss": 0.3648, + "step": 2679000 + }, + { + "epoch": 18.13217301862278, + "grad_norm": 0.37629690766334534, + "learning_rate": 4.818678269813772e-05, + "loss": 0.3654, + "step": 2679500 + }, + { + "epoch": 18.13555651797315, + "grad_norm": 0.3695160448551178, + "learning_rate": 4.8186444348202685e-05, + "loss": 0.3671, + "step": 2680000 + }, + { + "epoch": 18.138940017323517, + "grad_norm": 0.37204939126968384, + "learning_rate": 4.818610599826765e-05, + "loss": 0.3655, + "step": 2680500 + }, + { + "epoch": 18.142323516673883, + "grad_norm": 0.3446868658065796, + "learning_rate": 4.8185767648332616e-05, + "loss": 0.3649, + "step": 2681000 + }, + { + "epoch": 18.145707016024254, + "grad_norm": 0.3619040846824646, + "learning_rate": 4.818542929839758e-05, + "loss": 0.3663, + "step": 2681500 + }, + { + "epoch": 18.14909051537462, + "grad_norm": 0.3531608283519745, + "learning_rate": 4.818509094846254e-05, + "loss": 0.3654, + "step": 2682000 + }, + { + "epoch": 18.15247401472499, + "grad_norm": 0.36895817518234253, + "learning_rate": 4.81847525985275e-05, + "loss": 0.366, + "step": 2682500 + }, + { + "epoch": 18.155857514075358, + "grad_norm": 0.36233237385749817, + "learning_rate": 4.818441424859247e-05, + "loss": 0.3669, + "step": 2683000 + }, + { + "epoch": 18.159241013425724, + "grad_norm": 0.40669670701026917, + "learning_rate": 4.818407589865743e-05, + "loss": 0.3663, + "step": 2683500 + }, + { + "epoch": 18.162624512776095, + "grad_norm": 0.42166778445243835, + "learning_rate": 4.818373754872239e-05, + "loss": 0.3667, + "step": 2684000 + }, + { + "epoch": 18.16600801212646, + "grad_norm": 0.3649405539035797, + "learning_rate": 4.818339919878736e-05, + "loss": 0.3652, + "step": 2684500 + }, + { + "epoch": 18.169391511476828, + "grad_norm": 0.3400033712387085, + "learning_rate": 4.818306084885232e-05, + "loss": 0.3653, + "step": 2685000 + }, + { + "epoch": 18.1727750108272, + "grad_norm": 0.35990405082702637, + "learning_rate": 4.818272249891728e-05, + "loss": 0.3661, + "step": 2685500 + }, + { + "epoch": 18.176158510177565, + "grad_norm": 0.3466646075248718, + "learning_rate": 4.8182384148982244e-05, + "loss": 0.3663, + "step": 2686000 + }, + { + "epoch": 18.179542009527935, + "grad_norm": 0.37089553475379944, + "learning_rate": 4.8182045799047206e-05, + "loss": 0.3678, + "step": 2686500 + }, + { + "epoch": 18.182925508878302, + "grad_norm": 0.3663167953491211, + "learning_rate": 4.8181707449112175e-05, + "loss": 0.3675, + "step": 2687000 + }, + { + "epoch": 18.18630900822867, + "grad_norm": 0.37911930680274963, + "learning_rate": 4.818136909917714e-05, + "loss": 0.3657, + "step": 2687500 + }, + { + "epoch": 18.18969250757904, + "grad_norm": 0.35394448041915894, + "learning_rate": 4.81810307492421e-05, + "loss": 0.3653, + "step": 2688000 + }, + { + "epoch": 18.193076006929406, + "grad_norm": 0.37933778762817383, + "learning_rate": 4.818069239930706e-05, + "loss": 0.3652, + "step": 2688500 + }, + { + "epoch": 18.196459506279776, + "grad_norm": 0.35863587260246277, + "learning_rate": 4.8180354049372024e-05, + "loss": 0.366, + "step": 2689000 + }, + { + "epoch": 18.199843005630143, + "grad_norm": 0.33558905124664307, + "learning_rate": 4.8180015699436986e-05, + "loss": 0.3654, + "step": 2689500 + }, + { + "epoch": 18.20322650498051, + "grad_norm": 0.37256914377212524, + "learning_rate": 4.817967734950195e-05, + "loss": 0.3667, + "step": 2690000 + }, + { + "epoch": 18.20661000433088, + "grad_norm": 0.387002557516098, + "learning_rate": 4.817933899956692e-05, + "loss": 0.365, + "step": 2690500 + }, + { + "epoch": 18.209993503681247, + "grad_norm": 0.36478158831596375, + "learning_rate": 4.817900064963188e-05, + "loss": 0.3664, + "step": 2691000 + }, + { + "epoch": 18.213377003031617, + "grad_norm": 0.3580193817615509, + "learning_rate": 4.817866229969684e-05, + "loss": 0.3653, + "step": 2691500 + }, + { + "epoch": 18.216760502381984, + "grad_norm": 0.3662140667438507, + "learning_rate": 4.81783239497618e-05, + "loss": 0.367, + "step": 2692000 + }, + { + "epoch": 18.22014400173235, + "grad_norm": 0.43125787377357483, + "learning_rate": 4.817798559982677e-05, + "loss": 0.3661, + "step": 2692500 + }, + { + "epoch": 18.22352750108272, + "grad_norm": 0.37605369091033936, + "learning_rate": 4.8177647249891734e-05, + "loss": 0.3671, + "step": 2693000 + }, + { + "epoch": 18.226911000433088, + "grad_norm": 0.3609592914581299, + "learning_rate": 4.817730889995669e-05, + "loss": 0.3662, + "step": 2693500 + }, + { + "epoch": 18.230294499783454, + "grad_norm": 0.3572835326194763, + "learning_rate": 4.817697055002166e-05, + "loss": 0.3663, + "step": 2694000 + }, + { + "epoch": 18.233677999133825, + "grad_norm": 0.3766094148159027, + "learning_rate": 4.817663220008662e-05, + "loss": 0.3659, + "step": 2694500 + }, + { + "epoch": 18.23706149848419, + "grad_norm": 0.35266968607902527, + "learning_rate": 4.817629385015158e-05, + "loss": 0.3666, + "step": 2695000 + }, + { + "epoch": 18.24044499783456, + "grad_norm": 0.386359840631485, + "learning_rate": 4.8175955500216545e-05, + "loss": 0.3663, + "step": 2695500 + }, + { + "epoch": 18.24382849718493, + "grad_norm": 0.3682827055454254, + "learning_rate": 4.817561715028151e-05, + "loss": 0.3668, + "step": 2696000 + }, + { + "epoch": 18.247211996535295, + "grad_norm": 0.34724515676498413, + "learning_rate": 4.8175278800346476e-05, + "loss": 0.3641, + "step": 2696500 + }, + { + "epoch": 18.250595495885666, + "grad_norm": 0.36000531911849976, + "learning_rate": 4.817494045041144e-05, + "loss": 0.3666, + "step": 2697000 + }, + { + "epoch": 18.253978995236032, + "grad_norm": 0.3692454397678375, + "learning_rate": 4.81746021004764e-05, + "loss": 0.3679, + "step": 2697500 + }, + { + "epoch": 18.257362494586403, + "grad_norm": 0.36783865094184875, + "learning_rate": 4.817426375054136e-05, + "loss": 0.3648, + "step": 2698000 + }, + { + "epoch": 18.26074599393677, + "grad_norm": 0.30626559257507324, + "learning_rate": 4.8173925400606324e-05, + "loss": 0.3648, + "step": 2698500 + }, + { + "epoch": 18.264129493287136, + "grad_norm": 0.34672948718070984, + "learning_rate": 4.8173587050671286e-05, + "loss": 0.366, + "step": 2699000 + }, + { + "epoch": 18.267512992637506, + "grad_norm": 0.3682321012020111, + "learning_rate": 4.817324870073625e-05, + "loss": 0.368, + "step": 2699500 + }, + { + "epoch": 18.270896491987873, + "grad_norm": 0.34346839785575867, + "learning_rate": 4.817291035080122e-05, + "loss": 0.366, + "step": 2700000 + }, + { + "epoch": 18.27427999133824, + "grad_norm": 0.39888525009155273, + "learning_rate": 4.817257200086618e-05, + "loss": 0.364, + "step": 2700500 + }, + { + "epoch": 18.27766349068861, + "grad_norm": 0.3761538863182068, + "learning_rate": 4.817223365093114e-05, + "loss": 0.3653, + "step": 2701000 + }, + { + "epoch": 18.281046990038977, + "grad_norm": 0.36459478735923767, + "learning_rate": 4.8171895300996104e-05, + "loss": 0.3651, + "step": 2701500 + }, + { + "epoch": 18.284430489389347, + "grad_norm": 0.41893574595451355, + "learning_rate": 4.817155695106107e-05, + "loss": 0.3662, + "step": 2702000 + }, + { + "epoch": 18.287813988739714, + "grad_norm": 0.3921719193458557, + "learning_rate": 4.8171218601126035e-05, + "loss": 0.3664, + "step": 2702500 + }, + { + "epoch": 18.29119748809008, + "grad_norm": 0.3538754880428314, + "learning_rate": 4.817088025119099e-05, + "loss": 0.3661, + "step": 2703000 + }, + { + "epoch": 18.29458098744045, + "grad_norm": 0.3760707974433899, + "learning_rate": 4.817054190125595e-05, + "loss": 0.3666, + "step": 2703500 + }, + { + "epoch": 18.297964486790818, + "grad_norm": 0.3739316761493683, + "learning_rate": 4.817020355132092e-05, + "loss": 0.3674, + "step": 2704000 + }, + { + "epoch": 18.301347986141188, + "grad_norm": 0.3687141537666321, + "learning_rate": 4.816986520138588e-05, + "loss": 0.3655, + "step": 2704500 + }, + { + "epoch": 18.304731485491555, + "grad_norm": 0.3798815608024597, + "learning_rate": 4.8169526851450845e-05, + "loss": 0.3652, + "step": 2705000 + }, + { + "epoch": 18.30811498484192, + "grad_norm": 0.38236284255981445, + "learning_rate": 4.816918850151581e-05, + "loss": 0.3652, + "step": 2705500 + }, + { + "epoch": 18.311498484192292, + "grad_norm": 0.3626120090484619, + "learning_rate": 4.8168850151580776e-05, + "loss": 0.3656, + "step": 2706000 + }, + { + "epoch": 18.31488198354266, + "grad_norm": 0.3783361315727234, + "learning_rate": 4.816851180164574e-05, + "loss": 0.366, + "step": 2706500 + }, + { + "epoch": 18.31826548289303, + "grad_norm": 0.39285773038864136, + "learning_rate": 4.81681734517107e-05, + "loss": 0.3667, + "step": 2707000 + }, + { + "epoch": 18.321648982243396, + "grad_norm": 0.3508833646774292, + "learning_rate": 4.816783510177566e-05, + "loss": 0.3663, + "step": 2707500 + }, + { + "epoch": 18.325032481593762, + "grad_norm": 0.3748737871646881, + "learning_rate": 4.8167496751840625e-05, + "loss": 0.3653, + "step": 2708000 + }, + { + "epoch": 18.328415980944133, + "grad_norm": 0.39258819818496704, + "learning_rate": 4.816715840190559e-05, + "loss": 0.3671, + "step": 2708500 + }, + { + "epoch": 18.3317994802945, + "grad_norm": 0.3650103807449341, + "learning_rate": 4.816682005197055e-05, + "loss": 0.3672, + "step": 2709000 + }, + { + "epoch": 18.335182979644866, + "grad_norm": 0.3767535090446472, + "learning_rate": 4.816648170203552e-05, + "loss": 0.3671, + "step": 2709500 + }, + { + "epoch": 18.338566478995237, + "grad_norm": 0.37702834606170654, + "learning_rate": 4.816614335210048e-05, + "loss": 0.368, + "step": 2710000 + }, + { + "epoch": 18.341949978345603, + "grad_norm": 0.37456855177879333, + "learning_rate": 4.816580500216544e-05, + "loss": 0.3672, + "step": 2710500 + }, + { + "epoch": 18.345333477695974, + "grad_norm": 0.3798463046550751, + "learning_rate": 4.8165466652230404e-05, + "loss": 0.3658, + "step": 2711000 + }, + { + "epoch": 18.34871697704634, + "grad_norm": 0.32513168454170227, + "learning_rate": 4.816512830229537e-05, + "loss": 0.3647, + "step": 2711500 + }, + { + "epoch": 18.352100476396707, + "grad_norm": 0.37949857115745544, + "learning_rate": 4.8164789952360335e-05, + "loss": 0.3661, + "step": 2712000 + }, + { + "epoch": 18.355483975747077, + "grad_norm": 0.32526543736457825, + "learning_rate": 4.816445160242529e-05, + "loss": 0.3653, + "step": 2712500 + }, + { + "epoch": 18.358867475097444, + "grad_norm": 0.3609720766544342, + "learning_rate": 4.816411325249025e-05, + "loss": 0.3653, + "step": 2713000 + }, + { + "epoch": 18.362250974447814, + "grad_norm": 0.34838616847991943, + "learning_rate": 4.816377490255522e-05, + "loss": 0.3669, + "step": 2713500 + }, + { + "epoch": 18.36563447379818, + "grad_norm": 0.3667636513710022, + "learning_rate": 4.8163436552620184e-05, + "loss": 0.3648, + "step": 2714000 + }, + { + "epoch": 18.369017973148548, + "grad_norm": 0.37113574147224426, + "learning_rate": 4.8163098202685146e-05, + "loss": 0.3647, + "step": 2714500 + }, + { + "epoch": 18.372401472498918, + "grad_norm": 0.37193408608436584, + "learning_rate": 4.816275985275011e-05, + "loss": 0.3677, + "step": 2715000 + }, + { + "epoch": 18.375784971849285, + "grad_norm": 0.36754679679870605, + "learning_rate": 4.816242150281508e-05, + "loss": 0.3657, + "step": 2715500 + }, + { + "epoch": 18.379168471199655, + "grad_norm": 0.404785692691803, + "learning_rate": 4.816208315288004e-05, + "loss": 0.3669, + "step": 2716000 + }, + { + "epoch": 18.382551970550022, + "grad_norm": 0.3446834087371826, + "learning_rate": 4.8161744802945e-05, + "loss": 0.3655, + "step": 2716500 + }, + { + "epoch": 18.38593546990039, + "grad_norm": 0.3776063621044159, + "learning_rate": 4.8161406453009963e-05, + "loss": 0.3664, + "step": 2717000 + }, + { + "epoch": 18.38931896925076, + "grad_norm": 0.34708327054977417, + "learning_rate": 4.8161068103074926e-05, + "loss": 0.366, + "step": 2717500 + }, + { + "epoch": 18.392702468601126, + "grad_norm": 0.35970473289489746, + "learning_rate": 4.816072975313989e-05, + "loss": 0.3676, + "step": 2718000 + }, + { + "epoch": 18.396085967951493, + "grad_norm": 0.40329545736312866, + "learning_rate": 4.816039140320485e-05, + "loss": 0.3653, + "step": 2718500 + }, + { + "epoch": 18.399469467301863, + "grad_norm": 0.34806376695632935, + "learning_rate": 4.816005305326982e-05, + "loss": 0.3675, + "step": 2719000 + }, + { + "epoch": 18.40285296665223, + "grad_norm": 0.362699031829834, + "learning_rate": 4.815971470333478e-05, + "loss": 0.3684, + "step": 2719500 + }, + { + "epoch": 18.4062364660026, + "grad_norm": 0.32820168137550354, + "learning_rate": 4.815937635339974e-05, + "loss": 0.3647, + "step": 2720000 + }, + { + "epoch": 18.409619965352967, + "grad_norm": 0.33602628111839294, + "learning_rate": 4.8159038003464705e-05, + "loss": 0.3652, + "step": 2720500 + }, + { + "epoch": 18.413003464703333, + "grad_norm": 0.3572433888912201, + "learning_rate": 4.8158699653529674e-05, + "loss": 0.368, + "step": 2721000 + }, + { + "epoch": 18.416386964053704, + "grad_norm": 0.38326042890548706, + "learning_rate": 4.8158361303594636e-05, + "loss": 0.3665, + "step": 2721500 + }, + { + "epoch": 18.41977046340407, + "grad_norm": 0.36697304248809814, + "learning_rate": 4.815802295365959e-05, + "loss": 0.3656, + "step": 2722000 + }, + { + "epoch": 18.42315396275444, + "grad_norm": 0.3343600034713745, + "learning_rate": 4.8157684603724554e-05, + "loss": 0.3661, + "step": 2722500 + }, + { + "epoch": 18.426537462104807, + "grad_norm": 0.36516448855400085, + "learning_rate": 4.815734625378952e-05, + "loss": 0.3659, + "step": 2723000 + }, + { + "epoch": 18.429920961455174, + "grad_norm": 0.4176952540874481, + "learning_rate": 4.8157007903854485e-05, + "loss": 0.366, + "step": 2723500 + }, + { + "epoch": 18.433304460805545, + "grad_norm": 0.37972500920295715, + "learning_rate": 4.815666955391945e-05, + "loss": 0.3656, + "step": 2724000 + }, + { + "epoch": 18.43668796015591, + "grad_norm": 0.38399654626846313, + "learning_rate": 4.815633120398441e-05, + "loss": 0.3665, + "step": 2724500 + }, + { + "epoch": 18.440071459506278, + "grad_norm": 0.3755989968776703, + "learning_rate": 4.815599285404938e-05, + "loss": 0.3657, + "step": 2725000 + }, + { + "epoch": 18.44345495885665, + "grad_norm": 0.32659873366355896, + "learning_rate": 4.815565450411434e-05, + "loss": 0.3656, + "step": 2725500 + }, + { + "epoch": 18.446838458207015, + "grad_norm": 0.3238185942173004, + "learning_rate": 4.81553161541793e-05, + "loss": 0.3657, + "step": 2726000 + }, + { + "epoch": 18.450221957557385, + "grad_norm": 0.3889681100845337, + "learning_rate": 4.8154977804244264e-05, + "loss": 0.3654, + "step": 2726500 + }, + { + "epoch": 18.453605456907752, + "grad_norm": 0.38696062564849854, + "learning_rate": 4.8154639454309226e-05, + "loss": 0.3651, + "step": 2727000 + }, + { + "epoch": 18.45698895625812, + "grad_norm": 0.35800743103027344, + "learning_rate": 4.815430110437419e-05, + "loss": 0.3662, + "step": 2727500 + }, + { + "epoch": 18.46037245560849, + "grad_norm": 0.3667149841785431, + "learning_rate": 4.815396275443915e-05, + "loss": 0.3665, + "step": 2728000 + }, + { + "epoch": 18.463755954958856, + "grad_norm": 0.37348583340644836, + "learning_rate": 4.815362440450412e-05, + "loss": 0.367, + "step": 2728500 + }, + { + "epoch": 18.467139454309226, + "grad_norm": 0.3520568311214447, + "learning_rate": 4.815328605456908e-05, + "loss": 0.3678, + "step": 2729000 + }, + { + "epoch": 18.470522953659593, + "grad_norm": 0.38284870982170105, + "learning_rate": 4.8152947704634044e-05, + "loss": 0.3672, + "step": 2729500 + }, + { + "epoch": 18.47390645300996, + "grad_norm": 0.34518930315971375, + "learning_rate": 4.8152609354699006e-05, + "loss": 0.3655, + "step": 2730000 + }, + { + "epoch": 18.47728995236033, + "grad_norm": 0.34147876501083374, + "learning_rate": 4.8152271004763975e-05, + "loss": 0.3666, + "step": 2730500 + }, + { + "epoch": 18.480673451710697, + "grad_norm": 0.37820854783058167, + "learning_rate": 4.815193265482894e-05, + "loss": 0.3648, + "step": 2731000 + }, + { + "epoch": 18.484056951061067, + "grad_norm": 0.372206449508667, + "learning_rate": 4.815159430489389e-05, + "loss": 0.367, + "step": 2731500 + }, + { + "epoch": 18.487440450411434, + "grad_norm": 0.36140644550323486, + "learning_rate": 4.8151255954958854e-05, + "loss": 0.3664, + "step": 2732000 + }, + { + "epoch": 18.4908239497618, + "grad_norm": 0.366239994764328, + "learning_rate": 4.815091760502382e-05, + "loss": 0.3672, + "step": 2732500 + }, + { + "epoch": 18.49420744911217, + "grad_norm": 0.3318920135498047, + "learning_rate": 4.8150579255088785e-05, + "loss": 0.3668, + "step": 2733000 + }, + { + "epoch": 18.497590948462538, + "grad_norm": 0.32766205072402954, + "learning_rate": 4.815024090515375e-05, + "loss": 0.3669, + "step": 2733500 + }, + { + "epoch": 18.500974447812904, + "grad_norm": 0.3653565049171448, + "learning_rate": 4.814990255521871e-05, + "loss": 0.3665, + "step": 2734000 + }, + { + "epoch": 18.504357947163275, + "grad_norm": 0.38248080015182495, + "learning_rate": 4.814956420528368e-05, + "loss": 0.367, + "step": 2734500 + }, + { + "epoch": 18.50774144651364, + "grad_norm": 0.3788929283618927, + "learning_rate": 4.814922585534864e-05, + "loss": 0.3654, + "step": 2735000 + }, + { + "epoch": 18.51112494586401, + "grad_norm": 0.3633365035057068, + "learning_rate": 4.81488875054136e-05, + "loss": 0.3667, + "step": 2735500 + }, + { + "epoch": 18.51450844521438, + "grad_norm": 0.3879588842391968, + "learning_rate": 4.8148549155478565e-05, + "loss": 0.3664, + "step": 2736000 + }, + { + "epoch": 18.517891944564745, + "grad_norm": 0.38161173462867737, + "learning_rate": 4.814821080554353e-05, + "loss": 0.3671, + "step": 2736500 + }, + { + "epoch": 18.521275443915115, + "grad_norm": 0.36614733934402466, + "learning_rate": 4.814787245560849e-05, + "loss": 0.3667, + "step": 2737000 + }, + { + "epoch": 18.524658943265482, + "grad_norm": 0.3792756497859955, + "learning_rate": 4.814753410567345e-05, + "loss": 0.3666, + "step": 2737500 + }, + { + "epoch": 18.528042442615853, + "grad_norm": 0.38750314712524414, + "learning_rate": 4.814719575573842e-05, + "loss": 0.3658, + "step": 2738000 + }, + { + "epoch": 18.53142594196622, + "grad_norm": 0.3520932197570801, + "learning_rate": 4.814685740580338e-05, + "loss": 0.3656, + "step": 2738500 + }, + { + "epoch": 18.534809441316586, + "grad_norm": 0.3735598027706146, + "learning_rate": 4.8146519055868344e-05, + "loss": 0.3659, + "step": 2739000 + }, + { + "epoch": 18.538192940666956, + "grad_norm": 0.3451687693595886, + "learning_rate": 4.8146180705933306e-05, + "loss": 0.3658, + "step": 2739500 + }, + { + "epoch": 18.541576440017323, + "grad_norm": 0.33136609196662903, + "learning_rate": 4.8145842355998275e-05, + "loss": 0.3664, + "step": 2740000 + }, + { + "epoch": 18.544959939367693, + "grad_norm": 0.379218190908432, + "learning_rate": 4.814550400606324e-05, + "loss": 0.3673, + "step": 2740500 + }, + { + "epoch": 18.54834343871806, + "grad_norm": 0.3787924647331238, + "learning_rate": 4.814516565612819e-05, + "loss": 0.3666, + "step": 2741000 + }, + { + "epoch": 18.551726938068427, + "grad_norm": 0.3593429625034332, + "learning_rate": 4.8144827306193155e-05, + "loss": 0.3656, + "step": 2741500 + }, + { + "epoch": 18.555110437418797, + "grad_norm": 0.3921399414539337, + "learning_rate": 4.8144488956258124e-05, + "loss": 0.3652, + "step": 2742000 + }, + { + "epoch": 18.558493936769164, + "grad_norm": 0.34136930108070374, + "learning_rate": 4.8144150606323086e-05, + "loss": 0.3679, + "step": 2742500 + }, + { + "epoch": 18.56187743611953, + "grad_norm": 0.33634859323501587, + "learning_rate": 4.814381225638805e-05, + "loss": 0.3658, + "step": 2743000 + }, + { + "epoch": 18.5652609354699, + "grad_norm": 0.3452155292034149, + "learning_rate": 4.814347390645301e-05, + "loss": 0.3647, + "step": 2743500 + }, + { + "epoch": 18.568644434820268, + "grad_norm": 0.3631720542907715, + "learning_rate": 4.814313555651798e-05, + "loss": 0.3668, + "step": 2744000 + }, + { + "epoch": 18.572027934170638, + "grad_norm": 0.3850167989730835, + "learning_rate": 4.814279720658294e-05, + "loss": 0.3664, + "step": 2744500 + }, + { + "epoch": 18.575411433521005, + "grad_norm": 0.39307355880737305, + "learning_rate": 4.81424588566479e-05, + "loss": 0.3674, + "step": 2745000 + }, + { + "epoch": 18.57879493287137, + "grad_norm": 0.36546698212623596, + "learning_rate": 4.8142120506712865e-05, + "loss": 0.3669, + "step": 2745500 + }, + { + "epoch": 18.582178432221742, + "grad_norm": 0.3274228870868683, + "learning_rate": 4.814178215677783e-05, + "loss": 0.366, + "step": 2746000 + }, + { + "epoch": 18.58556193157211, + "grad_norm": 0.37010127305984497, + "learning_rate": 4.814144380684279e-05, + "loss": 0.3654, + "step": 2746500 + }, + { + "epoch": 18.58894543092248, + "grad_norm": 0.36521145701408386, + "learning_rate": 4.814110545690775e-05, + "loss": 0.3652, + "step": 2747000 + }, + { + "epoch": 18.592328930272846, + "grad_norm": 0.3667973279953003, + "learning_rate": 4.814076710697272e-05, + "loss": 0.3666, + "step": 2747500 + }, + { + "epoch": 18.595712429623212, + "grad_norm": 0.3821118474006653, + "learning_rate": 4.814042875703768e-05, + "loss": 0.3669, + "step": 2748000 + }, + { + "epoch": 18.599095928973583, + "grad_norm": 0.34617286920547485, + "learning_rate": 4.8140090407102645e-05, + "loss": 0.3651, + "step": 2748500 + }, + { + "epoch": 18.60247942832395, + "grad_norm": 0.349868506193161, + "learning_rate": 4.813975205716761e-05, + "loss": 0.3654, + "step": 2749000 + }, + { + "epoch": 18.605862927674316, + "grad_norm": 0.3502577543258667, + "learning_rate": 4.813941370723257e-05, + "loss": 0.3661, + "step": 2749500 + }, + { + "epoch": 18.609246427024686, + "grad_norm": 0.34174036979675293, + "learning_rate": 4.813907535729754e-05, + "loss": 0.3655, + "step": 2750000 + }, + { + "epoch": 18.612629926375053, + "grad_norm": 0.3498634696006775, + "learning_rate": 4.813873700736249e-05, + "loss": 0.3649, + "step": 2750500 + }, + { + "epoch": 18.616013425725424, + "grad_norm": 0.3564428985118866, + "learning_rate": 4.8138398657427455e-05, + "loss": 0.3681, + "step": 2751000 + }, + { + "epoch": 18.61939692507579, + "grad_norm": 0.32354769110679626, + "learning_rate": 4.8138060307492424e-05, + "loss": 0.3672, + "step": 2751500 + }, + { + "epoch": 18.622780424426157, + "grad_norm": 0.37490779161453247, + "learning_rate": 4.8137721957557386e-05, + "loss": 0.3666, + "step": 2752000 + }, + { + "epoch": 18.626163923776527, + "grad_norm": 0.3808179497718811, + "learning_rate": 4.813738360762235e-05, + "loss": 0.3667, + "step": 2752500 + }, + { + "epoch": 18.629547423126894, + "grad_norm": 0.34879276156425476, + "learning_rate": 4.813704525768731e-05, + "loss": 0.3662, + "step": 2753000 + }, + { + "epoch": 18.632930922477264, + "grad_norm": 0.35445043444633484, + "learning_rate": 4.813670690775228e-05, + "loss": 0.3673, + "step": 2753500 + }, + { + "epoch": 18.63631442182763, + "grad_norm": 0.37173983454704285, + "learning_rate": 4.813636855781724e-05, + "loss": 0.3656, + "step": 2754000 + }, + { + "epoch": 18.639697921177998, + "grad_norm": 0.33357104659080505, + "learning_rate": 4.8136030207882204e-05, + "loss": 0.3657, + "step": 2754500 + }, + { + "epoch": 18.643081420528368, + "grad_norm": 0.3388036787509918, + "learning_rate": 4.8135691857947166e-05, + "loss": 0.3662, + "step": 2755000 + }, + { + "epoch": 18.646464919878735, + "grad_norm": 0.3628353476524353, + "learning_rate": 4.813535350801213e-05, + "loss": 0.3674, + "step": 2755500 + }, + { + "epoch": 18.649848419229105, + "grad_norm": 0.37852683663368225, + "learning_rate": 4.813501515807709e-05, + "loss": 0.3661, + "step": 2756000 + }, + { + "epoch": 18.653231918579472, + "grad_norm": 0.3644959628582001, + "learning_rate": 4.813467680814205e-05, + "loss": 0.3662, + "step": 2756500 + }, + { + "epoch": 18.65661541792984, + "grad_norm": 0.371595174074173, + "learning_rate": 4.8134338458207014e-05, + "loss": 0.3665, + "step": 2757000 + }, + { + "epoch": 18.65999891728021, + "grad_norm": 0.3815184235572815, + "learning_rate": 4.813400010827198e-05, + "loss": 0.3663, + "step": 2757500 + }, + { + "epoch": 18.663382416630576, + "grad_norm": 0.36756274104118347, + "learning_rate": 4.8133661758336945e-05, + "loss": 0.3657, + "step": 2758000 + }, + { + "epoch": 18.666765915980942, + "grad_norm": 0.3418610692024231, + "learning_rate": 4.813332340840191e-05, + "loss": 0.3664, + "step": 2758500 + }, + { + "epoch": 18.670149415331313, + "grad_norm": 0.36101487278938293, + "learning_rate": 4.813298505846687e-05, + "loss": 0.3677, + "step": 2759000 + }, + { + "epoch": 18.67353291468168, + "grad_norm": 0.39173388481140137, + "learning_rate": 4.813264670853184e-05, + "loss": 0.3649, + "step": 2759500 + }, + { + "epoch": 18.67691641403205, + "grad_norm": 0.37323135137557983, + "learning_rate": 4.8132308358596794e-05, + "loss": 0.3657, + "step": 2760000 + }, + { + "epoch": 18.680299913382417, + "grad_norm": 0.36500459909439087, + "learning_rate": 4.8131970008661756e-05, + "loss": 0.3652, + "step": 2760500 + }, + { + "epoch": 18.683683412732783, + "grad_norm": 0.35408034920692444, + "learning_rate": 4.8131631658726725e-05, + "loss": 0.3656, + "step": 2761000 + }, + { + "epoch": 18.687066912083154, + "grad_norm": 0.3520357012748718, + "learning_rate": 4.813129330879169e-05, + "loss": 0.3658, + "step": 2761500 + }, + { + "epoch": 18.69045041143352, + "grad_norm": 0.35334864258766174, + "learning_rate": 4.813095495885665e-05, + "loss": 0.3673, + "step": 2762000 + }, + { + "epoch": 18.69383391078389, + "grad_norm": 0.36210882663726807, + "learning_rate": 4.813061660892161e-05, + "loss": 0.3659, + "step": 2762500 + }, + { + "epoch": 18.697217410134257, + "grad_norm": 0.3763342797756195, + "learning_rate": 4.813027825898658e-05, + "loss": 0.3674, + "step": 2763000 + }, + { + "epoch": 18.700600909484624, + "grad_norm": 0.33555757999420166, + "learning_rate": 4.812993990905154e-05, + "loss": 0.3667, + "step": 2763500 + }, + { + "epoch": 18.703984408834994, + "grad_norm": 0.3518519103527069, + "learning_rate": 4.8129601559116504e-05, + "loss": 0.3662, + "step": 2764000 + }, + { + "epoch": 18.70736790818536, + "grad_norm": 0.3487462103366852, + "learning_rate": 4.812926320918147e-05, + "loss": 0.3674, + "step": 2764500 + }, + { + "epoch": 18.71075140753573, + "grad_norm": 0.3702090382575989, + "learning_rate": 4.812892485924643e-05, + "loss": 0.3642, + "step": 2765000 + }, + { + "epoch": 18.7141349068861, + "grad_norm": 0.348533034324646, + "learning_rate": 4.812858650931139e-05, + "loss": 0.3669, + "step": 2765500 + }, + { + "epoch": 18.717518406236465, + "grad_norm": 0.360711008310318, + "learning_rate": 4.812824815937635e-05, + "loss": 0.3657, + "step": 2766000 + }, + { + "epoch": 18.720901905586835, + "grad_norm": 0.3330981731414795, + "learning_rate": 4.8127909809441315e-05, + "loss": 0.366, + "step": 2766500 + }, + { + "epoch": 18.724285404937202, + "grad_norm": 0.3385585844516754, + "learning_rate": 4.8127571459506284e-05, + "loss": 0.3654, + "step": 2767000 + }, + { + "epoch": 18.72766890428757, + "grad_norm": 0.3667827546596527, + "learning_rate": 4.8127233109571246e-05, + "loss": 0.3645, + "step": 2767500 + }, + { + "epoch": 18.73105240363794, + "grad_norm": 0.34755051136016846, + "learning_rate": 4.812689475963621e-05, + "loss": 0.366, + "step": 2768000 + }, + { + "epoch": 18.734435902988306, + "grad_norm": 0.3477209508419037, + "learning_rate": 4.812655640970117e-05, + "loss": 0.3666, + "step": 2768500 + }, + { + "epoch": 18.737819402338676, + "grad_norm": 0.4002993702888489, + "learning_rate": 4.812621805976614e-05, + "loss": 0.367, + "step": 2769000 + }, + { + "epoch": 18.741202901689043, + "grad_norm": 0.34906914830207825, + "learning_rate": 4.8125879709831095e-05, + "loss": 0.3671, + "step": 2769500 + }, + { + "epoch": 18.74458640103941, + "grad_norm": 0.38176172971725464, + "learning_rate": 4.812554135989606e-05, + "loss": 0.3663, + "step": 2770000 + }, + { + "epoch": 18.74796990038978, + "grad_norm": 0.36529573798179626, + "learning_rate": 4.8125203009961026e-05, + "loss": 0.3669, + "step": 2770500 + }, + { + "epoch": 18.751353399740147, + "grad_norm": 0.3871072828769684, + "learning_rate": 4.812486466002599e-05, + "loss": 0.3656, + "step": 2771000 + }, + { + "epoch": 18.754736899090517, + "grad_norm": 0.37509897351264954, + "learning_rate": 4.812452631009095e-05, + "loss": 0.3665, + "step": 2771500 + }, + { + "epoch": 18.758120398440884, + "grad_norm": 0.3503206968307495, + "learning_rate": 4.812418796015591e-05, + "loss": 0.365, + "step": 2772000 + }, + { + "epoch": 18.76150389779125, + "grad_norm": 0.38853317499160767, + "learning_rate": 4.812384961022088e-05, + "loss": 0.3665, + "step": 2772500 + }, + { + "epoch": 18.76488739714162, + "grad_norm": 0.33158519864082336, + "learning_rate": 4.812351126028584e-05, + "loss": 0.3657, + "step": 2773000 + }, + { + "epoch": 18.768270896491988, + "grad_norm": 0.39370110630989075, + "learning_rate": 4.8123172910350805e-05, + "loss": 0.3664, + "step": 2773500 + }, + { + "epoch": 18.771654395842354, + "grad_norm": 0.3575500547885895, + "learning_rate": 4.812283456041577e-05, + "loss": 0.3663, + "step": 2774000 + }, + { + "epoch": 18.775037895192725, + "grad_norm": 0.373933345079422, + "learning_rate": 4.812249621048073e-05, + "loss": 0.3667, + "step": 2774500 + }, + { + "epoch": 18.77842139454309, + "grad_norm": 0.3317835330963135, + "learning_rate": 4.812215786054569e-05, + "loss": 0.3655, + "step": 2775000 + }, + { + "epoch": 18.78180489389346, + "grad_norm": 0.3341616690158844, + "learning_rate": 4.8121819510610654e-05, + "loss": 0.3662, + "step": 2775500 + }, + { + "epoch": 18.78518839324383, + "grad_norm": 0.39613327383995056, + "learning_rate": 4.8121481160675616e-05, + "loss": 0.3664, + "step": 2776000 + }, + { + "epoch": 18.788571892594195, + "grad_norm": 0.36104127764701843, + "learning_rate": 4.8121142810740585e-05, + "loss": 0.3658, + "step": 2776500 + }, + { + "epoch": 18.791955391944565, + "grad_norm": 0.34897303581237793, + "learning_rate": 4.812080446080555e-05, + "loss": 0.367, + "step": 2777000 + }, + { + "epoch": 18.795338891294932, + "grad_norm": 0.40071144700050354, + "learning_rate": 4.812046611087051e-05, + "loss": 0.366, + "step": 2777500 + }, + { + "epoch": 18.798722390645302, + "grad_norm": 0.3589742183685303, + "learning_rate": 4.812012776093547e-05, + "loss": 0.3656, + "step": 2778000 + }, + { + "epoch": 18.80210588999567, + "grad_norm": 0.3539280593395233, + "learning_rate": 4.811978941100044e-05, + "loss": 0.3672, + "step": 2778500 + }, + { + "epoch": 18.805489389346036, + "grad_norm": 0.3695991635322571, + "learning_rate": 4.8119451061065395e-05, + "loss": 0.3681, + "step": 2779000 + }, + { + "epoch": 18.808872888696406, + "grad_norm": 0.3592228293418884, + "learning_rate": 4.811911271113036e-05, + "loss": 0.3675, + "step": 2779500 + }, + { + "epoch": 18.812256388046773, + "grad_norm": 0.3480539321899414, + "learning_rate": 4.8118774361195326e-05, + "loss": 0.3642, + "step": 2780000 + }, + { + "epoch": 18.815639887397143, + "grad_norm": 0.36234450340270996, + "learning_rate": 4.811843601126029e-05, + "loss": 0.3668, + "step": 2780500 + }, + { + "epoch": 18.81902338674751, + "grad_norm": 0.37986135482788086, + "learning_rate": 4.811809766132525e-05, + "loss": 0.3665, + "step": 2781000 + }, + { + "epoch": 18.822406886097877, + "grad_norm": 0.3787166476249695, + "learning_rate": 4.811775931139021e-05, + "loss": 0.365, + "step": 2781500 + }, + { + "epoch": 18.825790385448247, + "grad_norm": 0.3391520082950592, + "learning_rate": 4.811742096145518e-05, + "loss": 0.3658, + "step": 2782000 + }, + { + "epoch": 18.829173884798614, + "grad_norm": 0.37807825207710266, + "learning_rate": 4.8117082611520144e-05, + "loss": 0.3663, + "step": 2782500 + }, + { + "epoch": 18.83255738414898, + "grad_norm": 0.36283302307128906, + "learning_rate": 4.8116744261585106e-05, + "loss": 0.366, + "step": 2783000 + }, + { + "epoch": 18.83594088349935, + "grad_norm": 0.37602904438972473, + "learning_rate": 4.811640591165007e-05, + "loss": 0.3669, + "step": 2783500 + }, + { + "epoch": 18.839324382849718, + "grad_norm": 0.3637160062789917, + "learning_rate": 4.811606756171503e-05, + "loss": 0.3663, + "step": 2784000 + }, + { + "epoch": 18.842707882200088, + "grad_norm": 0.35866454243659973, + "learning_rate": 4.811572921177999e-05, + "loss": 0.3663, + "step": 2784500 + }, + { + "epoch": 18.846091381550455, + "grad_norm": 0.32873547077178955, + "learning_rate": 4.8115390861844954e-05, + "loss": 0.3651, + "step": 2785000 + }, + { + "epoch": 18.84947488090082, + "grad_norm": 0.36401212215423584, + "learning_rate": 4.8115052511909916e-05, + "loss": 0.365, + "step": 2785500 + }, + { + "epoch": 18.85285838025119, + "grad_norm": 0.3384600281715393, + "learning_rate": 4.8114714161974885e-05, + "loss": 0.3655, + "step": 2786000 + }, + { + "epoch": 18.85624187960156, + "grad_norm": 0.34413713216781616, + "learning_rate": 4.811437581203985e-05, + "loss": 0.3652, + "step": 2786500 + }, + { + "epoch": 18.85962537895193, + "grad_norm": 0.3725583255290985, + "learning_rate": 4.811403746210481e-05, + "loss": 0.3665, + "step": 2787000 + }, + { + "epoch": 18.863008878302296, + "grad_norm": 0.35329461097717285, + "learning_rate": 4.811369911216977e-05, + "loss": 0.3659, + "step": 2787500 + }, + { + "epoch": 18.866392377652662, + "grad_norm": 0.33690500259399414, + "learning_rate": 4.811336076223474e-05, + "loss": 0.3668, + "step": 2788000 + }, + { + "epoch": 18.869775877003033, + "grad_norm": 0.3894781470298767, + "learning_rate": 4.8113022412299696e-05, + "loss": 0.3658, + "step": 2788500 + }, + { + "epoch": 18.8731593763534, + "grad_norm": 0.38862013816833496, + "learning_rate": 4.811268406236466e-05, + "loss": 0.365, + "step": 2789000 + }, + { + "epoch": 18.87654287570377, + "grad_norm": 0.3516341745853424, + "learning_rate": 4.811234571242963e-05, + "loss": 0.3675, + "step": 2789500 + }, + { + "epoch": 18.879926375054136, + "grad_norm": 0.3779483735561371, + "learning_rate": 4.811200736249459e-05, + "loss": 0.367, + "step": 2790000 + }, + { + "epoch": 18.883309874404503, + "grad_norm": 0.3754669725894928, + "learning_rate": 4.811166901255955e-05, + "loss": 0.3672, + "step": 2790500 + }, + { + "epoch": 18.886693373754873, + "grad_norm": 0.38591325283050537, + "learning_rate": 4.811133066262451e-05, + "loss": 0.3681, + "step": 2791000 + }, + { + "epoch": 18.89007687310524, + "grad_norm": 0.3474923372268677, + "learning_rate": 4.811099231268948e-05, + "loss": 0.3664, + "step": 2791500 + }, + { + "epoch": 18.893460372455607, + "grad_norm": 0.3471725285053253, + "learning_rate": 4.8110653962754444e-05, + "loss": 0.3652, + "step": 2792000 + }, + { + "epoch": 18.896843871805977, + "grad_norm": 0.3508303761482239, + "learning_rate": 4.8110315612819406e-05, + "loss": 0.3644, + "step": 2792500 + }, + { + "epoch": 18.900227371156344, + "grad_norm": 0.3836657404899597, + "learning_rate": 4.810997726288437e-05, + "loss": 0.3679, + "step": 2793000 + }, + { + "epoch": 18.903610870506714, + "grad_norm": 0.37254008650779724, + "learning_rate": 4.810963891294933e-05, + "loss": 0.3657, + "step": 2793500 + }, + { + "epoch": 18.90699436985708, + "grad_norm": 0.35349971055984497, + "learning_rate": 4.810930056301429e-05, + "loss": 0.3657, + "step": 2794000 + }, + { + "epoch": 18.910377869207448, + "grad_norm": 0.3606826961040497, + "learning_rate": 4.8108962213079255e-05, + "loss": 0.3673, + "step": 2794500 + }, + { + "epoch": 18.913761368557818, + "grad_norm": 0.3424413502216339, + "learning_rate": 4.810862386314422e-05, + "loss": 0.3678, + "step": 2795000 + }, + { + "epoch": 18.917144867908185, + "grad_norm": 0.3713364601135254, + "learning_rate": 4.8108285513209186e-05, + "loss": 0.3675, + "step": 2795500 + }, + { + "epoch": 18.920528367258555, + "grad_norm": 0.36504119634628296, + "learning_rate": 4.810794716327415e-05, + "loss": 0.3664, + "step": 2796000 + }, + { + "epoch": 18.923911866608922, + "grad_norm": 0.3482113182544708, + "learning_rate": 4.810760881333911e-05, + "loss": 0.3657, + "step": 2796500 + }, + { + "epoch": 18.92729536595929, + "grad_norm": 0.36139917373657227, + "learning_rate": 4.810727046340407e-05, + "loss": 0.3671, + "step": 2797000 + }, + { + "epoch": 18.93067886530966, + "grad_norm": 0.3819946050643921, + "learning_rate": 4.810693211346904e-05, + "loss": 0.3658, + "step": 2797500 + }, + { + "epoch": 18.934062364660026, + "grad_norm": 0.37418124079704285, + "learning_rate": 4.8106593763533997e-05, + "loss": 0.3654, + "step": 2798000 + }, + { + "epoch": 18.937445864010392, + "grad_norm": 0.37340039014816284, + "learning_rate": 4.810625541359896e-05, + "loss": 0.3667, + "step": 2798500 + }, + { + "epoch": 18.940829363360763, + "grad_norm": 0.3686635494232178, + "learning_rate": 4.810591706366393e-05, + "loss": 0.3671, + "step": 2799000 + }, + { + "epoch": 18.94421286271113, + "grad_norm": 0.34484514594078064, + "learning_rate": 4.810557871372889e-05, + "loss": 0.3666, + "step": 2799500 + }, + { + "epoch": 18.9475963620615, + "grad_norm": 0.3572694957256317, + "learning_rate": 4.810524036379385e-05, + "loss": 0.3643, + "step": 2800000 + }, + { + "epoch": 18.950979861411867, + "grad_norm": 0.35780635476112366, + "learning_rate": 4.8104902013858814e-05, + "loss": 0.3677, + "step": 2800500 + }, + { + "epoch": 18.954363360762233, + "grad_norm": 0.34452852606773376, + "learning_rate": 4.810456366392378e-05, + "loss": 0.3664, + "step": 2801000 + }, + { + "epoch": 18.957746860112604, + "grad_norm": 0.3469793498516083, + "learning_rate": 4.8104225313988745e-05, + "loss": 0.3674, + "step": 2801500 + }, + { + "epoch": 18.96113035946297, + "grad_norm": 0.39285343885421753, + "learning_rate": 4.810388696405371e-05, + "loss": 0.3659, + "step": 2802000 + }, + { + "epoch": 18.96451385881334, + "grad_norm": 0.39713558554649353, + "learning_rate": 4.810354861411867e-05, + "loss": 0.3672, + "step": 2802500 + }, + { + "epoch": 18.967897358163707, + "grad_norm": 0.3319869935512543, + "learning_rate": 4.810321026418363e-05, + "loss": 0.3648, + "step": 2803000 + }, + { + "epoch": 18.971280857514074, + "grad_norm": 0.39673224091529846, + "learning_rate": 4.8102871914248593e-05, + "loss": 0.3661, + "step": 2803500 + }, + { + "epoch": 18.974664356864444, + "grad_norm": 0.3552807867527008, + "learning_rate": 4.8102533564313556e-05, + "loss": 0.3668, + "step": 2804000 + }, + { + "epoch": 18.97804785621481, + "grad_norm": 0.3664064109325409, + "learning_rate": 4.810219521437852e-05, + "loss": 0.3664, + "step": 2804500 + }, + { + "epoch": 18.981431355565178, + "grad_norm": 0.3757137954235077, + "learning_rate": 4.8101856864443487e-05, + "loss": 0.3663, + "step": 2805000 + }, + { + "epoch": 18.984814854915548, + "grad_norm": 0.3732067048549652, + "learning_rate": 4.810151851450845e-05, + "loss": 0.3664, + "step": 2805500 + }, + { + "epoch": 18.988198354265915, + "grad_norm": 0.34929749369621277, + "learning_rate": 4.810118016457341e-05, + "loss": 0.3685, + "step": 2806000 + }, + { + "epoch": 18.991581853616285, + "grad_norm": 0.3506418764591217, + "learning_rate": 4.810084181463837e-05, + "loss": 0.3647, + "step": 2806500 + }, + { + "epoch": 18.994965352966652, + "grad_norm": 0.3531164824962616, + "learning_rate": 4.810050346470334e-05, + "loss": 0.3657, + "step": 2807000 + }, + { + "epoch": 18.99834885231702, + "grad_norm": 0.35706275701522827, + "learning_rate": 4.8100165114768304e-05, + "loss": 0.3659, + "step": 2807500 + }, + { + "epoch": 19.0, + "eval_accuracy": 0.8607044074124969, + "eval_loss": 0.5654244422912598, + "eval_runtime": 3394.0031, + "eval_samples_per_second": 85.664, + "eval_steps_per_second": 5.354, + "step": 2807744 + }, + { + "epoch": 19.00173235166739, + "grad_norm": 0.34599050879478455, + "learning_rate": 4.809982676483326e-05, + "loss": 0.3665, + "step": 2808000 + }, + { + "epoch": 19.005115851017756, + "grad_norm": 0.376055508852005, + "learning_rate": 4.809948841489823e-05, + "loss": 0.3639, + "step": 2808500 + }, + { + "epoch": 19.008499350368126, + "grad_norm": 0.39492684602737427, + "learning_rate": 4.809915006496319e-05, + "loss": 0.3646, + "step": 2809000 + }, + { + "epoch": 19.011882849718493, + "grad_norm": 0.3591514527797699, + "learning_rate": 4.809881171502815e-05, + "loss": 0.3638, + "step": 2809500 + }, + { + "epoch": 19.01526634906886, + "grad_norm": 0.3865591287612915, + "learning_rate": 4.8098473365093115e-05, + "loss": 0.3626, + "step": 2810000 + }, + { + "epoch": 19.01864984841923, + "grad_norm": 0.3449145257472992, + "learning_rate": 4.8098135015158083e-05, + "loss": 0.3637, + "step": 2810500 + }, + { + "epoch": 19.022033347769597, + "grad_norm": 0.4153270423412323, + "learning_rate": 4.8097796665223046e-05, + "loss": 0.3634, + "step": 2811000 + }, + { + "epoch": 19.025416847119967, + "grad_norm": 0.38772618770599365, + "learning_rate": 4.809745831528801e-05, + "loss": 0.3647, + "step": 2811500 + }, + { + "epoch": 19.028800346470334, + "grad_norm": 0.34968364238739014, + "learning_rate": 4.809711996535297e-05, + "loss": 0.3651, + "step": 2812000 + }, + { + "epoch": 19.0321838458207, + "grad_norm": 0.3845962584018707, + "learning_rate": 4.809678161541793e-05, + "loss": 0.3647, + "step": 2812500 + }, + { + "epoch": 19.03556734517107, + "grad_norm": 0.350651353597641, + "learning_rate": 4.8096443265482894e-05, + "loss": 0.3639, + "step": 2813000 + }, + { + "epoch": 19.038950844521437, + "grad_norm": 0.3638920187950134, + "learning_rate": 4.8096104915547856e-05, + "loss": 0.3651, + "step": 2813500 + }, + { + "epoch": 19.042334343871804, + "grad_norm": 0.3764038681983948, + "learning_rate": 4.809576656561282e-05, + "loss": 0.3643, + "step": 2814000 + }, + { + "epoch": 19.045717843222175, + "grad_norm": 0.37085476517677307, + "learning_rate": 4.809542821567779e-05, + "loss": 0.3635, + "step": 2814500 + }, + { + "epoch": 19.04910134257254, + "grad_norm": 0.3822312653064728, + "learning_rate": 4.809508986574275e-05, + "loss": 0.3665, + "step": 2815000 + }, + { + "epoch": 19.05248484192291, + "grad_norm": 0.34775885939598083, + "learning_rate": 4.809475151580771e-05, + "loss": 0.3657, + "step": 2815500 + }, + { + "epoch": 19.05586834127328, + "grad_norm": 0.3567921817302704, + "learning_rate": 4.8094413165872674e-05, + "loss": 0.3643, + "step": 2816000 + }, + { + "epoch": 19.059251840623645, + "grad_norm": 0.39007171988487244, + "learning_rate": 4.809407481593764e-05, + "loss": 0.3645, + "step": 2816500 + }, + { + "epoch": 19.062635339974015, + "grad_norm": 0.40277397632598877, + "learning_rate": 4.8093736466002605e-05, + "loss": 0.3656, + "step": 2817000 + }, + { + "epoch": 19.066018839324382, + "grad_norm": 0.3536837100982666, + "learning_rate": 4.809339811606756e-05, + "loss": 0.3646, + "step": 2817500 + }, + { + "epoch": 19.069402338674752, + "grad_norm": 0.36523324251174927, + "learning_rate": 4.809305976613253e-05, + "loss": 0.3655, + "step": 2818000 + }, + { + "epoch": 19.07278583802512, + "grad_norm": 0.35634082555770874, + "learning_rate": 4.809272141619749e-05, + "loss": 0.3645, + "step": 2818500 + }, + { + "epoch": 19.076169337375486, + "grad_norm": 0.36509403586387634, + "learning_rate": 4.809238306626245e-05, + "loss": 0.365, + "step": 2819000 + }, + { + "epoch": 19.079552836725856, + "grad_norm": 0.3636913299560547, + "learning_rate": 4.8092044716327415e-05, + "loss": 0.3647, + "step": 2819500 + }, + { + "epoch": 19.082936336076223, + "grad_norm": 0.35474660992622375, + "learning_rate": 4.809170636639238e-05, + "loss": 0.3653, + "step": 2820000 + }, + { + "epoch": 19.086319835426593, + "grad_norm": 0.3720208406448364, + "learning_rate": 4.8091368016457346e-05, + "loss": 0.3631, + "step": 2820500 + }, + { + "epoch": 19.08970333477696, + "grad_norm": 0.40283653140068054, + "learning_rate": 4.809102966652231e-05, + "loss": 0.3651, + "step": 2821000 + }, + { + "epoch": 19.093086834127327, + "grad_norm": 0.36578333377838135, + "learning_rate": 4.809069131658727e-05, + "loss": 0.3652, + "step": 2821500 + }, + { + "epoch": 19.096470333477697, + "grad_norm": 0.3494338095188141, + "learning_rate": 4.809035296665223e-05, + "loss": 0.366, + "step": 2822000 + }, + { + "epoch": 19.099853832828064, + "grad_norm": 0.39220380783081055, + "learning_rate": 4.8090014616717195e-05, + "loss": 0.3632, + "step": 2822500 + }, + { + "epoch": 19.10323733217843, + "grad_norm": 0.39391595125198364, + "learning_rate": 4.808967626678216e-05, + "loss": 0.3654, + "step": 2823000 + }, + { + "epoch": 19.1066208315288, + "grad_norm": 0.36776575446128845, + "learning_rate": 4.808933791684712e-05, + "loss": 0.3645, + "step": 2823500 + }, + { + "epoch": 19.110004330879168, + "grad_norm": 0.3435305058956146, + "learning_rate": 4.808899956691209e-05, + "loss": 0.3663, + "step": 2824000 + }, + { + "epoch": 19.113387830229538, + "grad_norm": 0.412218302488327, + "learning_rate": 4.808866121697705e-05, + "loss": 0.366, + "step": 2824500 + }, + { + "epoch": 19.116771329579905, + "grad_norm": 0.3410913944244385, + "learning_rate": 4.808832286704201e-05, + "loss": 0.3666, + "step": 2825000 + }, + { + "epoch": 19.12015482893027, + "grad_norm": 0.3647315204143524, + "learning_rate": 4.8087984517106974e-05, + "loss": 0.3653, + "step": 2825500 + }, + { + "epoch": 19.12353832828064, + "grad_norm": 0.3654209077358246, + "learning_rate": 4.808764616717194e-05, + "loss": 0.3651, + "step": 2826000 + }, + { + "epoch": 19.12692182763101, + "grad_norm": 0.37966057658195496, + "learning_rate": 4.8087307817236905e-05, + "loss": 0.3647, + "step": 2826500 + }, + { + "epoch": 19.13030532698138, + "grad_norm": 0.3871251940727234, + "learning_rate": 4.808696946730186e-05, + "loss": 0.365, + "step": 2827000 + }, + { + "epoch": 19.133688826331746, + "grad_norm": 0.3365837037563324, + "learning_rate": 4.808663111736682e-05, + "loss": 0.3656, + "step": 2827500 + }, + { + "epoch": 19.137072325682112, + "grad_norm": 0.3533678650856018, + "learning_rate": 4.808629276743179e-05, + "loss": 0.3637, + "step": 2828000 + }, + { + "epoch": 19.140455825032483, + "grad_norm": 0.3549516499042511, + "learning_rate": 4.8085954417496754e-05, + "loss": 0.365, + "step": 2828500 + }, + { + "epoch": 19.14383932438285, + "grad_norm": 0.3680468797683716, + "learning_rate": 4.8085616067561716e-05, + "loss": 0.3649, + "step": 2829000 + }, + { + "epoch": 19.14722282373322, + "grad_norm": 0.37863093614578247, + "learning_rate": 4.808527771762668e-05, + "loss": 0.3652, + "step": 2829500 + }, + { + "epoch": 19.150606323083586, + "grad_norm": 0.41318458318710327, + "learning_rate": 4.808493936769165e-05, + "loss": 0.3633, + "step": 2830000 + }, + { + "epoch": 19.153989822433953, + "grad_norm": 0.37374892830848694, + "learning_rate": 4.808460101775661e-05, + "loss": 0.3639, + "step": 2830500 + }, + { + "epoch": 19.157373321784323, + "grad_norm": 0.371120423078537, + "learning_rate": 4.808426266782157e-05, + "loss": 0.3647, + "step": 2831000 + }, + { + "epoch": 19.16075682113469, + "grad_norm": 0.3479118347167969, + "learning_rate": 4.808392431788653e-05, + "loss": 0.3659, + "step": 2831500 + }, + { + "epoch": 19.164140320485057, + "grad_norm": 0.350894957780838, + "learning_rate": 4.8083585967951495e-05, + "loss": 0.3657, + "step": 2832000 + }, + { + "epoch": 19.167523819835427, + "grad_norm": 0.3924980163574219, + "learning_rate": 4.808324761801646e-05, + "loss": 0.3654, + "step": 2832500 + }, + { + "epoch": 19.170907319185794, + "grad_norm": 0.349345862865448, + "learning_rate": 4.808290926808142e-05, + "loss": 0.3649, + "step": 2833000 + }, + { + "epoch": 19.174290818536164, + "grad_norm": 0.35752519965171814, + "learning_rate": 4.808257091814639e-05, + "loss": 0.3648, + "step": 2833500 + }, + { + "epoch": 19.17767431788653, + "grad_norm": 0.36245429515838623, + "learning_rate": 4.808223256821135e-05, + "loss": 0.3653, + "step": 2834000 + }, + { + "epoch": 19.181057817236898, + "grad_norm": 0.3374299705028534, + "learning_rate": 4.808189421827631e-05, + "loss": 0.3647, + "step": 2834500 + }, + { + "epoch": 19.184441316587268, + "grad_norm": 0.33912578225135803, + "learning_rate": 4.8081555868341275e-05, + "loss": 0.3641, + "step": 2835000 + }, + { + "epoch": 19.187824815937635, + "grad_norm": 0.387525349855423, + "learning_rate": 4.8081217518406244e-05, + "loss": 0.3654, + "step": 2835500 + }, + { + "epoch": 19.191208315288005, + "grad_norm": 0.32535794377326965, + "learning_rate": 4.8080879168471206e-05, + "loss": 0.3649, + "step": 2836000 + }, + { + "epoch": 19.194591814638372, + "grad_norm": 0.38029584288597107, + "learning_rate": 4.808054081853616e-05, + "loss": 0.3645, + "step": 2836500 + }, + { + "epoch": 19.19797531398874, + "grad_norm": 0.3509443700313568, + "learning_rate": 4.808020246860112e-05, + "loss": 0.3646, + "step": 2837000 + }, + { + "epoch": 19.20135881333911, + "grad_norm": 0.38972824811935425, + "learning_rate": 4.807986411866609e-05, + "loss": 0.3639, + "step": 2837500 + }, + { + "epoch": 19.204742312689476, + "grad_norm": 0.38763338327407837, + "learning_rate": 4.8079525768731054e-05, + "loss": 0.3647, + "step": 2838000 + }, + { + "epoch": 19.208125812039842, + "grad_norm": 0.35395440459251404, + "learning_rate": 4.8079187418796016e-05, + "loss": 0.3647, + "step": 2838500 + }, + { + "epoch": 19.211509311390213, + "grad_norm": 0.36853599548339844, + "learning_rate": 4.807884906886098e-05, + "loss": 0.3642, + "step": 2839000 + }, + { + "epoch": 19.21489281074058, + "grad_norm": 0.34220248460769653, + "learning_rate": 4.807851071892595e-05, + "loss": 0.3656, + "step": 2839500 + }, + { + "epoch": 19.21827631009095, + "grad_norm": 0.38692814111709595, + "learning_rate": 4.807817236899091e-05, + "loss": 0.3646, + "step": 2840000 + }, + { + "epoch": 19.221659809441316, + "grad_norm": 0.3987424969673157, + "learning_rate": 4.807783401905587e-05, + "loss": 0.3647, + "step": 2840500 + }, + { + "epoch": 19.225043308791683, + "grad_norm": 0.3585709035396576, + "learning_rate": 4.8077495669120834e-05, + "loss": 0.366, + "step": 2841000 + }, + { + "epoch": 19.228426808142054, + "grad_norm": 0.39497193694114685, + "learning_rate": 4.8077157319185796e-05, + "loss": 0.3672, + "step": 2841500 + }, + { + "epoch": 19.23181030749242, + "grad_norm": 0.33567896485328674, + "learning_rate": 4.807681896925076e-05, + "loss": 0.3651, + "step": 2842000 + }, + { + "epoch": 19.23519380684279, + "grad_norm": 0.34376466274261475, + "learning_rate": 4.807648061931572e-05, + "loss": 0.3659, + "step": 2842500 + }, + { + "epoch": 19.238577306193157, + "grad_norm": 0.39341068267822266, + "learning_rate": 4.807614226938069e-05, + "loss": 0.3652, + "step": 2843000 + }, + { + "epoch": 19.241960805543524, + "grad_norm": 0.3692178428173065, + "learning_rate": 4.807580391944565e-05, + "loss": 0.3647, + "step": 2843500 + }, + { + "epoch": 19.245344304893894, + "grad_norm": 0.350510835647583, + "learning_rate": 4.807546556951061e-05, + "loss": 0.3657, + "step": 2844000 + }, + { + "epoch": 19.24872780424426, + "grad_norm": 0.35085275769233704, + "learning_rate": 4.8075127219575575e-05, + "loss": 0.3645, + "step": 2844500 + }, + { + "epoch": 19.25211130359463, + "grad_norm": 0.3581259846687317, + "learning_rate": 4.8074788869640544e-05, + "loss": 0.3664, + "step": 2845000 + }, + { + "epoch": 19.255494802944998, + "grad_norm": 0.3346034586429596, + "learning_rate": 4.8074450519705506e-05, + "loss": 0.3654, + "step": 2845500 + }, + { + "epoch": 19.258878302295365, + "grad_norm": 0.41722241044044495, + "learning_rate": 4.807411216977046e-05, + "loss": 0.3657, + "step": 2846000 + }, + { + "epoch": 19.262261801645735, + "grad_norm": 0.33750683069229126, + "learning_rate": 4.8073773819835424e-05, + "loss": 0.366, + "step": 2846500 + }, + { + "epoch": 19.265645300996102, + "grad_norm": 0.3772087097167969, + "learning_rate": 4.807343546990039e-05, + "loss": 0.3657, + "step": 2847000 + }, + { + "epoch": 19.26902880034647, + "grad_norm": 0.3495555818080902, + "learning_rate": 4.8073097119965355e-05, + "loss": 0.3654, + "step": 2847500 + }, + { + "epoch": 19.27241229969684, + "grad_norm": 0.37127164006233215, + "learning_rate": 4.807275877003032e-05, + "loss": 0.3642, + "step": 2848000 + }, + { + "epoch": 19.275795799047206, + "grad_norm": 0.3762498199939728, + "learning_rate": 4.807242042009528e-05, + "loss": 0.3648, + "step": 2848500 + }, + { + "epoch": 19.279179298397576, + "grad_norm": 0.378359317779541, + "learning_rate": 4.807208207016025e-05, + "loss": 0.3642, + "step": 2849000 + }, + { + "epoch": 19.282562797747943, + "grad_norm": 0.38522207736968994, + "learning_rate": 4.807174372022521e-05, + "loss": 0.366, + "step": 2849500 + }, + { + "epoch": 19.28594629709831, + "grad_norm": 0.38753610849380493, + "learning_rate": 4.807140537029017e-05, + "loss": 0.3647, + "step": 2850000 + }, + { + "epoch": 19.28932979644868, + "grad_norm": 0.3656563460826874, + "learning_rate": 4.8071067020355134e-05, + "loss": 0.3651, + "step": 2850500 + }, + { + "epoch": 19.292713295799047, + "grad_norm": 0.3563537001609802, + "learning_rate": 4.8070728670420097e-05, + "loss": 0.3657, + "step": 2851000 + }, + { + "epoch": 19.296096795149417, + "grad_norm": 0.3515377342700958, + "learning_rate": 4.807039032048506e-05, + "loss": 0.3671, + "step": 2851500 + }, + { + "epoch": 19.299480294499784, + "grad_norm": 0.335602343082428, + "learning_rate": 4.807005197055002e-05, + "loss": 0.365, + "step": 2852000 + }, + { + "epoch": 19.30286379385015, + "grad_norm": 0.35789671540260315, + "learning_rate": 4.806971362061499e-05, + "loss": 0.3666, + "step": 2852500 + }, + { + "epoch": 19.30624729320052, + "grad_norm": 0.36818790435791016, + "learning_rate": 4.806937527067995e-05, + "loss": 0.3642, + "step": 2853000 + }, + { + "epoch": 19.309630792550887, + "grad_norm": 0.34332209825515747, + "learning_rate": 4.8069036920744914e-05, + "loss": 0.3665, + "step": 2853500 + }, + { + "epoch": 19.313014291901254, + "grad_norm": 0.3529587388038635, + "learning_rate": 4.8068698570809876e-05, + "loss": 0.3648, + "step": 2854000 + }, + { + "epoch": 19.316397791251624, + "grad_norm": 0.3891531527042389, + "learning_rate": 4.8068360220874845e-05, + "loss": 0.3652, + "step": 2854500 + }, + { + "epoch": 19.31978129060199, + "grad_norm": 0.3630481958389282, + "learning_rate": 4.806802187093981e-05, + "loss": 0.3655, + "step": 2855000 + }, + { + "epoch": 19.32316478995236, + "grad_norm": 0.3677315413951874, + "learning_rate": 4.806768352100476e-05, + "loss": 0.3662, + "step": 2855500 + }, + { + "epoch": 19.32654828930273, + "grad_norm": 0.3780626654624939, + "learning_rate": 4.8067345171069725e-05, + "loss": 0.3641, + "step": 2856000 + }, + { + "epoch": 19.329931788653095, + "grad_norm": 0.33556923270225525, + "learning_rate": 4.8067006821134693e-05, + "loss": 0.3641, + "step": 2856500 + }, + { + "epoch": 19.333315288003465, + "grad_norm": 0.3398982584476471, + "learning_rate": 4.8066668471199656e-05, + "loss": 0.366, + "step": 2857000 + }, + { + "epoch": 19.336698787353832, + "grad_norm": 0.33337876200675964, + "learning_rate": 4.806633012126462e-05, + "loss": 0.3649, + "step": 2857500 + }, + { + "epoch": 19.340082286704202, + "grad_norm": 0.3427415192127228, + "learning_rate": 4.806599177132958e-05, + "loss": 0.365, + "step": 2858000 + }, + { + "epoch": 19.34346578605457, + "grad_norm": 0.33809319138526917, + "learning_rate": 4.806565342139455e-05, + "loss": 0.364, + "step": 2858500 + }, + { + "epoch": 19.346849285404936, + "grad_norm": 0.35444408655166626, + "learning_rate": 4.806531507145951e-05, + "loss": 0.3666, + "step": 2859000 + }, + { + "epoch": 19.350232784755306, + "grad_norm": 0.3796485364437103, + "learning_rate": 4.806497672152447e-05, + "loss": 0.3656, + "step": 2859500 + }, + { + "epoch": 19.353616284105673, + "grad_norm": 0.4037618637084961, + "learning_rate": 4.8064638371589435e-05, + "loss": 0.3653, + "step": 2860000 + }, + { + "epoch": 19.356999783456043, + "grad_norm": 0.35624685883522034, + "learning_rate": 4.80643000216544e-05, + "loss": 0.3633, + "step": 2860500 + }, + { + "epoch": 19.36038328280641, + "grad_norm": 0.39349284768104553, + "learning_rate": 4.806396167171936e-05, + "loss": 0.366, + "step": 2861000 + }, + { + "epoch": 19.363766782156777, + "grad_norm": 0.4229692220687866, + "learning_rate": 4.806362332178432e-05, + "loss": 0.3653, + "step": 2861500 + }, + { + "epoch": 19.367150281507147, + "grad_norm": 0.40164726972579956, + "learning_rate": 4.806328497184929e-05, + "loss": 0.3657, + "step": 2862000 + }, + { + "epoch": 19.370533780857514, + "grad_norm": 0.37175437808036804, + "learning_rate": 4.806294662191425e-05, + "loss": 0.365, + "step": 2862500 + }, + { + "epoch": 19.37391728020788, + "grad_norm": 0.3736785352230072, + "learning_rate": 4.8062608271979215e-05, + "loss": 0.3643, + "step": 2863000 + }, + { + "epoch": 19.37730077955825, + "grad_norm": 0.3389967978000641, + "learning_rate": 4.806226992204418e-05, + "loss": 0.3655, + "step": 2863500 + }, + { + "epoch": 19.380684278908618, + "grad_norm": 0.35902392864227295, + "learning_rate": 4.8061931572109146e-05, + "loss": 0.3646, + "step": 2864000 + }, + { + "epoch": 19.384067778258988, + "grad_norm": 0.37433475255966187, + "learning_rate": 4.806159322217411e-05, + "loss": 0.3665, + "step": 2864500 + }, + { + "epoch": 19.387451277609355, + "grad_norm": 0.3930208384990692, + "learning_rate": 4.806125487223906e-05, + "loss": 0.3655, + "step": 2865000 + }, + { + "epoch": 19.39083477695972, + "grad_norm": 0.32924091815948486, + "learning_rate": 4.8060916522304025e-05, + "loss": 0.3652, + "step": 2865500 + }, + { + "epoch": 19.39421827631009, + "grad_norm": 0.3700878322124481, + "learning_rate": 4.8060578172368994e-05, + "loss": 0.3646, + "step": 2866000 + }, + { + "epoch": 19.39760177566046, + "grad_norm": 0.36699455976486206, + "learning_rate": 4.8060239822433956e-05, + "loss": 0.364, + "step": 2866500 + }, + { + "epoch": 19.40098527501083, + "grad_norm": 0.3643014132976532, + "learning_rate": 4.805990147249892e-05, + "loss": 0.3655, + "step": 2867000 + }, + { + "epoch": 19.404368774361195, + "grad_norm": 0.4105173349380493, + "learning_rate": 4.805956312256388e-05, + "loss": 0.3665, + "step": 2867500 + }, + { + "epoch": 19.407752273711562, + "grad_norm": 0.35919809341430664, + "learning_rate": 4.805922477262885e-05, + "loss": 0.3663, + "step": 2868000 + }, + { + "epoch": 19.411135773061932, + "grad_norm": 0.3556428551673889, + "learning_rate": 4.805888642269381e-05, + "loss": 0.3648, + "step": 2868500 + }, + { + "epoch": 19.4145192724123, + "grad_norm": 0.37149539589881897, + "learning_rate": 4.8058548072758774e-05, + "loss": 0.3666, + "step": 2869000 + }, + { + "epoch": 19.41790277176267, + "grad_norm": 0.39399322867393494, + "learning_rate": 4.8058209722823736e-05, + "loss": 0.3653, + "step": 2869500 + }, + { + "epoch": 19.421286271113036, + "grad_norm": 0.33901286125183105, + "learning_rate": 4.80578713728887e-05, + "loss": 0.366, + "step": 2870000 + }, + { + "epoch": 19.424669770463403, + "grad_norm": 0.33520957827568054, + "learning_rate": 4.805753302295366e-05, + "loss": 0.3644, + "step": 2870500 + }, + { + "epoch": 19.428053269813773, + "grad_norm": 0.3945138454437256, + "learning_rate": 4.805719467301862e-05, + "loss": 0.3666, + "step": 2871000 + }, + { + "epoch": 19.43143676916414, + "grad_norm": 0.40912050008773804, + "learning_rate": 4.805685632308359e-05, + "loss": 0.3657, + "step": 2871500 + }, + { + "epoch": 19.434820268514507, + "grad_norm": 0.3628746271133423, + "learning_rate": 4.805651797314855e-05, + "loss": 0.3666, + "step": 2872000 + }, + { + "epoch": 19.438203767864877, + "grad_norm": 0.3478717803955078, + "learning_rate": 4.8056179623213515e-05, + "loss": 0.3663, + "step": 2872500 + }, + { + "epoch": 19.441587267215244, + "grad_norm": 0.38110673427581787, + "learning_rate": 4.805584127327848e-05, + "loss": 0.3654, + "step": 2873000 + }, + { + "epoch": 19.444970766565614, + "grad_norm": 0.36483538150787354, + "learning_rate": 4.805550292334344e-05, + "loss": 0.3645, + "step": 2873500 + }, + { + "epoch": 19.44835426591598, + "grad_norm": 0.41633760929107666, + "learning_rate": 4.805516457340841e-05, + "loss": 0.3666, + "step": 2874000 + }, + { + "epoch": 19.451737765266348, + "grad_norm": 0.4003651440143585, + "learning_rate": 4.8054826223473364e-05, + "loss": 0.3666, + "step": 2874500 + }, + { + "epoch": 19.455121264616718, + "grad_norm": 0.359640508890152, + "learning_rate": 4.8054487873538326e-05, + "loss": 0.3648, + "step": 2875000 + }, + { + "epoch": 19.458504763967085, + "grad_norm": 0.34756699204444885, + "learning_rate": 4.8054149523603295e-05, + "loss": 0.3643, + "step": 2875500 + }, + { + "epoch": 19.461888263317455, + "grad_norm": 0.3257155418395996, + "learning_rate": 4.805381117366826e-05, + "loss": 0.3645, + "step": 2876000 + }, + { + "epoch": 19.46527176266782, + "grad_norm": 0.3670973479747772, + "learning_rate": 4.805347282373322e-05, + "loss": 0.3655, + "step": 2876500 + }, + { + "epoch": 19.46865526201819, + "grad_norm": 0.39196884632110596, + "learning_rate": 4.805313447379818e-05, + "loss": 0.3677, + "step": 2877000 + }, + { + "epoch": 19.47203876136856, + "grad_norm": 0.3847583532333374, + "learning_rate": 4.805279612386315e-05, + "loss": 0.3656, + "step": 2877500 + }, + { + "epoch": 19.475422260718926, + "grad_norm": 0.3999863862991333, + "learning_rate": 4.805245777392811e-05, + "loss": 0.3662, + "step": 2878000 + }, + { + "epoch": 19.478805760069292, + "grad_norm": 0.35730212926864624, + "learning_rate": 4.8052119423993074e-05, + "loss": 0.3657, + "step": 2878500 + }, + { + "epoch": 19.482189259419663, + "grad_norm": 0.390097439289093, + "learning_rate": 4.8051781074058036e-05, + "loss": 0.3641, + "step": 2879000 + }, + { + "epoch": 19.48557275877003, + "grad_norm": 0.33165496587753296, + "learning_rate": 4.8051442724123e-05, + "loss": 0.3666, + "step": 2879500 + }, + { + "epoch": 19.4889562581204, + "grad_norm": 0.396045058965683, + "learning_rate": 4.805110437418796e-05, + "loss": 0.3648, + "step": 2880000 + }, + { + "epoch": 19.492339757470766, + "grad_norm": 0.32602620124816895, + "learning_rate": 4.805076602425292e-05, + "loss": 0.3654, + "step": 2880500 + }, + { + "epoch": 19.495723256821133, + "grad_norm": 0.362203985452652, + "learning_rate": 4.805042767431789e-05, + "loss": 0.3661, + "step": 2881000 + }, + { + "epoch": 19.499106756171503, + "grad_norm": 0.38408005237579346, + "learning_rate": 4.8050089324382854e-05, + "loss": 0.3656, + "step": 2881500 + }, + { + "epoch": 19.50249025552187, + "grad_norm": 0.3891410827636719, + "learning_rate": 4.8049750974447816e-05, + "loss": 0.3658, + "step": 2882000 + }, + { + "epoch": 19.50587375487224, + "grad_norm": 0.4252306818962097, + "learning_rate": 4.804941262451278e-05, + "loss": 0.3663, + "step": 2882500 + }, + { + "epoch": 19.509257254222607, + "grad_norm": 0.3757934868335724, + "learning_rate": 4.804907427457774e-05, + "loss": 0.3662, + "step": 2883000 + }, + { + "epoch": 19.512640753572974, + "grad_norm": 0.36446207761764526, + "learning_rate": 4.804873592464271e-05, + "loss": 0.3638, + "step": 2883500 + }, + { + "epoch": 19.516024252923344, + "grad_norm": 0.38249266147613525, + "learning_rate": 4.8048397574707664e-05, + "loss": 0.3657, + "step": 2884000 + }, + { + "epoch": 19.51940775227371, + "grad_norm": 0.37207794189453125, + "learning_rate": 4.8048059224772626e-05, + "loss": 0.3671, + "step": 2884500 + }, + { + "epoch": 19.52279125162408, + "grad_norm": 0.3630044162273407, + "learning_rate": 4.8047720874837595e-05, + "loss": 0.3663, + "step": 2885000 + }, + { + "epoch": 19.526174750974448, + "grad_norm": 0.3826695382595062, + "learning_rate": 4.804738252490256e-05, + "loss": 0.3647, + "step": 2885500 + }, + { + "epoch": 19.529558250324815, + "grad_norm": 0.4040972590446472, + "learning_rate": 4.804704417496752e-05, + "loss": 0.3647, + "step": 2886000 + }, + { + "epoch": 19.532941749675185, + "grad_norm": 0.37629270553588867, + "learning_rate": 4.804670582503248e-05, + "loss": 0.3667, + "step": 2886500 + }, + { + "epoch": 19.536325249025552, + "grad_norm": 0.3865208625793457, + "learning_rate": 4.804636747509745e-05, + "loss": 0.3645, + "step": 2887000 + }, + { + "epoch": 19.53970874837592, + "grad_norm": 0.3766942024230957, + "learning_rate": 4.804602912516241e-05, + "loss": 0.3654, + "step": 2887500 + }, + { + "epoch": 19.54309224772629, + "grad_norm": 0.3663296401500702, + "learning_rate": 4.8045690775227375e-05, + "loss": 0.3639, + "step": 2888000 + }, + { + "epoch": 19.546475747076656, + "grad_norm": 0.3530994951725006, + "learning_rate": 4.804535242529234e-05, + "loss": 0.3671, + "step": 2888500 + }, + { + "epoch": 19.549859246427026, + "grad_norm": 0.4040890336036682, + "learning_rate": 4.80450140753573e-05, + "loss": 0.3653, + "step": 2889000 + }, + { + "epoch": 19.553242745777393, + "grad_norm": 0.33798158168792725, + "learning_rate": 4.804467572542226e-05, + "loss": 0.3655, + "step": 2889500 + }, + { + "epoch": 19.55662624512776, + "grad_norm": 0.3921116292476654, + "learning_rate": 4.804433737548722e-05, + "loss": 0.3654, + "step": 2890000 + }, + { + "epoch": 19.56000974447813, + "grad_norm": 0.3624851405620575, + "learning_rate": 4.8043999025552185e-05, + "loss": 0.3671, + "step": 2890500 + }, + { + "epoch": 19.563393243828497, + "grad_norm": 0.4179016947746277, + "learning_rate": 4.8043660675617154e-05, + "loss": 0.367, + "step": 2891000 + }, + { + "epoch": 19.566776743178867, + "grad_norm": 0.34648746252059937, + "learning_rate": 4.8043322325682116e-05, + "loss": 0.3662, + "step": 2891500 + }, + { + "epoch": 19.570160242529234, + "grad_norm": 0.3729505240917206, + "learning_rate": 4.804298397574708e-05, + "loss": 0.3669, + "step": 2892000 + }, + { + "epoch": 19.5735437418796, + "grad_norm": 0.3661282956600189, + "learning_rate": 4.804264562581204e-05, + "loss": 0.365, + "step": 2892500 + }, + { + "epoch": 19.57692724122997, + "grad_norm": 0.38361188769340515, + "learning_rate": 4.804230727587701e-05, + "loss": 0.3654, + "step": 2893000 + }, + { + "epoch": 19.580310740580337, + "grad_norm": 0.397332102060318, + "learning_rate": 4.8041968925941965e-05, + "loss": 0.3665, + "step": 2893500 + }, + { + "epoch": 19.583694239930708, + "grad_norm": 0.3681514263153076, + "learning_rate": 4.804163057600693e-05, + "loss": 0.3643, + "step": 2894000 + }, + { + "epoch": 19.587077739281074, + "grad_norm": 0.3427649736404419, + "learning_rate": 4.8041292226071896e-05, + "loss": 0.365, + "step": 2894500 + }, + { + "epoch": 19.59046123863144, + "grad_norm": 0.3386278748512268, + "learning_rate": 4.804095387613686e-05, + "loss": 0.3677, + "step": 2895000 + }, + { + "epoch": 19.59384473798181, + "grad_norm": 0.3502010703086853, + "learning_rate": 4.804061552620182e-05, + "loss": 0.3662, + "step": 2895500 + }, + { + "epoch": 19.59722823733218, + "grad_norm": 0.34507283568382263, + "learning_rate": 4.804027717626678e-05, + "loss": 0.3665, + "step": 2896000 + }, + { + "epoch": 19.600611736682545, + "grad_norm": 0.33213046193122864, + "learning_rate": 4.803993882633175e-05, + "loss": 0.3658, + "step": 2896500 + }, + { + "epoch": 19.603995236032915, + "grad_norm": 0.3481467068195343, + "learning_rate": 4.803960047639671e-05, + "loss": 0.3659, + "step": 2897000 + }, + { + "epoch": 19.607378735383282, + "grad_norm": 0.3790886402130127, + "learning_rate": 4.8039262126461675e-05, + "loss": 0.3658, + "step": 2897500 + }, + { + "epoch": 19.610762234733652, + "grad_norm": 0.35485225915908813, + "learning_rate": 4.803892377652664e-05, + "loss": 0.3659, + "step": 2898000 + }, + { + "epoch": 19.61414573408402, + "grad_norm": 0.37360668182373047, + "learning_rate": 4.80385854265916e-05, + "loss": 0.3668, + "step": 2898500 + }, + { + "epoch": 19.617529233434386, + "grad_norm": 0.3755984604358673, + "learning_rate": 4.803824707665656e-05, + "loss": 0.3652, + "step": 2899000 + }, + { + "epoch": 19.620912732784756, + "grad_norm": 0.3325406014919281, + "learning_rate": 4.8037908726721524e-05, + "loss": 0.3646, + "step": 2899500 + }, + { + "epoch": 19.624296232135123, + "grad_norm": 0.38012993335723877, + "learning_rate": 4.8037570376786486e-05, + "loss": 0.3671, + "step": 2900000 + }, + { + "epoch": 19.627679731485493, + "grad_norm": 0.39566442370414734, + "learning_rate": 4.8037232026851455e-05, + "loss": 0.3646, + "step": 2900500 + }, + { + "epoch": 19.63106323083586, + "grad_norm": 0.35954439640045166, + "learning_rate": 4.803689367691642e-05, + "loss": 0.3654, + "step": 2901000 + }, + { + "epoch": 19.634446730186227, + "grad_norm": 0.41132161021232605, + "learning_rate": 4.803655532698138e-05, + "loss": 0.3667, + "step": 2901500 + }, + { + "epoch": 19.637830229536597, + "grad_norm": 0.3903202712535858, + "learning_rate": 4.803621697704634e-05, + "loss": 0.3663, + "step": 2902000 + }, + { + "epoch": 19.641213728886964, + "grad_norm": 0.35856491327285767, + "learning_rate": 4.803587862711131e-05, + "loss": 0.3661, + "step": 2902500 + }, + { + "epoch": 19.64459722823733, + "grad_norm": 0.3773479163646698, + "learning_rate": 4.8035540277176266e-05, + "loss": 0.3658, + "step": 2903000 + }, + { + "epoch": 19.6479807275877, + "grad_norm": 0.3696669638156891, + "learning_rate": 4.803520192724123e-05, + "loss": 0.3672, + "step": 2903500 + }, + { + "epoch": 19.651364226938068, + "grad_norm": 0.3732285797595978, + "learning_rate": 4.80348635773062e-05, + "loss": 0.3653, + "step": 2904000 + }, + { + "epoch": 19.654747726288438, + "grad_norm": 0.367210328578949, + "learning_rate": 4.803452522737116e-05, + "loss": 0.3669, + "step": 2904500 + }, + { + "epoch": 19.658131225638805, + "grad_norm": 0.3591116964817047, + "learning_rate": 4.803418687743612e-05, + "loss": 0.3665, + "step": 2905000 + }, + { + "epoch": 19.66151472498917, + "grad_norm": 0.3639654815196991, + "learning_rate": 4.803384852750108e-05, + "loss": 0.3657, + "step": 2905500 + }, + { + "epoch": 19.66489822433954, + "grad_norm": 0.36325597763061523, + "learning_rate": 4.803351017756605e-05, + "loss": 0.3672, + "step": 2906000 + }, + { + "epoch": 19.66828172368991, + "grad_norm": 0.3457540273666382, + "learning_rate": 4.8033171827631014e-05, + "loss": 0.3655, + "step": 2906500 + }, + { + "epoch": 19.67166522304028, + "grad_norm": 0.3496015965938568, + "learning_rate": 4.8032833477695976e-05, + "loss": 0.3639, + "step": 2907000 + }, + { + "epoch": 19.675048722390645, + "grad_norm": 0.338682621717453, + "learning_rate": 4.803249512776094e-05, + "loss": 0.3669, + "step": 2907500 + }, + { + "epoch": 19.678432221741012, + "grad_norm": 0.3519893288612366, + "learning_rate": 4.80321567778259e-05, + "loss": 0.3663, + "step": 2908000 + }, + { + "epoch": 19.681815721091382, + "grad_norm": 0.345056414604187, + "learning_rate": 4.803181842789086e-05, + "loss": 0.3656, + "step": 2908500 + }, + { + "epoch": 19.68519922044175, + "grad_norm": 0.37303248047828674, + "learning_rate": 4.8031480077955825e-05, + "loss": 0.3655, + "step": 2909000 + }, + { + "epoch": 19.68858271979212, + "grad_norm": 0.355040043592453, + "learning_rate": 4.803114172802079e-05, + "loss": 0.3666, + "step": 2909500 + }, + { + "epoch": 19.691966219142486, + "grad_norm": 0.3853912949562073, + "learning_rate": 4.8030803378085756e-05, + "loss": 0.3664, + "step": 2910000 + }, + { + "epoch": 19.695349718492853, + "grad_norm": 0.3670165240764618, + "learning_rate": 4.803046502815072e-05, + "loss": 0.3664, + "step": 2910500 + }, + { + "epoch": 19.698733217843223, + "grad_norm": 0.37371906638145447, + "learning_rate": 4.803012667821568e-05, + "loss": 0.3643, + "step": 2911000 + }, + { + "epoch": 19.70211671719359, + "grad_norm": 0.34766170382499695, + "learning_rate": 4.802978832828064e-05, + "loss": 0.3672, + "step": 2911500 + }, + { + "epoch": 19.705500216543957, + "grad_norm": 0.3667513430118561, + "learning_rate": 4.802944997834561e-05, + "loss": 0.3639, + "step": 2912000 + }, + { + "epoch": 19.708883715894327, + "grad_norm": 0.3334568440914154, + "learning_rate": 4.8029111628410566e-05, + "loss": 0.3669, + "step": 2912500 + }, + { + "epoch": 19.712267215244694, + "grad_norm": 0.3696806728839874, + "learning_rate": 4.802877327847553e-05, + "loss": 0.3657, + "step": 2913000 + }, + { + "epoch": 19.715650714595064, + "grad_norm": 0.3852570056915283, + "learning_rate": 4.80284349285405e-05, + "loss": 0.3645, + "step": 2913500 + }, + { + "epoch": 19.71903421394543, + "grad_norm": 0.3736285865306854, + "learning_rate": 4.802809657860546e-05, + "loss": 0.3647, + "step": 2914000 + }, + { + "epoch": 19.722417713295798, + "grad_norm": 0.33064740896224976, + "learning_rate": 4.802775822867042e-05, + "loss": 0.365, + "step": 2914500 + }, + { + "epoch": 19.725801212646168, + "grad_norm": 0.3868357837200165, + "learning_rate": 4.8027419878735384e-05, + "loss": 0.3642, + "step": 2915000 + }, + { + "epoch": 19.729184711996535, + "grad_norm": 0.39555037021636963, + "learning_rate": 4.802708152880035e-05, + "loss": 0.3654, + "step": 2915500 + }, + { + "epoch": 19.732568211346905, + "grad_norm": 0.33659178018569946, + "learning_rate": 4.8026743178865315e-05, + "loss": 0.3672, + "step": 2916000 + }, + { + "epoch": 19.73595171069727, + "grad_norm": 0.38515806198120117, + "learning_rate": 4.802640482893028e-05, + "loss": 0.3665, + "step": 2916500 + }, + { + "epoch": 19.73933521004764, + "grad_norm": 0.36636361479759216, + "learning_rate": 4.802606647899524e-05, + "loss": 0.3656, + "step": 2917000 + }, + { + "epoch": 19.74271870939801, + "grad_norm": 0.3609832227230072, + "learning_rate": 4.80257281290602e-05, + "loss": 0.3658, + "step": 2917500 + }, + { + "epoch": 19.746102208748376, + "grad_norm": 0.34398800134658813, + "learning_rate": 4.802538977912516e-05, + "loss": 0.368, + "step": 2918000 + }, + { + "epoch": 19.749485708098746, + "grad_norm": 0.3340972363948822, + "learning_rate": 4.8025051429190125e-05, + "loss": 0.3649, + "step": 2918500 + }, + { + "epoch": 19.752869207449113, + "grad_norm": 0.35565194487571716, + "learning_rate": 4.802471307925509e-05, + "loss": 0.3646, + "step": 2919000 + }, + { + "epoch": 19.75625270679948, + "grad_norm": 0.3764093518257141, + "learning_rate": 4.8024374729320056e-05, + "loss": 0.3663, + "step": 2919500 + }, + { + "epoch": 19.75963620614985, + "grad_norm": 0.3450486660003662, + "learning_rate": 4.802403637938502e-05, + "loss": 0.3665, + "step": 2920000 + }, + { + "epoch": 19.763019705500216, + "grad_norm": 0.3341296911239624, + "learning_rate": 4.802369802944998e-05, + "loss": 0.367, + "step": 2920500 + }, + { + "epoch": 19.766403204850583, + "grad_norm": 0.3815435469150543, + "learning_rate": 4.802335967951494e-05, + "loss": 0.3658, + "step": 2921000 + }, + { + "epoch": 19.769786704200953, + "grad_norm": 0.3538355231285095, + "learning_rate": 4.802302132957991e-05, + "loss": 0.366, + "step": 2921500 + }, + { + "epoch": 19.77317020355132, + "grad_norm": 0.3608250916004181, + "learning_rate": 4.8022682979644874e-05, + "loss": 0.3665, + "step": 2922000 + }, + { + "epoch": 19.77655370290169, + "grad_norm": 0.3507607579231262, + "learning_rate": 4.802234462970983e-05, + "loss": 0.3648, + "step": 2922500 + }, + { + "epoch": 19.779937202252057, + "grad_norm": 0.3902914524078369, + "learning_rate": 4.80220062797748e-05, + "loss": 0.3662, + "step": 2923000 + }, + { + "epoch": 19.783320701602424, + "grad_norm": 0.37399566173553467, + "learning_rate": 4.802166792983976e-05, + "loss": 0.3657, + "step": 2923500 + }, + { + "epoch": 19.786704200952794, + "grad_norm": 0.3999204635620117, + "learning_rate": 4.802132957990472e-05, + "loss": 0.3642, + "step": 2924000 + }, + { + "epoch": 19.79008770030316, + "grad_norm": 0.37231069803237915, + "learning_rate": 4.8020991229969684e-05, + "loss": 0.3668, + "step": 2924500 + }, + { + "epoch": 19.79347119965353, + "grad_norm": 0.3625221252441406, + "learning_rate": 4.802065288003465e-05, + "loss": 0.3644, + "step": 2925000 + }, + { + "epoch": 19.796854699003898, + "grad_norm": 0.3390725255012512, + "learning_rate": 4.8020314530099615e-05, + "loss": 0.3655, + "step": 2925500 + }, + { + "epoch": 19.800238198354265, + "grad_norm": 0.3890676498413086, + "learning_rate": 4.801997618016458e-05, + "loss": 0.3676, + "step": 2926000 + }, + { + "epoch": 19.803621697704635, + "grad_norm": 0.35610461235046387, + "learning_rate": 4.801963783022954e-05, + "loss": 0.364, + "step": 2926500 + }, + { + "epoch": 19.807005197055002, + "grad_norm": 0.3813265264034271, + "learning_rate": 4.80192994802945e-05, + "loss": 0.3663, + "step": 2927000 + }, + { + "epoch": 19.81038869640537, + "grad_norm": 0.3406016230583191, + "learning_rate": 4.8018961130359464e-05, + "loss": 0.3662, + "step": 2927500 + }, + { + "epoch": 19.81377219575574, + "grad_norm": 0.34513765573501587, + "learning_rate": 4.8018622780424426e-05, + "loss": 0.3654, + "step": 2928000 + }, + { + "epoch": 19.817155695106106, + "grad_norm": 0.38791200518608093, + "learning_rate": 4.801828443048939e-05, + "loss": 0.3656, + "step": 2928500 + }, + { + "epoch": 19.820539194456476, + "grad_norm": 0.39310234785079956, + "learning_rate": 4.801794608055436e-05, + "loss": 0.3659, + "step": 2929000 + }, + { + "epoch": 19.823922693806843, + "grad_norm": 0.3613438606262207, + "learning_rate": 4.801760773061932e-05, + "loss": 0.3658, + "step": 2929500 + }, + { + "epoch": 19.82730619315721, + "grad_norm": 0.3606795370578766, + "learning_rate": 4.801726938068428e-05, + "loss": 0.3651, + "step": 2930000 + }, + { + "epoch": 19.83068969250758, + "grad_norm": 0.37441298365592957, + "learning_rate": 4.801693103074924e-05, + "loss": 0.3659, + "step": 2930500 + }, + { + "epoch": 19.834073191857946, + "grad_norm": 0.3620903789997101, + "learning_rate": 4.801659268081421e-05, + "loss": 0.3655, + "step": 2931000 + }, + { + "epoch": 19.837456691208317, + "grad_norm": 0.3679297864437103, + "learning_rate": 4.8016254330879174e-05, + "loss": 0.3661, + "step": 2931500 + }, + { + "epoch": 19.840840190558684, + "grad_norm": 0.37004727125167847, + "learning_rate": 4.801591598094413e-05, + "loss": 0.3663, + "step": 2932000 + }, + { + "epoch": 19.84422368990905, + "grad_norm": 0.3652530312538147, + "learning_rate": 4.80155776310091e-05, + "loss": 0.3652, + "step": 2932500 + }, + { + "epoch": 19.84760718925942, + "grad_norm": 0.35250890254974365, + "learning_rate": 4.801523928107406e-05, + "loss": 0.3646, + "step": 2933000 + }, + { + "epoch": 19.850990688609787, + "grad_norm": 0.36383000016212463, + "learning_rate": 4.801490093113902e-05, + "loss": 0.3659, + "step": 2933500 + }, + { + "epoch": 19.854374187960158, + "grad_norm": 0.38137707114219666, + "learning_rate": 4.8014562581203985e-05, + "loss": 0.3663, + "step": 2934000 + }, + { + "epoch": 19.857757687310524, + "grad_norm": 0.37094008922576904, + "learning_rate": 4.8014224231268954e-05, + "loss": 0.3646, + "step": 2934500 + }, + { + "epoch": 19.86114118666089, + "grad_norm": 0.3024352490901947, + "learning_rate": 4.8013885881333916e-05, + "loss": 0.3663, + "step": 2935000 + }, + { + "epoch": 19.86452468601126, + "grad_norm": 0.35161951184272766, + "learning_rate": 4.801354753139888e-05, + "loss": 0.3655, + "step": 2935500 + }, + { + "epoch": 19.867908185361628, + "grad_norm": 0.37565305829048157, + "learning_rate": 4.801320918146384e-05, + "loss": 0.3672, + "step": 2936000 + }, + { + "epoch": 19.871291684711995, + "grad_norm": 0.3331560790538788, + "learning_rate": 4.80128708315288e-05, + "loss": 0.3673, + "step": 2936500 + }, + { + "epoch": 19.874675184062365, + "grad_norm": 0.3546275496482849, + "learning_rate": 4.8012532481593764e-05, + "loss": 0.3664, + "step": 2937000 + }, + { + "epoch": 19.878058683412732, + "grad_norm": 0.39799919724464417, + "learning_rate": 4.8012194131658727e-05, + "loss": 0.3662, + "step": 2937500 + }, + { + "epoch": 19.881442182763102, + "grad_norm": 0.3795362114906311, + "learning_rate": 4.801185578172369e-05, + "loss": 0.3659, + "step": 2938000 + }, + { + "epoch": 19.88482568211347, + "grad_norm": 0.3724011480808258, + "learning_rate": 4.801151743178866e-05, + "loss": 0.3656, + "step": 2938500 + }, + { + "epoch": 19.888209181463836, + "grad_norm": 0.3911020755767822, + "learning_rate": 4.801117908185362e-05, + "loss": 0.3666, + "step": 2939000 + }, + { + "epoch": 19.891592680814206, + "grad_norm": 0.3608606159687042, + "learning_rate": 4.801084073191858e-05, + "loss": 0.3657, + "step": 2939500 + }, + { + "epoch": 19.894976180164573, + "grad_norm": 0.35787469148635864, + "learning_rate": 4.8010502381983544e-05, + "loss": 0.3667, + "step": 2940000 + }, + { + "epoch": 19.898359679514943, + "grad_norm": 0.40856972336769104, + "learning_rate": 4.801016403204851e-05, + "loss": 0.3656, + "step": 2940500 + }, + { + "epoch": 19.90174317886531, + "grad_norm": 0.3542422354221344, + "learning_rate": 4.8009825682113475e-05, + "loss": 0.3636, + "step": 2941000 + }, + { + "epoch": 19.905126678215677, + "grad_norm": 0.3684796690940857, + "learning_rate": 4.800948733217843e-05, + "loss": 0.3663, + "step": 2941500 + }, + { + "epoch": 19.908510177566047, + "grad_norm": 0.34546706080436707, + "learning_rate": 4.80091489822434e-05, + "loss": 0.3653, + "step": 2942000 + }, + { + "epoch": 19.911893676916414, + "grad_norm": 0.3739299476146698, + "learning_rate": 4.800881063230836e-05, + "loss": 0.3658, + "step": 2942500 + }, + { + "epoch": 19.915277176266784, + "grad_norm": 0.37413647770881653, + "learning_rate": 4.8008472282373323e-05, + "loss": 0.3663, + "step": 2943000 + }, + { + "epoch": 19.91866067561715, + "grad_norm": 0.38979628682136536, + "learning_rate": 4.8008133932438286e-05, + "loss": 0.3651, + "step": 2943500 + }, + { + "epoch": 19.922044174967517, + "grad_norm": 0.35113048553466797, + "learning_rate": 4.8007795582503254e-05, + "loss": 0.3655, + "step": 2944000 + }, + { + "epoch": 19.925427674317888, + "grad_norm": 0.3997516930103302, + "learning_rate": 4.8007457232568217e-05, + "loss": 0.3657, + "step": 2944500 + }, + { + "epoch": 19.928811173668254, + "grad_norm": 0.3631218671798706, + "learning_rate": 4.800711888263318e-05, + "loss": 0.3655, + "step": 2945000 + }, + { + "epoch": 19.93219467301862, + "grad_norm": 0.38995862007141113, + "learning_rate": 4.800678053269814e-05, + "loss": 0.3652, + "step": 2945500 + }, + { + "epoch": 19.93557817236899, + "grad_norm": 0.36848723888397217, + "learning_rate": 4.80064421827631e-05, + "loss": 0.3672, + "step": 2946000 + }, + { + "epoch": 19.93896167171936, + "grad_norm": 0.3414466381072998, + "learning_rate": 4.8006103832828065e-05, + "loss": 0.3664, + "step": 2946500 + }, + { + "epoch": 19.94234517106973, + "grad_norm": 0.3705518841743469, + "learning_rate": 4.800576548289303e-05, + "loss": 0.3662, + "step": 2947000 + }, + { + "epoch": 19.945728670420095, + "grad_norm": 0.35034966468811035, + "learning_rate": 4.800542713295799e-05, + "loss": 0.3654, + "step": 2947500 + }, + { + "epoch": 19.949112169770462, + "grad_norm": 0.41538190841674805, + "learning_rate": 4.800508878302296e-05, + "loss": 0.3658, + "step": 2948000 + }, + { + "epoch": 19.952495669120832, + "grad_norm": 0.32482677698135376, + "learning_rate": 4.800475043308792e-05, + "loss": 0.3678, + "step": 2948500 + }, + { + "epoch": 19.9558791684712, + "grad_norm": 0.36166203022003174, + "learning_rate": 4.800441208315288e-05, + "loss": 0.3667, + "step": 2949000 + }, + { + "epoch": 19.95926266782157, + "grad_norm": 0.38970285654067993, + "learning_rate": 4.8004073733217845e-05, + "loss": 0.3656, + "step": 2949500 + }, + { + "epoch": 19.962646167171936, + "grad_norm": 0.35561996698379517, + "learning_rate": 4.8003735383282813e-05, + "loss": 0.3664, + "step": 2950000 + }, + { + "epoch": 19.966029666522303, + "grad_norm": 0.34169068932533264, + "learning_rate": 4.8003397033347776e-05, + "loss": 0.3662, + "step": 2950500 + }, + { + "epoch": 19.969413165872673, + "grad_norm": 0.3757932484149933, + "learning_rate": 4.800305868341273e-05, + "loss": 0.3633, + "step": 2951000 + }, + { + "epoch": 19.97279666522304, + "grad_norm": 0.3738752603530884, + "learning_rate": 4.80027203334777e-05, + "loss": 0.3663, + "step": 2951500 + }, + { + "epoch": 19.976180164573407, + "grad_norm": 0.3486514091491699, + "learning_rate": 4.800238198354266e-05, + "loss": 0.3656, + "step": 2952000 + }, + { + "epoch": 19.979563663923777, + "grad_norm": 0.3921927809715271, + "learning_rate": 4.8002043633607624e-05, + "loss": 0.3639, + "step": 2952500 + }, + { + "epoch": 19.982947163274144, + "grad_norm": 0.3608473241329193, + "learning_rate": 4.8001705283672586e-05, + "loss": 0.3655, + "step": 2953000 + }, + { + "epoch": 19.986330662624514, + "grad_norm": 0.3112834095954895, + "learning_rate": 4.800136693373755e-05, + "loss": 0.3672, + "step": 2953500 + }, + { + "epoch": 19.98971416197488, + "grad_norm": 0.36548370122909546, + "learning_rate": 4.800102858380252e-05, + "loss": 0.3642, + "step": 2954000 + }, + { + "epoch": 19.993097661325248, + "grad_norm": 0.34996843338012695, + "learning_rate": 4.800069023386748e-05, + "loss": 0.3673, + "step": 2954500 + }, + { + "epoch": 19.996481160675618, + "grad_norm": 0.3586348295211792, + "learning_rate": 4.800035188393244e-05, + "loss": 0.3665, + "step": 2955000 + }, + { + "epoch": 19.999864660025985, + "grad_norm": 0.35456401109695435, + "learning_rate": 4.8000013533997404e-05, + "loss": 0.3659, + "step": 2955500 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.8606411793262569, + "eval_loss": 0.5660711526870728, + "eval_runtime": 3389.5524, + "eval_samples_per_second": 85.777, + "eval_steps_per_second": 5.361, + "step": 2955520 + }, + { + "epoch": 20.003248159376355, + "grad_norm": 0.37346386909484863, + "learning_rate": 4.7999675184062366e-05, + "loss": 0.3626, + "step": 2956000 + }, + { + "epoch": 20.00663165872672, + "grad_norm": 0.345569908618927, + "learning_rate": 4.799933683412733e-05, + "loss": 0.3636, + "step": 2956500 + }, + { + "epoch": 20.01001515807709, + "grad_norm": 0.38589319586753845, + "learning_rate": 4.799899848419229e-05, + "loss": 0.3638, + "step": 2957000 + }, + { + "epoch": 20.01339865742746, + "grad_norm": 0.36453381180763245, + "learning_rate": 4.799866013425726e-05, + "loss": 0.3631, + "step": 2957500 + }, + { + "epoch": 20.016782156777825, + "grad_norm": 0.38249197602272034, + "learning_rate": 4.799832178432222e-05, + "loss": 0.3627, + "step": 2958000 + }, + { + "epoch": 20.020165656128196, + "grad_norm": 0.3455698788166046, + "learning_rate": 4.799798343438718e-05, + "loss": 0.3636, + "step": 2958500 + }, + { + "epoch": 20.023549155478563, + "grad_norm": 0.3930585980415344, + "learning_rate": 4.7997645084452145e-05, + "loss": 0.3642, + "step": 2959000 + }, + { + "epoch": 20.02693265482893, + "grad_norm": 0.38215509057044983, + "learning_rate": 4.7997306734517114e-05, + "loss": 0.3632, + "step": 2959500 + }, + { + "epoch": 20.0303161541793, + "grad_norm": 0.3310193717479706, + "learning_rate": 4.7996968384582076e-05, + "loss": 0.3644, + "step": 2960000 + }, + { + "epoch": 20.033699653529666, + "grad_norm": 0.3496049642562866, + "learning_rate": 4.799663003464703e-05, + "loss": 0.3626, + "step": 2960500 + }, + { + "epoch": 20.037083152880033, + "grad_norm": 0.3632880449295044, + "learning_rate": 4.7996291684711994e-05, + "loss": 0.3648, + "step": 2961000 + }, + { + "epoch": 20.040466652230403, + "grad_norm": 0.4196126461029053, + "learning_rate": 4.799595333477696e-05, + "loss": 0.3644, + "step": 2961500 + }, + { + "epoch": 20.04385015158077, + "grad_norm": 0.43219631910324097, + "learning_rate": 4.7995614984841925e-05, + "loss": 0.3636, + "step": 2962000 + }, + { + "epoch": 20.04723365093114, + "grad_norm": 0.35603439807891846, + "learning_rate": 4.799527663490689e-05, + "loss": 0.3657, + "step": 2962500 + }, + { + "epoch": 20.050617150281507, + "grad_norm": 0.39890220761299133, + "learning_rate": 4.799493828497185e-05, + "loss": 0.3638, + "step": 2963000 + }, + { + "epoch": 20.054000649631874, + "grad_norm": 0.351046621799469, + "learning_rate": 4.799459993503682e-05, + "loss": 0.3647, + "step": 2963500 + }, + { + "epoch": 20.057384148982244, + "grad_norm": 0.3713405728340149, + "learning_rate": 4.799426158510178e-05, + "loss": 0.365, + "step": 2964000 + }, + { + "epoch": 20.06076764833261, + "grad_norm": 0.3425780236721039, + "learning_rate": 4.799392323516674e-05, + "loss": 0.364, + "step": 2964500 + }, + { + "epoch": 20.06415114768298, + "grad_norm": 0.359878808259964, + "learning_rate": 4.7993584885231704e-05, + "loss": 0.3654, + "step": 2965000 + }, + { + "epoch": 20.067534647033348, + "grad_norm": 0.38747385144233704, + "learning_rate": 4.7993246535296666e-05, + "loss": 0.3633, + "step": 2965500 + }, + { + "epoch": 20.070918146383715, + "grad_norm": 0.3746489882469177, + "learning_rate": 4.799290818536163e-05, + "loss": 0.3642, + "step": 2966000 + }, + { + "epoch": 20.074301645734085, + "grad_norm": 0.41375428438186646, + "learning_rate": 4.799256983542659e-05, + "loss": 0.3643, + "step": 2966500 + }, + { + "epoch": 20.077685145084452, + "grad_norm": 0.4087056517601013, + "learning_rate": 4.799223148549156e-05, + "loss": 0.365, + "step": 2967000 + }, + { + "epoch": 20.08106864443482, + "grad_norm": 0.3827906847000122, + "learning_rate": 4.799189313555652e-05, + "loss": 0.3635, + "step": 2967500 + }, + { + "epoch": 20.08445214378519, + "grad_norm": 0.34499284625053406, + "learning_rate": 4.7991554785621484e-05, + "loss": 0.3636, + "step": 2968000 + }, + { + "epoch": 20.087835643135556, + "grad_norm": 0.3452380895614624, + "learning_rate": 4.7991216435686446e-05, + "loss": 0.3636, + "step": 2968500 + }, + { + "epoch": 20.091219142485926, + "grad_norm": 0.3407035171985626, + "learning_rate": 4.7990878085751415e-05, + "loss": 0.3647, + "step": 2969000 + }, + { + "epoch": 20.094602641836293, + "grad_norm": 0.37500280141830444, + "learning_rate": 4.799053973581638e-05, + "loss": 0.3651, + "step": 2969500 + }, + { + "epoch": 20.09798614118666, + "grad_norm": 0.3894490897655487, + "learning_rate": 4.799020138588133e-05, + "loss": 0.3635, + "step": 2970000 + }, + { + "epoch": 20.10136964053703, + "grad_norm": 0.3936595618724823, + "learning_rate": 4.7989863035946294e-05, + "loss": 0.3644, + "step": 2970500 + }, + { + "epoch": 20.104753139887396, + "grad_norm": 0.37825560569763184, + "learning_rate": 4.798952468601126e-05, + "loss": 0.3646, + "step": 2971000 + }, + { + "epoch": 20.108136639237767, + "grad_norm": 0.3469005823135376, + "learning_rate": 4.7989186336076225e-05, + "loss": 0.3655, + "step": 2971500 + }, + { + "epoch": 20.111520138588133, + "grad_norm": 0.37107598781585693, + "learning_rate": 4.798884798614119e-05, + "loss": 0.3671, + "step": 2972000 + }, + { + "epoch": 20.1149036379385, + "grad_norm": 0.3491670489311218, + "learning_rate": 4.798850963620615e-05, + "loss": 0.3643, + "step": 2972500 + }, + { + "epoch": 20.11828713728887, + "grad_norm": 0.3483146131038666, + "learning_rate": 4.798817128627112e-05, + "loss": 0.3628, + "step": 2973000 + }, + { + "epoch": 20.121670636639237, + "grad_norm": 0.35180410742759705, + "learning_rate": 4.798783293633608e-05, + "loss": 0.3641, + "step": 2973500 + }, + { + "epoch": 20.125054135989608, + "grad_norm": 0.35226669907569885, + "learning_rate": 4.798749458640104e-05, + "loss": 0.3652, + "step": 2974000 + }, + { + "epoch": 20.128437635339974, + "grad_norm": 0.3892536461353302, + "learning_rate": 4.7987156236466005e-05, + "loss": 0.3654, + "step": 2974500 + }, + { + "epoch": 20.13182113469034, + "grad_norm": 0.3841457962989807, + "learning_rate": 4.798681788653097e-05, + "loss": 0.3636, + "step": 2975000 + }, + { + "epoch": 20.13520463404071, + "grad_norm": 0.3708195090293884, + "learning_rate": 4.798647953659593e-05, + "loss": 0.3651, + "step": 2975500 + }, + { + "epoch": 20.138588133391078, + "grad_norm": 0.3810982406139374, + "learning_rate": 4.798614118666089e-05, + "loss": 0.3646, + "step": 2976000 + }, + { + "epoch": 20.141971632741445, + "grad_norm": 0.4043063521385193, + "learning_rate": 4.798580283672586e-05, + "loss": 0.3651, + "step": 2976500 + }, + { + "epoch": 20.145355132091815, + "grad_norm": 0.345688134431839, + "learning_rate": 4.798546448679082e-05, + "loss": 0.365, + "step": 2977000 + }, + { + "epoch": 20.148738631442182, + "grad_norm": 0.3402092456817627, + "learning_rate": 4.7985126136855784e-05, + "loss": 0.3649, + "step": 2977500 + }, + { + "epoch": 20.152122130792552, + "grad_norm": 0.3475629985332489, + "learning_rate": 4.7984787786920746e-05, + "loss": 0.3645, + "step": 2978000 + }, + { + "epoch": 20.15550563014292, + "grad_norm": 0.38040003180503845, + "learning_rate": 4.7984449436985715e-05, + "loss": 0.3662, + "step": 2978500 + }, + { + "epoch": 20.158889129493286, + "grad_norm": 0.3826071619987488, + "learning_rate": 4.798411108705068e-05, + "loss": 0.3642, + "step": 2979000 + }, + { + "epoch": 20.162272628843656, + "grad_norm": 0.3706953823566437, + "learning_rate": 4.798377273711563e-05, + "loss": 0.365, + "step": 2979500 + }, + { + "epoch": 20.165656128194023, + "grad_norm": 0.3419247269630432, + "learning_rate": 4.7983434387180595e-05, + "loss": 0.3654, + "step": 2980000 + }, + { + "epoch": 20.169039627544393, + "grad_norm": 0.4026300013065338, + "learning_rate": 4.7983096037245564e-05, + "loss": 0.3651, + "step": 2980500 + }, + { + "epoch": 20.17242312689476, + "grad_norm": 0.39176565408706665, + "learning_rate": 4.7982757687310526e-05, + "loss": 0.3645, + "step": 2981000 + }, + { + "epoch": 20.175806626245127, + "grad_norm": 0.3783055245876312, + "learning_rate": 4.798241933737549e-05, + "loss": 0.3656, + "step": 2981500 + }, + { + "epoch": 20.179190125595497, + "grad_norm": 0.39230743050575256, + "learning_rate": 4.798208098744045e-05, + "loss": 0.3649, + "step": 2982000 + }, + { + "epoch": 20.182573624945864, + "grad_norm": 0.3609783351421356, + "learning_rate": 4.798174263750542e-05, + "loss": 0.3641, + "step": 2982500 + }, + { + "epoch": 20.18595712429623, + "grad_norm": 0.34281569719314575, + "learning_rate": 4.798140428757038e-05, + "loss": 0.3645, + "step": 2983000 + }, + { + "epoch": 20.1893406236466, + "grad_norm": 0.35994818806648254, + "learning_rate": 4.798106593763534e-05, + "loss": 0.3657, + "step": 2983500 + }, + { + "epoch": 20.192724122996967, + "grad_norm": 0.3607349991798401, + "learning_rate": 4.7980727587700305e-05, + "loss": 0.3642, + "step": 2984000 + }, + { + "epoch": 20.196107622347338, + "grad_norm": 0.36213696002960205, + "learning_rate": 4.798038923776527e-05, + "loss": 0.3649, + "step": 2984500 + }, + { + "epoch": 20.199491121697704, + "grad_norm": 0.37555092573165894, + "learning_rate": 4.798005088783023e-05, + "loss": 0.3648, + "step": 2985000 + }, + { + "epoch": 20.20287462104807, + "grad_norm": 0.381056547164917, + "learning_rate": 4.797971253789519e-05, + "loss": 0.3665, + "step": 2985500 + }, + { + "epoch": 20.20625812039844, + "grad_norm": 0.375205397605896, + "learning_rate": 4.797937418796016e-05, + "loss": 0.3645, + "step": 2986000 + }, + { + "epoch": 20.20964161974881, + "grad_norm": 0.3704332113265991, + "learning_rate": 4.797903583802512e-05, + "loss": 0.3648, + "step": 2986500 + }, + { + "epoch": 20.21302511909918, + "grad_norm": 0.359117329120636, + "learning_rate": 4.7978697488090085e-05, + "loss": 0.3661, + "step": 2987000 + }, + { + "epoch": 20.216408618449545, + "grad_norm": 0.34534206986427307, + "learning_rate": 4.797835913815505e-05, + "loss": 0.3656, + "step": 2987500 + }, + { + "epoch": 20.219792117799912, + "grad_norm": 0.33024686574935913, + "learning_rate": 4.7978020788220016e-05, + "loss": 0.3643, + "step": 2988000 + }, + { + "epoch": 20.223175617150282, + "grad_norm": 0.3749881982803345, + "learning_rate": 4.797768243828498e-05, + "loss": 0.3648, + "step": 2988500 + }, + { + "epoch": 20.22655911650065, + "grad_norm": 0.35957151651382446, + "learning_rate": 4.7977344088349933e-05, + "loss": 0.3644, + "step": 2989000 + }, + { + "epoch": 20.22994261585102, + "grad_norm": 0.3546516001224518, + "learning_rate": 4.7977005738414896e-05, + "loss": 0.3654, + "step": 2989500 + }, + { + "epoch": 20.233326115201386, + "grad_norm": 0.4022500514984131, + "learning_rate": 4.7976667388479864e-05, + "loss": 0.3661, + "step": 2990000 + }, + { + "epoch": 20.236709614551753, + "grad_norm": 0.3377239406108856, + "learning_rate": 4.7976329038544827e-05, + "loss": 0.365, + "step": 2990500 + }, + { + "epoch": 20.240093113902123, + "grad_norm": 0.3646054267883301, + "learning_rate": 4.797599068860979e-05, + "loss": 0.3636, + "step": 2991000 + }, + { + "epoch": 20.24347661325249, + "grad_norm": 0.3465125858783722, + "learning_rate": 4.797565233867475e-05, + "loss": 0.3643, + "step": 2991500 + }, + { + "epoch": 20.246860112602857, + "grad_norm": 0.38007932901382446, + "learning_rate": 4.797531398873972e-05, + "loss": 0.3645, + "step": 2992000 + }, + { + "epoch": 20.250243611953227, + "grad_norm": 0.38943079113960266, + "learning_rate": 4.797497563880468e-05, + "loss": 0.3646, + "step": 2992500 + }, + { + "epoch": 20.253627111303594, + "grad_norm": 0.40387022495269775, + "learning_rate": 4.7974637288869644e-05, + "loss": 0.3644, + "step": 2993000 + }, + { + "epoch": 20.257010610653964, + "grad_norm": 0.35680386424064636, + "learning_rate": 4.7974298938934606e-05, + "loss": 0.3647, + "step": 2993500 + }, + { + "epoch": 20.26039411000433, + "grad_norm": 0.35992133617401123, + "learning_rate": 4.797396058899957e-05, + "loss": 0.3643, + "step": 2994000 + }, + { + "epoch": 20.263777609354698, + "grad_norm": 0.3910994529724121, + "learning_rate": 4.797362223906453e-05, + "loss": 0.3649, + "step": 2994500 + }, + { + "epoch": 20.267161108705068, + "grad_norm": 0.36142924427986145, + "learning_rate": 4.797328388912949e-05, + "loss": 0.3655, + "step": 2995000 + }, + { + "epoch": 20.270544608055435, + "grad_norm": 0.370726078748703, + "learning_rate": 4.797294553919446e-05, + "loss": 0.3663, + "step": 2995500 + }, + { + "epoch": 20.273928107405805, + "grad_norm": 0.39392247796058655, + "learning_rate": 4.7972607189259423e-05, + "loss": 0.3652, + "step": 2996000 + }, + { + "epoch": 20.27731160675617, + "grad_norm": 0.3471395969390869, + "learning_rate": 4.7972268839324386e-05, + "loss": 0.3641, + "step": 2996500 + }, + { + "epoch": 20.28069510610654, + "grad_norm": 0.3578757047653198, + "learning_rate": 4.797193048938935e-05, + "loss": 0.364, + "step": 2997000 + }, + { + "epoch": 20.28407860545691, + "grad_norm": 0.3582392930984497, + "learning_rate": 4.7971592139454317e-05, + "loss": 0.3661, + "step": 2997500 + }, + { + "epoch": 20.287462104807275, + "grad_norm": 0.3710322380065918, + "learning_rate": 4.797125378951928e-05, + "loss": 0.3645, + "step": 2998000 + }, + { + "epoch": 20.290845604157646, + "grad_norm": 0.3882392942905426, + "learning_rate": 4.7970915439584234e-05, + "loss": 0.3659, + "step": 2998500 + }, + { + "epoch": 20.294229103508012, + "grad_norm": 0.3817421793937683, + "learning_rate": 4.7970577089649196e-05, + "loss": 0.3646, + "step": 2999000 + }, + { + "epoch": 20.29761260285838, + "grad_norm": 0.361188679933548, + "learning_rate": 4.7970238739714165e-05, + "loss": 0.3632, + "step": 2999500 + }, + { + "epoch": 20.30099610220875, + "grad_norm": 0.4115106165409088, + "learning_rate": 4.796990038977913e-05, + "loss": 0.3651, + "step": 3000000 + }, + { + "epoch": 20.304379601559116, + "grad_norm": 0.37860366702079773, + "learning_rate": 4.796956203984409e-05, + "loss": 0.3653, + "step": 3000500 + }, + { + "epoch": 20.307763100909483, + "grad_norm": 0.36756300926208496, + "learning_rate": 4.796922368990905e-05, + "loss": 0.3647, + "step": 3001000 + }, + { + "epoch": 20.311146600259853, + "grad_norm": 0.39316004514694214, + "learning_rate": 4.796888533997402e-05, + "loss": 0.3646, + "step": 3001500 + }, + { + "epoch": 20.31453009961022, + "grad_norm": 0.39260104298591614, + "learning_rate": 4.796854699003898e-05, + "loss": 0.3649, + "step": 3002000 + }, + { + "epoch": 20.31791359896059, + "grad_norm": 0.38872653245925903, + "learning_rate": 4.7968208640103945e-05, + "loss": 0.3638, + "step": 3002500 + }, + { + "epoch": 20.321297098310957, + "grad_norm": 0.3718373477458954, + "learning_rate": 4.796787029016891e-05, + "loss": 0.3653, + "step": 3003000 + }, + { + "epoch": 20.324680597661324, + "grad_norm": 0.3872230648994446, + "learning_rate": 4.796753194023387e-05, + "loss": 0.366, + "step": 3003500 + }, + { + "epoch": 20.328064097011694, + "grad_norm": 0.3719272017478943, + "learning_rate": 4.796719359029883e-05, + "loss": 0.3657, + "step": 3004000 + }, + { + "epoch": 20.33144759636206, + "grad_norm": 0.39180269837379456, + "learning_rate": 4.796685524036379e-05, + "loss": 0.365, + "step": 3004500 + }, + { + "epoch": 20.33483109571243, + "grad_norm": 0.39898020029067993, + "learning_rate": 4.796651689042876e-05, + "loss": 0.3644, + "step": 3005000 + }, + { + "epoch": 20.338214595062798, + "grad_norm": 0.38650649785995483, + "learning_rate": 4.7966178540493724e-05, + "loss": 0.367, + "step": 3005500 + }, + { + "epoch": 20.341598094413165, + "grad_norm": 0.3660157024860382, + "learning_rate": 4.7965840190558686e-05, + "loss": 0.3646, + "step": 3006000 + }, + { + "epoch": 20.344981593763535, + "grad_norm": 0.32656699419021606, + "learning_rate": 4.796550184062365e-05, + "loss": 0.3655, + "step": 3006500 + }, + { + "epoch": 20.3483650931139, + "grad_norm": 0.3373340666294098, + "learning_rate": 4.796516349068861e-05, + "loss": 0.3666, + "step": 3007000 + }, + { + "epoch": 20.35174859246427, + "grad_norm": 0.3731749355792999, + "learning_rate": 4.796482514075358e-05, + "loss": 0.3655, + "step": 3007500 + }, + { + "epoch": 20.35513209181464, + "grad_norm": 0.33905479311943054, + "learning_rate": 4.7964486790818535e-05, + "loss": 0.3655, + "step": 3008000 + }, + { + "epoch": 20.358515591165006, + "grad_norm": 0.36236539483070374, + "learning_rate": 4.79641484408835e-05, + "loss": 0.3639, + "step": 3008500 + }, + { + "epoch": 20.361899090515376, + "grad_norm": 0.42300811409950256, + "learning_rate": 4.7963810090948466e-05, + "loss": 0.3653, + "step": 3009000 + }, + { + "epoch": 20.365282589865743, + "grad_norm": 0.33264434337615967, + "learning_rate": 4.796347174101343e-05, + "loss": 0.3645, + "step": 3009500 + }, + { + "epoch": 20.36866608921611, + "grad_norm": 0.33422377705574036, + "learning_rate": 4.796313339107839e-05, + "loss": 0.3645, + "step": 3010000 + }, + { + "epoch": 20.37204958856648, + "grad_norm": 0.390158474445343, + "learning_rate": 4.796279504114335e-05, + "loss": 0.3637, + "step": 3010500 + }, + { + "epoch": 20.375433087916846, + "grad_norm": 0.32751229405403137, + "learning_rate": 4.796245669120832e-05, + "loss": 0.3655, + "step": 3011000 + }, + { + "epoch": 20.378816587267217, + "grad_norm": 0.3563542366027832, + "learning_rate": 4.796211834127328e-05, + "loss": 0.3652, + "step": 3011500 + }, + { + "epoch": 20.382200086617583, + "grad_norm": 0.3355710506439209, + "learning_rate": 4.7961779991338245e-05, + "loss": 0.3645, + "step": 3012000 + }, + { + "epoch": 20.38558358596795, + "grad_norm": 0.3966407775878906, + "learning_rate": 4.796144164140321e-05, + "loss": 0.3646, + "step": 3012500 + }, + { + "epoch": 20.38896708531832, + "grad_norm": 0.36854368448257446, + "learning_rate": 4.796110329146817e-05, + "loss": 0.3645, + "step": 3013000 + }, + { + "epoch": 20.392350584668687, + "grad_norm": 0.3450147807598114, + "learning_rate": 4.796076494153313e-05, + "loss": 0.3639, + "step": 3013500 + }, + { + "epoch": 20.395734084019058, + "grad_norm": 0.35542258620262146, + "learning_rate": 4.7960426591598094e-05, + "loss": 0.3642, + "step": 3014000 + }, + { + "epoch": 20.399117583369424, + "grad_norm": 0.3457816541194916, + "learning_rate": 4.796008824166306e-05, + "loss": 0.3646, + "step": 3014500 + }, + { + "epoch": 20.40250108271979, + "grad_norm": 0.34304627776145935, + "learning_rate": 4.7959749891728025e-05, + "loss": 0.3653, + "step": 3015000 + }, + { + "epoch": 20.40588458207016, + "grad_norm": 0.3585280179977417, + "learning_rate": 4.795941154179299e-05, + "loss": 0.3641, + "step": 3015500 + }, + { + "epoch": 20.409268081420528, + "grad_norm": 0.3855977952480316, + "learning_rate": 4.795907319185795e-05, + "loss": 0.3657, + "step": 3016000 + }, + { + "epoch": 20.412651580770895, + "grad_norm": 0.39115121960639954, + "learning_rate": 4.795873484192291e-05, + "loss": 0.3647, + "step": 3016500 + }, + { + "epoch": 20.416035080121265, + "grad_norm": 0.3797233998775482, + "learning_rate": 4.795839649198788e-05, + "loss": 0.3648, + "step": 3017000 + }, + { + "epoch": 20.419418579471632, + "grad_norm": 0.4135083556175232, + "learning_rate": 4.7958058142052835e-05, + "loss": 0.3661, + "step": 3017500 + }, + { + "epoch": 20.422802078822002, + "grad_norm": 0.3774179518222809, + "learning_rate": 4.79577197921178e-05, + "loss": 0.3653, + "step": 3018000 + }, + { + "epoch": 20.42618557817237, + "grad_norm": 0.3676152527332306, + "learning_rate": 4.7957381442182766e-05, + "loss": 0.3652, + "step": 3018500 + }, + { + "epoch": 20.429569077522736, + "grad_norm": 0.3528066575527191, + "learning_rate": 4.795704309224773e-05, + "loss": 0.3651, + "step": 3019000 + }, + { + "epoch": 20.432952576873106, + "grad_norm": 0.37309351563453674, + "learning_rate": 4.795670474231269e-05, + "loss": 0.3662, + "step": 3019500 + }, + { + "epoch": 20.436336076223473, + "grad_norm": 0.3470800518989563, + "learning_rate": 4.795636639237765e-05, + "loss": 0.3638, + "step": 3020000 + }, + { + "epoch": 20.439719575573843, + "grad_norm": 0.36600929498672485, + "learning_rate": 4.795602804244262e-05, + "loss": 0.3652, + "step": 3020500 + }, + { + "epoch": 20.44310307492421, + "grad_norm": 0.38214531540870667, + "learning_rate": 4.7955689692507584e-05, + "loss": 0.365, + "step": 3021000 + }, + { + "epoch": 20.446486574274576, + "grad_norm": 0.3714052140712738, + "learning_rate": 4.7955351342572546e-05, + "loss": 0.3654, + "step": 3021500 + }, + { + "epoch": 20.449870073624947, + "grad_norm": 0.38815969228744507, + "learning_rate": 4.795501299263751e-05, + "loss": 0.367, + "step": 3022000 + }, + { + "epoch": 20.453253572975314, + "grad_norm": 0.3865346610546112, + "learning_rate": 4.795467464270247e-05, + "loss": 0.365, + "step": 3022500 + }, + { + "epoch": 20.456637072325684, + "grad_norm": 0.3771718740463257, + "learning_rate": 4.795433629276743e-05, + "loss": 0.3642, + "step": 3023000 + }, + { + "epoch": 20.46002057167605, + "grad_norm": 0.3145259916782379, + "learning_rate": 4.7953997942832394e-05, + "loss": 0.3649, + "step": 3023500 + }, + { + "epoch": 20.463404071026417, + "grad_norm": 0.3595036268234253, + "learning_rate": 4.7953659592897356e-05, + "loss": 0.3642, + "step": 3024000 + }, + { + "epoch": 20.466787570376788, + "grad_norm": 0.36791813373565674, + "learning_rate": 4.7953321242962325e-05, + "loss": 0.3657, + "step": 3024500 + }, + { + "epoch": 20.470171069727154, + "grad_norm": 0.3860068917274475, + "learning_rate": 4.795298289302729e-05, + "loss": 0.3649, + "step": 3025000 + }, + { + "epoch": 20.47355456907752, + "grad_norm": 0.379476398229599, + "learning_rate": 4.795264454309225e-05, + "loss": 0.3651, + "step": 3025500 + }, + { + "epoch": 20.47693806842789, + "grad_norm": 0.3719246983528137, + "learning_rate": 4.795230619315721e-05, + "loss": 0.3657, + "step": 3026000 + }, + { + "epoch": 20.480321567778258, + "grad_norm": 0.3715427815914154, + "learning_rate": 4.795196784322218e-05, + "loss": 0.3626, + "step": 3026500 + }, + { + "epoch": 20.48370506712863, + "grad_norm": 0.42802438139915466, + "learning_rate": 4.7951629493287136e-05, + "loss": 0.3644, + "step": 3027000 + }, + { + "epoch": 20.487088566478995, + "grad_norm": 0.37511390447616577, + "learning_rate": 4.79512911433521e-05, + "loss": 0.3648, + "step": 3027500 + }, + { + "epoch": 20.490472065829362, + "grad_norm": 0.40279248356819153, + "learning_rate": 4.795095279341707e-05, + "loss": 0.3646, + "step": 3028000 + }, + { + "epoch": 20.493855565179732, + "grad_norm": 0.37222951650619507, + "learning_rate": 4.795061444348203e-05, + "loss": 0.3656, + "step": 3028500 + }, + { + "epoch": 20.4972390645301, + "grad_norm": 0.36907824873924255, + "learning_rate": 4.795027609354699e-05, + "loss": 0.3651, + "step": 3029000 + }, + { + "epoch": 20.50062256388047, + "grad_norm": 0.38645726442337036, + "learning_rate": 4.794993774361195e-05, + "loss": 0.3649, + "step": 3029500 + }, + { + "epoch": 20.504006063230836, + "grad_norm": 0.34893524646759033, + "learning_rate": 4.794959939367692e-05, + "loss": 0.364, + "step": 3030000 + }, + { + "epoch": 20.507389562581203, + "grad_norm": 0.38672298192977905, + "learning_rate": 4.7949261043741884e-05, + "loss": 0.3656, + "step": 3030500 + }, + { + "epoch": 20.510773061931573, + "grad_norm": 0.35509294271469116, + "learning_rate": 4.7948922693806846e-05, + "loss": 0.3647, + "step": 3031000 + }, + { + "epoch": 20.51415656128194, + "grad_norm": 0.3459559679031372, + "learning_rate": 4.794858434387181e-05, + "loss": 0.3635, + "step": 3031500 + }, + { + "epoch": 20.517540060632307, + "grad_norm": 0.3502878248691559, + "learning_rate": 4.794824599393677e-05, + "loss": 0.3664, + "step": 3032000 + }, + { + "epoch": 20.520923559982677, + "grad_norm": 0.38622573018074036, + "learning_rate": 4.794790764400173e-05, + "loss": 0.3642, + "step": 3032500 + }, + { + "epoch": 20.524307059333044, + "grad_norm": 0.34671613574028015, + "learning_rate": 4.7947569294066695e-05, + "loss": 0.3654, + "step": 3033000 + }, + { + "epoch": 20.527690558683414, + "grad_norm": 0.39764460921287537, + "learning_rate": 4.794723094413166e-05, + "loss": 0.3648, + "step": 3033500 + }, + { + "epoch": 20.53107405803378, + "grad_norm": 0.36788100004196167, + "learning_rate": 4.7946892594196626e-05, + "loss": 0.3646, + "step": 3034000 + }, + { + "epoch": 20.534457557384147, + "grad_norm": 0.3505501449108124, + "learning_rate": 4.794655424426159e-05, + "loss": 0.3647, + "step": 3034500 + }, + { + "epoch": 20.537841056734518, + "grad_norm": 0.37173885107040405, + "learning_rate": 4.794621589432655e-05, + "loss": 0.3658, + "step": 3035000 + }, + { + "epoch": 20.541224556084885, + "grad_norm": 0.372952401638031, + "learning_rate": 4.794587754439151e-05, + "loss": 0.3641, + "step": 3035500 + }, + { + "epoch": 20.544608055435255, + "grad_norm": 0.36666375398635864, + "learning_rate": 4.794553919445648e-05, + "loss": 0.3642, + "step": 3036000 + }, + { + "epoch": 20.54799155478562, + "grad_norm": 0.3400515615940094, + "learning_rate": 4.794520084452144e-05, + "loss": 0.3661, + "step": 3036500 + }, + { + "epoch": 20.55137505413599, + "grad_norm": 0.3797872066497803, + "learning_rate": 4.79448624945864e-05, + "loss": 0.3638, + "step": 3037000 + }, + { + "epoch": 20.55475855348636, + "grad_norm": 0.3844873309135437, + "learning_rate": 4.794452414465137e-05, + "loss": 0.3652, + "step": 3037500 + }, + { + "epoch": 20.558142052836725, + "grad_norm": 0.36429187655448914, + "learning_rate": 4.794418579471633e-05, + "loss": 0.3661, + "step": 3038000 + }, + { + "epoch": 20.561525552187096, + "grad_norm": 0.3935987949371338, + "learning_rate": 4.794384744478129e-05, + "loss": 0.3646, + "step": 3038500 + }, + { + "epoch": 20.564909051537462, + "grad_norm": 0.3462426960468292, + "learning_rate": 4.7943509094846254e-05, + "loss": 0.3645, + "step": 3039000 + }, + { + "epoch": 20.56829255088783, + "grad_norm": 0.3837229013442993, + "learning_rate": 4.794317074491122e-05, + "loss": 0.3644, + "step": 3039500 + }, + { + "epoch": 20.5716760502382, + "grad_norm": 0.37439703941345215, + "learning_rate": 4.7942832394976185e-05, + "loss": 0.3663, + "step": 3040000 + }, + { + "epoch": 20.575059549588566, + "grad_norm": 0.37577924132347107, + "learning_rate": 4.794249404504115e-05, + "loss": 0.3652, + "step": 3040500 + }, + { + "epoch": 20.578443048938933, + "grad_norm": 0.35608455538749695, + "learning_rate": 4.794215569510611e-05, + "loss": 0.3632, + "step": 3041000 + }, + { + "epoch": 20.581826548289303, + "grad_norm": 0.3622041642665863, + "learning_rate": 4.794181734517107e-05, + "loss": 0.3674, + "step": 3041500 + }, + { + "epoch": 20.58521004763967, + "grad_norm": 0.3489411175251007, + "learning_rate": 4.7941478995236033e-05, + "loss": 0.3655, + "step": 3042000 + }, + { + "epoch": 20.58859354699004, + "grad_norm": 0.39626896381378174, + "learning_rate": 4.7941140645300996e-05, + "loss": 0.3648, + "step": 3042500 + }, + { + "epoch": 20.591977046340407, + "grad_norm": 0.3579977750778198, + "learning_rate": 4.794080229536596e-05, + "loss": 0.3628, + "step": 3043000 + }, + { + "epoch": 20.595360545690774, + "grad_norm": 0.3381921947002411, + "learning_rate": 4.794046394543093e-05, + "loss": 0.3649, + "step": 3043500 + }, + { + "epoch": 20.598744045041144, + "grad_norm": 0.35242900252342224, + "learning_rate": 4.794012559549589e-05, + "loss": 0.3641, + "step": 3044000 + }, + { + "epoch": 20.60212754439151, + "grad_norm": 0.36363518238067627, + "learning_rate": 4.793978724556085e-05, + "loss": 0.3639, + "step": 3044500 + }, + { + "epoch": 20.60551104374188, + "grad_norm": 0.40079447627067566, + "learning_rate": 4.793944889562581e-05, + "loss": 0.3652, + "step": 3045000 + }, + { + "epoch": 20.608894543092248, + "grad_norm": 0.3724885880947113, + "learning_rate": 4.793911054569078e-05, + "loss": 0.364, + "step": 3045500 + }, + { + "epoch": 20.612278042442615, + "grad_norm": 0.3565116226673126, + "learning_rate": 4.7938772195755744e-05, + "loss": 0.3655, + "step": 3046000 + }, + { + "epoch": 20.615661541792985, + "grad_norm": 0.329285591840744, + "learning_rate": 4.79384338458207e-05, + "loss": 0.3657, + "step": 3046500 + }, + { + "epoch": 20.61904504114335, + "grad_norm": 0.35131943225860596, + "learning_rate": 4.793809549588567e-05, + "loss": 0.3676, + "step": 3047000 + }, + { + "epoch": 20.622428540493722, + "grad_norm": 0.37161844968795776, + "learning_rate": 4.793775714595063e-05, + "loss": 0.364, + "step": 3047500 + }, + { + "epoch": 20.62581203984409, + "grad_norm": 0.3753008544445038, + "learning_rate": 4.793741879601559e-05, + "loss": 0.3657, + "step": 3048000 + }, + { + "epoch": 20.629195539194455, + "grad_norm": 0.3838074207305908, + "learning_rate": 4.7937080446080555e-05, + "loss": 0.3653, + "step": 3048500 + }, + { + "epoch": 20.632579038544826, + "grad_norm": 0.39390793442726135, + "learning_rate": 4.7936742096145524e-05, + "loss": 0.3653, + "step": 3049000 + }, + { + "epoch": 20.635962537895193, + "grad_norm": 0.38033226132392883, + "learning_rate": 4.7936403746210486e-05, + "loss": 0.3652, + "step": 3049500 + }, + { + "epoch": 20.63934603724556, + "grad_norm": 0.35507896542549133, + "learning_rate": 4.793606539627545e-05, + "loss": 0.3656, + "step": 3050000 + }, + { + "epoch": 20.64272953659593, + "grad_norm": 0.348247230052948, + "learning_rate": 4.793572704634041e-05, + "loss": 0.365, + "step": 3050500 + }, + { + "epoch": 20.646113035946296, + "grad_norm": 0.38723573088645935, + "learning_rate": 4.793538869640537e-05, + "loss": 0.3659, + "step": 3051000 + }, + { + "epoch": 20.649496535296667, + "grad_norm": 0.3930349349975586, + "learning_rate": 4.7935050346470334e-05, + "loss": 0.3634, + "step": 3051500 + }, + { + "epoch": 20.652880034647033, + "grad_norm": 0.3427135646343231, + "learning_rate": 4.7934711996535296e-05, + "loss": 0.3661, + "step": 3052000 + }, + { + "epoch": 20.6562635339974, + "grad_norm": 0.359590619802475, + "learning_rate": 4.793437364660026e-05, + "loss": 0.3658, + "step": 3052500 + }, + { + "epoch": 20.65964703334777, + "grad_norm": 0.35423997044563293, + "learning_rate": 4.793403529666523e-05, + "loss": 0.3663, + "step": 3053000 + }, + { + "epoch": 20.663030532698137, + "grad_norm": 0.33582478761672974, + "learning_rate": 4.793369694673019e-05, + "loss": 0.3656, + "step": 3053500 + }, + { + "epoch": 20.666414032048507, + "grad_norm": 0.3541942536830902, + "learning_rate": 4.793335859679515e-05, + "loss": 0.3646, + "step": 3054000 + }, + { + "epoch": 20.669797531398874, + "grad_norm": 0.368379145860672, + "learning_rate": 4.7933020246860114e-05, + "loss": 0.3657, + "step": 3054500 + }, + { + "epoch": 20.67318103074924, + "grad_norm": 0.37707754969596863, + "learning_rate": 4.793268189692508e-05, + "loss": 0.3646, + "step": 3055000 + }, + { + "epoch": 20.67656453009961, + "grad_norm": 0.37721332907676697, + "learning_rate": 4.7932343546990045e-05, + "loss": 0.3643, + "step": 3055500 + }, + { + "epoch": 20.679948029449978, + "grad_norm": 0.3578668236732483, + "learning_rate": 4.7932005197055e-05, + "loss": 0.3637, + "step": 3056000 + }, + { + "epoch": 20.683331528800345, + "grad_norm": 0.37248265743255615, + "learning_rate": 4.793166684711997e-05, + "loss": 0.3658, + "step": 3056500 + }, + { + "epoch": 20.686715028150715, + "grad_norm": 0.42079028487205505, + "learning_rate": 4.793132849718493e-05, + "loss": 0.3663, + "step": 3057000 + }, + { + "epoch": 20.690098527501082, + "grad_norm": 0.35661521553993225, + "learning_rate": 4.793099014724989e-05, + "loss": 0.3648, + "step": 3057500 + }, + { + "epoch": 20.693482026851452, + "grad_norm": 0.36780673265457153, + "learning_rate": 4.7930651797314855e-05, + "loss": 0.3653, + "step": 3058000 + }, + { + "epoch": 20.69686552620182, + "grad_norm": 0.3804129660129547, + "learning_rate": 4.7930313447379824e-05, + "loss": 0.3659, + "step": 3058500 + }, + { + "epoch": 20.700249025552186, + "grad_norm": 0.3678402900695801, + "learning_rate": 4.7929975097444786e-05, + "loss": 0.3648, + "step": 3059000 + }, + { + "epoch": 20.703632524902556, + "grad_norm": 0.34290778636932373, + "learning_rate": 4.792963674750975e-05, + "loss": 0.3639, + "step": 3059500 + }, + { + "epoch": 20.707016024252923, + "grad_norm": 0.32359635829925537, + "learning_rate": 4.792929839757471e-05, + "loss": 0.3663, + "step": 3060000 + }, + { + "epoch": 20.710399523603293, + "grad_norm": 0.3486013114452362, + "learning_rate": 4.792896004763967e-05, + "loss": 0.3656, + "step": 3060500 + }, + { + "epoch": 20.71378302295366, + "grad_norm": 0.3553240895271301, + "learning_rate": 4.7928621697704635e-05, + "loss": 0.364, + "step": 3061000 + }, + { + "epoch": 20.717166522304026, + "grad_norm": 0.3806537687778473, + "learning_rate": 4.79282833477696e-05, + "loss": 0.3658, + "step": 3061500 + }, + { + "epoch": 20.720550021654397, + "grad_norm": 0.43344950675964355, + "learning_rate": 4.792794499783456e-05, + "loss": 0.3642, + "step": 3062000 + }, + { + "epoch": 20.723933521004763, + "grad_norm": 0.3625841438770294, + "learning_rate": 4.792760664789953e-05, + "loss": 0.3652, + "step": 3062500 + }, + { + "epoch": 20.727317020355134, + "grad_norm": 0.4011187255382538, + "learning_rate": 4.792726829796449e-05, + "loss": 0.3649, + "step": 3063000 + }, + { + "epoch": 20.7307005197055, + "grad_norm": 0.38117992877960205, + "learning_rate": 4.792692994802945e-05, + "loss": 0.3653, + "step": 3063500 + }, + { + "epoch": 20.734084019055867, + "grad_norm": 0.4144648313522339, + "learning_rate": 4.7926591598094414e-05, + "loss": 0.3635, + "step": 3064000 + }, + { + "epoch": 20.737467518406238, + "grad_norm": 0.3855360746383667, + "learning_rate": 4.792625324815938e-05, + "loss": 0.3667, + "step": 3064500 + }, + { + "epoch": 20.740851017756604, + "grad_norm": 0.37165096402168274, + "learning_rate": 4.7925914898224345e-05, + "loss": 0.3652, + "step": 3065000 + }, + { + "epoch": 20.74423451710697, + "grad_norm": 0.3604426383972168, + "learning_rate": 4.79255765482893e-05, + "loss": 0.3662, + "step": 3065500 + }, + { + "epoch": 20.74761801645734, + "grad_norm": 0.3857273757457733, + "learning_rate": 4.792523819835427e-05, + "loss": 0.3654, + "step": 3066000 + }, + { + "epoch": 20.751001515807708, + "grad_norm": 0.3289198577404022, + "learning_rate": 4.792489984841923e-05, + "loss": 0.3633, + "step": 3066500 + }, + { + "epoch": 20.75438501515808, + "grad_norm": 0.3403366208076477, + "learning_rate": 4.7924561498484194e-05, + "loss": 0.3654, + "step": 3067000 + }, + { + "epoch": 20.757768514508445, + "grad_norm": 0.3487418293952942, + "learning_rate": 4.7924223148549156e-05, + "loss": 0.3638, + "step": 3067500 + }, + { + "epoch": 20.761152013858812, + "grad_norm": 0.3301853537559509, + "learning_rate": 4.7923884798614125e-05, + "loss": 0.3647, + "step": 3068000 + }, + { + "epoch": 20.764535513209182, + "grad_norm": 0.3602818250656128, + "learning_rate": 4.792354644867909e-05, + "loss": 0.366, + "step": 3068500 + }, + { + "epoch": 20.76791901255955, + "grad_norm": 0.3686750531196594, + "learning_rate": 4.792320809874405e-05, + "loss": 0.3658, + "step": 3069000 + }, + { + "epoch": 20.77130251190992, + "grad_norm": 0.36918964982032776, + "learning_rate": 4.792286974880901e-05, + "loss": 0.3648, + "step": 3069500 + }, + { + "epoch": 20.774686011260286, + "grad_norm": 0.3289744257926941, + "learning_rate": 4.792253139887397e-05, + "loss": 0.3661, + "step": 3070000 + }, + { + "epoch": 20.778069510610653, + "grad_norm": 0.36303091049194336, + "learning_rate": 4.7922193048938935e-05, + "loss": 0.3663, + "step": 3070500 + }, + { + "epoch": 20.781453009961023, + "grad_norm": 0.34788212180137634, + "learning_rate": 4.79218546990039e-05, + "loss": 0.3651, + "step": 3071000 + }, + { + "epoch": 20.78483650931139, + "grad_norm": 0.35184186697006226, + "learning_rate": 4.792151634906886e-05, + "loss": 0.3644, + "step": 3071500 + }, + { + "epoch": 20.78822000866176, + "grad_norm": 0.3535412549972534, + "learning_rate": 4.792117799913383e-05, + "loss": 0.3652, + "step": 3072000 + }, + { + "epoch": 20.791603508012127, + "grad_norm": 0.3471745252609253, + "learning_rate": 4.792083964919879e-05, + "loss": 0.3658, + "step": 3072500 + }, + { + "epoch": 20.794987007362494, + "grad_norm": 0.3394944965839386, + "learning_rate": 4.792050129926375e-05, + "loss": 0.3651, + "step": 3073000 + }, + { + "epoch": 20.798370506712864, + "grad_norm": 0.3789084851741791, + "learning_rate": 4.7920162949328715e-05, + "loss": 0.3655, + "step": 3073500 + }, + { + "epoch": 20.80175400606323, + "grad_norm": 0.34156209230422974, + "learning_rate": 4.7919824599393684e-05, + "loss": 0.3639, + "step": 3074000 + }, + { + "epoch": 20.805137505413597, + "grad_norm": 0.3876127302646637, + "learning_rate": 4.7919486249458646e-05, + "loss": 0.3637, + "step": 3074500 + }, + { + "epoch": 20.808521004763968, + "grad_norm": 0.3334454298019409, + "learning_rate": 4.79191478995236e-05, + "loss": 0.367, + "step": 3075000 + }, + { + "epoch": 20.811904504114334, + "grad_norm": 0.39533373713493347, + "learning_rate": 4.791880954958857e-05, + "loss": 0.366, + "step": 3075500 + }, + { + "epoch": 20.815288003464705, + "grad_norm": 0.35846906900405884, + "learning_rate": 4.791847119965353e-05, + "loss": 0.3665, + "step": 3076000 + }, + { + "epoch": 20.81867150281507, + "grad_norm": 0.3482748568058014, + "learning_rate": 4.7918132849718494e-05, + "loss": 0.364, + "step": 3076500 + }, + { + "epoch": 20.82205500216544, + "grad_norm": 0.37502777576446533, + "learning_rate": 4.7917794499783457e-05, + "loss": 0.3644, + "step": 3077000 + }, + { + "epoch": 20.82543850151581, + "grad_norm": 0.3282943069934845, + "learning_rate": 4.791745614984842e-05, + "loss": 0.3645, + "step": 3077500 + }, + { + "epoch": 20.828822000866175, + "grad_norm": 0.3873825669288635, + "learning_rate": 4.791711779991339e-05, + "loss": 0.3653, + "step": 3078000 + }, + { + "epoch": 20.832205500216546, + "grad_norm": 0.410609632730484, + "learning_rate": 4.791677944997835e-05, + "loss": 0.3653, + "step": 3078500 + }, + { + "epoch": 20.835588999566912, + "grad_norm": 0.413093626499176, + "learning_rate": 4.791644110004331e-05, + "loss": 0.3641, + "step": 3079000 + }, + { + "epoch": 20.83897249891728, + "grad_norm": 0.33067917823791504, + "learning_rate": 4.7916102750108274e-05, + "loss": 0.3659, + "step": 3079500 + }, + { + "epoch": 20.84235599826765, + "grad_norm": 0.36715462803840637, + "learning_rate": 4.7915764400173236e-05, + "loss": 0.3639, + "step": 3080000 + }, + { + "epoch": 20.845739497618016, + "grad_norm": 0.3519304692745209, + "learning_rate": 4.79154260502382e-05, + "loss": 0.364, + "step": 3080500 + }, + { + "epoch": 20.849122996968383, + "grad_norm": 0.39291125535964966, + "learning_rate": 4.791508770030316e-05, + "loss": 0.3658, + "step": 3081000 + }, + { + "epoch": 20.852506496318753, + "grad_norm": 0.40021994709968567, + "learning_rate": 4.791474935036813e-05, + "loss": 0.3662, + "step": 3081500 + }, + { + "epoch": 20.85588999566912, + "grad_norm": 0.36764124035835266, + "learning_rate": 4.791441100043309e-05, + "loss": 0.364, + "step": 3082000 + }, + { + "epoch": 20.85927349501949, + "grad_norm": 0.3566446304321289, + "learning_rate": 4.7914072650498053e-05, + "loss": 0.3659, + "step": 3082500 + }, + { + "epoch": 20.862656994369857, + "grad_norm": 0.3574284017086029, + "learning_rate": 4.7913734300563016e-05, + "loss": 0.3656, + "step": 3083000 + }, + { + "epoch": 20.866040493720224, + "grad_norm": 0.35197603702545166, + "learning_rate": 4.7913395950627984e-05, + "loss": 0.3657, + "step": 3083500 + }, + { + "epoch": 20.869423993070594, + "grad_norm": 0.34930962324142456, + "learning_rate": 4.7913057600692947e-05, + "loss": 0.3659, + "step": 3084000 + }, + { + "epoch": 20.87280749242096, + "grad_norm": 0.35749518871307373, + "learning_rate": 4.79127192507579e-05, + "loss": 0.3658, + "step": 3084500 + }, + { + "epoch": 20.87619099177133, + "grad_norm": 0.35430794954299927, + "learning_rate": 4.791238090082287e-05, + "loss": 0.365, + "step": 3085000 + }, + { + "epoch": 20.879574491121698, + "grad_norm": 0.37598565220832825, + "learning_rate": 4.791204255088783e-05, + "loss": 0.3649, + "step": 3085500 + }, + { + "epoch": 20.882957990472065, + "grad_norm": 0.3943885564804077, + "learning_rate": 4.7911704200952795e-05, + "loss": 0.3645, + "step": 3086000 + }, + { + "epoch": 20.886341489822435, + "grad_norm": 0.37297797203063965, + "learning_rate": 4.791136585101776e-05, + "loss": 0.3637, + "step": 3086500 + }, + { + "epoch": 20.8897249891728, + "grad_norm": 0.3791876435279846, + "learning_rate": 4.791102750108272e-05, + "loss": 0.3652, + "step": 3087000 + }, + { + "epoch": 20.89310848852317, + "grad_norm": 0.3426647484302521, + "learning_rate": 4.791068915114769e-05, + "loss": 0.3657, + "step": 3087500 + }, + { + "epoch": 20.89649198787354, + "grad_norm": 0.3604070842266083, + "learning_rate": 4.791035080121265e-05, + "loss": 0.3653, + "step": 3088000 + }, + { + "epoch": 20.899875487223905, + "grad_norm": 0.381246417760849, + "learning_rate": 4.791001245127761e-05, + "loss": 0.3649, + "step": 3088500 + }, + { + "epoch": 20.903258986574276, + "grad_norm": 0.39292630553245544, + "learning_rate": 4.7909674101342575e-05, + "loss": 0.3643, + "step": 3089000 + }, + { + "epoch": 20.906642485924642, + "grad_norm": 0.36294469237327576, + "learning_rate": 4.790933575140754e-05, + "loss": 0.3632, + "step": 3089500 + }, + { + "epoch": 20.91002598527501, + "grad_norm": 0.37999242544174194, + "learning_rate": 4.79089974014725e-05, + "loss": 0.3635, + "step": 3090000 + }, + { + "epoch": 20.91340948462538, + "grad_norm": 0.3372073769569397, + "learning_rate": 4.790865905153746e-05, + "loss": 0.3645, + "step": 3090500 + }, + { + "epoch": 20.916792983975746, + "grad_norm": 0.44187304377555847, + "learning_rate": 4.790832070160243e-05, + "loss": 0.3648, + "step": 3091000 + }, + { + "epoch": 20.920176483326117, + "grad_norm": 0.3418973982334137, + "learning_rate": 4.790798235166739e-05, + "loss": 0.3633, + "step": 3091500 + }, + { + "epoch": 20.923559982676483, + "grad_norm": 0.3711940050125122, + "learning_rate": 4.7907644001732354e-05, + "loss": 0.3639, + "step": 3092000 + }, + { + "epoch": 20.92694348202685, + "grad_norm": 0.33627617359161377, + "learning_rate": 4.7907305651797316e-05, + "loss": 0.3651, + "step": 3092500 + }, + { + "epoch": 20.93032698137722, + "grad_norm": 0.37105128169059753, + "learning_rate": 4.7906967301862285e-05, + "loss": 0.3656, + "step": 3093000 + }, + { + "epoch": 20.933710480727587, + "grad_norm": 0.3730045557022095, + "learning_rate": 4.790662895192725e-05, + "loss": 0.3647, + "step": 3093500 + }, + { + "epoch": 20.937093980077957, + "grad_norm": 0.39295563101768494, + "learning_rate": 4.79062906019922e-05, + "loss": 0.3654, + "step": 3094000 + }, + { + "epoch": 20.940477479428324, + "grad_norm": 0.341587096452713, + "learning_rate": 4.7905952252057165e-05, + "loss": 0.3658, + "step": 3094500 + }, + { + "epoch": 20.94386097877869, + "grad_norm": 0.3795209527015686, + "learning_rate": 4.7905613902122134e-05, + "loss": 0.3664, + "step": 3095000 + }, + { + "epoch": 20.94724447812906, + "grad_norm": 0.3408762812614441, + "learning_rate": 4.7905275552187096e-05, + "loss": 0.3647, + "step": 3095500 + }, + { + "epoch": 20.950627977479428, + "grad_norm": 0.34244951605796814, + "learning_rate": 4.790493720225206e-05, + "loss": 0.3669, + "step": 3096000 + }, + { + "epoch": 20.9540114768298, + "grad_norm": 0.41083306074142456, + "learning_rate": 4.790459885231702e-05, + "loss": 0.3661, + "step": 3096500 + }, + { + "epoch": 20.957394976180165, + "grad_norm": 0.36155903339385986, + "learning_rate": 4.790426050238199e-05, + "loss": 0.3632, + "step": 3097000 + }, + { + "epoch": 20.96077847553053, + "grad_norm": 0.36875125765800476, + "learning_rate": 4.790392215244695e-05, + "loss": 0.3651, + "step": 3097500 + }, + { + "epoch": 20.964161974880902, + "grad_norm": 0.350536972284317, + "learning_rate": 4.790358380251191e-05, + "loss": 0.366, + "step": 3098000 + }, + { + "epoch": 20.96754547423127, + "grad_norm": 0.36575838923454285, + "learning_rate": 4.7903245452576875e-05, + "loss": 0.3662, + "step": 3098500 + }, + { + "epoch": 20.970928973581636, + "grad_norm": 0.36057087779045105, + "learning_rate": 4.790290710264184e-05, + "loss": 0.3666, + "step": 3099000 + }, + { + "epoch": 20.974312472932006, + "grad_norm": 0.38979464769363403, + "learning_rate": 4.79025687527068e-05, + "loss": 0.3644, + "step": 3099500 + }, + { + "epoch": 20.977695972282373, + "grad_norm": 0.33669137954711914, + "learning_rate": 4.790223040277176e-05, + "loss": 0.367, + "step": 3100000 + }, + { + "epoch": 20.981079471632743, + "grad_norm": 0.380115270614624, + "learning_rate": 4.790189205283673e-05, + "loss": 0.3645, + "step": 3100500 + }, + { + "epoch": 20.98446297098311, + "grad_norm": 0.40111979842185974, + "learning_rate": 4.790155370290169e-05, + "loss": 0.3652, + "step": 3101000 + }, + { + "epoch": 20.987846470333476, + "grad_norm": 0.38543492555618286, + "learning_rate": 4.7901215352966655e-05, + "loss": 0.366, + "step": 3101500 + }, + { + "epoch": 20.991229969683847, + "grad_norm": 0.35383349657058716, + "learning_rate": 4.790087700303162e-05, + "loss": 0.3644, + "step": 3102000 + }, + { + "epoch": 20.994613469034213, + "grad_norm": 0.33337080478668213, + "learning_rate": 4.7900538653096586e-05, + "loss": 0.363, + "step": 3102500 + }, + { + "epoch": 20.997996968384584, + "grad_norm": 0.3798995614051819, + "learning_rate": 4.790020030316155e-05, + "loss": 0.3633, + "step": 3103000 + }, + { + "epoch": 21.0, + "eval_accuracy": 0.8607153645715448, + "eval_loss": 0.5653529763221741, + "eval_runtime": 3396.8155, + "eval_samples_per_second": 85.593, + "eval_steps_per_second": 5.35, + "step": 3103296 + }, + { + "epoch": 21.00138046773495, + "grad_norm": 0.3778703510761261, + "learning_rate": 4.78998619532265e-05, + "loss": 0.3644, + "step": 3103500 + }, + { + "epoch": 21.004763967085317, + "grad_norm": 0.369404137134552, + "learning_rate": 4.7899523603291465e-05, + "loss": 0.3625, + "step": 3104000 + }, + { + "epoch": 21.008147466435688, + "grad_norm": 0.36425143480300903, + "learning_rate": 4.7899185253356434e-05, + "loss": 0.3633, + "step": 3104500 + }, + { + "epoch": 21.011530965786054, + "grad_norm": 0.3674194812774658, + "learning_rate": 4.7898846903421396e-05, + "loss": 0.3634, + "step": 3105000 + }, + { + "epoch": 21.01491446513642, + "grad_norm": 0.3845115303993225, + "learning_rate": 4.789850855348636e-05, + "loss": 0.3642, + "step": 3105500 + }, + { + "epoch": 21.01829796448679, + "grad_norm": 0.38932108879089355, + "learning_rate": 4.789817020355132e-05, + "loss": 0.3611, + "step": 3106000 + }, + { + "epoch": 21.021681463837158, + "grad_norm": 0.39633065462112427, + "learning_rate": 4.789783185361629e-05, + "loss": 0.3667, + "step": 3106500 + }, + { + "epoch": 21.02506496318753, + "grad_norm": 0.39111411571502686, + "learning_rate": 4.789749350368125e-05, + "loss": 0.3632, + "step": 3107000 + }, + { + "epoch": 21.028448462537895, + "grad_norm": 0.38436001539230347, + "learning_rate": 4.7897155153746214e-05, + "loss": 0.3639, + "step": 3107500 + }, + { + "epoch": 21.031831961888262, + "grad_norm": 0.40379461646080017, + "learning_rate": 4.7896816803811176e-05, + "loss": 0.3638, + "step": 3108000 + }, + { + "epoch": 21.035215461238632, + "grad_norm": 0.41419756412506104, + "learning_rate": 4.789647845387614e-05, + "loss": 0.3638, + "step": 3108500 + }, + { + "epoch": 21.038598960589, + "grad_norm": 0.3712766766548157, + "learning_rate": 4.78961401039411e-05, + "loss": 0.3622, + "step": 3109000 + }, + { + "epoch": 21.04198245993937, + "grad_norm": 0.38608309626579285, + "learning_rate": 4.789580175400606e-05, + "loss": 0.3614, + "step": 3109500 + }, + { + "epoch": 21.045365959289736, + "grad_norm": 0.3591993749141693, + "learning_rate": 4.789546340407103e-05, + "loss": 0.3638, + "step": 3110000 + }, + { + "epoch": 21.048749458640103, + "grad_norm": 0.3584752678871155, + "learning_rate": 4.789512505413599e-05, + "loss": 0.3625, + "step": 3110500 + }, + { + "epoch": 21.052132957990473, + "grad_norm": 0.39413151144981384, + "learning_rate": 4.7894786704200955e-05, + "loss": 0.3618, + "step": 3111000 + }, + { + "epoch": 21.05551645734084, + "grad_norm": 0.3555285632610321, + "learning_rate": 4.789444835426592e-05, + "loss": 0.3626, + "step": 3111500 + }, + { + "epoch": 21.05889995669121, + "grad_norm": 0.37488609552383423, + "learning_rate": 4.7894110004330886e-05, + "loss": 0.3642, + "step": 3112000 + }, + { + "epoch": 21.062283456041577, + "grad_norm": 0.36124318838119507, + "learning_rate": 4.789377165439585e-05, + "loss": 0.3657, + "step": 3112500 + }, + { + "epoch": 21.065666955391944, + "grad_norm": 0.35256969928741455, + "learning_rate": 4.7893433304460804e-05, + "loss": 0.3627, + "step": 3113000 + }, + { + "epoch": 21.069050454742314, + "grad_norm": 0.3735707104206085, + "learning_rate": 4.7893094954525766e-05, + "loss": 0.3637, + "step": 3113500 + }, + { + "epoch": 21.07243395409268, + "grad_norm": 0.4027535021305084, + "learning_rate": 4.7892756604590735e-05, + "loss": 0.3633, + "step": 3114000 + }, + { + "epoch": 21.075817453443047, + "grad_norm": 0.3611033856868744, + "learning_rate": 4.78924182546557e-05, + "loss": 0.3631, + "step": 3114500 + }, + { + "epoch": 21.079200952793418, + "grad_norm": 0.3534304201602936, + "learning_rate": 4.789207990472066e-05, + "loss": 0.3635, + "step": 3115000 + }, + { + "epoch": 21.082584452143784, + "grad_norm": 0.32752498984336853, + "learning_rate": 4.789174155478562e-05, + "loss": 0.3637, + "step": 3115500 + }, + { + "epoch": 21.085967951494155, + "grad_norm": 0.34558671712875366, + "learning_rate": 4.789140320485059e-05, + "loss": 0.3635, + "step": 3116000 + }, + { + "epoch": 21.08935145084452, + "grad_norm": 0.31938326358795166, + "learning_rate": 4.789106485491555e-05, + "loss": 0.363, + "step": 3116500 + }, + { + "epoch": 21.092734950194888, + "grad_norm": 0.4072025716304779, + "learning_rate": 4.7890726504980514e-05, + "loss": 0.3634, + "step": 3117000 + }, + { + "epoch": 21.09611844954526, + "grad_norm": 0.33352988958358765, + "learning_rate": 4.7890388155045476e-05, + "loss": 0.3639, + "step": 3117500 + }, + { + "epoch": 21.099501948895625, + "grad_norm": 0.42230895161628723, + "learning_rate": 4.789004980511044e-05, + "loss": 0.3637, + "step": 3118000 + }, + { + "epoch": 21.102885448245996, + "grad_norm": 0.3735465109348297, + "learning_rate": 4.78897114551754e-05, + "loss": 0.3639, + "step": 3118500 + }, + { + "epoch": 21.106268947596362, + "grad_norm": 0.3831036686897278, + "learning_rate": 4.788937310524036e-05, + "loss": 0.3628, + "step": 3119000 + }, + { + "epoch": 21.10965244694673, + "grad_norm": 0.3576519191265106, + "learning_rate": 4.788903475530533e-05, + "loss": 0.3635, + "step": 3119500 + }, + { + "epoch": 21.1130359462971, + "grad_norm": 0.32627448439598083, + "learning_rate": 4.7888696405370294e-05, + "loss": 0.3633, + "step": 3120000 + }, + { + "epoch": 21.116419445647466, + "grad_norm": 0.35932162404060364, + "learning_rate": 4.7888358055435256e-05, + "loss": 0.3636, + "step": 3120500 + }, + { + "epoch": 21.119802944997833, + "grad_norm": 0.3643208146095276, + "learning_rate": 4.788801970550022e-05, + "loss": 0.3632, + "step": 3121000 + }, + { + "epoch": 21.123186444348203, + "grad_norm": 0.35325881838798523, + "learning_rate": 4.788768135556519e-05, + "loss": 0.3647, + "step": 3121500 + }, + { + "epoch": 21.12656994369857, + "grad_norm": 0.34074848890304565, + "learning_rate": 4.788734300563015e-05, + "loss": 0.3634, + "step": 3122000 + }, + { + "epoch": 21.12995344304894, + "grad_norm": 0.3535933792591095, + "learning_rate": 4.7887004655695104e-05, + "loss": 0.3638, + "step": 3122500 + }, + { + "epoch": 21.133336942399307, + "grad_norm": 0.34740936756134033, + "learning_rate": 4.7886666305760067e-05, + "loss": 0.3635, + "step": 3123000 + }, + { + "epoch": 21.136720441749674, + "grad_norm": 0.35919180512428284, + "learning_rate": 4.7886327955825035e-05, + "loss": 0.365, + "step": 3123500 + }, + { + "epoch": 21.140103941100044, + "grad_norm": 0.3536825180053711, + "learning_rate": 4.788598960589e-05, + "loss": 0.363, + "step": 3124000 + }, + { + "epoch": 21.14348744045041, + "grad_norm": 0.3748694956302643, + "learning_rate": 4.788565125595496e-05, + "loss": 0.3643, + "step": 3124500 + }, + { + "epoch": 21.14687093980078, + "grad_norm": 0.3851325511932373, + "learning_rate": 4.788531290601992e-05, + "loss": 0.3635, + "step": 3125000 + }, + { + "epoch": 21.150254439151148, + "grad_norm": 0.3734279274940491, + "learning_rate": 4.788497455608489e-05, + "loss": 0.3631, + "step": 3125500 + }, + { + "epoch": 21.153637938501515, + "grad_norm": 0.35346266627311707, + "learning_rate": 4.788463620614985e-05, + "loss": 0.3649, + "step": 3126000 + }, + { + "epoch": 21.157021437851885, + "grad_norm": 0.3762940764427185, + "learning_rate": 4.7884297856214815e-05, + "loss": 0.3625, + "step": 3126500 + }, + { + "epoch": 21.16040493720225, + "grad_norm": 0.34404096007347107, + "learning_rate": 4.788395950627978e-05, + "loss": 0.3636, + "step": 3127000 + }, + { + "epoch": 21.163788436552622, + "grad_norm": 0.36609381437301636, + "learning_rate": 4.788362115634474e-05, + "loss": 0.3631, + "step": 3127500 + }, + { + "epoch": 21.16717193590299, + "grad_norm": 0.34207746386528015, + "learning_rate": 4.78832828064097e-05, + "loss": 0.364, + "step": 3128000 + }, + { + "epoch": 21.170555435253355, + "grad_norm": 0.3563523292541504, + "learning_rate": 4.7882944456474663e-05, + "loss": 0.3656, + "step": 3128500 + }, + { + "epoch": 21.173938934603726, + "grad_norm": 0.3344648480415344, + "learning_rate": 4.788260610653963e-05, + "loss": 0.3625, + "step": 3129000 + }, + { + "epoch": 21.177322433954092, + "grad_norm": 0.35955458879470825, + "learning_rate": 4.7882267756604594e-05, + "loss": 0.3648, + "step": 3129500 + }, + { + "epoch": 21.18070593330446, + "grad_norm": 0.35110658407211304, + "learning_rate": 4.7881929406669557e-05, + "loss": 0.3645, + "step": 3130000 + }, + { + "epoch": 21.18408943265483, + "grad_norm": 0.3675309121608734, + "learning_rate": 4.788159105673452e-05, + "loss": 0.3643, + "step": 3130500 + }, + { + "epoch": 21.187472932005196, + "grad_norm": 0.36067256331443787, + "learning_rate": 4.788125270679949e-05, + "loss": 0.3647, + "step": 3131000 + }, + { + "epoch": 21.190856431355567, + "grad_norm": 0.35494762659072876, + "learning_rate": 4.788091435686445e-05, + "loss": 0.363, + "step": 3131500 + }, + { + "epoch": 21.194239930705933, + "grad_norm": 0.38366198539733887, + "learning_rate": 4.7880576006929405e-05, + "loss": 0.3663, + "step": 3132000 + }, + { + "epoch": 21.1976234300563, + "grad_norm": 0.3648875057697296, + "learning_rate": 4.788023765699437e-05, + "loss": 0.3632, + "step": 3132500 + }, + { + "epoch": 21.20100692940667, + "grad_norm": 0.3924325704574585, + "learning_rate": 4.7879899307059336e-05, + "loss": 0.3629, + "step": 3133000 + }, + { + "epoch": 21.204390428757037, + "grad_norm": 0.3698674440383911, + "learning_rate": 4.78795609571243e-05, + "loss": 0.3621, + "step": 3133500 + }, + { + "epoch": 21.207773928107407, + "grad_norm": 0.38015779852867126, + "learning_rate": 4.787922260718926e-05, + "loss": 0.3637, + "step": 3134000 + }, + { + "epoch": 21.211157427457774, + "grad_norm": 0.36998480558395386, + "learning_rate": 4.787888425725422e-05, + "loss": 0.3648, + "step": 3134500 + }, + { + "epoch": 21.21454092680814, + "grad_norm": 0.3666990101337433, + "learning_rate": 4.787854590731919e-05, + "loss": 0.3644, + "step": 3135000 + }, + { + "epoch": 21.21792442615851, + "grad_norm": 0.34037184715270996, + "learning_rate": 4.7878207557384153e-05, + "loss": 0.3651, + "step": 3135500 + }, + { + "epoch": 21.221307925508878, + "grad_norm": 0.393677294254303, + "learning_rate": 4.7877869207449116e-05, + "loss": 0.3634, + "step": 3136000 + }, + { + "epoch": 21.224691424859245, + "grad_norm": 0.37676140666007996, + "learning_rate": 4.787753085751408e-05, + "loss": 0.3644, + "step": 3136500 + }, + { + "epoch": 21.228074924209615, + "grad_norm": 0.3752439320087433, + "learning_rate": 4.787719250757904e-05, + "loss": 0.364, + "step": 3137000 + }, + { + "epoch": 21.23145842355998, + "grad_norm": 0.3684835433959961, + "learning_rate": 4.7876854157644e-05, + "loss": 0.3633, + "step": 3137500 + }, + { + "epoch": 21.234841922910352, + "grad_norm": 0.4008356034755707, + "learning_rate": 4.7876515807708964e-05, + "loss": 0.364, + "step": 3138000 + }, + { + "epoch": 21.23822542226072, + "grad_norm": 0.34454894065856934, + "learning_rate": 4.787617745777393e-05, + "loss": 0.363, + "step": 3138500 + }, + { + "epoch": 21.241608921611085, + "grad_norm": 0.367489755153656, + "learning_rate": 4.7875839107838895e-05, + "loss": 0.3652, + "step": 3139000 + }, + { + "epoch": 21.244992420961456, + "grad_norm": 0.3662697970867157, + "learning_rate": 4.787550075790386e-05, + "loss": 0.3639, + "step": 3139500 + }, + { + "epoch": 21.248375920311823, + "grad_norm": 0.3582600951194763, + "learning_rate": 4.787516240796882e-05, + "loss": 0.3629, + "step": 3140000 + }, + { + "epoch": 21.251759419662193, + "grad_norm": 0.3587487041950226, + "learning_rate": 4.787482405803378e-05, + "loss": 0.3629, + "step": 3140500 + }, + { + "epoch": 21.25514291901256, + "grad_norm": 0.34383293986320496, + "learning_rate": 4.787448570809875e-05, + "loss": 0.3635, + "step": 3141000 + }, + { + "epoch": 21.258526418362926, + "grad_norm": 0.34148308634757996, + "learning_rate": 4.7874147358163706e-05, + "loss": 0.3661, + "step": 3141500 + }, + { + "epoch": 21.261909917713297, + "grad_norm": 0.3754810690879822, + "learning_rate": 4.787380900822867e-05, + "loss": 0.3642, + "step": 3142000 + }, + { + "epoch": 21.265293417063663, + "grad_norm": 0.3658917546272278, + "learning_rate": 4.787347065829364e-05, + "loss": 0.3654, + "step": 3142500 + }, + { + "epoch": 21.268676916414034, + "grad_norm": 0.36671942472457886, + "learning_rate": 4.78731323083586e-05, + "loss": 0.3635, + "step": 3143000 + }, + { + "epoch": 21.2720604157644, + "grad_norm": 0.3823990821838379, + "learning_rate": 4.787279395842356e-05, + "loss": 0.3631, + "step": 3143500 + }, + { + "epoch": 21.275443915114767, + "grad_norm": 0.37779998779296875, + "learning_rate": 4.787245560848852e-05, + "loss": 0.3643, + "step": 3144000 + }, + { + "epoch": 21.278827414465137, + "grad_norm": 0.3578616976737976, + "learning_rate": 4.787211725855349e-05, + "loss": 0.3636, + "step": 3144500 + }, + { + "epoch": 21.282210913815504, + "grad_norm": 0.37870824337005615, + "learning_rate": 4.7871778908618454e-05, + "loss": 0.3642, + "step": 3145000 + }, + { + "epoch": 21.28559441316587, + "grad_norm": 0.3804149031639099, + "learning_rate": 4.7871440558683416e-05, + "loss": 0.366, + "step": 3145500 + }, + { + "epoch": 21.28897791251624, + "grad_norm": 0.3497847318649292, + "learning_rate": 4.787110220874838e-05, + "loss": 0.364, + "step": 3146000 + }, + { + "epoch": 21.292361411866608, + "grad_norm": 0.36826586723327637, + "learning_rate": 4.787076385881334e-05, + "loss": 0.3635, + "step": 3146500 + }, + { + "epoch": 21.29574491121698, + "grad_norm": 0.34804922342300415, + "learning_rate": 4.78704255088783e-05, + "loss": 0.3651, + "step": 3147000 + }, + { + "epoch": 21.299128410567345, + "grad_norm": 0.38317447900772095, + "learning_rate": 4.7870087158943265e-05, + "loss": 0.3643, + "step": 3147500 + }, + { + "epoch": 21.302511909917712, + "grad_norm": 0.37221434712409973, + "learning_rate": 4.7869748809008234e-05, + "loss": 0.3644, + "step": 3148000 + }, + { + "epoch": 21.305895409268082, + "grad_norm": 0.39198219776153564, + "learning_rate": 4.7869410459073196e-05, + "loss": 0.364, + "step": 3148500 + }, + { + "epoch": 21.30927890861845, + "grad_norm": 0.37624913454055786, + "learning_rate": 4.786907210913816e-05, + "loss": 0.3643, + "step": 3149000 + }, + { + "epoch": 21.31266240796882, + "grad_norm": 0.33953070640563965, + "learning_rate": 4.786873375920312e-05, + "loss": 0.3646, + "step": 3149500 + }, + { + "epoch": 21.316045907319186, + "grad_norm": 0.35391950607299805, + "learning_rate": 4.786839540926808e-05, + "loss": 0.3637, + "step": 3150000 + }, + { + "epoch": 21.319429406669553, + "grad_norm": 0.38748225569725037, + "learning_rate": 4.786805705933305e-05, + "loss": 0.3631, + "step": 3150500 + }, + { + "epoch": 21.322812906019923, + "grad_norm": 0.40156933665275574, + "learning_rate": 4.786771870939801e-05, + "loss": 0.3657, + "step": 3151000 + }, + { + "epoch": 21.32619640537029, + "grad_norm": 0.34377700090408325, + "learning_rate": 4.786738035946297e-05, + "loss": 0.3625, + "step": 3151500 + }, + { + "epoch": 21.32957990472066, + "grad_norm": 0.3886925280094147, + "learning_rate": 4.786704200952794e-05, + "loss": 0.3656, + "step": 3152000 + }, + { + "epoch": 21.332963404071027, + "grad_norm": 0.4017632305622101, + "learning_rate": 4.78667036595929e-05, + "loss": 0.3648, + "step": 3152500 + }, + { + "epoch": 21.336346903421394, + "grad_norm": 0.34350860118865967, + "learning_rate": 4.786636530965786e-05, + "loss": 0.3651, + "step": 3153000 + }, + { + "epoch": 21.339730402771764, + "grad_norm": 0.36725303530693054, + "learning_rate": 4.7866026959722824e-05, + "loss": 0.3634, + "step": 3153500 + }, + { + "epoch": 21.34311390212213, + "grad_norm": 0.354269802570343, + "learning_rate": 4.786568860978779e-05, + "loss": 0.3643, + "step": 3154000 + }, + { + "epoch": 21.346497401472497, + "grad_norm": 0.34944888949394226, + "learning_rate": 4.7865350259852755e-05, + "loss": 0.3641, + "step": 3154500 + }, + { + "epoch": 21.349880900822868, + "grad_norm": 0.41143280267715454, + "learning_rate": 4.786501190991772e-05, + "loss": 0.364, + "step": 3155000 + }, + { + "epoch": 21.353264400173234, + "grad_norm": 0.35492393374443054, + "learning_rate": 4.786467355998268e-05, + "loss": 0.3633, + "step": 3155500 + }, + { + "epoch": 21.356647899523605, + "grad_norm": 0.3725531995296478, + "learning_rate": 4.786433521004764e-05, + "loss": 0.3651, + "step": 3156000 + }, + { + "epoch": 21.36003139887397, + "grad_norm": 0.3667643070220947, + "learning_rate": 4.78639968601126e-05, + "loss": 0.3649, + "step": 3156500 + }, + { + "epoch": 21.363414898224338, + "grad_norm": 0.3814246356487274, + "learning_rate": 4.7863658510177565e-05, + "loss": 0.3645, + "step": 3157000 + }, + { + "epoch": 21.36679839757471, + "grad_norm": 0.3644694983959198, + "learning_rate": 4.786332016024253e-05, + "loss": 0.366, + "step": 3157500 + }, + { + "epoch": 21.370181896925075, + "grad_norm": 0.37675225734710693, + "learning_rate": 4.7862981810307496e-05, + "loss": 0.3652, + "step": 3158000 + }, + { + "epoch": 21.373565396275445, + "grad_norm": 0.36693552136421204, + "learning_rate": 4.786264346037246e-05, + "loss": 0.3653, + "step": 3158500 + }, + { + "epoch": 21.376948895625812, + "grad_norm": 0.3457273542881012, + "learning_rate": 4.786230511043742e-05, + "loss": 0.3639, + "step": 3159000 + }, + { + "epoch": 21.38033239497618, + "grad_norm": 0.37589481472969055, + "learning_rate": 4.786196676050238e-05, + "loss": 0.3633, + "step": 3159500 + }, + { + "epoch": 21.38371589432655, + "grad_norm": 0.3978182077407837, + "learning_rate": 4.786162841056735e-05, + "loss": 0.3636, + "step": 3160000 + }, + { + "epoch": 21.387099393676916, + "grad_norm": 0.3442537784576416, + "learning_rate": 4.7861290060632314e-05, + "loss": 0.3656, + "step": 3160500 + }, + { + "epoch": 21.390482893027283, + "grad_norm": 0.35906335711479187, + "learning_rate": 4.786095171069727e-05, + "loss": 0.365, + "step": 3161000 + }, + { + "epoch": 21.393866392377653, + "grad_norm": 0.37391018867492676, + "learning_rate": 4.786061336076224e-05, + "loss": 0.3631, + "step": 3161500 + }, + { + "epoch": 21.39724989172802, + "grad_norm": 0.38614532351493835, + "learning_rate": 4.78602750108272e-05, + "loss": 0.366, + "step": 3162000 + }, + { + "epoch": 21.40063339107839, + "grad_norm": 0.39576542377471924, + "learning_rate": 4.785993666089216e-05, + "loss": 0.3638, + "step": 3162500 + }, + { + "epoch": 21.404016890428757, + "grad_norm": 0.3507489562034607, + "learning_rate": 4.7859598310957124e-05, + "loss": 0.3655, + "step": 3163000 + }, + { + "epoch": 21.407400389779124, + "grad_norm": 0.35442155599594116, + "learning_rate": 4.785925996102209e-05, + "loss": 0.3659, + "step": 3163500 + }, + { + "epoch": 21.410783889129494, + "grad_norm": 0.33154821395874023, + "learning_rate": 4.7858921611087055e-05, + "loss": 0.3653, + "step": 3164000 + }, + { + "epoch": 21.41416738847986, + "grad_norm": 0.3507884740829468, + "learning_rate": 4.785858326115202e-05, + "loss": 0.366, + "step": 3164500 + }, + { + "epoch": 21.41755088783023, + "grad_norm": 0.31205856800079346, + "learning_rate": 4.785824491121698e-05, + "loss": 0.3643, + "step": 3165000 + }, + { + "epoch": 21.420934387180598, + "grad_norm": 0.39912527799606323, + "learning_rate": 4.785790656128194e-05, + "loss": 0.3634, + "step": 3165500 + }, + { + "epoch": 21.424317886530964, + "grad_norm": 0.37065696716308594, + "learning_rate": 4.7857568211346904e-05, + "loss": 0.3641, + "step": 3166000 + }, + { + "epoch": 21.427701385881335, + "grad_norm": 0.38233476877212524, + "learning_rate": 4.7857229861411866e-05, + "loss": 0.3626, + "step": 3166500 + }, + { + "epoch": 21.4310848852317, + "grad_norm": 0.34912359714508057, + "learning_rate": 4.785689151147683e-05, + "loss": 0.362, + "step": 3167000 + }, + { + "epoch": 21.434468384582072, + "grad_norm": 0.38831716775894165, + "learning_rate": 4.78565531615418e-05, + "loss": 0.3628, + "step": 3167500 + }, + { + "epoch": 21.43785188393244, + "grad_norm": 0.40890708565711975, + "learning_rate": 4.785621481160676e-05, + "loss": 0.3624, + "step": 3168000 + }, + { + "epoch": 21.441235383282805, + "grad_norm": 0.3991265296936035, + "learning_rate": 4.785587646167172e-05, + "loss": 0.3654, + "step": 3168500 + }, + { + "epoch": 21.444618882633176, + "grad_norm": 0.37633535265922546, + "learning_rate": 4.785553811173668e-05, + "loss": 0.3648, + "step": 3169000 + }, + { + "epoch": 21.448002381983542, + "grad_norm": 0.36139988899230957, + "learning_rate": 4.785519976180165e-05, + "loss": 0.3648, + "step": 3169500 + }, + { + "epoch": 21.45138588133391, + "grad_norm": 0.3636741042137146, + "learning_rate": 4.7854861411866614e-05, + "loss": 0.365, + "step": 3170000 + }, + { + "epoch": 21.45476938068428, + "grad_norm": 0.36973482370376587, + "learning_rate": 4.785452306193157e-05, + "loss": 0.3652, + "step": 3170500 + }, + { + "epoch": 21.458152880034646, + "grad_norm": 0.36788687109947205, + "learning_rate": 4.785418471199654e-05, + "loss": 0.3634, + "step": 3171000 + }, + { + "epoch": 21.461536379385016, + "grad_norm": 0.364843487739563, + "learning_rate": 4.78538463620615e-05, + "loss": 0.3633, + "step": 3171500 + }, + { + "epoch": 21.464919878735383, + "grad_norm": 0.3892847001552582, + "learning_rate": 4.785350801212646e-05, + "loss": 0.3645, + "step": 3172000 + }, + { + "epoch": 21.46830337808575, + "grad_norm": 0.35123181343078613, + "learning_rate": 4.7853169662191425e-05, + "loss": 0.3643, + "step": 3172500 + }, + { + "epoch": 21.47168687743612, + "grad_norm": 0.3675801753997803, + "learning_rate": 4.7852831312256394e-05, + "loss": 0.3651, + "step": 3173000 + }, + { + "epoch": 21.475070376786487, + "grad_norm": 0.3755906820297241, + "learning_rate": 4.7852492962321356e-05, + "loss": 0.3656, + "step": 3173500 + }, + { + "epoch": 21.478453876136857, + "grad_norm": 0.360592782497406, + "learning_rate": 4.785215461238632e-05, + "loss": 0.3633, + "step": 3174000 + }, + { + "epoch": 21.481837375487224, + "grad_norm": 0.3928108811378479, + "learning_rate": 4.785181626245128e-05, + "loss": 0.3632, + "step": 3174500 + }, + { + "epoch": 21.48522087483759, + "grad_norm": 0.371901273727417, + "learning_rate": 4.785147791251624e-05, + "loss": 0.3632, + "step": 3175000 + }, + { + "epoch": 21.48860437418796, + "grad_norm": 0.3364705443382263, + "learning_rate": 4.7851139562581204e-05, + "loss": 0.3656, + "step": 3175500 + }, + { + "epoch": 21.491987873538328, + "grad_norm": 0.3526456952095032, + "learning_rate": 4.785080121264617e-05, + "loss": 0.3648, + "step": 3176000 + }, + { + "epoch": 21.495371372888698, + "grad_norm": 0.3650068938732147, + "learning_rate": 4.785046286271113e-05, + "loss": 0.3643, + "step": 3176500 + }, + { + "epoch": 21.498754872239065, + "grad_norm": 0.3584924638271332, + "learning_rate": 4.78501245127761e-05, + "loss": 0.3661, + "step": 3177000 + }, + { + "epoch": 21.50213837158943, + "grad_norm": 0.36041101813316345, + "learning_rate": 4.784978616284106e-05, + "loss": 0.3642, + "step": 3177500 + }, + { + "epoch": 21.505521870939802, + "grad_norm": 0.3904370367527008, + "learning_rate": 4.784944781290602e-05, + "loss": 0.3657, + "step": 3178000 + }, + { + "epoch": 21.50890537029017, + "grad_norm": 0.3386889398097992, + "learning_rate": 4.7849109462970984e-05, + "loss": 0.3641, + "step": 3178500 + }, + { + "epoch": 21.512288869640535, + "grad_norm": 0.36542385816574097, + "learning_rate": 4.784877111303595e-05, + "loss": 0.3642, + "step": 3179000 + }, + { + "epoch": 21.515672368990906, + "grad_norm": 0.3513859212398529, + "learning_rate": 4.7848432763100915e-05, + "loss": 0.3624, + "step": 3179500 + }, + { + "epoch": 21.519055868341272, + "grad_norm": 0.39994296431541443, + "learning_rate": 4.784809441316587e-05, + "loss": 0.3638, + "step": 3180000 + }, + { + "epoch": 21.522439367691643, + "grad_norm": 0.3714151680469513, + "learning_rate": 4.784775606323084e-05, + "loss": 0.3641, + "step": 3180500 + }, + { + "epoch": 21.52582286704201, + "grad_norm": 0.3823555111885071, + "learning_rate": 4.78474177132958e-05, + "loss": 0.3646, + "step": 3181000 + }, + { + "epoch": 21.529206366392376, + "grad_norm": 0.37660470604896545, + "learning_rate": 4.7847079363360764e-05, + "loss": 0.3656, + "step": 3181500 + }, + { + "epoch": 21.532589865742747, + "grad_norm": 0.3868112862110138, + "learning_rate": 4.7846741013425726e-05, + "loss": 0.3659, + "step": 3182000 + }, + { + "epoch": 21.535973365093113, + "grad_norm": 0.37358933687210083, + "learning_rate": 4.7846402663490695e-05, + "loss": 0.3663, + "step": 3182500 + }, + { + "epoch": 21.539356864443484, + "grad_norm": 0.3735228180885315, + "learning_rate": 4.784606431355566e-05, + "loss": 0.3642, + "step": 3183000 + }, + { + "epoch": 21.54274036379385, + "grad_norm": 0.34915071725845337, + "learning_rate": 4.784572596362062e-05, + "loss": 0.3649, + "step": 3183500 + }, + { + "epoch": 21.546123863144217, + "grad_norm": 0.37599048018455505, + "learning_rate": 4.784538761368558e-05, + "loss": 0.3639, + "step": 3184000 + }, + { + "epoch": 21.549507362494587, + "grad_norm": 0.3553498387336731, + "learning_rate": 4.784504926375054e-05, + "loss": 0.3648, + "step": 3184500 + }, + { + "epoch": 21.552890861844954, + "grad_norm": 0.35894688963890076, + "learning_rate": 4.7844710913815505e-05, + "loss": 0.3651, + "step": 3185000 + }, + { + "epoch": 21.55627436119532, + "grad_norm": 0.35583484172821045, + "learning_rate": 4.784437256388047e-05, + "loss": 0.3632, + "step": 3185500 + }, + { + "epoch": 21.55965786054569, + "grad_norm": 0.3528789281845093, + "learning_rate": 4.784403421394543e-05, + "loss": 0.3648, + "step": 3186000 + }, + { + "epoch": 21.563041359896058, + "grad_norm": 0.36016106605529785, + "learning_rate": 4.78436958640104e-05, + "loss": 0.3634, + "step": 3186500 + }, + { + "epoch": 21.56642485924643, + "grad_norm": 0.389413982629776, + "learning_rate": 4.784335751407536e-05, + "loss": 0.3672, + "step": 3187000 + }, + { + "epoch": 21.569808358596795, + "grad_norm": 0.4205852150917053, + "learning_rate": 4.784301916414032e-05, + "loss": 0.3646, + "step": 3187500 + }, + { + "epoch": 21.57319185794716, + "grad_norm": 0.3701072931289673, + "learning_rate": 4.7842680814205285e-05, + "loss": 0.3655, + "step": 3188000 + }, + { + "epoch": 21.576575357297532, + "grad_norm": 0.32102009654045105, + "learning_rate": 4.7842342464270254e-05, + "loss": 0.3638, + "step": 3188500 + }, + { + "epoch": 21.5799588566479, + "grad_norm": 0.3219831585884094, + "learning_rate": 4.7842004114335216e-05, + "loss": 0.3644, + "step": 3189000 + }, + { + "epoch": 21.58334235599827, + "grad_norm": 0.33970320224761963, + "learning_rate": 4.784166576440017e-05, + "loss": 0.365, + "step": 3189500 + }, + { + "epoch": 21.586725855348636, + "grad_norm": 0.3333664834499359, + "learning_rate": 4.784132741446514e-05, + "loss": 0.3658, + "step": 3190000 + }, + { + "epoch": 21.590109354699003, + "grad_norm": 0.36616653203964233, + "learning_rate": 4.78409890645301e-05, + "loss": 0.3644, + "step": 3190500 + }, + { + "epoch": 21.593492854049373, + "grad_norm": 0.3060896098613739, + "learning_rate": 4.7840650714595064e-05, + "loss": 0.3631, + "step": 3191000 + }, + { + "epoch": 21.59687635339974, + "grad_norm": 0.4059158265590668, + "learning_rate": 4.7840312364660026e-05, + "loss": 0.3638, + "step": 3191500 + }, + { + "epoch": 21.60025985275011, + "grad_norm": 0.36123108863830566, + "learning_rate": 4.7839974014724995e-05, + "loss": 0.3646, + "step": 3192000 + }, + { + "epoch": 21.603643352100477, + "grad_norm": 0.36856579780578613, + "learning_rate": 4.783963566478996e-05, + "loss": 0.3658, + "step": 3192500 + }, + { + "epoch": 21.607026851450843, + "grad_norm": 0.35361194610595703, + "learning_rate": 4.783929731485492e-05, + "loss": 0.365, + "step": 3193000 + }, + { + "epoch": 21.610410350801214, + "grad_norm": 0.3588424026966095, + "learning_rate": 4.783895896491988e-05, + "loss": 0.3647, + "step": 3193500 + }, + { + "epoch": 21.61379385015158, + "grad_norm": 0.3856673836708069, + "learning_rate": 4.7838620614984844e-05, + "loss": 0.3629, + "step": 3194000 + }, + { + "epoch": 21.617177349501947, + "grad_norm": 0.3217855989933014, + "learning_rate": 4.7838282265049806e-05, + "loss": 0.3645, + "step": 3194500 + }, + { + "epoch": 21.620560848852318, + "grad_norm": 0.3752342462539673, + "learning_rate": 4.783794391511477e-05, + "loss": 0.364, + "step": 3195000 + }, + { + "epoch": 21.623944348202684, + "grad_norm": 0.3654584586620331, + "learning_rate": 4.783760556517973e-05, + "loss": 0.3639, + "step": 3195500 + }, + { + "epoch": 21.627327847553055, + "grad_norm": 0.3445321023464203, + "learning_rate": 4.78372672152447e-05, + "loss": 0.3654, + "step": 3196000 + }, + { + "epoch": 21.63071134690342, + "grad_norm": 0.3452378213405609, + "learning_rate": 4.783692886530966e-05, + "loss": 0.3656, + "step": 3196500 + }, + { + "epoch": 21.634094846253788, + "grad_norm": 0.37917622923851013, + "learning_rate": 4.783659051537462e-05, + "loss": 0.3662, + "step": 3197000 + }, + { + "epoch": 21.63747834560416, + "grad_norm": 0.35758304595947266, + "learning_rate": 4.7836252165439585e-05, + "loss": 0.3649, + "step": 3197500 + }, + { + "epoch": 21.640861844954525, + "grad_norm": 0.3921206593513489, + "learning_rate": 4.7835913815504554e-05, + "loss": 0.3657, + "step": 3198000 + }, + { + "epoch": 21.644245344304895, + "grad_norm": 0.34664463996887207, + "learning_rate": 4.7835575465569516e-05, + "loss": 0.3641, + "step": 3198500 + }, + { + "epoch": 21.647628843655262, + "grad_norm": 0.3591472804546356, + "learning_rate": 4.783523711563447e-05, + "loss": 0.3657, + "step": 3199000 + }, + { + "epoch": 21.65101234300563, + "grad_norm": 0.3513629734516144, + "learning_rate": 4.783489876569944e-05, + "loss": 0.3647, + "step": 3199500 + }, + { + "epoch": 21.654395842356, + "grad_norm": 0.37922269105911255, + "learning_rate": 4.78345604157644e-05, + "loss": 0.3638, + "step": 3200000 + }, + { + "epoch": 21.657779341706366, + "grad_norm": 0.41598668694496155, + "learning_rate": 4.7834222065829365e-05, + "loss": 0.366, + "step": 3200500 + }, + { + "epoch": 21.661162841056736, + "grad_norm": 0.37553656101226807, + "learning_rate": 4.783388371589433e-05, + "loss": 0.3647, + "step": 3201000 + }, + { + "epoch": 21.664546340407103, + "grad_norm": 0.31073540449142456, + "learning_rate": 4.7833545365959296e-05, + "loss": 0.3645, + "step": 3201500 + }, + { + "epoch": 21.66792983975747, + "grad_norm": 0.3254922926425934, + "learning_rate": 4.783320701602426e-05, + "loss": 0.363, + "step": 3202000 + }, + { + "epoch": 21.67131333910784, + "grad_norm": 0.3663785457611084, + "learning_rate": 4.783286866608922e-05, + "loss": 0.3643, + "step": 3202500 + }, + { + "epoch": 21.674696838458207, + "grad_norm": 0.35388389229774475, + "learning_rate": 4.783253031615418e-05, + "loss": 0.3628, + "step": 3203000 + }, + { + "epoch": 21.678080337808574, + "grad_norm": 0.3668496012687683, + "learning_rate": 4.7832191966219144e-05, + "loss": 0.3659, + "step": 3203500 + }, + { + "epoch": 21.681463837158944, + "grad_norm": 0.3565506339073181, + "learning_rate": 4.7831853616284106e-05, + "loss": 0.3634, + "step": 3204000 + }, + { + "epoch": 21.68484733650931, + "grad_norm": 0.35518208146095276, + "learning_rate": 4.783151526634907e-05, + "loss": 0.364, + "step": 3204500 + }, + { + "epoch": 21.68823083585968, + "grad_norm": 0.4002978801727295, + "learning_rate": 4.783117691641403e-05, + "loss": 0.3654, + "step": 3205000 + }, + { + "epoch": 21.691614335210048, + "grad_norm": 0.35866105556488037, + "learning_rate": 4.7830838566479e-05, + "loss": 0.3643, + "step": 3205500 + }, + { + "epoch": 21.694997834560414, + "grad_norm": 0.3549555242061615, + "learning_rate": 4.783050021654396e-05, + "loss": 0.3651, + "step": 3206000 + }, + { + "epoch": 21.698381333910785, + "grad_norm": 0.3505103588104248, + "learning_rate": 4.7830161866608924e-05, + "loss": 0.3643, + "step": 3206500 + }, + { + "epoch": 21.70176483326115, + "grad_norm": 0.373981773853302, + "learning_rate": 4.7829823516673886e-05, + "loss": 0.3645, + "step": 3207000 + }, + { + "epoch": 21.70514833261152, + "grad_norm": 0.3849928081035614, + "learning_rate": 4.7829485166738855e-05, + "loss": 0.3649, + "step": 3207500 + }, + { + "epoch": 21.70853183196189, + "grad_norm": 0.3909493684768677, + "learning_rate": 4.782914681680382e-05, + "loss": 0.3634, + "step": 3208000 + }, + { + "epoch": 21.711915331312255, + "grad_norm": 0.3365996181964874, + "learning_rate": 4.782880846686877e-05, + "loss": 0.3648, + "step": 3208500 + }, + { + "epoch": 21.715298830662626, + "grad_norm": 0.35930874943733215, + "learning_rate": 4.782847011693374e-05, + "loss": 0.3643, + "step": 3209000 + }, + { + "epoch": 21.718682330012992, + "grad_norm": 0.3305171728134155, + "learning_rate": 4.78281317669987e-05, + "loss": 0.3655, + "step": 3209500 + }, + { + "epoch": 21.72206582936336, + "grad_norm": 0.37265220284461975, + "learning_rate": 4.7827793417063665e-05, + "loss": 0.3665, + "step": 3210000 + }, + { + "epoch": 21.72544932871373, + "grad_norm": 0.327110230922699, + "learning_rate": 4.782745506712863e-05, + "loss": 0.3655, + "step": 3210500 + }, + { + "epoch": 21.728832828064096, + "grad_norm": 0.3678237497806549, + "learning_rate": 4.782711671719359e-05, + "loss": 0.3646, + "step": 3211000 + }, + { + "epoch": 21.732216327414466, + "grad_norm": 0.3403486907482147, + "learning_rate": 4.782677836725856e-05, + "loss": 0.3656, + "step": 3211500 + }, + { + "epoch": 21.735599826764833, + "grad_norm": 0.3431899845600128, + "learning_rate": 4.782644001732352e-05, + "loss": 0.3641, + "step": 3212000 + }, + { + "epoch": 21.7389833261152, + "grad_norm": 0.3963979184627533, + "learning_rate": 4.782610166738848e-05, + "loss": 0.3651, + "step": 3212500 + }, + { + "epoch": 21.74236682546557, + "grad_norm": 0.3516821265220642, + "learning_rate": 4.7825763317453445e-05, + "loss": 0.3636, + "step": 3213000 + }, + { + "epoch": 21.745750324815937, + "grad_norm": 0.36609870195388794, + "learning_rate": 4.782542496751841e-05, + "loss": 0.3646, + "step": 3213500 + }, + { + "epoch": 21.749133824166307, + "grad_norm": 0.3898099660873413, + "learning_rate": 4.782508661758337e-05, + "loss": 0.3622, + "step": 3214000 + }, + { + "epoch": 21.752517323516674, + "grad_norm": 0.3062995374202728, + "learning_rate": 4.782474826764833e-05, + "loss": 0.3646, + "step": 3214500 + }, + { + "epoch": 21.75590082286704, + "grad_norm": 0.37412673234939575, + "learning_rate": 4.78244099177133e-05, + "loss": 0.3647, + "step": 3215000 + }, + { + "epoch": 21.75928432221741, + "grad_norm": 0.38491693139076233, + "learning_rate": 4.782407156777826e-05, + "loss": 0.3633, + "step": 3215500 + }, + { + "epoch": 21.762667821567778, + "grad_norm": 0.3784734606742859, + "learning_rate": 4.7823733217843224e-05, + "loss": 0.3661, + "step": 3216000 + }, + { + "epoch": 21.766051320918148, + "grad_norm": 0.3548288345336914, + "learning_rate": 4.7823394867908187e-05, + "loss": 0.3639, + "step": 3216500 + }, + { + "epoch": 21.769434820268515, + "grad_norm": 0.3605089783668518, + "learning_rate": 4.7823056517973155e-05, + "loss": 0.3648, + "step": 3217000 + }, + { + "epoch": 21.77281831961888, + "grad_norm": 0.3627791702747345, + "learning_rate": 4.782271816803812e-05, + "loss": 0.3644, + "step": 3217500 + }, + { + "epoch": 21.776201818969252, + "grad_norm": 0.40289151668548584, + "learning_rate": 4.782237981810307e-05, + "loss": 0.3629, + "step": 3218000 + }, + { + "epoch": 21.77958531831962, + "grad_norm": 0.36663007736206055, + "learning_rate": 4.782204146816804e-05, + "loss": 0.3652, + "step": 3218500 + }, + { + "epoch": 21.782968817669985, + "grad_norm": 0.3509083688259125, + "learning_rate": 4.7821703118233004e-05, + "loss": 0.3652, + "step": 3219000 + }, + { + "epoch": 21.786352317020356, + "grad_norm": 0.3642936944961548, + "learning_rate": 4.7821364768297966e-05, + "loss": 0.3652, + "step": 3219500 + }, + { + "epoch": 21.789735816370722, + "grad_norm": 0.4193442761898041, + "learning_rate": 4.782102641836293e-05, + "loss": 0.3651, + "step": 3220000 + }, + { + "epoch": 21.793119315721093, + "grad_norm": 0.4046929180622101, + "learning_rate": 4.782068806842789e-05, + "loss": 0.3644, + "step": 3220500 + }, + { + "epoch": 21.79650281507146, + "grad_norm": 0.37898531556129456, + "learning_rate": 4.782034971849286e-05, + "loss": 0.366, + "step": 3221000 + }, + { + "epoch": 21.799886314421826, + "grad_norm": 0.35761505365371704, + "learning_rate": 4.782001136855782e-05, + "loss": 0.3654, + "step": 3221500 + }, + { + "epoch": 21.803269813772197, + "grad_norm": 0.3742005527019501, + "learning_rate": 4.7819673018622783e-05, + "loss": 0.3654, + "step": 3222000 + }, + { + "epoch": 21.806653313122563, + "grad_norm": 0.3698446452617645, + "learning_rate": 4.7819334668687746e-05, + "loss": 0.3647, + "step": 3222500 + }, + { + "epoch": 21.810036812472934, + "grad_norm": 0.35749921202659607, + "learning_rate": 4.781899631875271e-05, + "loss": 0.3636, + "step": 3223000 + }, + { + "epoch": 21.8134203118233, + "grad_norm": 0.34802117943763733, + "learning_rate": 4.781865796881767e-05, + "loss": 0.3662, + "step": 3223500 + }, + { + "epoch": 21.816803811173667, + "grad_norm": 0.34336793422698975, + "learning_rate": 4.781831961888263e-05, + "loss": 0.3651, + "step": 3224000 + }, + { + "epoch": 21.820187310524037, + "grad_norm": 0.38206735253334045, + "learning_rate": 4.78179812689476e-05, + "loss": 0.3656, + "step": 3224500 + }, + { + "epoch": 21.823570809874404, + "grad_norm": 0.38054636120796204, + "learning_rate": 4.781764291901256e-05, + "loss": 0.3644, + "step": 3225000 + }, + { + "epoch": 21.826954309224774, + "grad_norm": 0.37155601382255554, + "learning_rate": 4.7817304569077525e-05, + "loss": 0.3648, + "step": 3225500 + }, + { + "epoch": 21.83033780857514, + "grad_norm": 0.38225722312927246, + "learning_rate": 4.781696621914249e-05, + "loss": 0.3639, + "step": 3226000 + }, + { + "epoch": 21.833721307925508, + "grad_norm": 0.39849624037742615, + "learning_rate": 4.7816627869207456e-05, + "loss": 0.3639, + "step": 3226500 + }, + { + "epoch": 21.837104807275878, + "grad_norm": 0.3631201982498169, + "learning_rate": 4.781628951927242e-05, + "loss": 0.3644, + "step": 3227000 + }, + { + "epoch": 21.840488306626245, + "grad_norm": 0.3547350764274597, + "learning_rate": 4.7815951169337374e-05, + "loss": 0.3631, + "step": 3227500 + }, + { + "epoch": 21.84387180597661, + "grad_norm": 0.36984550952911377, + "learning_rate": 4.7815612819402336e-05, + "loss": 0.3652, + "step": 3228000 + }, + { + "epoch": 21.847255305326982, + "grad_norm": 0.34281057119369507, + "learning_rate": 4.7815274469467305e-05, + "loss": 0.3634, + "step": 3228500 + }, + { + "epoch": 21.85063880467735, + "grad_norm": 0.3575781285762787, + "learning_rate": 4.781493611953227e-05, + "loss": 0.3654, + "step": 3229000 + }, + { + "epoch": 21.85402230402772, + "grad_norm": 0.3543434143066406, + "learning_rate": 4.781459776959723e-05, + "loss": 0.363, + "step": 3229500 + }, + { + "epoch": 21.857405803378086, + "grad_norm": 0.3873865306377411, + "learning_rate": 4.781425941966219e-05, + "loss": 0.3636, + "step": 3230000 + }, + { + "epoch": 21.860789302728453, + "grad_norm": 0.3623385727405548, + "learning_rate": 4.781392106972716e-05, + "loss": 0.3635, + "step": 3230500 + }, + { + "epoch": 21.864172802078823, + "grad_norm": 0.35204654932022095, + "learning_rate": 4.781358271979212e-05, + "loss": 0.3673, + "step": 3231000 + }, + { + "epoch": 21.86755630142919, + "grad_norm": 0.3785359561443329, + "learning_rate": 4.7813244369857084e-05, + "loss": 0.3641, + "step": 3231500 + }, + { + "epoch": 21.87093980077956, + "grad_norm": 0.3396179974079132, + "learning_rate": 4.7812906019922046e-05, + "loss": 0.3644, + "step": 3232000 + }, + { + "epoch": 21.874323300129927, + "grad_norm": 0.35946404933929443, + "learning_rate": 4.781256766998701e-05, + "loss": 0.3657, + "step": 3232500 + }, + { + "epoch": 21.877706799480293, + "grad_norm": 0.40445196628570557, + "learning_rate": 4.781222932005197e-05, + "loss": 0.365, + "step": 3233000 + }, + { + "epoch": 21.881090298830664, + "grad_norm": 0.3674616813659668, + "learning_rate": 4.781189097011693e-05, + "loss": 0.3652, + "step": 3233500 + }, + { + "epoch": 21.88447379818103, + "grad_norm": 0.3473931849002838, + "learning_rate": 4.78115526201819e-05, + "loss": 0.3647, + "step": 3234000 + }, + { + "epoch": 21.887857297531397, + "grad_norm": 0.37436696887016296, + "learning_rate": 4.7811214270246864e-05, + "loss": 0.3639, + "step": 3234500 + }, + { + "epoch": 21.891240796881767, + "grad_norm": 0.3391352891921997, + "learning_rate": 4.7810875920311826e-05, + "loss": 0.3663, + "step": 3235000 + }, + { + "epoch": 21.894624296232134, + "grad_norm": 0.40358367562294006, + "learning_rate": 4.781053757037679e-05, + "loss": 0.3653, + "step": 3235500 + }, + { + "epoch": 21.898007795582505, + "grad_norm": 0.35122546553611755, + "learning_rate": 4.781019922044176e-05, + "loss": 0.3647, + "step": 3236000 + }, + { + "epoch": 21.90139129493287, + "grad_norm": 0.393868625164032, + "learning_rate": 4.780986087050672e-05, + "loss": 0.3628, + "step": 3236500 + }, + { + "epoch": 21.904774794283238, + "grad_norm": 0.33418118953704834, + "learning_rate": 4.7809522520571674e-05, + "loss": 0.3637, + "step": 3237000 + }, + { + "epoch": 21.90815829363361, + "grad_norm": 0.37154361605644226, + "learning_rate": 4.7809184170636636e-05, + "loss": 0.3647, + "step": 3237500 + }, + { + "epoch": 21.911541792983975, + "grad_norm": 0.3812848627567291, + "learning_rate": 4.7808845820701605e-05, + "loss": 0.3657, + "step": 3238000 + }, + { + "epoch": 21.914925292334345, + "grad_norm": 0.35401734709739685, + "learning_rate": 4.780850747076657e-05, + "loss": 0.3634, + "step": 3238500 + }, + { + "epoch": 21.918308791684712, + "grad_norm": 0.3435650169849396, + "learning_rate": 4.780816912083153e-05, + "loss": 0.3634, + "step": 3239000 + }, + { + "epoch": 21.92169229103508, + "grad_norm": 0.37529340386390686, + "learning_rate": 4.780783077089649e-05, + "loss": 0.3646, + "step": 3239500 + }, + { + "epoch": 21.92507579038545, + "grad_norm": 0.33938807249069214, + "learning_rate": 4.780749242096146e-05, + "loss": 0.3647, + "step": 3240000 + }, + { + "epoch": 21.928459289735816, + "grad_norm": 0.3155749440193176, + "learning_rate": 4.780715407102642e-05, + "loss": 0.364, + "step": 3240500 + }, + { + "epoch": 21.931842789086183, + "grad_norm": 0.37478557229042053, + "learning_rate": 4.7806815721091385e-05, + "loss": 0.3648, + "step": 3241000 + }, + { + "epoch": 21.935226288436553, + "grad_norm": 0.35621947050094604, + "learning_rate": 4.780647737115635e-05, + "loss": 0.3657, + "step": 3241500 + }, + { + "epoch": 21.93860978778692, + "grad_norm": 0.3681904375553131, + "learning_rate": 4.780613902122131e-05, + "loss": 0.3647, + "step": 3242000 + }, + { + "epoch": 21.94199328713729, + "grad_norm": 0.37497857213020325, + "learning_rate": 4.780580067128627e-05, + "loss": 0.3643, + "step": 3242500 + }, + { + "epoch": 21.945376786487657, + "grad_norm": 0.41352739930152893, + "learning_rate": 4.780546232135123e-05, + "loss": 0.365, + "step": 3243000 + }, + { + "epoch": 21.948760285838024, + "grad_norm": 0.386015385389328, + "learning_rate": 4.78051239714162e-05, + "loss": 0.3654, + "step": 3243500 + }, + { + "epoch": 21.952143785188394, + "grad_norm": 0.3766820430755615, + "learning_rate": 4.7804785621481164e-05, + "loss": 0.3657, + "step": 3244000 + }, + { + "epoch": 21.95552728453876, + "grad_norm": 0.3699408769607544, + "learning_rate": 4.7804447271546126e-05, + "loss": 0.3641, + "step": 3244500 + }, + { + "epoch": 21.95891078388913, + "grad_norm": 0.3297472894191742, + "learning_rate": 4.780410892161109e-05, + "loss": 0.3647, + "step": 3245000 + }, + { + "epoch": 21.962294283239498, + "grad_norm": 0.36351293325424194, + "learning_rate": 4.780377057167606e-05, + "loss": 0.3644, + "step": 3245500 + }, + { + "epoch": 21.965677782589864, + "grad_norm": 0.3939778506755829, + "learning_rate": 4.780343222174102e-05, + "loss": 0.3661, + "step": 3246000 + }, + { + "epoch": 21.969061281940235, + "grad_norm": 0.4079965353012085, + "learning_rate": 4.7803093871805975e-05, + "loss": 0.3646, + "step": 3246500 + }, + { + "epoch": 21.9724447812906, + "grad_norm": 0.3721138536930084, + "learning_rate": 4.780275552187094e-05, + "loss": 0.3652, + "step": 3247000 + }, + { + "epoch": 21.97582828064097, + "grad_norm": 0.3701770603656769, + "learning_rate": 4.7802417171935906e-05, + "loss": 0.365, + "step": 3247500 + }, + { + "epoch": 21.97921177999134, + "grad_norm": 0.3543812334537506, + "learning_rate": 4.780207882200087e-05, + "loss": 0.3631, + "step": 3248000 + }, + { + "epoch": 21.982595279341705, + "grad_norm": 0.3594103157520294, + "learning_rate": 4.780174047206583e-05, + "loss": 0.3647, + "step": 3248500 + }, + { + "epoch": 21.985978778692076, + "grad_norm": 0.3717160224914551, + "learning_rate": 4.780140212213079e-05, + "loss": 0.3654, + "step": 3249000 + }, + { + "epoch": 21.989362278042442, + "grad_norm": 0.3792131543159485, + "learning_rate": 4.780106377219576e-05, + "loss": 0.3639, + "step": 3249500 + }, + { + "epoch": 21.992745777392813, + "grad_norm": 0.34902265667915344, + "learning_rate": 4.780072542226072e-05, + "loss": 0.3638, + "step": 3250000 + }, + { + "epoch": 21.99612927674318, + "grad_norm": 0.3875604569911957, + "learning_rate": 4.7800387072325685e-05, + "loss": 0.3651, + "step": 3250500 + }, + { + "epoch": 21.999512776093546, + "grad_norm": 0.3711358308792114, + "learning_rate": 4.780004872239065e-05, + "loss": 0.3648, + "step": 3251000 + }, + { + "epoch": 22.0, + "eval_accuracy": 0.8610757404558269, + "eval_loss": 0.5639563798904419, + "eval_runtime": 3381.8763, + "eval_samples_per_second": 85.971, + "eval_steps_per_second": 5.373, + "step": 3251072 + }, + { + "epoch": 22.002896275443916, + "grad_norm": 0.36901411414146423, + "learning_rate": 4.779971037245561e-05, + "loss": 0.3639, + "step": 3251500 + }, + { + "epoch": 22.006279774794283, + "grad_norm": 0.37363478541374207, + "learning_rate": 4.779937202252057e-05, + "loss": 0.3617, + "step": 3252000 + }, + { + "epoch": 22.00966327414465, + "grad_norm": 0.35945314168930054, + "learning_rate": 4.7799033672585534e-05, + "loss": 0.3616, + "step": 3252500 + }, + { + "epoch": 22.01304677349502, + "grad_norm": 0.3384290337562561, + "learning_rate": 4.77986953226505e-05, + "loss": 0.3638, + "step": 3253000 + }, + { + "epoch": 22.016430272845387, + "grad_norm": 0.3904561698436737, + "learning_rate": 4.7798356972715465e-05, + "loss": 0.3633, + "step": 3253500 + }, + { + "epoch": 22.019813772195757, + "grad_norm": 0.35363084077835083, + "learning_rate": 4.779801862278043e-05, + "loss": 0.3627, + "step": 3254000 + }, + { + "epoch": 22.023197271546124, + "grad_norm": 0.3530249297618866, + "learning_rate": 4.779768027284539e-05, + "loss": 0.3629, + "step": 3254500 + }, + { + "epoch": 22.02658077089649, + "grad_norm": 0.3707101047039032, + "learning_rate": 4.779734192291036e-05, + "loss": 0.3631, + "step": 3255000 + }, + { + "epoch": 22.02996427024686, + "grad_norm": 0.33745312690734863, + "learning_rate": 4.779700357297532e-05, + "loss": 0.3618, + "step": 3255500 + }, + { + "epoch": 22.033347769597228, + "grad_norm": 0.37105587124824524, + "learning_rate": 4.7796665223040275e-05, + "loss": 0.3604, + "step": 3256000 + }, + { + "epoch": 22.036731268947598, + "grad_norm": 0.36006683111190796, + "learning_rate": 4.779632687310524e-05, + "loss": 0.3625, + "step": 3256500 + }, + { + "epoch": 22.040114768297965, + "grad_norm": 0.37026074528694153, + "learning_rate": 4.7795988523170206e-05, + "loss": 0.363, + "step": 3257000 + }, + { + "epoch": 22.04349826764833, + "grad_norm": 0.3176872432231903, + "learning_rate": 4.779565017323517e-05, + "loss": 0.364, + "step": 3257500 + }, + { + "epoch": 22.046881766998702, + "grad_norm": 0.37873584032058716, + "learning_rate": 4.779531182330013e-05, + "loss": 0.3629, + "step": 3258000 + }, + { + "epoch": 22.05026526634907, + "grad_norm": 0.4034285247325897, + "learning_rate": 4.779497347336509e-05, + "loss": 0.3618, + "step": 3258500 + }, + { + "epoch": 22.053648765699435, + "grad_norm": 0.35463711619377136, + "learning_rate": 4.779463512343006e-05, + "loss": 0.3619, + "step": 3259000 + }, + { + "epoch": 22.057032265049806, + "grad_norm": 0.39048513770103455, + "learning_rate": 4.7794296773495024e-05, + "loss": 0.3639, + "step": 3259500 + }, + { + "epoch": 22.060415764400172, + "grad_norm": 0.38548508286476135, + "learning_rate": 4.7793958423559986e-05, + "loss": 0.3633, + "step": 3260000 + }, + { + "epoch": 22.063799263750543, + "grad_norm": 0.38834986090660095, + "learning_rate": 4.779362007362495e-05, + "loss": 0.3642, + "step": 3260500 + }, + { + "epoch": 22.06718276310091, + "grad_norm": 0.31702256202697754, + "learning_rate": 4.779328172368991e-05, + "loss": 0.3637, + "step": 3261000 + }, + { + "epoch": 22.070566262451276, + "grad_norm": 0.35815784335136414, + "learning_rate": 4.779294337375487e-05, + "loss": 0.3634, + "step": 3261500 + }, + { + "epoch": 22.073949761801646, + "grad_norm": 0.34262552857398987, + "learning_rate": 4.7792605023819834e-05, + "loss": 0.3629, + "step": 3262000 + }, + { + "epoch": 22.077333261152013, + "grad_norm": 0.37976783514022827, + "learning_rate": 4.77922666738848e-05, + "loss": 0.3625, + "step": 3262500 + }, + { + "epoch": 22.080716760502384, + "grad_norm": 0.41490501165390015, + "learning_rate": 4.7791928323949765e-05, + "loss": 0.3637, + "step": 3263000 + }, + { + "epoch": 22.08410025985275, + "grad_norm": 0.3685232698917389, + "learning_rate": 4.779158997401473e-05, + "loss": 0.3636, + "step": 3263500 + }, + { + "epoch": 22.087483759203117, + "grad_norm": 0.39337560534477234, + "learning_rate": 4.779125162407969e-05, + "loss": 0.3641, + "step": 3264000 + }, + { + "epoch": 22.090867258553487, + "grad_norm": 0.3464198410511017, + "learning_rate": 4.779091327414466e-05, + "loss": 0.3634, + "step": 3264500 + }, + { + "epoch": 22.094250757903854, + "grad_norm": 0.35609114170074463, + "learning_rate": 4.779057492420962e-05, + "loss": 0.3652, + "step": 3265000 + }, + { + "epoch": 22.097634257254224, + "grad_norm": 0.3641035854816437, + "learning_rate": 4.779023657427458e-05, + "loss": 0.3636, + "step": 3265500 + }, + { + "epoch": 22.10101775660459, + "grad_norm": 0.38921040296554565, + "learning_rate": 4.778989822433954e-05, + "loss": 0.3639, + "step": 3266000 + }, + { + "epoch": 22.104401255954958, + "grad_norm": 0.35882940888404846, + "learning_rate": 4.778955987440451e-05, + "loss": 0.3612, + "step": 3266500 + }, + { + "epoch": 22.107784755305328, + "grad_norm": 0.36842402815818787, + "learning_rate": 4.778922152446947e-05, + "loss": 0.3623, + "step": 3267000 + }, + { + "epoch": 22.111168254655695, + "grad_norm": 0.34856799244880676, + "learning_rate": 4.778888317453443e-05, + "loss": 0.3641, + "step": 3267500 + }, + { + "epoch": 22.11455175400606, + "grad_norm": 0.36763256788253784, + "learning_rate": 4.7788544824599393e-05, + "loss": 0.3627, + "step": 3268000 + }, + { + "epoch": 22.117935253356432, + "grad_norm": 0.345851331949234, + "learning_rate": 4.778820647466436e-05, + "loss": 0.3632, + "step": 3268500 + }, + { + "epoch": 22.1213187527068, + "grad_norm": 0.3884712755680084, + "learning_rate": 4.7787868124729324e-05, + "loss": 0.3629, + "step": 3269000 + }, + { + "epoch": 22.12470225205717, + "grad_norm": 0.3752146065235138, + "learning_rate": 4.7787529774794287e-05, + "loss": 0.3627, + "step": 3269500 + }, + { + "epoch": 22.128085751407536, + "grad_norm": 0.3530506491661072, + "learning_rate": 4.778719142485925e-05, + "loss": 0.364, + "step": 3270000 + }, + { + "epoch": 22.131469250757903, + "grad_norm": 0.3271232843399048, + "learning_rate": 4.778685307492421e-05, + "loss": 0.3641, + "step": 3270500 + }, + { + "epoch": 22.134852750108273, + "grad_norm": 0.3788377046585083, + "learning_rate": 4.778651472498917e-05, + "loss": 0.3629, + "step": 3271000 + }, + { + "epoch": 22.13823624945864, + "grad_norm": 0.35951292514801025, + "learning_rate": 4.7786176375054135e-05, + "loss": 0.3628, + "step": 3271500 + }, + { + "epoch": 22.14161974880901, + "grad_norm": 0.38089001178741455, + "learning_rate": 4.7785838025119104e-05, + "loss": 0.3641, + "step": 3272000 + }, + { + "epoch": 22.145003248159377, + "grad_norm": 0.3626417815685272, + "learning_rate": 4.7785499675184066e-05, + "loss": 0.362, + "step": 3272500 + }, + { + "epoch": 22.148386747509743, + "grad_norm": 0.34376761317253113, + "learning_rate": 4.778516132524903e-05, + "loss": 0.3631, + "step": 3273000 + }, + { + "epoch": 22.151770246860114, + "grad_norm": 0.3726509213447571, + "learning_rate": 4.778482297531399e-05, + "loss": 0.3635, + "step": 3273500 + }, + { + "epoch": 22.15515374621048, + "grad_norm": 0.33812475204467773, + "learning_rate": 4.778448462537895e-05, + "loss": 0.3634, + "step": 3274000 + }, + { + "epoch": 22.158537245560847, + "grad_norm": 0.40661919116973877, + "learning_rate": 4.778414627544392e-05, + "loss": 0.3616, + "step": 3274500 + }, + { + "epoch": 22.161920744911217, + "grad_norm": 0.4099995493888855, + "learning_rate": 4.7783807925508883e-05, + "loss": 0.3629, + "step": 3275000 + }, + { + "epoch": 22.165304244261584, + "grad_norm": 0.4026210904121399, + "learning_rate": 4.778346957557384e-05, + "loss": 0.3651, + "step": 3275500 + }, + { + "epoch": 22.168687743611954, + "grad_norm": 0.3911675214767456, + "learning_rate": 4.778313122563881e-05, + "loss": 0.3631, + "step": 3276000 + }, + { + "epoch": 22.17207124296232, + "grad_norm": 0.3900057375431061, + "learning_rate": 4.778279287570377e-05, + "loss": 0.3624, + "step": 3276500 + }, + { + "epoch": 22.175454742312688, + "grad_norm": 0.4155902862548828, + "learning_rate": 4.778245452576873e-05, + "loss": 0.3648, + "step": 3277000 + }, + { + "epoch": 22.17883824166306, + "grad_norm": 0.3615865111351013, + "learning_rate": 4.7782116175833694e-05, + "loss": 0.3642, + "step": 3277500 + }, + { + "epoch": 22.182221741013425, + "grad_norm": 0.3743878901004791, + "learning_rate": 4.778177782589866e-05, + "loss": 0.3625, + "step": 3278000 + }, + { + "epoch": 22.185605240363795, + "grad_norm": 0.3730500638484955, + "learning_rate": 4.7781439475963625e-05, + "loss": 0.3641, + "step": 3278500 + }, + { + "epoch": 22.188988739714162, + "grad_norm": 0.3749823272228241, + "learning_rate": 4.778110112602859e-05, + "loss": 0.3638, + "step": 3279000 + }, + { + "epoch": 22.19237223906453, + "grad_norm": 0.378461092710495, + "learning_rate": 4.778076277609355e-05, + "loss": 0.3625, + "step": 3279500 + }, + { + "epoch": 22.1957557384149, + "grad_norm": 0.3822938799858093, + "learning_rate": 4.778042442615851e-05, + "loss": 0.3627, + "step": 3280000 + }, + { + "epoch": 22.199139237765266, + "grad_norm": 0.3303757309913635, + "learning_rate": 4.7780086076223474e-05, + "loss": 0.3636, + "step": 3280500 + }, + { + "epoch": 22.202522737115636, + "grad_norm": 0.32777705788612366, + "learning_rate": 4.7779747726288436e-05, + "loss": 0.3649, + "step": 3281000 + }, + { + "epoch": 22.205906236466003, + "grad_norm": 0.3365825414657593, + "learning_rate": 4.77794093763534e-05, + "loss": 0.3657, + "step": 3281500 + }, + { + "epoch": 22.20928973581637, + "grad_norm": 0.3661050796508789, + "learning_rate": 4.777907102641837e-05, + "loss": 0.3627, + "step": 3282000 + }, + { + "epoch": 22.21267323516674, + "grad_norm": 0.3733326494693756, + "learning_rate": 4.777873267648333e-05, + "loss": 0.3626, + "step": 3282500 + }, + { + "epoch": 22.216056734517107, + "grad_norm": 0.3773648738861084, + "learning_rate": 4.777839432654829e-05, + "loss": 0.3632, + "step": 3283000 + }, + { + "epoch": 22.219440233867473, + "grad_norm": 0.38698533177375793, + "learning_rate": 4.777805597661325e-05, + "loss": 0.3638, + "step": 3283500 + }, + { + "epoch": 22.222823733217844, + "grad_norm": 0.357965350151062, + "learning_rate": 4.777771762667822e-05, + "loss": 0.3635, + "step": 3284000 + }, + { + "epoch": 22.22620723256821, + "grad_norm": 0.36651939153671265, + "learning_rate": 4.7777379276743184e-05, + "loss": 0.3635, + "step": 3284500 + }, + { + "epoch": 22.22959073191858, + "grad_norm": 0.40140554308891296, + "learning_rate": 4.777704092680814e-05, + "loss": 0.3626, + "step": 3285000 + }, + { + "epoch": 22.232974231268948, + "grad_norm": 0.3868422508239746, + "learning_rate": 4.777670257687311e-05, + "loss": 0.364, + "step": 3285500 + }, + { + "epoch": 22.236357730619314, + "grad_norm": 0.4074952304363251, + "learning_rate": 4.777636422693807e-05, + "loss": 0.3644, + "step": 3286000 + }, + { + "epoch": 22.239741229969685, + "grad_norm": 0.32810431718826294, + "learning_rate": 4.777602587700303e-05, + "loss": 0.3627, + "step": 3286500 + }, + { + "epoch": 22.24312472932005, + "grad_norm": 0.3723648488521576, + "learning_rate": 4.7775687527067995e-05, + "loss": 0.3641, + "step": 3287000 + }, + { + "epoch": 22.24650822867042, + "grad_norm": 0.41280627250671387, + "learning_rate": 4.7775349177132964e-05, + "loss": 0.3637, + "step": 3287500 + }, + { + "epoch": 22.24989172802079, + "grad_norm": 0.37163811922073364, + "learning_rate": 4.7775010827197926e-05, + "loss": 0.3646, + "step": 3288000 + }, + { + "epoch": 22.253275227371155, + "grad_norm": 0.36624860763549805, + "learning_rate": 4.777467247726289e-05, + "loss": 0.3624, + "step": 3288500 + }, + { + "epoch": 22.256658726721525, + "grad_norm": 0.36479902267456055, + "learning_rate": 4.777433412732785e-05, + "loss": 0.3638, + "step": 3289000 + }, + { + "epoch": 22.260042226071892, + "grad_norm": 0.3627351224422455, + "learning_rate": 4.777399577739281e-05, + "loss": 0.3631, + "step": 3289500 + }, + { + "epoch": 22.26342572542226, + "grad_norm": 0.3495413362979889, + "learning_rate": 4.7773657427457774e-05, + "loss": 0.3633, + "step": 3290000 + }, + { + "epoch": 22.26680922477263, + "grad_norm": 0.40336576104164124, + "learning_rate": 4.7773319077522736e-05, + "loss": 0.3621, + "step": 3290500 + }, + { + "epoch": 22.270192724122996, + "grad_norm": 0.3687015175819397, + "learning_rate": 4.77729807275877e-05, + "loss": 0.3638, + "step": 3291000 + }, + { + "epoch": 22.273576223473366, + "grad_norm": 0.43421855568885803, + "learning_rate": 4.777264237765267e-05, + "loss": 0.3616, + "step": 3291500 + }, + { + "epoch": 22.276959722823733, + "grad_norm": 0.37915298342704773, + "learning_rate": 4.777230402771763e-05, + "loss": 0.3647, + "step": 3292000 + }, + { + "epoch": 22.2803432221741, + "grad_norm": 0.3645276725292206, + "learning_rate": 4.777196567778259e-05, + "loss": 0.3623, + "step": 3292500 + }, + { + "epoch": 22.28372672152447, + "grad_norm": 0.36133942008018494, + "learning_rate": 4.7771627327847554e-05, + "loss": 0.3634, + "step": 3293000 + }, + { + "epoch": 22.287110220874837, + "grad_norm": 0.366842120885849, + "learning_rate": 4.777128897791252e-05, + "loss": 0.3637, + "step": 3293500 + }, + { + "epoch": 22.290493720225207, + "grad_norm": 0.3590872883796692, + "learning_rate": 4.7770950627977485e-05, + "loss": 0.3658, + "step": 3294000 + }, + { + "epoch": 22.293877219575574, + "grad_norm": 0.40072405338287354, + "learning_rate": 4.777061227804244e-05, + "loss": 0.3639, + "step": 3294500 + }, + { + "epoch": 22.29726071892594, + "grad_norm": 0.3492056429386139, + "learning_rate": 4.777027392810741e-05, + "loss": 0.3642, + "step": 3295000 + }, + { + "epoch": 22.30064421827631, + "grad_norm": 0.3781352639198303, + "learning_rate": 4.776993557817237e-05, + "loss": 0.3644, + "step": 3295500 + }, + { + "epoch": 22.304027717626678, + "grad_norm": 0.3554432988166809, + "learning_rate": 4.776959722823733e-05, + "loss": 0.3637, + "step": 3296000 + }, + { + "epoch": 22.307411216977048, + "grad_norm": 0.4013362526893616, + "learning_rate": 4.7769258878302295e-05, + "loss": 0.365, + "step": 3296500 + }, + { + "epoch": 22.310794716327415, + "grad_norm": 0.37104544043540955, + "learning_rate": 4.7768920528367264e-05, + "loss": 0.3642, + "step": 3297000 + }, + { + "epoch": 22.31417821567778, + "grad_norm": 0.37026894092559814, + "learning_rate": 4.7768582178432226e-05, + "loss": 0.3638, + "step": 3297500 + }, + { + "epoch": 22.317561715028152, + "grad_norm": 0.39977866411209106, + "learning_rate": 4.776824382849719e-05, + "loss": 0.3634, + "step": 3298000 + }, + { + "epoch": 22.32094521437852, + "grad_norm": 0.33193448185920715, + "learning_rate": 4.776790547856215e-05, + "loss": 0.3612, + "step": 3298500 + }, + { + "epoch": 22.324328713728885, + "grad_norm": 0.3862043023109436, + "learning_rate": 4.776756712862711e-05, + "loss": 0.3645, + "step": 3299000 + }, + { + "epoch": 22.327712213079256, + "grad_norm": 0.35771092772483826, + "learning_rate": 4.7767228778692075e-05, + "loss": 0.3637, + "step": 3299500 + }, + { + "epoch": 22.331095712429622, + "grad_norm": 0.39172911643981934, + "learning_rate": 4.776689042875704e-05, + "loss": 0.3646, + "step": 3300000 + }, + { + "epoch": 22.334479211779993, + "grad_norm": 0.3653565049171448, + "learning_rate": 4.7766552078822e-05, + "loss": 0.3642, + "step": 3300500 + }, + { + "epoch": 22.33786271113036, + "grad_norm": 0.4046486020088196, + "learning_rate": 4.776621372888697e-05, + "loss": 0.3632, + "step": 3301000 + }, + { + "epoch": 22.341246210480726, + "grad_norm": 0.381191611289978, + "learning_rate": 4.776587537895193e-05, + "loss": 0.362, + "step": 3301500 + }, + { + "epoch": 22.344629709831096, + "grad_norm": 0.3809015452861786, + "learning_rate": 4.776553702901689e-05, + "loss": 0.3628, + "step": 3302000 + }, + { + "epoch": 22.348013209181463, + "grad_norm": 0.42356231808662415, + "learning_rate": 4.7765198679081854e-05, + "loss": 0.3636, + "step": 3302500 + }, + { + "epoch": 22.351396708531833, + "grad_norm": 0.38534021377563477, + "learning_rate": 4.776486032914682e-05, + "loss": 0.3642, + "step": 3303000 + }, + { + "epoch": 22.3547802078822, + "grad_norm": 0.3867948353290558, + "learning_rate": 4.7764521979211785e-05, + "loss": 0.3636, + "step": 3303500 + }, + { + "epoch": 22.358163707232567, + "grad_norm": 0.38022705912590027, + "learning_rate": 4.776418362927674e-05, + "loss": 0.3641, + "step": 3304000 + }, + { + "epoch": 22.361547206582937, + "grad_norm": 0.38373327255249023, + "learning_rate": 4.776384527934171e-05, + "loss": 0.3633, + "step": 3304500 + }, + { + "epoch": 22.364930705933304, + "grad_norm": 0.33922332525253296, + "learning_rate": 4.776350692940667e-05, + "loss": 0.3626, + "step": 3305000 + }, + { + "epoch": 22.368314205283674, + "grad_norm": 0.39301684498786926, + "learning_rate": 4.7763168579471634e-05, + "loss": 0.3645, + "step": 3305500 + }, + { + "epoch": 22.37169770463404, + "grad_norm": 0.3709513247013092, + "learning_rate": 4.7762830229536596e-05, + "loss": 0.3639, + "step": 3306000 + }, + { + "epoch": 22.375081203984408, + "grad_norm": 0.3477535843849182, + "learning_rate": 4.7762491879601565e-05, + "loss": 0.3636, + "step": 3306500 + }, + { + "epoch": 22.378464703334778, + "grad_norm": 0.33180704712867737, + "learning_rate": 4.776215352966653e-05, + "loss": 0.3647, + "step": 3307000 + }, + { + "epoch": 22.381848202685145, + "grad_norm": 0.34438008069992065, + "learning_rate": 4.776181517973149e-05, + "loss": 0.3644, + "step": 3307500 + }, + { + "epoch": 22.38523170203551, + "grad_norm": 0.38011258840560913, + "learning_rate": 4.776147682979645e-05, + "loss": 0.3634, + "step": 3308000 + }, + { + "epoch": 22.388615201385882, + "grad_norm": 0.3358522355556488, + "learning_rate": 4.776113847986141e-05, + "loss": 0.3642, + "step": 3308500 + }, + { + "epoch": 22.39199870073625, + "grad_norm": 0.38324040174484253, + "learning_rate": 4.7760800129926375e-05, + "loss": 0.3635, + "step": 3309000 + }, + { + "epoch": 22.39538220008662, + "grad_norm": 0.3560822904109955, + "learning_rate": 4.776046177999134e-05, + "loss": 0.3654, + "step": 3309500 + }, + { + "epoch": 22.398765699436986, + "grad_norm": 0.39798104763031006, + "learning_rate": 4.77601234300563e-05, + "loss": 0.3626, + "step": 3310000 + }, + { + "epoch": 22.402149198787352, + "grad_norm": 0.3754647970199585, + "learning_rate": 4.775978508012127e-05, + "loss": 0.3648, + "step": 3310500 + }, + { + "epoch": 22.405532698137723, + "grad_norm": 0.3751375675201416, + "learning_rate": 4.775944673018623e-05, + "loss": 0.3624, + "step": 3311000 + }, + { + "epoch": 22.40891619748809, + "grad_norm": 0.36246442794799805, + "learning_rate": 4.775910838025119e-05, + "loss": 0.3642, + "step": 3311500 + }, + { + "epoch": 22.41229969683846, + "grad_norm": 0.38623011112213135, + "learning_rate": 4.7758770030316155e-05, + "loss": 0.3638, + "step": 3312000 + }, + { + "epoch": 22.415683196188827, + "grad_norm": 0.38859832286834717, + "learning_rate": 4.7758431680381124e-05, + "loss": 0.3633, + "step": 3312500 + }, + { + "epoch": 22.419066695539193, + "grad_norm": 0.36892005801200867, + "learning_rate": 4.7758093330446086e-05, + "loss": 0.3625, + "step": 3313000 + }, + { + "epoch": 22.422450194889564, + "grad_norm": 0.3597000539302826, + "learning_rate": 4.775775498051104e-05, + "loss": 0.3668, + "step": 3313500 + }, + { + "epoch": 22.42583369423993, + "grad_norm": 0.32845205068588257, + "learning_rate": 4.775741663057601e-05, + "loss": 0.3645, + "step": 3314000 + }, + { + "epoch": 22.429217193590297, + "grad_norm": 0.3746047019958496, + "learning_rate": 4.775707828064097e-05, + "loss": 0.3642, + "step": 3314500 + }, + { + "epoch": 22.432600692940667, + "grad_norm": 0.38173362612724304, + "learning_rate": 4.7756739930705935e-05, + "loss": 0.3638, + "step": 3315000 + }, + { + "epoch": 22.435984192291034, + "grad_norm": 0.3481561243534088, + "learning_rate": 4.77564015807709e-05, + "loss": 0.3628, + "step": 3315500 + }, + { + "epoch": 22.439367691641404, + "grad_norm": 0.3738810122013092, + "learning_rate": 4.7756063230835866e-05, + "loss": 0.3627, + "step": 3316000 + }, + { + "epoch": 22.44275119099177, + "grad_norm": 0.35183948278427124, + "learning_rate": 4.775572488090083e-05, + "loss": 0.3647, + "step": 3316500 + }, + { + "epoch": 22.446134690342138, + "grad_norm": 0.33670639991760254, + "learning_rate": 4.775538653096579e-05, + "loss": 0.3643, + "step": 3317000 + }, + { + "epoch": 22.44951818969251, + "grad_norm": 0.36027422547340393, + "learning_rate": 4.775504818103075e-05, + "loss": 0.3632, + "step": 3317500 + }, + { + "epoch": 22.452901689042875, + "grad_norm": 0.3198143541812897, + "learning_rate": 4.7754709831095714e-05, + "loss": 0.3624, + "step": 3318000 + }, + { + "epoch": 22.456285188393245, + "grad_norm": 0.3682264983654022, + "learning_rate": 4.7754371481160676e-05, + "loss": 0.3648, + "step": 3318500 + }, + { + "epoch": 22.459668687743612, + "grad_norm": 0.3861812353134155, + "learning_rate": 4.775403313122564e-05, + "loss": 0.3649, + "step": 3319000 + }, + { + "epoch": 22.46305218709398, + "grad_norm": 0.4011298418045044, + "learning_rate": 4.77536947812906e-05, + "loss": 0.3647, + "step": 3319500 + }, + { + "epoch": 22.46643568644435, + "grad_norm": 0.3753091096878052, + "learning_rate": 4.775335643135557e-05, + "loss": 0.3647, + "step": 3320000 + }, + { + "epoch": 22.469819185794716, + "grad_norm": 0.3396071791648865, + "learning_rate": 4.775301808142053e-05, + "loss": 0.3654, + "step": 3320500 + }, + { + "epoch": 22.473202685145086, + "grad_norm": 0.3864244818687439, + "learning_rate": 4.7752679731485494e-05, + "loss": 0.3634, + "step": 3321000 + }, + { + "epoch": 22.476586184495453, + "grad_norm": 0.34095582365989685, + "learning_rate": 4.7752341381550456e-05, + "loss": 0.3624, + "step": 3321500 + }, + { + "epoch": 22.47996968384582, + "grad_norm": 0.3696548640727997, + "learning_rate": 4.7752003031615425e-05, + "loss": 0.3648, + "step": 3322000 + }, + { + "epoch": 22.48335318319619, + "grad_norm": 0.35707706212997437, + "learning_rate": 4.775166468168039e-05, + "loss": 0.3626, + "step": 3322500 + }, + { + "epoch": 22.486736682546557, + "grad_norm": 0.3745119273662567, + "learning_rate": 4.775132633174534e-05, + "loss": 0.3648, + "step": 3323000 + }, + { + "epoch": 22.490120181896923, + "grad_norm": 0.3682263493537903, + "learning_rate": 4.775098798181031e-05, + "loss": 0.3621, + "step": 3323500 + }, + { + "epoch": 22.493503681247294, + "grad_norm": 0.38439232110977173, + "learning_rate": 4.775064963187527e-05, + "loss": 0.3629, + "step": 3324000 + }, + { + "epoch": 22.49688718059766, + "grad_norm": 0.39407601952552795, + "learning_rate": 4.7750311281940235e-05, + "loss": 0.3632, + "step": 3324500 + }, + { + "epoch": 22.50027067994803, + "grad_norm": 0.35311228036880493, + "learning_rate": 4.77499729320052e-05, + "loss": 0.3638, + "step": 3325000 + }, + { + "epoch": 22.503654179298398, + "grad_norm": 0.3494729995727539, + "learning_rate": 4.7749634582070166e-05, + "loss": 0.3637, + "step": 3325500 + }, + { + "epoch": 22.507037678648764, + "grad_norm": 0.335531622171402, + "learning_rate": 4.774929623213513e-05, + "loss": 0.3643, + "step": 3326000 + }, + { + "epoch": 22.510421177999135, + "grad_norm": 0.37210074067115784, + "learning_rate": 4.774895788220009e-05, + "loss": 0.3636, + "step": 3326500 + }, + { + "epoch": 22.5138046773495, + "grad_norm": 0.3789541721343994, + "learning_rate": 4.774861953226505e-05, + "loss": 0.3636, + "step": 3327000 + }, + { + "epoch": 22.51718817669987, + "grad_norm": 0.32330092787742615, + "learning_rate": 4.7748281182330015e-05, + "loss": 0.3636, + "step": 3327500 + }, + { + "epoch": 22.52057167605024, + "grad_norm": 0.38846564292907715, + "learning_rate": 4.774794283239498e-05, + "loss": 0.3647, + "step": 3328000 + }, + { + "epoch": 22.523955175400605, + "grad_norm": 0.3626689910888672, + "learning_rate": 4.774760448245994e-05, + "loss": 0.3637, + "step": 3328500 + }, + { + "epoch": 22.527338674750975, + "grad_norm": 0.3889565169811249, + "learning_rate": 4.77472661325249e-05, + "loss": 0.3629, + "step": 3329000 + }, + { + "epoch": 22.530722174101342, + "grad_norm": 0.35343024134635925, + "learning_rate": 4.774692778258987e-05, + "loss": 0.3635, + "step": 3329500 + }, + { + "epoch": 22.534105673451712, + "grad_norm": 0.3859838843345642, + "learning_rate": 4.774658943265483e-05, + "loss": 0.3641, + "step": 3330000 + }, + { + "epoch": 22.53748917280208, + "grad_norm": 0.38369202613830566, + "learning_rate": 4.7746251082719794e-05, + "loss": 0.3636, + "step": 3330500 + }, + { + "epoch": 22.540872672152446, + "grad_norm": 0.34914785623550415, + "learning_rate": 4.7745912732784756e-05, + "loss": 0.3624, + "step": 3331000 + }, + { + "epoch": 22.544256171502816, + "grad_norm": 0.41669896245002747, + "learning_rate": 4.7745574382849725e-05, + "loss": 0.3642, + "step": 3331500 + }, + { + "epoch": 22.547639670853183, + "grad_norm": 0.3618820309638977, + "learning_rate": 4.774523603291469e-05, + "loss": 0.364, + "step": 3332000 + }, + { + "epoch": 22.55102317020355, + "grad_norm": 0.43509700894355774, + "learning_rate": 4.774489768297964e-05, + "loss": 0.3644, + "step": 3332500 + }, + { + "epoch": 22.55440666955392, + "grad_norm": 0.37723085284233093, + "learning_rate": 4.774455933304461e-05, + "loss": 0.3643, + "step": 3333000 + }, + { + "epoch": 22.557790168904287, + "grad_norm": 0.3719700872898102, + "learning_rate": 4.7744220983109574e-05, + "loss": 0.3652, + "step": 3333500 + }, + { + "epoch": 22.561173668254657, + "grad_norm": 0.351824015378952, + "learning_rate": 4.7743882633174536e-05, + "loss": 0.3631, + "step": 3334000 + }, + { + "epoch": 22.564557167605024, + "grad_norm": 0.3875691294670105, + "learning_rate": 4.77435442832395e-05, + "loss": 0.3644, + "step": 3334500 + }, + { + "epoch": 22.56794066695539, + "grad_norm": 0.36765944957733154, + "learning_rate": 4.774320593330447e-05, + "loss": 0.3638, + "step": 3335000 + }, + { + "epoch": 22.57132416630576, + "grad_norm": 0.3525508642196655, + "learning_rate": 4.774286758336943e-05, + "loss": 0.3647, + "step": 3335500 + }, + { + "epoch": 22.574707665656128, + "grad_norm": 0.37704595923423767, + "learning_rate": 4.774252923343439e-05, + "loss": 0.3642, + "step": 3336000 + }, + { + "epoch": 22.578091165006498, + "grad_norm": 0.3496682345867157, + "learning_rate": 4.774219088349935e-05, + "loss": 0.3638, + "step": 3336500 + }, + { + "epoch": 22.581474664356865, + "grad_norm": 0.3411332964897156, + "learning_rate": 4.7741852533564315e-05, + "loss": 0.3641, + "step": 3337000 + }, + { + "epoch": 22.58485816370723, + "grad_norm": 0.3448581099510193, + "learning_rate": 4.774151418362928e-05, + "loss": 0.3637, + "step": 3337500 + }, + { + "epoch": 22.5882416630576, + "grad_norm": 0.33924153447151184, + "learning_rate": 4.774117583369424e-05, + "loss": 0.3642, + "step": 3338000 + }, + { + "epoch": 22.59162516240797, + "grad_norm": 0.3455522358417511, + "learning_rate": 4.77408374837592e-05, + "loss": 0.364, + "step": 3338500 + }, + { + "epoch": 22.595008661758335, + "grad_norm": 0.42671269178390503, + "learning_rate": 4.774049913382417e-05, + "loss": 0.3644, + "step": 3339000 + }, + { + "epoch": 22.598392161108706, + "grad_norm": 0.32911738753318787, + "learning_rate": 4.774016078388913e-05, + "loss": 0.364, + "step": 3339500 + }, + { + "epoch": 22.601775660459072, + "grad_norm": 0.3579391837120056, + "learning_rate": 4.7739822433954095e-05, + "loss": 0.3634, + "step": 3340000 + }, + { + "epoch": 22.605159159809443, + "grad_norm": 0.34895071387290955, + "learning_rate": 4.773948408401906e-05, + "loss": 0.3652, + "step": 3340500 + }, + { + "epoch": 22.60854265915981, + "grad_norm": 0.3935413956642151, + "learning_rate": 4.7739145734084026e-05, + "loss": 0.363, + "step": 3341000 + }, + { + "epoch": 22.611926158510176, + "grad_norm": 0.3797558844089508, + "learning_rate": 4.773880738414899e-05, + "loss": 0.364, + "step": 3341500 + }, + { + "epoch": 22.615309657860546, + "grad_norm": 0.35469311475753784, + "learning_rate": 4.773846903421394e-05, + "loss": 0.3638, + "step": 3342000 + }, + { + "epoch": 22.618693157210913, + "grad_norm": 0.36201012134552, + "learning_rate": 4.773813068427891e-05, + "loss": 0.3634, + "step": 3342500 + }, + { + "epoch": 22.622076656561283, + "grad_norm": 0.3611513078212738, + "learning_rate": 4.7737792334343874e-05, + "loss": 0.3654, + "step": 3343000 + }, + { + "epoch": 22.62546015591165, + "grad_norm": 0.4306865930557251, + "learning_rate": 4.7737453984408836e-05, + "loss": 0.3633, + "step": 3343500 + }, + { + "epoch": 22.628843655262017, + "grad_norm": 0.3794310390949249, + "learning_rate": 4.77371156344738e-05, + "loss": 0.3634, + "step": 3344000 + }, + { + "epoch": 22.632227154612387, + "grad_norm": 0.3547627031803131, + "learning_rate": 4.773677728453876e-05, + "loss": 0.3635, + "step": 3344500 + }, + { + "epoch": 22.635610653962754, + "grad_norm": 0.36016029119491577, + "learning_rate": 4.773643893460373e-05, + "loss": 0.3646, + "step": 3345000 + }, + { + "epoch": 22.638994153313124, + "grad_norm": 0.3823973536491394, + "learning_rate": 4.773610058466869e-05, + "loss": 0.3645, + "step": 3345500 + }, + { + "epoch": 22.64237765266349, + "grad_norm": 0.3654640316963196, + "learning_rate": 4.7735762234733654e-05, + "loss": 0.3638, + "step": 3346000 + }, + { + "epoch": 22.645761152013858, + "grad_norm": 0.3841468393802643, + "learning_rate": 4.7735423884798616e-05, + "loss": 0.3647, + "step": 3346500 + }, + { + "epoch": 22.649144651364228, + "grad_norm": 0.3642100691795349, + "learning_rate": 4.773508553486358e-05, + "loss": 0.3642, + "step": 3347000 + }, + { + "epoch": 22.652528150714595, + "grad_norm": 0.382438600063324, + "learning_rate": 4.773474718492854e-05, + "loss": 0.3639, + "step": 3347500 + }, + { + "epoch": 22.65591165006496, + "grad_norm": 0.35496222972869873, + "learning_rate": 4.77344088349935e-05, + "loss": 0.3635, + "step": 3348000 + }, + { + "epoch": 22.659295149415332, + "grad_norm": 0.3788909316062927, + "learning_rate": 4.773407048505847e-05, + "loss": 0.3624, + "step": 3348500 + }, + { + "epoch": 22.6626786487657, + "grad_norm": 0.3825910687446594, + "learning_rate": 4.773373213512343e-05, + "loss": 0.365, + "step": 3349000 + }, + { + "epoch": 22.66606214811607, + "grad_norm": 0.3525485396385193, + "learning_rate": 4.7733393785188395e-05, + "loss": 0.3657, + "step": 3349500 + }, + { + "epoch": 22.669445647466436, + "grad_norm": 0.36212408542633057, + "learning_rate": 4.773305543525336e-05, + "loss": 0.3631, + "step": 3350000 + }, + { + "epoch": 22.672829146816802, + "grad_norm": 0.34431231021881104, + "learning_rate": 4.7732717085318326e-05, + "loss": 0.3629, + "step": 3350500 + }, + { + "epoch": 22.676212646167173, + "grad_norm": 0.36336973309516907, + "learning_rate": 4.773237873538329e-05, + "loss": 0.3638, + "step": 3351000 + }, + { + "epoch": 22.67959614551754, + "grad_norm": 0.4125959575176239, + "learning_rate": 4.7732040385448244e-05, + "loss": 0.3649, + "step": 3351500 + }, + { + "epoch": 22.68297964486791, + "grad_norm": 0.3445337116718292, + "learning_rate": 4.7731702035513206e-05, + "loss": 0.3642, + "step": 3352000 + }, + { + "epoch": 22.686363144218276, + "grad_norm": 0.366926372051239, + "learning_rate": 4.7731363685578175e-05, + "loss": 0.3624, + "step": 3352500 + }, + { + "epoch": 22.689746643568643, + "grad_norm": 0.3692866861820221, + "learning_rate": 4.773102533564314e-05, + "loss": 0.3657, + "step": 3353000 + }, + { + "epoch": 22.693130142919014, + "grad_norm": 0.32625988125801086, + "learning_rate": 4.77306869857081e-05, + "loss": 0.3647, + "step": 3353500 + }, + { + "epoch": 22.69651364226938, + "grad_norm": 0.33408379554748535, + "learning_rate": 4.773034863577306e-05, + "loss": 0.363, + "step": 3354000 + }, + { + "epoch": 22.69989714161975, + "grad_norm": 0.39026281237602234, + "learning_rate": 4.773001028583803e-05, + "loss": 0.3642, + "step": 3354500 + }, + { + "epoch": 22.703280640970117, + "grad_norm": 0.3677361309528351, + "learning_rate": 4.772967193590299e-05, + "loss": 0.3653, + "step": 3355000 + }, + { + "epoch": 22.706664140320484, + "grad_norm": 0.3817291855812073, + "learning_rate": 4.7729333585967954e-05, + "loss": 0.3637, + "step": 3355500 + }, + { + "epoch": 22.710047639670854, + "grad_norm": 0.3769967257976532, + "learning_rate": 4.7728995236032917e-05, + "loss": 0.3641, + "step": 3356000 + }, + { + "epoch": 22.71343113902122, + "grad_norm": 0.36510559916496277, + "learning_rate": 4.772865688609788e-05, + "loss": 0.363, + "step": 3356500 + }, + { + "epoch": 22.716814638371588, + "grad_norm": 0.3357907235622406, + "learning_rate": 4.772831853616284e-05, + "loss": 0.3638, + "step": 3357000 + }, + { + "epoch": 22.720198137721958, + "grad_norm": 0.3899648189544678, + "learning_rate": 4.77279801862278e-05, + "loss": 0.3638, + "step": 3357500 + }, + { + "epoch": 22.723581637072325, + "grad_norm": 0.3724825084209442, + "learning_rate": 4.772764183629277e-05, + "loss": 0.3628, + "step": 3358000 + }, + { + "epoch": 22.726965136422695, + "grad_norm": 0.3447500467300415, + "learning_rate": 4.7727303486357734e-05, + "loss": 0.3642, + "step": 3358500 + }, + { + "epoch": 22.730348635773062, + "grad_norm": 0.3528880178928375, + "learning_rate": 4.7726965136422696e-05, + "loss": 0.364, + "step": 3359000 + }, + { + "epoch": 22.73373213512343, + "grad_norm": 0.3487587571144104, + "learning_rate": 4.772662678648766e-05, + "loss": 0.3636, + "step": 3359500 + }, + { + "epoch": 22.7371156344738, + "grad_norm": 0.3664812445640564, + "learning_rate": 4.772628843655263e-05, + "loss": 0.3642, + "step": 3360000 + }, + { + "epoch": 22.740499133824166, + "grad_norm": 0.3146126866340637, + "learning_rate": 4.772595008661759e-05, + "loss": 0.3632, + "step": 3360500 + }, + { + "epoch": 22.743882633174536, + "grad_norm": 0.35260069370269775, + "learning_rate": 4.7725611736682545e-05, + "loss": 0.3657, + "step": 3361000 + }, + { + "epoch": 22.747266132524903, + "grad_norm": 0.36339813470840454, + "learning_rate": 4.772527338674751e-05, + "loss": 0.3643, + "step": 3361500 + }, + { + "epoch": 22.75064963187527, + "grad_norm": 0.3912361264228821, + "learning_rate": 4.7724935036812476e-05, + "loss": 0.3628, + "step": 3362000 + }, + { + "epoch": 22.75403313122564, + "grad_norm": 0.34351855516433716, + "learning_rate": 4.772459668687744e-05, + "loss": 0.3644, + "step": 3362500 + }, + { + "epoch": 22.757416630576007, + "grad_norm": 0.41395920515060425, + "learning_rate": 4.77242583369424e-05, + "loss": 0.3644, + "step": 3363000 + }, + { + "epoch": 22.760800129926373, + "grad_norm": 0.3602873384952545, + "learning_rate": 4.772391998700736e-05, + "loss": 0.3648, + "step": 3363500 + }, + { + "epoch": 22.764183629276744, + "grad_norm": 0.36354541778564453, + "learning_rate": 4.772358163707233e-05, + "loss": 0.3626, + "step": 3364000 + }, + { + "epoch": 22.76756712862711, + "grad_norm": 0.37120670080184937, + "learning_rate": 4.772324328713729e-05, + "loss": 0.3644, + "step": 3364500 + }, + { + "epoch": 22.77095062797748, + "grad_norm": 0.33839091658592224, + "learning_rate": 4.7722904937202255e-05, + "loss": 0.3636, + "step": 3365000 + }, + { + "epoch": 22.774334127327847, + "grad_norm": 0.3728088140487671, + "learning_rate": 4.772256658726722e-05, + "loss": 0.3648, + "step": 3365500 + }, + { + "epoch": 22.777717626678214, + "grad_norm": 0.35419514775276184, + "learning_rate": 4.772222823733218e-05, + "loss": 0.3635, + "step": 3366000 + }, + { + "epoch": 22.781101126028585, + "grad_norm": 0.3702925741672516, + "learning_rate": 4.772188988739714e-05, + "loss": 0.3637, + "step": 3366500 + }, + { + "epoch": 22.78448462537895, + "grad_norm": 0.371176540851593, + "learning_rate": 4.7721551537462104e-05, + "loss": 0.3641, + "step": 3367000 + }, + { + "epoch": 22.78786812472932, + "grad_norm": 0.3687214255332947, + "learning_rate": 4.772121318752707e-05, + "loss": 0.3654, + "step": 3367500 + }, + { + "epoch": 22.79125162407969, + "grad_norm": 0.3298780024051666, + "learning_rate": 4.7720874837592035e-05, + "loss": 0.3641, + "step": 3368000 + }, + { + "epoch": 22.794635123430055, + "grad_norm": 0.35434913635253906, + "learning_rate": 4.7720536487657e-05, + "loss": 0.3653, + "step": 3368500 + }, + { + "epoch": 22.798018622780425, + "grad_norm": 0.34204936027526855, + "learning_rate": 4.772019813772196e-05, + "loss": 0.3652, + "step": 3369000 + }, + { + "epoch": 22.801402122130792, + "grad_norm": 0.3661845624446869, + "learning_rate": 4.771985978778693e-05, + "loss": 0.364, + "step": 3369500 + }, + { + "epoch": 22.804785621481162, + "grad_norm": 0.3500819206237793, + "learning_rate": 4.771952143785189e-05, + "loss": 0.3651, + "step": 3370000 + }, + { + "epoch": 22.80816912083153, + "grad_norm": 0.4077375829219818, + "learning_rate": 4.7719183087916845e-05, + "loss": 0.3646, + "step": 3370500 + }, + { + "epoch": 22.811552620181896, + "grad_norm": 0.38122543692588806, + "learning_rate": 4.771884473798181e-05, + "loss": 0.3634, + "step": 3371000 + }, + { + "epoch": 22.814936119532266, + "grad_norm": 0.3645778298377991, + "learning_rate": 4.7718506388046776e-05, + "loss": 0.3646, + "step": 3371500 + }, + { + "epoch": 22.818319618882633, + "grad_norm": 0.4079326093196869, + "learning_rate": 4.771816803811174e-05, + "loss": 0.3662, + "step": 3372000 + }, + { + "epoch": 22.821703118233, + "grad_norm": 0.34691333770751953, + "learning_rate": 4.77178296881767e-05, + "loss": 0.3661, + "step": 3372500 + }, + { + "epoch": 22.82508661758337, + "grad_norm": 0.3471761643886566, + "learning_rate": 4.771749133824166e-05, + "loss": 0.3644, + "step": 3373000 + }, + { + "epoch": 22.828470116933737, + "grad_norm": 0.4043777585029602, + "learning_rate": 4.771715298830663e-05, + "loss": 0.3651, + "step": 3373500 + }, + { + "epoch": 22.831853616284107, + "grad_norm": 0.41398492455482483, + "learning_rate": 4.7716814638371594e-05, + "loss": 0.3646, + "step": 3374000 + }, + { + "epoch": 22.835237115634474, + "grad_norm": 0.3764701783657074, + "learning_rate": 4.7716476288436556e-05, + "loss": 0.3656, + "step": 3374500 + }, + { + "epoch": 22.83862061498484, + "grad_norm": 0.3379947543144226, + "learning_rate": 4.771613793850152e-05, + "loss": 0.3642, + "step": 3375000 + }, + { + "epoch": 22.84200411433521, + "grad_norm": 0.39115238189697266, + "learning_rate": 4.771579958856648e-05, + "loss": 0.3634, + "step": 3375500 + }, + { + "epoch": 22.845387613685578, + "grad_norm": 0.37695080041885376, + "learning_rate": 4.771546123863144e-05, + "loss": 0.3625, + "step": 3376000 + }, + { + "epoch": 22.848771113035948, + "grad_norm": 0.3427785038948059, + "learning_rate": 4.7715122888696404e-05, + "loss": 0.3654, + "step": 3376500 + }, + { + "epoch": 22.852154612386315, + "grad_norm": 0.3660113215446472, + "learning_rate": 4.771478453876137e-05, + "loss": 0.3646, + "step": 3377000 + }, + { + "epoch": 22.85553811173668, + "grad_norm": 0.4021322429180145, + "learning_rate": 4.7714446188826335e-05, + "loss": 0.3644, + "step": 3377500 + }, + { + "epoch": 22.85892161108705, + "grad_norm": 0.3710779547691345, + "learning_rate": 4.77141078388913e-05, + "loss": 0.365, + "step": 3378000 + }, + { + "epoch": 22.86230511043742, + "grad_norm": 0.38344478607177734, + "learning_rate": 4.771376948895626e-05, + "loss": 0.3628, + "step": 3378500 + }, + { + "epoch": 22.86568860978779, + "grad_norm": 0.3572565019130707, + "learning_rate": 4.771343113902123e-05, + "loss": 0.3642, + "step": 3379000 + }, + { + "epoch": 22.869072109138155, + "grad_norm": 0.3629647493362427, + "learning_rate": 4.771309278908619e-05, + "loss": 0.3644, + "step": 3379500 + }, + { + "epoch": 22.872455608488522, + "grad_norm": 0.3336074948310852, + "learning_rate": 4.771275443915115e-05, + "loss": 0.3648, + "step": 3380000 + }, + { + "epoch": 22.875839107838893, + "grad_norm": 0.388048380613327, + "learning_rate": 4.771241608921611e-05, + "loss": 0.3651, + "step": 3380500 + }, + { + "epoch": 22.87922260718926, + "grad_norm": 0.3633332848548889, + "learning_rate": 4.771207773928108e-05, + "loss": 0.3647, + "step": 3381000 + }, + { + "epoch": 22.882606106539626, + "grad_norm": 0.29492759704589844, + "learning_rate": 4.771173938934604e-05, + "loss": 0.3648, + "step": 3381500 + }, + { + "epoch": 22.885989605889996, + "grad_norm": 0.364529013633728, + "learning_rate": 4.7711401039411e-05, + "loss": 0.3629, + "step": 3382000 + }, + { + "epoch": 22.889373105240363, + "grad_norm": 0.3595414459705353, + "learning_rate": 4.771106268947596e-05, + "loss": 0.3644, + "step": 3382500 + }, + { + "epoch": 22.892756604590733, + "grad_norm": 0.37099790573120117, + "learning_rate": 4.771072433954093e-05, + "loss": 0.3648, + "step": 3383000 + }, + { + "epoch": 22.8961401039411, + "grad_norm": 0.3630152940750122, + "learning_rate": 4.7710385989605894e-05, + "loss": 0.3655, + "step": 3383500 + }, + { + "epoch": 22.899523603291467, + "grad_norm": 0.33306482434272766, + "learning_rate": 4.7710047639670856e-05, + "loss": 0.3654, + "step": 3384000 + }, + { + "epoch": 22.902907102641837, + "grad_norm": 0.37848731875419617, + "learning_rate": 4.770970928973582e-05, + "loss": 0.3629, + "step": 3384500 + }, + { + "epoch": 22.906290601992204, + "grad_norm": 0.35071754455566406, + "learning_rate": 4.770937093980078e-05, + "loss": 0.3645, + "step": 3385000 + }, + { + "epoch": 22.909674101342574, + "grad_norm": 0.35632824897766113, + "learning_rate": 4.770903258986574e-05, + "loss": 0.3641, + "step": 3385500 + }, + { + "epoch": 22.91305760069294, + "grad_norm": 0.37024572491645813, + "learning_rate": 4.7708694239930705e-05, + "loss": 0.3652, + "step": 3386000 + }, + { + "epoch": 22.916441100043308, + "grad_norm": 0.3406110405921936, + "learning_rate": 4.7708355889995674e-05, + "loss": 0.3642, + "step": 3386500 + }, + { + "epoch": 22.919824599393678, + "grad_norm": 0.36239370703697205, + "learning_rate": 4.7708017540060636e-05, + "loss": 0.3643, + "step": 3387000 + }, + { + "epoch": 22.923208098744045, + "grad_norm": 0.35012954473495483, + "learning_rate": 4.77076791901256e-05, + "loss": 0.3647, + "step": 3387500 + }, + { + "epoch": 22.92659159809441, + "grad_norm": 0.44089409708976746, + "learning_rate": 4.770734084019056e-05, + "loss": 0.3651, + "step": 3388000 + }, + { + "epoch": 22.929975097444782, + "grad_norm": 0.3752581775188446, + "learning_rate": 4.770700249025553e-05, + "loss": 0.3644, + "step": 3388500 + }, + { + "epoch": 22.93335859679515, + "grad_norm": 0.3997404873371124, + "learning_rate": 4.770666414032049e-05, + "loss": 0.3656, + "step": 3389000 + }, + { + "epoch": 22.93674209614552, + "grad_norm": 0.3542211949825287, + "learning_rate": 4.770632579038545e-05, + "loss": 0.366, + "step": 3389500 + }, + { + "epoch": 22.940125595495886, + "grad_norm": 0.34117844700813293, + "learning_rate": 4.770598744045041e-05, + "loss": 0.3633, + "step": 3390000 + }, + { + "epoch": 22.943509094846252, + "grad_norm": 0.36265915632247925, + "learning_rate": 4.770564909051538e-05, + "loss": 0.3646, + "step": 3390500 + }, + { + "epoch": 22.946892594196623, + "grad_norm": 0.3219507932662964, + "learning_rate": 4.770531074058034e-05, + "loss": 0.3644, + "step": 3391000 + }, + { + "epoch": 22.95027609354699, + "grad_norm": 0.36303895711898804, + "learning_rate": 4.77049723906453e-05, + "loss": 0.3638, + "step": 3391500 + }, + { + "epoch": 22.95365959289736, + "grad_norm": 0.3494533896446228, + "learning_rate": 4.7704634040710264e-05, + "loss": 0.3628, + "step": 3392000 + }, + { + "epoch": 22.957043092247726, + "grad_norm": 0.39607173204421997, + "learning_rate": 4.770429569077523e-05, + "loss": 0.3634, + "step": 3392500 + }, + { + "epoch": 22.960426591598093, + "grad_norm": 0.38334110379219055, + "learning_rate": 4.7703957340840195e-05, + "loss": 0.3638, + "step": 3393000 + }, + { + "epoch": 22.963810090948463, + "grad_norm": 0.3406641185283661, + "learning_rate": 4.770361899090516e-05, + "loss": 0.3642, + "step": 3393500 + }, + { + "epoch": 22.96719359029883, + "grad_norm": 0.3994218707084656, + "learning_rate": 4.770328064097012e-05, + "loss": 0.3646, + "step": 3394000 + }, + { + "epoch": 22.970577089649197, + "grad_norm": 0.362558513879776, + "learning_rate": 4.770294229103508e-05, + "loss": 0.363, + "step": 3394500 + }, + { + "epoch": 22.973960588999567, + "grad_norm": 0.3812117874622345, + "learning_rate": 4.770260394110004e-05, + "loss": 0.3634, + "step": 3395000 + }, + { + "epoch": 22.977344088349934, + "grad_norm": 0.3542080223560333, + "learning_rate": 4.7702265591165005e-05, + "loss": 0.3635, + "step": 3395500 + }, + { + "epoch": 22.980727587700304, + "grad_norm": 0.3493613600730896, + "learning_rate": 4.7701927241229974e-05, + "loss": 0.3647, + "step": 3396000 + }, + { + "epoch": 22.98411108705067, + "grad_norm": 0.38243380188941956, + "learning_rate": 4.7701588891294936e-05, + "loss": 0.3648, + "step": 3396500 + }, + { + "epoch": 22.987494586401038, + "grad_norm": 0.366038054227829, + "learning_rate": 4.77012505413599e-05, + "loss": 0.3638, + "step": 3397000 + }, + { + "epoch": 22.990878085751408, + "grad_norm": 0.36321356892585754, + "learning_rate": 4.770091219142486e-05, + "loss": 0.3632, + "step": 3397500 + }, + { + "epoch": 22.994261585101775, + "grad_norm": 0.33257806301116943, + "learning_rate": 4.770057384148983e-05, + "loss": 0.3637, + "step": 3398000 + }, + { + "epoch": 22.997645084452145, + "grad_norm": 0.38097190856933594, + "learning_rate": 4.770023549155479e-05, + "loss": 0.3655, + "step": 3398500 + }, + { + "epoch": 23.0, + "eval_accuracy": 0.8612532908124285, + "eval_loss": 0.5633518099784851, + "eval_runtime": 3398.3459, + "eval_samples_per_second": 85.555, + "eval_steps_per_second": 5.347, + "step": 3398848 + }, + { + "epoch": 23.001028583802512, + "grad_norm": 0.37000569701194763, + "learning_rate": 4.7699897141619754e-05, + "loss": 0.3635, + "step": 3399000 + }, + { + "epoch": 23.00441208315288, + "grad_norm": 0.3592061698436737, + "learning_rate": 4.769955879168471e-05, + "loss": 0.36, + "step": 3399500 + }, + { + "epoch": 23.00779558250325, + "grad_norm": 0.40653616189956665, + "learning_rate": 4.769922044174968e-05, + "loss": 0.3636, + "step": 3400000 + }, + { + "epoch": 23.011179081853616, + "grad_norm": 0.3927628993988037, + "learning_rate": 4.769888209181464e-05, + "loss": 0.3624, + "step": 3400500 + }, + { + "epoch": 23.014562581203986, + "grad_norm": 0.35245293378829956, + "learning_rate": 4.76985437418796e-05, + "loss": 0.3619, + "step": 3401000 + }, + { + "epoch": 23.017946080554353, + "grad_norm": 0.42338356375694275, + "learning_rate": 4.7698205391944564e-05, + "loss": 0.3625, + "step": 3401500 + }, + { + "epoch": 23.02132957990472, + "grad_norm": 0.366515189409256, + "learning_rate": 4.769786704200953e-05, + "loss": 0.3615, + "step": 3402000 + }, + { + "epoch": 23.02471307925509, + "grad_norm": 0.3612120449542999, + "learning_rate": 4.7697528692074495e-05, + "loss": 0.3633, + "step": 3402500 + }, + { + "epoch": 23.028096578605457, + "grad_norm": 0.36426788568496704, + "learning_rate": 4.769719034213946e-05, + "loss": 0.3622, + "step": 3403000 + }, + { + "epoch": 23.031480077955823, + "grad_norm": 0.3689402937889099, + "learning_rate": 4.769685199220442e-05, + "loss": 0.3619, + "step": 3403500 + }, + { + "epoch": 23.034863577306194, + "grad_norm": 0.36793333292007446, + "learning_rate": 4.769651364226938e-05, + "loss": 0.3625, + "step": 3404000 + }, + { + "epoch": 23.03824707665656, + "grad_norm": 0.3756435811519623, + "learning_rate": 4.7696175292334344e-05, + "loss": 0.3626, + "step": 3404500 + }, + { + "epoch": 23.04163057600693, + "grad_norm": 0.404081791639328, + "learning_rate": 4.7695836942399306e-05, + "loss": 0.3626, + "step": 3405000 + }, + { + "epoch": 23.045014075357297, + "grad_norm": 0.3988695740699768, + "learning_rate": 4.7695498592464275e-05, + "loss": 0.3638, + "step": 3405500 + }, + { + "epoch": 23.048397574707664, + "grad_norm": 0.35908782482147217, + "learning_rate": 4.769516024252924e-05, + "loss": 0.3634, + "step": 3406000 + }, + { + "epoch": 23.051781074058034, + "grad_norm": 0.35598117113113403, + "learning_rate": 4.76948218925942e-05, + "loss": 0.3629, + "step": 3406500 + }, + { + "epoch": 23.0551645734084, + "grad_norm": 0.3767651319503784, + "learning_rate": 4.769448354265916e-05, + "loss": 0.3632, + "step": 3407000 + }, + { + "epoch": 23.05854807275877, + "grad_norm": 0.40991491079330444, + "learning_rate": 4.7694145192724123e-05, + "loss": 0.3629, + "step": 3407500 + }, + { + "epoch": 23.06193157210914, + "grad_norm": 0.379182904958725, + "learning_rate": 4.769380684278909e-05, + "loss": 0.3632, + "step": 3408000 + }, + { + "epoch": 23.065315071459505, + "grad_norm": 0.35139524936676025, + "learning_rate": 4.7693468492854054e-05, + "loss": 0.3632, + "step": 3408500 + }, + { + "epoch": 23.068698570809875, + "grad_norm": 0.36326470971107483, + "learning_rate": 4.769313014291901e-05, + "loss": 0.3631, + "step": 3409000 + }, + { + "epoch": 23.072082070160242, + "grad_norm": 0.36097511649131775, + "learning_rate": 4.769279179298398e-05, + "loss": 0.3641, + "step": 3409500 + }, + { + "epoch": 23.075465569510612, + "grad_norm": 0.3503509759902954, + "learning_rate": 4.769245344304894e-05, + "loss": 0.3628, + "step": 3410000 + }, + { + "epoch": 23.07884906886098, + "grad_norm": 0.40592440962791443, + "learning_rate": 4.76921150931139e-05, + "loss": 0.3622, + "step": 3410500 + }, + { + "epoch": 23.082232568211346, + "grad_norm": 0.3593161702156067, + "learning_rate": 4.7691776743178865e-05, + "loss": 0.3619, + "step": 3411000 + }, + { + "epoch": 23.085616067561716, + "grad_norm": 0.3567400276660919, + "learning_rate": 4.7691438393243834e-05, + "loss": 0.3609, + "step": 3411500 + }, + { + "epoch": 23.088999566912083, + "grad_norm": 0.3718048632144928, + "learning_rate": 4.7691100043308796e-05, + "loss": 0.3619, + "step": 3412000 + }, + { + "epoch": 23.09238306626245, + "grad_norm": 0.38009029626846313, + "learning_rate": 4.769076169337376e-05, + "loss": 0.362, + "step": 3412500 + }, + { + "epoch": 23.09576656561282, + "grad_norm": 0.35412073135375977, + "learning_rate": 4.769042334343872e-05, + "loss": 0.3632, + "step": 3413000 + }, + { + "epoch": 23.099150064963187, + "grad_norm": 0.37574461102485657, + "learning_rate": 4.769008499350368e-05, + "loss": 0.3618, + "step": 3413500 + }, + { + "epoch": 23.102533564313557, + "grad_norm": 0.3682039678096771, + "learning_rate": 4.7689746643568645e-05, + "loss": 0.3608, + "step": 3414000 + }, + { + "epoch": 23.105917063663924, + "grad_norm": 0.37198999524116516, + "learning_rate": 4.768940829363361e-05, + "loss": 0.3629, + "step": 3414500 + }, + { + "epoch": 23.10930056301429, + "grad_norm": 0.3804379999637604, + "learning_rate": 4.768906994369857e-05, + "loss": 0.3624, + "step": 3415000 + }, + { + "epoch": 23.11268406236466, + "grad_norm": 0.37129905819892883, + "learning_rate": 4.768873159376354e-05, + "loss": 0.3636, + "step": 3415500 + }, + { + "epoch": 23.116067561715028, + "grad_norm": 0.384331613779068, + "learning_rate": 4.76883932438285e-05, + "loss": 0.3618, + "step": 3416000 + }, + { + "epoch": 23.119451061065398, + "grad_norm": 0.36952680349349976, + "learning_rate": 4.768805489389346e-05, + "loss": 0.3618, + "step": 3416500 + }, + { + "epoch": 23.122834560415765, + "grad_norm": 0.3988458514213562, + "learning_rate": 4.7687716543958424e-05, + "loss": 0.3627, + "step": 3417000 + }, + { + "epoch": 23.12621805976613, + "grad_norm": 0.3945951759815216, + "learning_rate": 4.768737819402339e-05, + "loss": 0.3628, + "step": 3417500 + }, + { + "epoch": 23.1296015591165, + "grad_norm": 0.39166146516799927, + "learning_rate": 4.7687039844088355e-05, + "loss": 0.3632, + "step": 3418000 + }, + { + "epoch": 23.13298505846687, + "grad_norm": 0.3587402403354645, + "learning_rate": 4.768670149415331e-05, + "loss": 0.3623, + "step": 3418500 + }, + { + "epoch": 23.136368557817235, + "grad_norm": 0.3880244195461273, + "learning_rate": 4.768636314421828e-05, + "loss": 0.3624, + "step": 3419000 + }, + { + "epoch": 23.139752057167605, + "grad_norm": 0.3770838975906372, + "learning_rate": 4.768602479428324e-05, + "loss": 0.3629, + "step": 3419500 + }, + { + "epoch": 23.143135556517972, + "grad_norm": 0.35299721360206604, + "learning_rate": 4.7685686444348204e-05, + "loss": 0.3629, + "step": 3420000 + }, + { + "epoch": 23.146519055868342, + "grad_norm": 0.37510251998901367, + "learning_rate": 4.7685348094413166e-05, + "loss": 0.3639, + "step": 3420500 + }, + { + "epoch": 23.14990255521871, + "grad_norm": 0.3695957362651825, + "learning_rate": 4.7685009744478135e-05, + "loss": 0.3609, + "step": 3421000 + }, + { + "epoch": 23.153286054569076, + "grad_norm": 0.4037812650203705, + "learning_rate": 4.76846713945431e-05, + "loss": 0.3623, + "step": 3421500 + }, + { + "epoch": 23.156669553919446, + "grad_norm": 0.3722400963306427, + "learning_rate": 4.768433304460806e-05, + "loss": 0.3625, + "step": 3422000 + }, + { + "epoch": 23.160053053269813, + "grad_norm": 0.3467889428138733, + "learning_rate": 4.768399469467302e-05, + "loss": 0.3619, + "step": 3422500 + }, + { + "epoch": 23.163436552620183, + "grad_norm": 0.4032399356365204, + "learning_rate": 4.768365634473798e-05, + "loss": 0.3639, + "step": 3423000 + }, + { + "epoch": 23.16682005197055, + "grad_norm": 0.3691263794898987, + "learning_rate": 4.7683317994802945e-05, + "loss": 0.3634, + "step": 3423500 + }, + { + "epoch": 23.170203551320917, + "grad_norm": 0.3949434459209442, + "learning_rate": 4.768297964486791e-05, + "loss": 0.3638, + "step": 3424000 + }, + { + "epoch": 23.173587050671287, + "grad_norm": 0.3885261118412018, + "learning_rate": 4.768264129493287e-05, + "loss": 0.3636, + "step": 3424500 + }, + { + "epoch": 23.176970550021654, + "grad_norm": 0.3906368017196655, + "learning_rate": 4.768230294499784e-05, + "loss": 0.364, + "step": 3425000 + }, + { + "epoch": 23.180354049372024, + "grad_norm": 0.3325451910495758, + "learning_rate": 4.76819645950628e-05, + "loss": 0.3631, + "step": 3425500 + }, + { + "epoch": 23.18373754872239, + "grad_norm": 0.3635040819644928, + "learning_rate": 4.768162624512776e-05, + "loss": 0.362, + "step": 3426000 + }, + { + "epoch": 23.187121048072758, + "grad_norm": 0.35502421855926514, + "learning_rate": 4.7681287895192725e-05, + "loss": 0.362, + "step": 3426500 + }, + { + "epoch": 23.190504547423128, + "grad_norm": 0.384591281414032, + "learning_rate": 4.7680949545257694e-05, + "loss": 0.364, + "step": 3427000 + }, + { + "epoch": 23.193888046773495, + "grad_norm": 0.37069278955459595, + "learning_rate": 4.7680611195322656e-05, + "loss": 0.3633, + "step": 3427500 + }, + { + "epoch": 23.19727154612386, + "grad_norm": 0.37285947799682617, + "learning_rate": 4.768027284538761e-05, + "loss": 0.3645, + "step": 3428000 + }, + { + "epoch": 23.20065504547423, + "grad_norm": 0.37554123997688293, + "learning_rate": 4.767993449545258e-05, + "loss": 0.363, + "step": 3428500 + }, + { + "epoch": 23.2040385448246, + "grad_norm": 0.4136617183685303, + "learning_rate": 4.767959614551754e-05, + "loss": 0.3638, + "step": 3429000 + }, + { + "epoch": 23.20742204417497, + "grad_norm": 0.3720233738422394, + "learning_rate": 4.7679257795582504e-05, + "loss": 0.3635, + "step": 3429500 + }, + { + "epoch": 23.210805543525336, + "grad_norm": 0.393037885427475, + "learning_rate": 4.7678919445647466e-05, + "loss": 0.3631, + "step": 3430000 + }, + { + "epoch": 23.214189042875702, + "grad_norm": 0.3909498155117035, + "learning_rate": 4.7678581095712435e-05, + "loss": 0.363, + "step": 3430500 + }, + { + "epoch": 23.217572542226073, + "grad_norm": 0.4048287570476532, + "learning_rate": 4.76782427457774e-05, + "loss": 0.3637, + "step": 3431000 + }, + { + "epoch": 23.22095604157644, + "grad_norm": 0.33996883034706116, + "learning_rate": 4.767790439584236e-05, + "loss": 0.3618, + "step": 3431500 + }, + { + "epoch": 23.22433954092681, + "grad_norm": 0.33059725165367126, + "learning_rate": 4.767756604590732e-05, + "loss": 0.364, + "step": 3432000 + }, + { + "epoch": 23.227723040277176, + "grad_norm": 0.370561808347702, + "learning_rate": 4.7677227695972284e-05, + "loss": 0.362, + "step": 3432500 + }, + { + "epoch": 23.231106539627543, + "grad_norm": 0.35351186990737915, + "learning_rate": 4.7676889346037246e-05, + "loss": 0.3622, + "step": 3433000 + }, + { + "epoch": 23.234490038977913, + "grad_norm": 0.38210129737854004, + "learning_rate": 4.767655099610221e-05, + "loss": 0.3645, + "step": 3433500 + }, + { + "epoch": 23.23787353832828, + "grad_norm": 0.40298372507095337, + "learning_rate": 4.767621264616717e-05, + "loss": 0.3631, + "step": 3434000 + }, + { + "epoch": 23.24125703767865, + "grad_norm": 0.4220386743545532, + "learning_rate": 4.767587429623214e-05, + "loss": 0.3621, + "step": 3434500 + }, + { + "epoch": 23.244640537029017, + "grad_norm": 0.38947874307632446, + "learning_rate": 4.76755359462971e-05, + "loss": 0.3633, + "step": 3435000 + }, + { + "epoch": 23.248024036379384, + "grad_norm": 0.35837265849113464, + "learning_rate": 4.767519759636206e-05, + "loss": 0.3614, + "step": 3435500 + }, + { + "epoch": 23.251407535729754, + "grad_norm": 0.36074700951576233, + "learning_rate": 4.7674859246427025e-05, + "loss": 0.3643, + "step": 3436000 + }, + { + "epoch": 23.25479103508012, + "grad_norm": 0.31319719552993774, + "learning_rate": 4.7674520896491994e-05, + "loss": 0.3633, + "step": 3436500 + }, + { + "epoch": 23.258174534430488, + "grad_norm": 0.36703911423683167, + "learning_rate": 4.7674182546556956e-05, + "loss": 0.3628, + "step": 3437000 + }, + { + "epoch": 23.261558033780858, + "grad_norm": 0.3402256965637207, + "learning_rate": 4.767384419662191e-05, + "loss": 0.3626, + "step": 3437500 + }, + { + "epoch": 23.264941533131225, + "grad_norm": 0.3701879680156708, + "learning_rate": 4.767350584668688e-05, + "loss": 0.3615, + "step": 3438000 + }, + { + "epoch": 23.268325032481595, + "grad_norm": 0.4011252820491791, + "learning_rate": 4.767316749675184e-05, + "loss": 0.363, + "step": 3438500 + }, + { + "epoch": 23.271708531831962, + "grad_norm": 0.3824266791343689, + "learning_rate": 4.7672829146816805e-05, + "loss": 0.3635, + "step": 3439000 + }, + { + "epoch": 23.27509203118233, + "grad_norm": 0.4016554653644562, + "learning_rate": 4.767249079688177e-05, + "loss": 0.3646, + "step": 3439500 + }, + { + "epoch": 23.2784755305327, + "grad_norm": 0.3987918794155121, + "learning_rate": 4.7672152446946736e-05, + "loss": 0.3624, + "step": 3440000 + }, + { + "epoch": 23.281859029883066, + "grad_norm": 0.3824206590652466, + "learning_rate": 4.76718140970117e-05, + "loss": 0.363, + "step": 3440500 + }, + { + "epoch": 23.285242529233436, + "grad_norm": 0.33757832646369934, + "learning_rate": 4.767147574707666e-05, + "loss": 0.363, + "step": 3441000 + }, + { + "epoch": 23.288626028583803, + "grad_norm": 0.3646239936351776, + "learning_rate": 4.767113739714162e-05, + "loss": 0.3629, + "step": 3441500 + }, + { + "epoch": 23.29200952793417, + "grad_norm": 0.37561193108558655, + "learning_rate": 4.767079904720659e-05, + "loss": 0.3615, + "step": 3442000 + }, + { + "epoch": 23.29539302728454, + "grad_norm": 0.3657933473587036, + "learning_rate": 4.7670460697271546e-05, + "loss": 0.3649, + "step": 3442500 + }, + { + "epoch": 23.298776526634907, + "grad_norm": 0.33516716957092285, + "learning_rate": 4.767012234733651e-05, + "loss": 0.3641, + "step": 3443000 + }, + { + "epoch": 23.302160025985273, + "grad_norm": 0.35404688119888306, + "learning_rate": 4.766978399740147e-05, + "loss": 0.3638, + "step": 3443500 + }, + { + "epoch": 23.305543525335644, + "grad_norm": 0.3707813620567322, + "learning_rate": 4.766944564746644e-05, + "loss": 0.3618, + "step": 3444000 + }, + { + "epoch": 23.30892702468601, + "grad_norm": 0.3737035393714905, + "learning_rate": 4.76691072975314e-05, + "loss": 0.3622, + "step": 3444500 + }, + { + "epoch": 23.31231052403638, + "grad_norm": 0.37302887439727783, + "learning_rate": 4.7668768947596364e-05, + "loss": 0.3626, + "step": 3445000 + }, + { + "epoch": 23.315694023386747, + "grad_norm": 0.349115788936615, + "learning_rate": 4.7668430597661326e-05, + "loss": 0.3639, + "step": 3445500 + }, + { + "epoch": 23.319077522737114, + "grad_norm": 0.3590487837791443, + "learning_rate": 4.7668092247726295e-05, + "loss": 0.3629, + "step": 3446000 + }, + { + "epoch": 23.322461022087484, + "grad_norm": 0.35418424010276794, + "learning_rate": 4.766775389779126e-05, + "loss": 0.3617, + "step": 3446500 + }, + { + "epoch": 23.32584452143785, + "grad_norm": 0.3744847774505615, + "learning_rate": 4.766741554785621e-05, + "loss": 0.3629, + "step": 3447000 + }, + { + "epoch": 23.32922802078822, + "grad_norm": 0.3934074342250824, + "learning_rate": 4.766707719792118e-05, + "loss": 0.3632, + "step": 3447500 + }, + { + "epoch": 23.332611520138588, + "grad_norm": 0.3964332342147827, + "learning_rate": 4.766673884798614e-05, + "loss": 0.3636, + "step": 3448000 + }, + { + "epoch": 23.335995019488955, + "grad_norm": 0.37640121579170227, + "learning_rate": 4.7666400498051106e-05, + "loss": 0.3635, + "step": 3448500 + }, + { + "epoch": 23.339378518839325, + "grad_norm": 0.43194273114204407, + "learning_rate": 4.766606214811607e-05, + "loss": 0.3624, + "step": 3449000 + }, + { + "epoch": 23.342762018189692, + "grad_norm": 0.35745880007743835, + "learning_rate": 4.7665723798181037e-05, + "loss": 0.3646, + "step": 3449500 + }, + { + "epoch": 23.346145517540062, + "grad_norm": 0.4166637659072876, + "learning_rate": 4.7665385448246e-05, + "loss": 0.3635, + "step": 3450000 + }, + { + "epoch": 23.34952901689043, + "grad_norm": 0.3475643992424011, + "learning_rate": 4.766504709831096e-05, + "loss": 0.3634, + "step": 3450500 + }, + { + "epoch": 23.352912516240796, + "grad_norm": 0.4426576793193817, + "learning_rate": 4.766470874837592e-05, + "loss": 0.3645, + "step": 3451000 + }, + { + "epoch": 23.356296015591166, + "grad_norm": 0.3714113235473633, + "learning_rate": 4.766437039844089e-05, + "loss": 0.3638, + "step": 3451500 + }, + { + "epoch": 23.359679514941533, + "grad_norm": 0.37920984625816345, + "learning_rate": 4.766403204850585e-05, + "loss": 0.3631, + "step": 3452000 + }, + { + "epoch": 23.3630630142919, + "grad_norm": 0.35684266686439514, + "learning_rate": 4.766369369857081e-05, + "loss": 0.364, + "step": 3452500 + }, + { + "epoch": 23.36644651364227, + "grad_norm": 0.37371116876602173, + "learning_rate": 4.766335534863577e-05, + "loss": 0.3623, + "step": 3453000 + }, + { + "epoch": 23.369830012992637, + "grad_norm": 0.37027162313461304, + "learning_rate": 4.766301699870074e-05, + "loss": 0.3634, + "step": 3453500 + }, + { + "epoch": 23.373213512343007, + "grad_norm": 0.3794066309928894, + "learning_rate": 4.76626786487657e-05, + "loss": 0.3634, + "step": 3454000 + }, + { + "epoch": 23.376597011693374, + "grad_norm": 0.3557523787021637, + "learning_rate": 4.7662340298830665e-05, + "loss": 0.3638, + "step": 3454500 + }, + { + "epoch": 23.37998051104374, + "grad_norm": 0.371554970741272, + "learning_rate": 4.766200194889563e-05, + "loss": 0.3638, + "step": 3455000 + }, + { + "epoch": 23.38336401039411, + "grad_norm": 0.35758981108665466, + "learning_rate": 4.7661663598960596e-05, + "loss": 0.3615, + "step": 3455500 + }, + { + "epoch": 23.386747509744477, + "grad_norm": 0.37202003598213196, + "learning_rate": 4.766132524902556e-05, + "loss": 0.3643, + "step": 3456000 + }, + { + "epoch": 23.390131009094848, + "grad_norm": 0.37054094672203064, + "learning_rate": 4.766098689909051e-05, + "loss": 0.365, + "step": 3456500 + }, + { + "epoch": 23.393514508445215, + "grad_norm": 0.35302260518074036, + "learning_rate": 4.766064854915548e-05, + "loss": 0.3629, + "step": 3457000 + }, + { + "epoch": 23.39689800779558, + "grad_norm": 0.3512248396873474, + "learning_rate": 4.7660310199220444e-05, + "loss": 0.364, + "step": 3457500 + }, + { + "epoch": 23.40028150714595, + "grad_norm": 0.377338707447052, + "learning_rate": 4.7659971849285406e-05, + "loss": 0.3658, + "step": 3458000 + }, + { + "epoch": 23.40366500649632, + "grad_norm": 0.36699292063713074, + "learning_rate": 4.765963349935037e-05, + "loss": 0.3649, + "step": 3458500 + }, + { + "epoch": 23.40704850584669, + "grad_norm": 0.4169010519981384, + "learning_rate": 4.765929514941534e-05, + "loss": 0.3634, + "step": 3459000 + }, + { + "epoch": 23.410432005197055, + "grad_norm": 0.35843947529792786, + "learning_rate": 4.76589567994803e-05, + "loss": 0.3622, + "step": 3459500 + }, + { + "epoch": 23.413815504547422, + "grad_norm": 0.3619913160800934, + "learning_rate": 4.765861844954526e-05, + "loss": 0.3638, + "step": 3460000 + }, + { + "epoch": 23.417199003897792, + "grad_norm": 0.3665013611316681, + "learning_rate": 4.7658280099610224e-05, + "loss": 0.3611, + "step": 3460500 + }, + { + "epoch": 23.42058250324816, + "grad_norm": 0.3594473600387573, + "learning_rate": 4.7657941749675186e-05, + "loss": 0.3648, + "step": 3461000 + }, + { + "epoch": 23.423966002598526, + "grad_norm": 0.3695002496242523, + "learning_rate": 4.765760339974015e-05, + "loss": 0.3643, + "step": 3461500 + }, + { + "epoch": 23.427349501948896, + "grad_norm": 0.4056394398212433, + "learning_rate": 4.765726504980511e-05, + "loss": 0.3632, + "step": 3462000 + }, + { + "epoch": 23.430733001299263, + "grad_norm": 0.3681444823741913, + "learning_rate": 4.765692669987007e-05, + "loss": 0.364, + "step": 3462500 + }, + { + "epoch": 23.434116500649633, + "grad_norm": 0.3586125671863556, + "learning_rate": 4.765658834993504e-05, + "loss": 0.363, + "step": 3463000 + }, + { + "epoch": 23.4375, + "grad_norm": 0.39215049147605896, + "learning_rate": 4.765625e-05, + "loss": 0.3635, + "step": 3463500 + }, + { + "epoch": 23.440883499350367, + "grad_norm": 0.32525262236595154, + "learning_rate": 4.7655911650064965e-05, + "loss": 0.3625, + "step": 3464000 + }, + { + "epoch": 23.444266998700737, + "grad_norm": 0.4453783333301544, + "learning_rate": 4.765557330012993e-05, + "loss": 0.363, + "step": 3464500 + }, + { + "epoch": 23.447650498051104, + "grad_norm": 0.39107051491737366, + "learning_rate": 4.7655234950194896e-05, + "loss": 0.3653, + "step": 3465000 + }, + { + "epoch": 23.451033997401474, + "grad_norm": 0.36332428455352783, + "learning_rate": 4.765489660025986e-05, + "loss": 0.3638, + "step": 3465500 + }, + { + "epoch": 23.45441749675184, + "grad_norm": 0.3637053966522217, + "learning_rate": 4.7654558250324814e-05, + "loss": 0.3627, + "step": 3466000 + }, + { + "epoch": 23.457800996102208, + "grad_norm": 0.3457685708999634, + "learning_rate": 4.765421990038978e-05, + "loss": 0.3635, + "step": 3466500 + }, + { + "epoch": 23.461184495452578, + "grad_norm": 0.38191792368888855, + "learning_rate": 4.7653881550454745e-05, + "loss": 0.3634, + "step": 3467000 + }, + { + "epoch": 23.464567994802945, + "grad_norm": 0.39992186427116394, + "learning_rate": 4.765354320051971e-05, + "loss": 0.3645, + "step": 3467500 + }, + { + "epoch": 23.46795149415331, + "grad_norm": 0.4218098223209381, + "learning_rate": 4.765320485058467e-05, + "loss": 0.3644, + "step": 3468000 + }, + { + "epoch": 23.47133499350368, + "grad_norm": 0.3750041425228119, + "learning_rate": 4.765286650064964e-05, + "loss": 0.3642, + "step": 3468500 + }, + { + "epoch": 23.47471849285405, + "grad_norm": 0.39663100242614746, + "learning_rate": 4.76525281507146e-05, + "loss": 0.3635, + "step": 3469000 + }, + { + "epoch": 23.47810199220442, + "grad_norm": 0.42526987195014954, + "learning_rate": 4.765218980077956e-05, + "loss": 0.3639, + "step": 3469500 + }, + { + "epoch": 23.481485491554785, + "grad_norm": 0.353274941444397, + "learning_rate": 4.7651851450844524e-05, + "loss": 0.3641, + "step": 3470000 + }, + { + "epoch": 23.484868990905152, + "grad_norm": 0.3648282289505005, + "learning_rate": 4.7651513100909486e-05, + "loss": 0.3644, + "step": 3470500 + }, + { + "epoch": 23.488252490255523, + "grad_norm": 0.40002763271331787, + "learning_rate": 4.765117475097445e-05, + "loss": 0.364, + "step": 3471000 + }, + { + "epoch": 23.49163598960589, + "grad_norm": 0.36751842498779297, + "learning_rate": 4.765083640103941e-05, + "loss": 0.3628, + "step": 3471500 + }, + { + "epoch": 23.49501948895626, + "grad_norm": 0.3347572684288025, + "learning_rate": 4.765049805110437e-05, + "loss": 0.3622, + "step": 3472000 + }, + { + "epoch": 23.498402988306626, + "grad_norm": 0.35500243306159973, + "learning_rate": 4.765015970116934e-05, + "loss": 0.3639, + "step": 3472500 + }, + { + "epoch": 23.501786487656993, + "grad_norm": 0.3748621344566345, + "learning_rate": 4.7649821351234304e-05, + "loss": 0.3635, + "step": 3473000 + }, + { + "epoch": 23.505169987007363, + "grad_norm": 0.3916022777557373, + "learning_rate": 4.7649483001299266e-05, + "loss": 0.3642, + "step": 3473500 + }, + { + "epoch": 23.50855348635773, + "grad_norm": 0.3466847240924835, + "learning_rate": 4.764914465136423e-05, + "loss": 0.3636, + "step": 3474000 + }, + { + "epoch": 23.5119369857081, + "grad_norm": 0.3665750026702881, + "learning_rate": 4.76488063014292e-05, + "loss": 0.3639, + "step": 3474500 + }, + { + "epoch": 23.515320485058467, + "grad_norm": 0.4211607277393341, + "learning_rate": 4.764846795149416e-05, + "loss": 0.3632, + "step": 3475000 + }, + { + "epoch": 23.518703984408834, + "grad_norm": 0.3728439509868622, + "learning_rate": 4.7648129601559114e-05, + "loss": 0.3639, + "step": 3475500 + }, + { + "epoch": 23.522087483759204, + "grad_norm": 0.3351035416126251, + "learning_rate": 4.764779125162408e-05, + "loss": 0.3628, + "step": 3476000 + }, + { + "epoch": 23.52547098310957, + "grad_norm": 0.34060367941856384, + "learning_rate": 4.7647452901689045e-05, + "loss": 0.364, + "step": 3476500 + }, + { + "epoch": 23.528854482459938, + "grad_norm": 0.3944180905818939, + "learning_rate": 4.764711455175401e-05, + "loss": 0.363, + "step": 3477000 + }, + { + "epoch": 23.532237981810308, + "grad_norm": 0.32537418603897095, + "learning_rate": 4.764677620181897e-05, + "loss": 0.3633, + "step": 3477500 + }, + { + "epoch": 23.535621481160675, + "grad_norm": 0.3785341680049896, + "learning_rate": 4.764643785188393e-05, + "loss": 0.3631, + "step": 3478000 + }, + { + "epoch": 23.539004980511045, + "grad_norm": 0.36599990725517273, + "learning_rate": 4.76460995019489e-05, + "loss": 0.3644, + "step": 3478500 + }, + { + "epoch": 23.542388479861412, + "grad_norm": 0.40464311838150024, + "learning_rate": 4.764576115201386e-05, + "loss": 0.363, + "step": 3479000 + }, + { + "epoch": 23.54577197921178, + "grad_norm": 0.34878531098365784, + "learning_rate": 4.7645422802078825e-05, + "loss": 0.3629, + "step": 3479500 + }, + { + "epoch": 23.54915547856215, + "grad_norm": 0.36536720395088196, + "learning_rate": 4.764508445214379e-05, + "loss": 0.3631, + "step": 3480000 + }, + { + "epoch": 23.552538977912516, + "grad_norm": 0.3720216155052185, + "learning_rate": 4.764474610220875e-05, + "loss": 0.3631, + "step": 3480500 + }, + { + "epoch": 23.555922477262886, + "grad_norm": 0.3621642291545868, + "learning_rate": 4.764440775227371e-05, + "loss": 0.3637, + "step": 3481000 + }, + { + "epoch": 23.559305976613253, + "grad_norm": 0.35155996680259705, + "learning_rate": 4.764406940233867e-05, + "loss": 0.363, + "step": 3481500 + }, + { + "epoch": 23.56268947596362, + "grad_norm": 0.3805035650730133, + "learning_rate": 4.764373105240364e-05, + "loss": 0.3629, + "step": 3482000 + }, + { + "epoch": 23.56607297531399, + "grad_norm": 0.4354890286922455, + "learning_rate": 4.7643392702468604e-05, + "loss": 0.364, + "step": 3482500 + }, + { + "epoch": 23.569456474664356, + "grad_norm": 0.35975298285484314, + "learning_rate": 4.7643054352533566e-05, + "loss": 0.3621, + "step": 3483000 + }, + { + "epoch": 23.572839974014727, + "grad_norm": 0.3824651539325714, + "learning_rate": 4.764271600259853e-05, + "loss": 0.3645, + "step": 3483500 + }, + { + "epoch": 23.576223473365093, + "grad_norm": 0.4012623429298401, + "learning_rate": 4.76423776526635e-05, + "loss": 0.3641, + "step": 3484000 + }, + { + "epoch": 23.57960697271546, + "grad_norm": 0.36130985617637634, + "learning_rate": 4.764203930272846e-05, + "loss": 0.3632, + "step": 3484500 + }, + { + "epoch": 23.58299047206583, + "grad_norm": 0.31537777185440063, + "learning_rate": 4.7641700952793415e-05, + "loss": 0.3628, + "step": 3485000 + }, + { + "epoch": 23.586373971416197, + "grad_norm": 0.3515828251838684, + "learning_rate": 4.764136260285838e-05, + "loss": 0.3649, + "step": 3485500 + }, + { + "epoch": 23.589757470766564, + "grad_norm": 0.39275023341178894, + "learning_rate": 4.7641024252923346e-05, + "loss": 0.3639, + "step": 3486000 + }, + { + "epoch": 23.593140970116934, + "grad_norm": 0.39433401823043823, + "learning_rate": 4.764068590298831e-05, + "loss": 0.3638, + "step": 3486500 + }, + { + "epoch": 23.5965244694673, + "grad_norm": 0.3990587294101715, + "learning_rate": 4.764034755305327e-05, + "loss": 0.3638, + "step": 3487000 + }, + { + "epoch": 23.59990796881767, + "grad_norm": 0.3309509754180908, + "learning_rate": 4.764000920311823e-05, + "loss": 0.363, + "step": 3487500 + }, + { + "epoch": 23.603291468168038, + "grad_norm": 0.3852672576904297, + "learning_rate": 4.76396708531832e-05, + "loss": 0.3641, + "step": 3488000 + }, + { + "epoch": 23.606674967518405, + "grad_norm": 0.39036333560943604, + "learning_rate": 4.763933250324816e-05, + "loss": 0.3637, + "step": 3488500 + }, + { + "epoch": 23.610058466868775, + "grad_norm": 0.38685497641563416, + "learning_rate": 4.7638994153313125e-05, + "loss": 0.3635, + "step": 3489000 + }, + { + "epoch": 23.613441966219142, + "grad_norm": 0.39183852076530457, + "learning_rate": 4.763865580337809e-05, + "loss": 0.364, + "step": 3489500 + }, + { + "epoch": 23.616825465569512, + "grad_norm": 0.37505850195884705, + "learning_rate": 4.763831745344305e-05, + "loss": 0.3618, + "step": 3490000 + }, + { + "epoch": 23.62020896491988, + "grad_norm": 0.39759066700935364, + "learning_rate": 4.763797910350801e-05, + "loss": 0.3636, + "step": 3490500 + }, + { + "epoch": 23.623592464270246, + "grad_norm": 0.38261231780052185, + "learning_rate": 4.7637640753572974e-05, + "loss": 0.3632, + "step": 3491000 + }, + { + "epoch": 23.626975963620616, + "grad_norm": 0.3759249448776245, + "learning_rate": 4.763730240363794e-05, + "loss": 0.3641, + "step": 3491500 + }, + { + "epoch": 23.630359462970983, + "grad_norm": 0.34230777621269226, + "learning_rate": 4.7636964053702905e-05, + "loss": 0.364, + "step": 3492000 + }, + { + "epoch": 23.63374296232135, + "grad_norm": 0.35262158513069153, + "learning_rate": 4.763662570376787e-05, + "loss": 0.3653, + "step": 3492500 + }, + { + "epoch": 23.63712646167172, + "grad_norm": 0.3902489244937897, + "learning_rate": 4.763628735383283e-05, + "loss": 0.3651, + "step": 3493000 + }, + { + "epoch": 23.640509961022087, + "grad_norm": 0.4088767468929291, + "learning_rate": 4.76359490038978e-05, + "loss": 0.3634, + "step": 3493500 + }, + { + "epoch": 23.643893460372457, + "grad_norm": 0.3867037296295166, + "learning_rate": 4.763561065396276e-05, + "loss": 0.3646, + "step": 3494000 + }, + { + "epoch": 23.647276959722824, + "grad_norm": 0.39906758069992065, + "learning_rate": 4.763527230402772e-05, + "loss": 0.3651, + "step": 3494500 + }, + { + "epoch": 23.65066045907319, + "grad_norm": 0.3372052311897278, + "learning_rate": 4.763493395409268e-05, + "loss": 0.3641, + "step": 3495000 + }, + { + "epoch": 23.65404395842356, + "grad_norm": 0.3461311459541321, + "learning_rate": 4.7634595604157647e-05, + "loss": 0.3648, + "step": 3495500 + }, + { + "epoch": 23.657427457773927, + "grad_norm": 0.38681378960609436, + "learning_rate": 4.763425725422261e-05, + "loss": 0.3633, + "step": 3496000 + }, + { + "epoch": 23.660810957124298, + "grad_norm": 0.3958388566970825, + "learning_rate": 4.763391890428757e-05, + "loss": 0.3635, + "step": 3496500 + }, + { + "epoch": 23.664194456474664, + "grad_norm": 0.3645811676979065, + "learning_rate": 4.763358055435253e-05, + "loss": 0.3639, + "step": 3497000 + }, + { + "epoch": 23.66757795582503, + "grad_norm": 0.4053094685077667, + "learning_rate": 4.76332422044175e-05, + "loss": 0.3641, + "step": 3497500 + }, + { + "epoch": 23.6709614551754, + "grad_norm": 0.3625944256782532, + "learning_rate": 4.7632903854482464e-05, + "loss": 0.3642, + "step": 3498000 + }, + { + "epoch": 23.67434495452577, + "grad_norm": 0.38809558749198914, + "learning_rate": 4.7632565504547426e-05, + "loss": 0.364, + "step": 3498500 + }, + { + "epoch": 23.67772845387614, + "grad_norm": 0.35745155811309814, + "learning_rate": 4.763222715461239e-05, + "loss": 0.3631, + "step": 3499000 + }, + { + "epoch": 23.681111953226505, + "grad_norm": 0.3489069938659668, + "learning_rate": 4.763188880467735e-05, + "loss": 0.3652, + "step": 3499500 + }, + { + "epoch": 23.684495452576872, + "grad_norm": 0.3746088445186615, + "learning_rate": 4.763155045474231e-05, + "loss": 0.3644, + "step": 3500000 + }, + { + "epoch": 23.687878951927242, + "grad_norm": 0.3664596378803253, + "learning_rate": 4.7631212104807275e-05, + "loss": 0.364, + "step": 3500500 + }, + { + "epoch": 23.69126245127761, + "grad_norm": 0.3510581851005554, + "learning_rate": 4.7630873754872243e-05, + "loss": 0.3638, + "step": 3501000 + }, + { + "epoch": 23.694645950627976, + "grad_norm": 0.3787493109703064, + "learning_rate": 4.7630535404937206e-05, + "loss": 0.3664, + "step": 3501500 + }, + { + "epoch": 23.698029449978346, + "grad_norm": 0.34525835514068604, + "learning_rate": 4.763019705500217e-05, + "loss": 0.3636, + "step": 3502000 + }, + { + "epoch": 23.701412949328713, + "grad_norm": 0.39369526505470276, + "learning_rate": 4.762985870506713e-05, + "loss": 0.3631, + "step": 3502500 + }, + { + "epoch": 23.704796448679083, + "grad_norm": 0.3621573746204376, + "learning_rate": 4.76295203551321e-05, + "loss": 0.3648, + "step": 3503000 + }, + { + "epoch": 23.70817994802945, + "grad_norm": 0.34888139367103577, + "learning_rate": 4.762918200519706e-05, + "loss": 0.3622, + "step": 3503500 + }, + { + "epoch": 23.711563447379817, + "grad_norm": 0.36744803190231323, + "learning_rate": 4.762884365526202e-05, + "loss": 0.363, + "step": 3504000 + }, + { + "epoch": 23.714946946730187, + "grad_norm": 0.3537336587905884, + "learning_rate": 4.762850530532698e-05, + "loss": 0.3631, + "step": 3504500 + }, + { + "epoch": 23.718330446080554, + "grad_norm": 0.35082361102104187, + "learning_rate": 4.762816695539195e-05, + "loss": 0.3629, + "step": 3505000 + }, + { + "epoch": 23.721713945430924, + "grad_norm": 0.3593525290489197, + "learning_rate": 4.762782860545691e-05, + "loss": 0.3627, + "step": 3505500 + }, + { + "epoch": 23.72509744478129, + "grad_norm": 0.3956252634525299, + "learning_rate": 4.762749025552187e-05, + "loss": 0.3635, + "step": 3506000 + }, + { + "epoch": 23.728480944131658, + "grad_norm": 0.3581351637840271, + "learning_rate": 4.7627151905586834e-05, + "loss": 0.3631, + "step": 3506500 + }, + { + "epoch": 23.731864443482028, + "grad_norm": 0.3484318256378174, + "learning_rate": 4.76268135556518e-05, + "loss": 0.3638, + "step": 3507000 + }, + { + "epoch": 23.735247942832395, + "grad_norm": 0.40121597051620483, + "learning_rate": 4.7626475205716765e-05, + "loss": 0.3635, + "step": 3507500 + }, + { + "epoch": 23.738631442182765, + "grad_norm": 0.3527781069278717, + "learning_rate": 4.762613685578173e-05, + "loss": 0.3634, + "step": 3508000 + }, + { + "epoch": 23.74201494153313, + "grad_norm": 0.3523523807525635, + "learning_rate": 4.762579850584669e-05, + "loss": 0.3621, + "step": 3508500 + }, + { + "epoch": 23.7453984408835, + "grad_norm": 0.37593403458595276, + "learning_rate": 4.762546015591165e-05, + "loss": 0.3638, + "step": 3509000 + }, + { + "epoch": 23.74878194023387, + "grad_norm": 0.3803982436656952, + "learning_rate": 4.762512180597661e-05, + "loss": 0.3639, + "step": 3509500 + }, + { + "epoch": 23.752165439584235, + "grad_norm": 0.3712470531463623, + "learning_rate": 4.7624783456041575e-05, + "loss": 0.3644, + "step": 3510000 + }, + { + "epoch": 23.755548938934602, + "grad_norm": 0.37614697217941284, + "learning_rate": 4.7624445106106544e-05, + "loss": 0.3648, + "step": 3510500 + }, + { + "epoch": 23.758932438284972, + "grad_norm": 0.3237117826938629, + "learning_rate": 4.7624106756171506e-05, + "loss": 0.3642, + "step": 3511000 + }, + { + "epoch": 23.76231593763534, + "grad_norm": 0.415750652551651, + "learning_rate": 4.762376840623647e-05, + "loss": 0.3627, + "step": 3511500 + }, + { + "epoch": 23.76569943698571, + "grad_norm": 0.3945716917514801, + "learning_rate": 4.762343005630143e-05, + "loss": 0.3625, + "step": 3512000 + }, + { + "epoch": 23.769082936336076, + "grad_norm": 0.3449915051460266, + "learning_rate": 4.76230917063664e-05, + "loss": 0.3644, + "step": 3512500 + }, + { + "epoch": 23.772466435686443, + "grad_norm": 0.38573160767555237, + "learning_rate": 4.762275335643136e-05, + "loss": 0.3633, + "step": 3513000 + }, + { + "epoch": 23.775849935036813, + "grad_norm": 0.37552815675735474, + "learning_rate": 4.7622415006496324e-05, + "loss": 0.3635, + "step": 3513500 + }, + { + "epoch": 23.77923343438718, + "grad_norm": 0.363090455532074, + "learning_rate": 4.762207665656128e-05, + "loss": 0.3643, + "step": 3514000 + }, + { + "epoch": 23.78261693373755, + "grad_norm": 0.3741694390773773, + "learning_rate": 4.762173830662625e-05, + "loss": 0.3649, + "step": 3514500 + }, + { + "epoch": 23.786000433087917, + "grad_norm": 0.35938596725463867, + "learning_rate": 4.762139995669121e-05, + "loss": 0.3634, + "step": 3515000 + }, + { + "epoch": 23.789383932438284, + "grad_norm": 0.40430352091789246, + "learning_rate": 4.762106160675617e-05, + "loss": 0.3623, + "step": 3515500 + }, + { + "epoch": 23.792767431788654, + "grad_norm": 0.3747963309288025, + "learning_rate": 4.7620723256821134e-05, + "loss": 0.3641, + "step": 3516000 + }, + { + "epoch": 23.79615093113902, + "grad_norm": 0.35648542642593384, + "learning_rate": 4.76203849068861e-05, + "loss": 0.3624, + "step": 3516500 + }, + { + "epoch": 23.799534430489388, + "grad_norm": 0.3804064691066742, + "learning_rate": 4.7620046556951065e-05, + "loss": 0.3635, + "step": 3517000 + }, + { + "epoch": 23.802917929839758, + "grad_norm": 0.38011279702186584, + "learning_rate": 4.761970820701603e-05, + "loss": 0.3644, + "step": 3517500 + }, + { + "epoch": 23.806301429190125, + "grad_norm": 0.3717729449272156, + "learning_rate": 4.761936985708099e-05, + "loss": 0.3642, + "step": 3518000 + }, + { + "epoch": 23.809684928540495, + "grad_norm": 0.3428543508052826, + "learning_rate": 4.761903150714595e-05, + "loss": 0.3631, + "step": 3518500 + }, + { + "epoch": 23.81306842789086, + "grad_norm": 0.362511545419693, + "learning_rate": 4.7618693157210914e-05, + "loss": 0.363, + "step": 3519000 + }, + { + "epoch": 23.81645192724123, + "grad_norm": 0.4208146929740906, + "learning_rate": 4.7618354807275876e-05, + "loss": 0.3628, + "step": 3519500 + }, + { + "epoch": 23.8198354265916, + "grad_norm": 0.37463951110839844, + "learning_rate": 4.7618016457340845e-05, + "loss": 0.3632, + "step": 3520000 + }, + { + "epoch": 23.823218925941966, + "grad_norm": 0.3562808334827423, + "learning_rate": 4.761767810740581e-05, + "loss": 0.3642, + "step": 3520500 + }, + { + "epoch": 23.826602425292336, + "grad_norm": 0.36782774329185486, + "learning_rate": 4.761733975747077e-05, + "loss": 0.3649, + "step": 3521000 + }, + { + "epoch": 23.829985924642703, + "grad_norm": 0.3616153597831726, + "learning_rate": 4.761700140753573e-05, + "loss": 0.3632, + "step": 3521500 + }, + { + "epoch": 23.83336942399307, + "grad_norm": 0.36399659514427185, + "learning_rate": 4.76166630576007e-05, + "loss": 0.3626, + "step": 3522000 + }, + { + "epoch": 23.83675292334344, + "grad_norm": 0.3577078878879547, + "learning_rate": 4.761632470766566e-05, + "loss": 0.3651, + "step": 3522500 + }, + { + "epoch": 23.840136422693806, + "grad_norm": 0.3801499307155609, + "learning_rate": 4.7615986357730624e-05, + "loss": 0.3629, + "step": 3523000 + }, + { + "epoch": 23.843519922044173, + "grad_norm": 0.34691569209098816, + "learning_rate": 4.761564800779558e-05, + "loss": 0.3637, + "step": 3523500 + }, + { + "epoch": 23.846903421394543, + "grad_norm": 0.38196486234664917, + "learning_rate": 4.761530965786055e-05, + "loss": 0.3651, + "step": 3524000 + }, + { + "epoch": 23.85028692074491, + "grad_norm": 0.3814425468444824, + "learning_rate": 4.761497130792551e-05, + "loss": 0.3634, + "step": 3524500 + }, + { + "epoch": 23.85367042009528, + "grad_norm": 0.34999626874923706, + "learning_rate": 4.761463295799047e-05, + "loss": 0.3627, + "step": 3525000 + }, + { + "epoch": 23.857053919445647, + "grad_norm": 0.3775683045387268, + "learning_rate": 4.7614294608055435e-05, + "loss": 0.3645, + "step": 3525500 + }, + { + "epoch": 23.860437418796014, + "grad_norm": 0.38793089985847473, + "learning_rate": 4.7613956258120404e-05, + "loss": 0.3643, + "step": 3526000 + }, + { + "epoch": 23.863820918146384, + "grad_norm": 0.33791637420654297, + "learning_rate": 4.7613617908185366e-05, + "loss": 0.3624, + "step": 3526500 + }, + { + "epoch": 23.86720441749675, + "grad_norm": 0.3349616527557373, + "learning_rate": 4.761327955825033e-05, + "loss": 0.3634, + "step": 3527000 + }, + { + "epoch": 23.87058791684712, + "grad_norm": 0.3802795112133026, + "learning_rate": 4.761294120831529e-05, + "loss": 0.3637, + "step": 3527500 + }, + { + "epoch": 23.873971416197488, + "grad_norm": 0.3808208405971527, + "learning_rate": 4.761260285838025e-05, + "loss": 0.3633, + "step": 3528000 + }, + { + "epoch": 23.877354915547855, + "grad_norm": 0.3281201720237732, + "learning_rate": 4.7612264508445214e-05, + "loss": 0.3629, + "step": 3528500 + }, + { + "epoch": 23.880738414898225, + "grad_norm": 0.3574613928794861, + "learning_rate": 4.7611926158510176e-05, + "loss": 0.3634, + "step": 3529000 + }, + { + "epoch": 23.884121914248592, + "grad_norm": 0.32629862427711487, + "learning_rate": 4.7611587808575145e-05, + "loss": 0.3632, + "step": 3529500 + }, + { + "epoch": 23.887505413598962, + "grad_norm": 0.39311572909355164, + "learning_rate": 4.761124945864011e-05, + "loss": 0.3634, + "step": 3530000 + }, + { + "epoch": 23.89088891294933, + "grad_norm": 0.3443436622619629, + "learning_rate": 4.761091110870507e-05, + "loss": 0.3637, + "step": 3530500 + }, + { + "epoch": 23.894272412299696, + "grad_norm": 0.3927950859069824, + "learning_rate": 4.761057275877003e-05, + "loss": 0.3633, + "step": 3531000 + }, + { + "epoch": 23.897655911650066, + "grad_norm": 0.3710768520832062, + "learning_rate": 4.7610234408834994e-05, + "loss": 0.3632, + "step": 3531500 + }, + { + "epoch": 23.901039411000433, + "grad_norm": 0.3737480342388153, + "learning_rate": 4.760989605889996e-05, + "loss": 0.3652, + "step": 3532000 + }, + { + "epoch": 23.904422910350803, + "grad_norm": 0.35131508111953735, + "learning_rate": 4.7609557708964925e-05, + "loss": 0.3622, + "step": 3532500 + }, + { + "epoch": 23.90780640970117, + "grad_norm": 0.39177268743515015, + "learning_rate": 4.760921935902988e-05, + "loss": 0.3645, + "step": 3533000 + }, + { + "epoch": 23.911189909051537, + "grad_norm": 0.36162006855010986, + "learning_rate": 4.760888100909485e-05, + "loss": 0.3632, + "step": 3533500 + }, + { + "epoch": 23.914573408401907, + "grad_norm": 0.36541253328323364, + "learning_rate": 4.760854265915981e-05, + "loss": 0.3632, + "step": 3534000 + }, + { + "epoch": 23.917956907752274, + "grad_norm": 0.3805837035179138, + "learning_rate": 4.760820430922477e-05, + "loss": 0.3646, + "step": 3534500 + }, + { + "epoch": 23.92134040710264, + "grad_norm": 0.3992142677307129, + "learning_rate": 4.7607865959289735e-05, + "loss": 0.3631, + "step": 3535000 + }, + { + "epoch": 23.92472390645301, + "grad_norm": 0.37672194838523865, + "learning_rate": 4.7607527609354704e-05, + "loss": 0.3636, + "step": 3535500 + }, + { + "epoch": 23.928107405803377, + "grad_norm": 0.3364548683166504, + "learning_rate": 4.7607189259419666e-05, + "loss": 0.3641, + "step": 3536000 + }, + { + "epoch": 23.931490905153748, + "grad_norm": 0.3823467493057251, + "learning_rate": 4.760685090948463e-05, + "loss": 0.3652, + "step": 3536500 + }, + { + "epoch": 23.934874404504114, + "grad_norm": 0.41631531715393066, + "learning_rate": 4.760651255954959e-05, + "loss": 0.3626, + "step": 3537000 + }, + { + "epoch": 23.93825790385448, + "grad_norm": 0.41096195578575134, + "learning_rate": 4.760617420961455e-05, + "loss": 0.3637, + "step": 3537500 + }, + { + "epoch": 23.94164140320485, + "grad_norm": 0.39381927251815796, + "learning_rate": 4.7605835859679515e-05, + "loss": 0.3639, + "step": 3538000 + }, + { + "epoch": 23.945024902555218, + "grad_norm": 0.4025476574897766, + "learning_rate": 4.760549750974448e-05, + "loss": 0.3635, + "step": 3538500 + }, + { + "epoch": 23.94840840190559, + "grad_norm": 0.33142849802970886, + "learning_rate": 4.7605159159809446e-05, + "loss": 0.3634, + "step": 3539000 + }, + { + "epoch": 23.951791901255955, + "grad_norm": 0.35124173760414124, + "learning_rate": 4.760482080987441e-05, + "loss": 0.3637, + "step": 3539500 + }, + { + "epoch": 23.955175400606322, + "grad_norm": 0.4197080433368683, + "learning_rate": 4.760448245993937e-05, + "loss": 0.3638, + "step": 3540000 + }, + { + "epoch": 23.958558899956692, + "grad_norm": 0.36347100138664246, + "learning_rate": 4.760414411000433e-05, + "loss": 0.3625, + "step": 3540500 + }, + { + "epoch": 23.96194239930706, + "grad_norm": 0.36141666769981384, + "learning_rate": 4.7603805760069294e-05, + "loss": 0.3633, + "step": 3541000 + }, + { + "epoch": 23.965325898657426, + "grad_norm": 0.35059964656829834, + "learning_rate": 4.760346741013426e-05, + "loss": 0.3627, + "step": 3541500 + }, + { + "epoch": 23.968709398007796, + "grad_norm": 0.38797518610954285, + "learning_rate": 4.7603129060199225e-05, + "loss": 0.3631, + "step": 3542000 + }, + { + "epoch": 23.972092897358163, + "grad_norm": 0.3420599699020386, + "learning_rate": 4.760279071026418e-05, + "loss": 0.3648, + "step": 3542500 + }, + { + "epoch": 23.975476396708533, + "grad_norm": 0.3919691741466522, + "learning_rate": 4.760245236032915e-05, + "loss": 0.3632, + "step": 3543000 + }, + { + "epoch": 23.9788598960589, + "grad_norm": 0.44462597370147705, + "learning_rate": 4.760211401039411e-05, + "loss": 0.3633, + "step": 3543500 + }, + { + "epoch": 23.982243395409267, + "grad_norm": 0.37594836950302124, + "learning_rate": 4.7601775660459074e-05, + "loss": 0.3642, + "step": 3544000 + }, + { + "epoch": 23.985626894759637, + "grad_norm": 0.3407946825027466, + "learning_rate": 4.7601437310524036e-05, + "loss": 0.3648, + "step": 3544500 + }, + { + "epoch": 23.989010394110004, + "grad_norm": 0.402271568775177, + "learning_rate": 4.7601098960589005e-05, + "loss": 0.3626, + "step": 3545000 + }, + { + "epoch": 23.992393893460374, + "grad_norm": 0.3821162283420563, + "learning_rate": 4.760076061065397e-05, + "loss": 0.3638, + "step": 3545500 + }, + { + "epoch": 23.99577739281074, + "grad_norm": 0.36141517758369446, + "learning_rate": 4.760042226071893e-05, + "loss": 0.364, + "step": 3546000 + }, + { + "epoch": 23.999160892161107, + "grad_norm": 0.4005153477191925, + "learning_rate": 4.760008391078389e-05, + "loss": 0.3628, + "step": 3546500 + }, + { + "epoch": 24.0, + "eval_accuracy": 0.8615310324361763, + "eval_loss": 0.5625105500221252, + "eval_runtime": 3398.82, + "eval_samples_per_second": 85.543, + "eval_steps_per_second": 5.347, + "step": 3546624 + }, + { + "epoch": 24.002544391511478, + "grad_norm": 0.3851083815097809, + "learning_rate": 4.7599745560848853e-05, + "loss": 0.3627, + "step": 3547000 + }, + { + "epoch": 24.005927890861845, + "grad_norm": 0.3546355366706848, + "learning_rate": 4.7599407210913816e-05, + "loss": 0.3611, + "step": 3547500 + }, + { + "epoch": 24.009311390212215, + "grad_norm": 0.3528681695461273, + "learning_rate": 4.759906886097878e-05, + "loss": 0.3626, + "step": 3548000 + }, + { + "epoch": 24.01269488956258, + "grad_norm": 0.3592744767665863, + "learning_rate": 4.759873051104374e-05, + "loss": 0.3612, + "step": 3548500 + }, + { + "epoch": 24.01607838891295, + "grad_norm": 0.3767262399196625, + "learning_rate": 4.759839216110871e-05, + "loss": 0.3613, + "step": 3549000 + }, + { + "epoch": 24.01946188826332, + "grad_norm": 0.31025248765945435, + "learning_rate": 4.759805381117367e-05, + "loss": 0.3603, + "step": 3549500 + }, + { + "epoch": 24.022845387613685, + "grad_norm": 0.3891949951648712, + "learning_rate": 4.759771546123863e-05, + "loss": 0.3615, + "step": 3550000 + }, + { + "epoch": 24.026228886964052, + "grad_norm": 0.3731558620929718, + "learning_rate": 4.7597377111303595e-05, + "loss": 0.361, + "step": 3550500 + }, + { + "epoch": 24.029612386314422, + "grad_norm": 0.3438579738140106, + "learning_rate": 4.7597038761368564e-05, + "loss": 0.3603, + "step": 3551000 + }, + { + "epoch": 24.03299588566479, + "grad_norm": 0.34653955698013306, + "learning_rate": 4.7596700411433526e-05, + "loss": 0.3618, + "step": 3551500 + }, + { + "epoch": 24.03637938501516, + "grad_norm": 0.35605496168136597, + "learning_rate": 4.759636206149848e-05, + "loss": 0.3622, + "step": 3552000 + }, + { + "epoch": 24.039762884365526, + "grad_norm": 0.3901239335536957, + "learning_rate": 4.759602371156345e-05, + "loss": 0.3623, + "step": 3552500 + }, + { + "epoch": 24.043146383715893, + "grad_norm": 0.3623374402523041, + "learning_rate": 4.759568536162841e-05, + "loss": 0.3605, + "step": 3553000 + }, + { + "epoch": 24.046529883066263, + "grad_norm": 0.38516634702682495, + "learning_rate": 4.7595347011693375e-05, + "loss": 0.3612, + "step": 3553500 + }, + { + "epoch": 24.04991338241663, + "grad_norm": 0.39640727639198303, + "learning_rate": 4.759500866175834e-05, + "loss": 0.3628, + "step": 3554000 + }, + { + "epoch": 24.053296881767, + "grad_norm": 0.35433048009872437, + "learning_rate": 4.7594670311823306e-05, + "loss": 0.3606, + "step": 3554500 + }, + { + "epoch": 24.056680381117367, + "grad_norm": 0.38493824005126953, + "learning_rate": 4.759433196188827e-05, + "loss": 0.3618, + "step": 3555000 + }, + { + "epoch": 24.060063880467734, + "grad_norm": 0.36116093397140503, + "learning_rate": 4.759399361195323e-05, + "loss": 0.3622, + "step": 3555500 + }, + { + "epoch": 24.063447379818104, + "grad_norm": 0.34238407015800476, + "learning_rate": 4.759365526201819e-05, + "loss": 0.3605, + "step": 3556000 + }, + { + "epoch": 24.06683087916847, + "grad_norm": 0.35663530230522156, + "learning_rate": 4.759331691208316e-05, + "loss": 0.3628, + "step": 3556500 + }, + { + "epoch": 24.070214378518838, + "grad_norm": 0.3875925540924072, + "learning_rate": 4.7592978562148116e-05, + "loss": 0.3624, + "step": 3557000 + }, + { + "epoch": 24.073597877869208, + "grad_norm": 0.4210719168186188, + "learning_rate": 4.759264021221308e-05, + "loss": 0.3614, + "step": 3557500 + }, + { + "epoch": 24.076981377219575, + "grad_norm": 0.3640529215335846, + "learning_rate": 4.759230186227804e-05, + "loss": 0.3617, + "step": 3558000 + }, + { + "epoch": 24.080364876569945, + "grad_norm": 0.39767134189605713, + "learning_rate": 4.759196351234301e-05, + "loss": 0.3632, + "step": 3558500 + }, + { + "epoch": 24.08374837592031, + "grad_norm": 0.38562124967575073, + "learning_rate": 4.759162516240797e-05, + "loss": 0.3627, + "step": 3559000 + }, + { + "epoch": 24.08713187527068, + "grad_norm": 0.34722045063972473, + "learning_rate": 4.7591286812472934e-05, + "loss": 0.3631, + "step": 3559500 + }, + { + "epoch": 24.09051537462105, + "grad_norm": 0.35584744811058044, + "learning_rate": 4.7590948462537896e-05, + "loss": 0.3618, + "step": 3560000 + }, + { + "epoch": 24.093898873971415, + "grad_norm": 0.36499258875846863, + "learning_rate": 4.7590610112602865e-05, + "loss": 0.3636, + "step": 3560500 + }, + { + "epoch": 24.097282373321786, + "grad_norm": 0.3745898902416229, + "learning_rate": 4.759027176266783e-05, + "loss": 0.3625, + "step": 3561000 + }, + { + "epoch": 24.100665872672153, + "grad_norm": 0.3334830403327942, + "learning_rate": 4.758993341273278e-05, + "loss": 0.3618, + "step": 3561500 + }, + { + "epoch": 24.10404937202252, + "grad_norm": 0.3804187774658203, + "learning_rate": 4.758959506279775e-05, + "loss": 0.3629, + "step": 3562000 + }, + { + "epoch": 24.10743287137289, + "grad_norm": 0.37679699063301086, + "learning_rate": 4.758925671286271e-05, + "loss": 0.3634, + "step": 3562500 + }, + { + "epoch": 24.110816370723256, + "grad_norm": 0.38330692052841187, + "learning_rate": 4.7588918362927675e-05, + "loss": 0.3633, + "step": 3563000 + }, + { + "epoch": 24.114199870073627, + "grad_norm": 0.387416273355484, + "learning_rate": 4.758858001299264e-05, + "loss": 0.3635, + "step": 3563500 + }, + { + "epoch": 24.117583369423993, + "grad_norm": 0.3465549349784851, + "learning_rate": 4.7588241663057606e-05, + "loss": 0.3612, + "step": 3564000 + }, + { + "epoch": 24.12096686877436, + "grad_norm": 0.34641847014427185, + "learning_rate": 4.758790331312257e-05, + "loss": 0.3609, + "step": 3564500 + }, + { + "epoch": 24.12435036812473, + "grad_norm": 0.36127665638923645, + "learning_rate": 4.758756496318753e-05, + "loss": 0.3612, + "step": 3565000 + }, + { + "epoch": 24.127733867475097, + "grad_norm": 0.3640176057815552, + "learning_rate": 4.758722661325249e-05, + "loss": 0.3628, + "step": 3565500 + }, + { + "epoch": 24.131117366825464, + "grad_norm": 0.35370656847953796, + "learning_rate": 4.758688826331746e-05, + "loss": 0.3629, + "step": 3566000 + }, + { + "epoch": 24.134500866175834, + "grad_norm": 0.3724617063999176, + "learning_rate": 4.758654991338242e-05, + "loss": 0.3617, + "step": 3566500 + }, + { + "epoch": 24.1378843655262, + "grad_norm": 0.37611472606658936, + "learning_rate": 4.758621156344738e-05, + "loss": 0.3627, + "step": 3567000 + }, + { + "epoch": 24.14126786487657, + "grad_norm": 0.40569251775741577, + "learning_rate": 4.758587321351234e-05, + "loss": 0.3633, + "step": 3567500 + }, + { + "epoch": 24.144651364226938, + "grad_norm": 0.3774605393409729, + "learning_rate": 4.758553486357731e-05, + "loss": 0.3617, + "step": 3568000 + }, + { + "epoch": 24.148034863577305, + "grad_norm": 0.37588009238243103, + "learning_rate": 4.758519651364227e-05, + "loss": 0.363, + "step": 3568500 + }, + { + "epoch": 24.151418362927675, + "grad_norm": 0.34325888752937317, + "learning_rate": 4.7584858163707234e-05, + "loss": 0.3617, + "step": 3569000 + }, + { + "epoch": 24.154801862278042, + "grad_norm": 0.3552990257740021, + "learning_rate": 4.7584519813772196e-05, + "loss": 0.3618, + "step": 3569500 + }, + { + "epoch": 24.158185361628412, + "grad_norm": 0.3599201440811157, + "learning_rate": 4.7584181463837165e-05, + "loss": 0.363, + "step": 3570000 + }, + { + "epoch": 24.16156886097878, + "grad_norm": 0.3919810652732849, + "learning_rate": 4.758384311390213e-05, + "loss": 0.3638, + "step": 3570500 + }, + { + "epoch": 24.164952360329146, + "grad_norm": 0.3540429174900055, + "learning_rate": 4.758350476396708e-05, + "loss": 0.3619, + "step": 3571000 + }, + { + "epoch": 24.168335859679516, + "grad_norm": 0.3905651569366455, + "learning_rate": 4.758316641403205e-05, + "loss": 0.3621, + "step": 3571500 + }, + { + "epoch": 24.171719359029883, + "grad_norm": 0.3780975043773651, + "learning_rate": 4.7582828064097014e-05, + "loss": 0.3621, + "step": 3572000 + }, + { + "epoch": 24.17510285838025, + "grad_norm": 0.37841475009918213, + "learning_rate": 4.7582489714161976e-05, + "loss": 0.3609, + "step": 3572500 + }, + { + "epoch": 24.17848635773062, + "grad_norm": 0.3821147382259369, + "learning_rate": 4.758215136422694e-05, + "loss": 0.3631, + "step": 3573000 + }, + { + "epoch": 24.181869857080986, + "grad_norm": 0.39197006821632385, + "learning_rate": 4.758181301429191e-05, + "loss": 0.3635, + "step": 3573500 + }, + { + "epoch": 24.185253356431357, + "grad_norm": 0.37433409690856934, + "learning_rate": 4.758147466435687e-05, + "loss": 0.3625, + "step": 3574000 + }, + { + "epoch": 24.188636855781724, + "grad_norm": 0.36877191066741943, + "learning_rate": 4.758113631442183e-05, + "loss": 0.3629, + "step": 3574500 + }, + { + "epoch": 24.19202035513209, + "grad_norm": 0.38596317172050476, + "learning_rate": 4.758079796448679e-05, + "loss": 0.3617, + "step": 3575000 + }, + { + "epoch": 24.19540385448246, + "grad_norm": 0.37200498580932617, + "learning_rate": 4.758045961455176e-05, + "loss": 0.3621, + "step": 3575500 + }, + { + "epoch": 24.198787353832827, + "grad_norm": 0.3859347999095917, + "learning_rate": 4.758012126461672e-05, + "loss": 0.3618, + "step": 3576000 + }, + { + "epoch": 24.202170853183198, + "grad_norm": 0.3903367817401886, + "learning_rate": 4.757978291468168e-05, + "loss": 0.3635, + "step": 3576500 + }, + { + "epoch": 24.205554352533564, + "grad_norm": 0.350730299949646, + "learning_rate": 4.757944456474664e-05, + "loss": 0.3621, + "step": 3577000 + }, + { + "epoch": 24.20893785188393, + "grad_norm": 0.360914409160614, + "learning_rate": 4.757910621481161e-05, + "loss": 0.3629, + "step": 3577500 + }, + { + "epoch": 24.2123213512343, + "grad_norm": 0.3702397644519806, + "learning_rate": 4.757876786487657e-05, + "loss": 0.363, + "step": 3578000 + }, + { + "epoch": 24.215704850584668, + "grad_norm": 0.35700830817222595, + "learning_rate": 4.7578429514941535e-05, + "loss": 0.3641, + "step": 3578500 + }, + { + "epoch": 24.21908834993504, + "grad_norm": 0.39207759499549866, + "learning_rate": 4.75780911650065e-05, + "loss": 0.3635, + "step": 3579000 + }, + { + "epoch": 24.222471849285405, + "grad_norm": 0.31646469235420227, + "learning_rate": 4.7577752815071466e-05, + "loss": 0.3627, + "step": 3579500 + }, + { + "epoch": 24.225855348635772, + "grad_norm": 0.39063090085983276, + "learning_rate": 4.757741446513643e-05, + "loss": 0.3628, + "step": 3580000 + }, + { + "epoch": 24.229238847986142, + "grad_norm": 0.3953896760940552, + "learning_rate": 4.757707611520138e-05, + "loss": 0.3647, + "step": 3580500 + }, + { + "epoch": 24.23262234733651, + "grad_norm": 0.39057600498199463, + "learning_rate": 4.757673776526635e-05, + "loss": 0.3627, + "step": 3581000 + }, + { + "epoch": 24.236005846686876, + "grad_norm": 0.3909989595413208, + "learning_rate": 4.7576399415331314e-05, + "loss": 0.3639, + "step": 3581500 + }, + { + "epoch": 24.239389346037246, + "grad_norm": 0.347472220659256, + "learning_rate": 4.7576061065396277e-05, + "loss": 0.363, + "step": 3582000 + }, + { + "epoch": 24.242772845387613, + "grad_norm": 0.34311747550964355, + "learning_rate": 4.757572271546124e-05, + "loss": 0.3619, + "step": 3582500 + }, + { + "epoch": 24.246156344737983, + "grad_norm": 0.4090639352798462, + "learning_rate": 4.757538436552621e-05, + "loss": 0.3627, + "step": 3583000 + }, + { + "epoch": 24.24953984408835, + "grad_norm": 0.37628045678138733, + "learning_rate": 4.757504601559117e-05, + "loss": 0.3618, + "step": 3583500 + }, + { + "epoch": 24.252923343438717, + "grad_norm": 0.4039421081542969, + "learning_rate": 4.757470766565613e-05, + "loss": 0.3619, + "step": 3584000 + }, + { + "epoch": 24.256306842789087, + "grad_norm": 0.4032013416290283, + "learning_rate": 4.7574369315721094e-05, + "loss": 0.3629, + "step": 3584500 + }, + { + "epoch": 24.259690342139454, + "grad_norm": 0.36380964517593384, + "learning_rate": 4.757403096578606e-05, + "loss": 0.3622, + "step": 3585000 + }, + { + "epoch": 24.263073841489824, + "grad_norm": 0.34602972865104675, + "learning_rate": 4.757369261585102e-05, + "loss": 0.3621, + "step": 3585500 + }, + { + "epoch": 24.26645734084019, + "grad_norm": 0.38954514265060425, + "learning_rate": 4.757335426591598e-05, + "loss": 0.3634, + "step": 3586000 + }, + { + "epoch": 24.269840840190557, + "grad_norm": 0.39841580390930176, + "learning_rate": 4.757301591598094e-05, + "loss": 0.3636, + "step": 3586500 + }, + { + "epoch": 24.273224339540928, + "grad_norm": 0.3810007572174072, + "learning_rate": 4.757267756604591e-05, + "loss": 0.3632, + "step": 3587000 + }, + { + "epoch": 24.276607838891294, + "grad_norm": 0.3691217601299286, + "learning_rate": 4.7572339216110873e-05, + "loss": 0.3622, + "step": 3587500 + }, + { + "epoch": 24.279991338241665, + "grad_norm": 0.37574613094329834, + "learning_rate": 4.7572000866175836e-05, + "loss": 0.3652, + "step": 3588000 + }, + { + "epoch": 24.28337483759203, + "grad_norm": 0.38883501291275024, + "learning_rate": 4.75716625162408e-05, + "loss": 0.3625, + "step": 3588500 + }, + { + "epoch": 24.2867583369424, + "grad_norm": 0.3991695046424866, + "learning_rate": 4.7571324166305767e-05, + "loss": 0.3633, + "step": 3589000 + }, + { + "epoch": 24.29014183629277, + "grad_norm": 0.3845086097717285, + "learning_rate": 4.757098581637073e-05, + "loss": 0.3636, + "step": 3589500 + }, + { + "epoch": 24.293525335643135, + "grad_norm": 0.35532400012016296, + "learning_rate": 4.7570647466435684e-05, + "loss": 0.3618, + "step": 3590000 + }, + { + "epoch": 24.296908834993502, + "grad_norm": 0.3768151104450226, + "learning_rate": 4.757030911650065e-05, + "loss": 0.3633, + "step": 3590500 + }, + { + "epoch": 24.300292334343872, + "grad_norm": 0.3618510365486145, + "learning_rate": 4.7569970766565615e-05, + "loss": 0.3644, + "step": 3591000 + }, + { + "epoch": 24.30367583369424, + "grad_norm": 0.3861792981624603, + "learning_rate": 4.756963241663058e-05, + "loss": 0.3624, + "step": 3591500 + }, + { + "epoch": 24.30705933304461, + "grad_norm": 0.3386296331882477, + "learning_rate": 4.756929406669554e-05, + "loss": 0.3622, + "step": 3592000 + }, + { + "epoch": 24.310442832394976, + "grad_norm": 0.38560476899147034, + "learning_rate": 4.756895571676051e-05, + "loss": 0.3625, + "step": 3592500 + }, + { + "epoch": 24.313826331745343, + "grad_norm": 0.37961357831954956, + "learning_rate": 4.756861736682547e-05, + "loss": 0.3629, + "step": 3593000 + }, + { + "epoch": 24.317209831095713, + "grad_norm": 0.3790128827095032, + "learning_rate": 4.756827901689043e-05, + "loss": 0.3615, + "step": 3593500 + }, + { + "epoch": 24.32059333044608, + "grad_norm": 0.4152431786060333, + "learning_rate": 4.7567940666955395e-05, + "loss": 0.3647, + "step": 3594000 + }, + { + "epoch": 24.32397682979645, + "grad_norm": 0.3837610185146332, + "learning_rate": 4.756760231702036e-05, + "loss": 0.3626, + "step": 3594500 + }, + { + "epoch": 24.327360329146817, + "grad_norm": 0.32097288966178894, + "learning_rate": 4.756726396708532e-05, + "loss": 0.3634, + "step": 3595000 + }, + { + "epoch": 24.330743828497184, + "grad_norm": 0.3750414252281189, + "learning_rate": 4.756692561715028e-05, + "loss": 0.3626, + "step": 3595500 + }, + { + "epoch": 24.334127327847554, + "grad_norm": 0.41777998208999634, + "learning_rate": 4.756658726721524e-05, + "loss": 0.3628, + "step": 3596000 + }, + { + "epoch": 24.33751082719792, + "grad_norm": 0.37119758129119873, + "learning_rate": 4.756624891728021e-05, + "loss": 0.3626, + "step": 3596500 + }, + { + "epoch": 24.340894326548288, + "grad_norm": 0.3998216688632965, + "learning_rate": 4.7565910567345174e-05, + "loss": 0.3625, + "step": 3597000 + }, + { + "epoch": 24.344277825898658, + "grad_norm": 0.3342391848564148, + "learning_rate": 4.7565572217410136e-05, + "loss": 0.3621, + "step": 3597500 + }, + { + "epoch": 24.347661325249025, + "grad_norm": 0.3821890950202942, + "learning_rate": 4.75652338674751e-05, + "loss": 0.3615, + "step": 3598000 + }, + { + "epoch": 24.351044824599395, + "grad_norm": 0.3546050190925598, + "learning_rate": 4.756489551754007e-05, + "loss": 0.3623, + "step": 3598500 + }, + { + "epoch": 24.35442832394976, + "grad_norm": 0.38674062490463257, + "learning_rate": 4.756455716760503e-05, + "loss": 0.3619, + "step": 3599000 + }, + { + "epoch": 24.35781182330013, + "grad_norm": 0.35297295451164246, + "learning_rate": 4.7564218817669985e-05, + "loss": 0.3631, + "step": 3599500 + }, + { + "epoch": 24.3611953226505, + "grad_norm": 0.3537425398826599, + "learning_rate": 4.7563880467734954e-05, + "loss": 0.3637, + "step": 3600000 + }, + { + "epoch": 24.364578822000865, + "grad_norm": 0.35156282782554626, + "learning_rate": 4.7563542117799916e-05, + "loss": 0.3631, + "step": 3600500 + }, + { + "epoch": 24.367962321351236, + "grad_norm": 0.37335723638534546, + "learning_rate": 4.756320376786488e-05, + "loss": 0.3633, + "step": 3601000 + }, + { + "epoch": 24.371345820701602, + "grad_norm": 0.3556972146034241, + "learning_rate": 4.756286541792984e-05, + "loss": 0.3626, + "step": 3601500 + }, + { + "epoch": 24.37472932005197, + "grad_norm": 0.3641450107097626, + "learning_rate": 4.75625270679948e-05, + "loss": 0.3645, + "step": 3602000 + }, + { + "epoch": 24.37811281940234, + "grad_norm": 0.3725520372390747, + "learning_rate": 4.756218871805977e-05, + "loss": 0.363, + "step": 3602500 + }, + { + "epoch": 24.381496318752706, + "grad_norm": 0.40755024552345276, + "learning_rate": 4.756185036812473e-05, + "loss": 0.3632, + "step": 3603000 + }, + { + "epoch": 24.384879818103077, + "grad_norm": 0.4027111828327179, + "learning_rate": 4.7561512018189695e-05, + "loss": 0.3647, + "step": 3603500 + }, + { + "epoch": 24.388263317453443, + "grad_norm": 0.38163647055625916, + "learning_rate": 4.756117366825466e-05, + "loss": 0.3634, + "step": 3604000 + }, + { + "epoch": 24.39164681680381, + "grad_norm": 0.36534103751182556, + "learning_rate": 4.756083531831962e-05, + "loss": 0.3622, + "step": 3604500 + }, + { + "epoch": 24.39503031615418, + "grad_norm": 0.3960428237915039, + "learning_rate": 4.756049696838458e-05, + "loss": 0.3636, + "step": 3605000 + }, + { + "epoch": 24.398413815504547, + "grad_norm": 0.38842570781707764, + "learning_rate": 4.7560158618449544e-05, + "loss": 0.364, + "step": 3605500 + }, + { + "epoch": 24.401797314854914, + "grad_norm": 0.39695754647254944, + "learning_rate": 4.755982026851451e-05, + "loss": 0.363, + "step": 3606000 + }, + { + "epoch": 24.405180814205284, + "grad_norm": 0.37467947602272034, + "learning_rate": 4.7559481918579475e-05, + "loss": 0.3621, + "step": 3606500 + }, + { + "epoch": 24.40856431355565, + "grad_norm": 0.36468884348869324, + "learning_rate": 4.755914356864444e-05, + "loss": 0.3615, + "step": 3607000 + }, + { + "epoch": 24.41194781290602, + "grad_norm": 0.3378750681877136, + "learning_rate": 4.75588052187094e-05, + "loss": 0.3626, + "step": 3607500 + }, + { + "epoch": 24.415331312256388, + "grad_norm": 0.37069934606552124, + "learning_rate": 4.755846686877437e-05, + "loss": 0.3634, + "step": 3608000 + }, + { + "epoch": 24.418714811606755, + "grad_norm": 0.32639679312705994, + "learning_rate": 4.755812851883933e-05, + "loss": 0.3629, + "step": 3608500 + }, + { + "epoch": 24.422098310957125, + "grad_norm": 0.3813059628009796, + "learning_rate": 4.755779016890429e-05, + "loss": 0.3626, + "step": 3609000 + }, + { + "epoch": 24.42548181030749, + "grad_norm": 0.33321505784988403, + "learning_rate": 4.7557451818969254e-05, + "loss": 0.363, + "step": 3609500 + }, + { + "epoch": 24.428865309657862, + "grad_norm": 0.37366393208503723, + "learning_rate": 4.7557113469034216e-05, + "loss": 0.3641, + "step": 3610000 + }, + { + "epoch": 24.43224880900823, + "grad_norm": 0.36428675055503845, + "learning_rate": 4.755677511909918e-05, + "loss": 0.3627, + "step": 3610500 + }, + { + "epoch": 24.435632308358596, + "grad_norm": 0.3707229793071747, + "learning_rate": 4.755643676916414e-05, + "loss": 0.3625, + "step": 3611000 + }, + { + "epoch": 24.439015807708966, + "grad_norm": 0.36802589893341064, + "learning_rate": 4.75560984192291e-05, + "loss": 0.3638, + "step": 3611500 + }, + { + "epoch": 24.442399307059333, + "grad_norm": 0.3834385275840759, + "learning_rate": 4.755576006929407e-05, + "loss": 0.3628, + "step": 3612000 + }, + { + "epoch": 24.445782806409703, + "grad_norm": 0.36757373809814453, + "learning_rate": 4.7555421719359034e-05, + "loss": 0.3641, + "step": 3612500 + }, + { + "epoch": 24.44916630576007, + "grad_norm": 0.3710903823375702, + "learning_rate": 4.7555083369423996e-05, + "loss": 0.3621, + "step": 3613000 + }, + { + "epoch": 24.452549805110436, + "grad_norm": 0.37583836913108826, + "learning_rate": 4.755474501948896e-05, + "loss": 0.363, + "step": 3613500 + }, + { + "epoch": 24.455933304460807, + "grad_norm": 0.3556283414363861, + "learning_rate": 4.755440666955392e-05, + "loss": 0.363, + "step": 3614000 + }, + { + "epoch": 24.459316803811173, + "grad_norm": 0.38675931096076965, + "learning_rate": 4.755406831961888e-05, + "loss": 0.3636, + "step": 3614500 + }, + { + "epoch": 24.46270030316154, + "grad_norm": 0.35959991812705994, + "learning_rate": 4.7553729969683844e-05, + "loss": 0.3632, + "step": 3615000 + }, + { + "epoch": 24.46608380251191, + "grad_norm": 0.36478689312934875, + "learning_rate": 4.755339161974881e-05, + "loss": 0.3644, + "step": 3615500 + }, + { + "epoch": 24.469467301862277, + "grad_norm": 0.37179750204086304, + "learning_rate": 4.7553053269813775e-05, + "loss": 0.363, + "step": 3616000 + }, + { + "epoch": 24.472850801212648, + "grad_norm": 0.38893115520477295, + "learning_rate": 4.755271491987874e-05, + "loss": 0.3631, + "step": 3616500 + }, + { + "epoch": 24.476234300563014, + "grad_norm": 0.4172806143760681, + "learning_rate": 4.75523765699437e-05, + "loss": 0.3618, + "step": 3617000 + }, + { + "epoch": 24.47961779991338, + "grad_norm": 0.38247597217559814, + "learning_rate": 4.755203822000867e-05, + "loss": 0.3633, + "step": 3617500 + }, + { + "epoch": 24.48300129926375, + "grad_norm": 0.398297518491745, + "learning_rate": 4.755169987007363e-05, + "loss": 0.3633, + "step": 3618000 + }, + { + "epoch": 24.486384798614118, + "grad_norm": 0.37525323033332825, + "learning_rate": 4.755136152013859e-05, + "loss": 0.3622, + "step": 3618500 + }, + { + "epoch": 24.48976829796449, + "grad_norm": 0.35158777236938477, + "learning_rate": 4.755102317020355e-05, + "loss": 0.3628, + "step": 3619000 + }, + { + "epoch": 24.493151797314855, + "grad_norm": 0.34074023365974426, + "learning_rate": 4.755068482026852e-05, + "loss": 0.363, + "step": 3619500 + }, + { + "epoch": 24.496535296665222, + "grad_norm": 0.3309043347835541, + "learning_rate": 4.755034647033348e-05, + "loss": 0.3624, + "step": 3620000 + }, + { + "epoch": 24.499918796015592, + "grad_norm": 0.3668690323829651, + "learning_rate": 4.755000812039844e-05, + "loss": 0.3621, + "step": 3620500 + }, + { + "epoch": 24.50330229536596, + "grad_norm": 0.3494468033313751, + "learning_rate": 4.75496697704634e-05, + "loss": 0.364, + "step": 3621000 + }, + { + "epoch": 24.506685794716326, + "grad_norm": 0.3519150912761688, + "learning_rate": 4.754933142052837e-05, + "loss": 0.3615, + "step": 3621500 + }, + { + "epoch": 24.510069294066696, + "grad_norm": 0.4194455146789551, + "learning_rate": 4.7548993070593334e-05, + "loss": 0.3628, + "step": 3622000 + }, + { + "epoch": 24.513452793417063, + "grad_norm": 0.3814716935157776, + "learning_rate": 4.7548654720658296e-05, + "loss": 0.3622, + "step": 3622500 + }, + { + "epoch": 24.516836292767433, + "grad_norm": 0.392856627702713, + "learning_rate": 4.754831637072326e-05, + "loss": 0.3624, + "step": 3623000 + }, + { + "epoch": 24.5202197921178, + "grad_norm": 0.35863417387008667, + "learning_rate": 4.754797802078822e-05, + "loss": 0.3623, + "step": 3623500 + }, + { + "epoch": 24.523603291468167, + "grad_norm": 0.38365086913108826, + "learning_rate": 4.754763967085318e-05, + "loss": 0.3629, + "step": 3624000 + }, + { + "epoch": 24.526986790818537, + "grad_norm": 0.3537156581878662, + "learning_rate": 4.7547301320918145e-05, + "loss": 0.3643, + "step": 3624500 + }, + { + "epoch": 24.530370290168904, + "grad_norm": 0.35854488611221313, + "learning_rate": 4.7546962970983114e-05, + "loss": 0.364, + "step": 3625000 + }, + { + "epoch": 24.533753789519274, + "grad_norm": 0.38134366273880005, + "learning_rate": 4.7546624621048076e-05, + "loss": 0.3637, + "step": 3625500 + }, + { + "epoch": 24.53713728886964, + "grad_norm": 0.37400737404823303, + "learning_rate": 4.754628627111304e-05, + "loss": 0.3644, + "step": 3626000 + }, + { + "epoch": 24.540520788220007, + "grad_norm": 0.38586708903312683, + "learning_rate": 4.7545947921178e-05, + "loss": 0.3631, + "step": 3626500 + }, + { + "epoch": 24.543904287570378, + "grad_norm": 0.38755014538764954, + "learning_rate": 4.754560957124297e-05, + "loss": 0.3633, + "step": 3627000 + }, + { + "epoch": 24.547287786920744, + "grad_norm": 0.3673875629901886, + "learning_rate": 4.754527122130793e-05, + "loss": 0.3634, + "step": 3627500 + }, + { + "epoch": 24.550671286271115, + "grad_norm": 0.38896000385284424, + "learning_rate": 4.754493287137289e-05, + "loss": 0.3625, + "step": 3628000 + }, + { + "epoch": 24.55405478562148, + "grad_norm": 0.36808526515960693, + "learning_rate": 4.754459452143785e-05, + "loss": 0.3627, + "step": 3628500 + }, + { + "epoch": 24.557438284971848, + "grad_norm": 0.34860363602638245, + "learning_rate": 4.754425617150282e-05, + "loss": 0.3634, + "step": 3629000 + }, + { + "epoch": 24.56082178432222, + "grad_norm": 0.3394933044910431, + "learning_rate": 4.754391782156778e-05, + "loss": 0.3637, + "step": 3629500 + }, + { + "epoch": 24.564205283672585, + "grad_norm": 0.35248643159866333, + "learning_rate": 4.754357947163274e-05, + "loss": 0.363, + "step": 3630000 + }, + { + "epoch": 24.567588783022952, + "grad_norm": 0.35058140754699707, + "learning_rate": 4.7543241121697704e-05, + "loss": 0.3631, + "step": 3630500 + }, + { + "epoch": 24.570972282373322, + "grad_norm": 0.3432460427284241, + "learning_rate": 4.754290277176267e-05, + "loss": 0.3644, + "step": 3631000 + }, + { + "epoch": 24.57435578172369, + "grad_norm": 0.4002135396003723, + "learning_rate": 4.7542564421827635e-05, + "loss": 0.3628, + "step": 3631500 + }, + { + "epoch": 24.57773928107406, + "grad_norm": 0.3974725008010864, + "learning_rate": 4.75422260718926e-05, + "loss": 0.3623, + "step": 3632000 + }, + { + "epoch": 24.581122780424426, + "grad_norm": 0.3433004915714264, + "learning_rate": 4.754188772195756e-05, + "loss": 0.3647, + "step": 3632500 + }, + { + "epoch": 24.584506279774793, + "grad_norm": 0.4006336033344269, + "learning_rate": 4.754154937202252e-05, + "loss": 0.3622, + "step": 3633000 + }, + { + "epoch": 24.587889779125163, + "grad_norm": 0.38487058877944946, + "learning_rate": 4.7541211022087483e-05, + "loss": 0.3634, + "step": 3633500 + }, + { + "epoch": 24.59127327847553, + "grad_norm": 0.3639155328273773, + "learning_rate": 4.7540872672152446e-05, + "loss": 0.3643, + "step": 3634000 + }, + { + "epoch": 24.5946567778259, + "grad_norm": 0.36571940779685974, + "learning_rate": 4.7540534322217414e-05, + "loss": 0.3641, + "step": 3634500 + }, + { + "epoch": 24.598040277176267, + "grad_norm": 0.42414596676826477, + "learning_rate": 4.7540195972282377e-05, + "loss": 0.3646, + "step": 3635000 + }, + { + "epoch": 24.601423776526634, + "grad_norm": 0.34833043813705444, + "learning_rate": 4.753985762234734e-05, + "loss": 0.3624, + "step": 3635500 + }, + { + "epoch": 24.604807275877004, + "grad_norm": 0.32802829146385193, + "learning_rate": 4.75395192724123e-05, + "loss": 0.3642, + "step": 3636000 + }, + { + "epoch": 24.60819077522737, + "grad_norm": 0.36172571778297424, + "learning_rate": 4.753918092247727e-05, + "loss": 0.3632, + "step": 3636500 + }, + { + "epoch": 24.61157427457774, + "grad_norm": 0.39942455291748047, + "learning_rate": 4.753884257254223e-05, + "loss": 0.3637, + "step": 3637000 + }, + { + "epoch": 24.614957773928108, + "grad_norm": 0.3636648952960968, + "learning_rate": 4.7538504222607194e-05, + "loss": 0.3632, + "step": 3637500 + }, + { + "epoch": 24.618341273278475, + "grad_norm": 0.35619986057281494, + "learning_rate": 4.753816587267215e-05, + "loss": 0.3649, + "step": 3638000 + }, + { + "epoch": 24.621724772628845, + "grad_norm": 0.36945921182632446, + "learning_rate": 4.753782752273712e-05, + "loss": 0.363, + "step": 3638500 + }, + { + "epoch": 24.62510827197921, + "grad_norm": 0.3725719153881073, + "learning_rate": 4.753748917280208e-05, + "loss": 0.3627, + "step": 3639000 + }, + { + "epoch": 24.62849177132958, + "grad_norm": 0.3723055422306061, + "learning_rate": 4.753715082286704e-05, + "loss": 0.3627, + "step": 3639500 + }, + { + "epoch": 24.63187527067995, + "grad_norm": 0.33416539430618286, + "learning_rate": 4.7536812472932005e-05, + "loss": 0.3624, + "step": 3640000 + }, + { + "epoch": 24.635258770030315, + "grad_norm": 0.37129876017570496, + "learning_rate": 4.7536474122996973e-05, + "loss": 0.3638, + "step": 3640500 + }, + { + "epoch": 24.638642269380686, + "grad_norm": 0.3299279510974884, + "learning_rate": 4.7536135773061936e-05, + "loss": 0.3639, + "step": 3641000 + }, + { + "epoch": 24.642025768731052, + "grad_norm": 0.40114957094192505, + "learning_rate": 4.75357974231269e-05, + "loss": 0.3633, + "step": 3641500 + }, + { + "epoch": 24.64540926808142, + "grad_norm": 0.42098936438560486, + "learning_rate": 4.753545907319186e-05, + "loss": 0.3631, + "step": 3642000 + }, + { + "epoch": 24.64879276743179, + "grad_norm": 0.38485991954803467, + "learning_rate": 4.753512072325682e-05, + "loss": 0.3624, + "step": 3642500 + }, + { + "epoch": 24.652176266782156, + "grad_norm": 0.35971710085868835, + "learning_rate": 4.7534782373321784e-05, + "loss": 0.3618, + "step": 3643000 + }, + { + "epoch": 24.655559766132527, + "grad_norm": 0.39405959844589233, + "learning_rate": 4.7534444023386746e-05, + "loss": 0.3629, + "step": 3643500 + }, + { + "epoch": 24.658943265482893, + "grad_norm": 0.39238572120666504, + "learning_rate": 4.7534105673451715e-05, + "loss": 0.3632, + "step": 3644000 + }, + { + "epoch": 24.66232676483326, + "grad_norm": 0.36738380789756775, + "learning_rate": 4.753376732351668e-05, + "loss": 0.3645, + "step": 3644500 + }, + { + "epoch": 24.66571026418363, + "grad_norm": 0.3584558963775635, + "learning_rate": 4.753342897358164e-05, + "loss": 0.3632, + "step": 3645000 + }, + { + "epoch": 24.669093763533997, + "grad_norm": 0.3708287477493286, + "learning_rate": 4.75330906236466e-05, + "loss": 0.3636, + "step": 3645500 + }, + { + "epoch": 24.672477262884364, + "grad_norm": 0.3938468396663666, + "learning_rate": 4.753275227371157e-05, + "loss": 0.3624, + "step": 3646000 + }, + { + "epoch": 24.675860762234734, + "grad_norm": 0.38310766220092773, + "learning_rate": 4.753241392377653e-05, + "loss": 0.3642, + "step": 3646500 + }, + { + "epoch": 24.6792442615851, + "grad_norm": 0.3726683259010315, + "learning_rate": 4.7532075573841495e-05, + "loss": 0.3626, + "step": 3647000 + }, + { + "epoch": 24.68262776093547, + "grad_norm": 0.35090330243110657, + "learning_rate": 4.753173722390645e-05, + "loss": 0.3616, + "step": 3647500 + }, + { + "epoch": 24.686011260285838, + "grad_norm": 0.37750542163848877, + "learning_rate": 4.753139887397142e-05, + "loss": 0.3625, + "step": 3648000 + }, + { + "epoch": 24.689394759636205, + "grad_norm": 0.3820444941520691, + "learning_rate": 4.753106052403638e-05, + "loss": 0.3625, + "step": 3648500 + }, + { + "epoch": 24.692778258986575, + "grad_norm": 0.3139392137527466, + "learning_rate": 4.753072217410134e-05, + "loss": 0.3639, + "step": 3649000 + }, + { + "epoch": 24.69616175833694, + "grad_norm": 0.3789820671081543, + "learning_rate": 4.7530383824166305e-05, + "loss": 0.3633, + "step": 3649500 + }, + { + "epoch": 24.699545257687312, + "grad_norm": 0.36827102303504944, + "learning_rate": 4.7530045474231274e-05, + "loss": 0.3635, + "step": 3650000 + }, + { + "epoch": 24.70292875703768, + "grad_norm": 0.40081366896629333, + "learning_rate": 4.7529707124296236e-05, + "loss": 0.3635, + "step": 3650500 + }, + { + "epoch": 24.706312256388046, + "grad_norm": 0.38157737255096436, + "learning_rate": 4.75293687743612e-05, + "loss": 0.3642, + "step": 3651000 + }, + { + "epoch": 24.709695755738416, + "grad_norm": 0.3455936312675476, + "learning_rate": 4.752903042442616e-05, + "loss": 0.3631, + "step": 3651500 + }, + { + "epoch": 24.713079255088783, + "grad_norm": 0.4064480662345886, + "learning_rate": 4.752869207449112e-05, + "loss": 0.3626, + "step": 3652000 + }, + { + "epoch": 24.716462754439153, + "grad_norm": 0.3554478883743286, + "learning_rate": 4.7528353724556085e-05, + "loss": 0.3635, + "step": 3652500 + }, + { + "epoch": 24.71984625378952, + "grad_norm": 0.3989277184009552, + "learning_rate": 4.752801537462105e-05, + "loss": 0.3642, + "step": 3653000 + }, + { + "epoch": 24.723229753139886, + "grad_norm": 0.3907890021800995, + "learning_rate": 4.7527677024686016e-05, + "loss": 0.3617, + "step": 3653500 + }, + { + "epoch": 24.726613252490257, + "grad_norm": 0.3725045621395111, + "learning_rate": 4.752733867475098e-05, + "loss": 0.3633, + "step": 3654000 + }, + { + "epoch": 24.729996751840623, + "grad_norm": 0.3770049512386322, + "learning_rate": 4.752700032481594e-05, + "loss": 0.3603, + "step": 3654500 + }, + { + "epoch": 24.73338025119099, + "grad_norm": 0.35805559158325195, + "learning_rate": 4.75266619748809e-05, + "loss": 0.3629, + "step": 3655000 + }, + { + "epoch": 24.73676375054136, + "grad_norm": 0.3792979419231415, + "learning_rate": 4.752632362494587e-05, + "loss": 0.3636, + "step": 3655500 + }, + { + "epoch": 24.740147249891727, + "grad_norm": 0.40678563714027405, + "learning_rate": 4.752598527501083e-05, + "loss": 0.3636, + "step": 3656000 + }, + { + "epoch": 24.743530749242097, + "grad_norm": 0.3942176103591919, + "learning_rate": 4.7525646925075795e-05, + "loss": 0.3626, + "step": 3656500 + }, + { + "epoch": 24.746914248592464, + "grad_norm": 0.3723395764827728, + "learning_rate": 4.752530857514075e-05, + "loss": 0.3611, + "step": 3657000 + }, + { + "epoch": 24.75029774794283, + "grad_norm": 0.3674887716770172, + "learning_rate": 4.752497022520572e-05, + "loss": 0.3624, + "step": 3657500 + }, + { + "epoch": 24.7536812472932, + "grad_norm": 0.37841156125068665, + "learning_rate": 4.752463187527068e-05, + "loss": 0.3634, + "step": 3658000 + }, + { + "epoch": 24.757064746643568, + "grad_norm": 0.423721581697464, + "learning_rate": 4.7524293525335644e-05, + "loss": 0.362, + "step": 3658500 + }, + { + "epoch": 24.76044824599394, + "grad_norm": 0.37729448080062866, + "learning_rate": 4.7523955175400606e-05, + "loss": 0.3619, + "step": 3659000 + }, + { + "epoch": 24.763831745344305, + "grad_norm": 0.3563602566719055, + "learning_rate": 4.7523616825465575e-05, + "loss": 0.3637, + "step": 3659500 + }, + { + "epoch": 24.767215244694672, + "grad_norm": 0.3742561638355255, + "learning_rate": 4.752327847553054e-05, + "loss": 0.364, + "step": 3660000 + }, + { + "epoch": 24.770598744045042, + "grad_norm": 0.41626352071762085, + "learning_rate": 4.75229401255955e-05, + "loss": 0.3628, + "step": 3660500 + }, + { + "epoch": 24.77398224339541, + "grad_norm": 0.3524860143661499, + "learning_rate": 4.752260177566046e-05, + "loss": 0.3639, + "step": 3661000 + }, + { + "epoch": 24.77736574274578, + "grad_norm": 0.4287998378276825, + "learning_rate": 4.752226342572542e-05, + "loss": 0.3628, + "step": 3661500 + }, + { + "epoch": 24.780749242096146, + "grad_norm": 0.3763847351074219, + "learning_rate": 4.7521925075790385e-05, + "loss": 0.3627, + "step": 3662000 + }, + { + "epoch": 24.784132741446513, + "grad_norm": 0.3679342269897461, + "learning_rate": 4.752158672585535e-05, + "loss": 0.3638, + "step": 3662500 + }, + { + "epoch": 24.787516240796883, + "grad_norm": 0.365048885345459, + "learning_rate": 4.7521248375920316e-05, + "loss": 0.3633, + "step": 3663000 + }, + { + "epoch": 24.79089974014725, + "grad_norm": 0.3907424509525299, + "learning_rate": 4.752091002598528e-05, + "loss": 0.3643, + "step": 3663500 + }, + { + "epoch": 24.794283239497616, + "grad_norm": 0.37695226073265076, + "learning_rate": 4.752057167605024e-05, + "loss": 0.3623, + "step": 3664000 + }, + { + "epoch": 24.797666738847987, + "grad_norm": 0.40741339325904846, + "learning_rate": 4.75202333261152e-05, + "loss": 0.3657, + "step": 3664500 + }, + { + "epoch": 24.801050238198354, + "grad_norm": 0.35198110342025757, + "learning_rate": 4.7519894976180165e-05, + "loss": 0.3629, + "step": 3665000 + }, + { + "epoch": 24.804433737548724, + "grad_norm": 0.36765557527542114, + "learning_rate": 4.7519556626245134e-05, + "loss": 0.3635, + "step": 3665500 + }, + { + "epoch": 24.80781723689909, + "grad_norm": 0.3759390711784363, + "learning_rate": 4.7519218276310096e-05, + "loss": 0.3638, + "step": 3666000 + }, + { + "epoch": 24.811200736249457, + "grad_norm": 0.3992515802383423, + "learning_rate": 4.751887992637505e-05, + "loss": 0.3633, + "step": 3666500 + }, + { + "epoch": 24.814584235599828, + "grad_norm": 0.3276924192905426, + "learning_rate": 4.751854157644002e-05, + "loss": 0.3622, + "step": 3667000 + }, + { + "epoch": 24.817967734950194, + "grad_norm": 0.3511314392089844, + "learning_rate": 4.751820322650498e-05, + "loss": 0.3615, + "step": 3667500 + }, + { + "epoch": 24.821351234300565, + "grad_norm": 0.36177822947502136, + "learning_rate": 4.7517864876569944e-05, + "loss": 0.3636, + "step": 3668000 + }, + { + "epoch": 24.82473473365093, + "grad_norm": 0.32816770672798157, + "learning_rate": 4.7517526526634906e-05, + "loss": 0.3637, + "step": 3668500 + }, + { + "epoch": 24.828118233001298, + "grad_norm": 0.31602737307548523, + "learning_rate": 4.7517188176699875e-05, + "loss": 0.3631, + "step": 3669000 + }, + { + "epoch": 24.83150173235167, + "grad_norm": 0.3635820746421814, + "learning_rate": 4.751684982676484e-05, + "loss": 0.3645, + "step": 3669500 + }, + { + "epoch": 24.834885231702035, + "grad_norm": 0.3586866855621338, + "learning_rate": 4.75165114768298e-05, + "loss": 0.362, + "step": 3670000 + }, + { + "epoch": 24.838268731052402, + "grad_norm": 0.31392955780029297, + "learning_rate": 4.751617312689476e-05, + "loss": 0.3641, + "step": 3670500 + }, + { + "epoch": 24.841652230402772, + "grad_norm": 0.37633511424064636, + "learning_rate": 4.751583477695973e-05, + "loss": 0.3626, + "step": 3671000 + }, + { + "epoch": 24.84503572975314, + "grad_norm": 0.3376986086368561, + "learning_rate": 4.7515496427024686e-05, + "loss": 0.3638, + "step": 3671500 + }, + { + "epoch": 24.84841922910351, + "grad_norm": 0.3483399450778961, + "learning_rate": 4.751515807708965e-05, + "loss": 0.3639, + "step": 3672000 + }, + { + "epoch": 24.851802728453876, + "grad_norm": 0.36665934324264526, + "learning_rate": 4.751481972715462e-05, + "loss": 0.3636, + "step": 3672500 + }, + { + "epoch": 24.855186227804243, + "grad_norm": 0.3893853425979614, + "learning_rate": 4.751448137721958e-05, + "loss": 0.3639, + "step": 3673000 + }, + { + "epoch": 24.858569727154613, + "grad_norm": 0.3913221061229706, + "learning_rate": 4.751414302728454e-05, + "loss": 0.362, + "step": 3673500 + }, + { + "epoch": 24.86195322650498, + "grad_norm": 0.36556679010391235, + "learning_rate": 4.75138046773495e-05, + "loss": 0.3625, + "step": 3674000 + }, + { + "epoch": 24.86533672585535, + "grad_norm": 0.40248703956604004, + "learning_rate": 4.7513466327414465e-05, + "loss": 0.3632, + "step": 3674500 + }, + { + "epoch": 24.868720225205717, + "grad_norm": 0.35256707668304443, + "learning_rate": 4.7513127977479434e-05, + "loss": 0.3623, + "step": 3675000 + }, + { + "epoch": 24.872103724556084, + "grad_norm": 0.3386058807373047, + "learning_rate": 4.7512789627544396e-05, + "loss": 0.3627, + "step": 3675500 + }, + { + "epoch": 24.875487223906454, + "grad_norm": 0.36424487829208374, + "learning_rate": 4.751245127760935e-05, + "loss": 0.365, + "step": 3676000 + }, + { + "epoch": 24.87887072325682, + "grad_norm": 0.38003039360046387, + "learning_rate": 4.751211292767432e-05, + "loss": 0.3641, + "step": 3676500 + }, + { + "epoch": 24.882254222607187, + "grad_norm": 0.40823283791542053, + "learning_rate": 4.751177457773928e-05, + "loss": 0.3634, + "step": 3677000 + }, + { + "epoch": 24.885637721957558, + "grad_norm": 0.40488094091415405, + "learning_rate": 4.7511436227804245e-05, + "loss": 0.3642, + "step": 3677500 + }, + { + "epoch": 24.889021221307924, + "grad_norm": 0.37072402238845825, + "learning_rate": 4.751109787786921e-05, + "loss": 0.3629, + "step": 3678000 + }, + { + "epoch": 24.892404720658295, + "grad_norm": 0.3462226092815399, + "learning_rate": 4.7510759527934176e-05, + "loss": 0.3617, + "step": 3678500 + }, + { + "epoch": 24.89578822000866, + "grad_norm": 0.3488467037677765, + "learning_rate": 4.751042117799914e-05, + "loss": 0.364, + "step": 3679000 + }, + { + "epoch": 24.89917171935903, + "grad_norm": 0.3277910053730011, + "learning_rate": 4.75100828280641e-05, + "loss": 0.3638, + "step": 3679500 + }, + { + "epoch": 24.9025552187094, + "grad_norm": 0.40324801206588745, + "learning_rate": 4.750974447812906e-05, + "loss": 0.3618, + "step": 3680000 + }, + { + "epoch": 24.905938718059765, + "grad_norm": 0.34906333684921265, + "learning_rate": 4.750940612819403e-05, + "loss": 0.3637, + "step": 3680500 + }, + { + "epoch": 24.909322217410136, + "grad_norm": 0.37106651067733765, + "learning_rate": 4.7509067778258987e-05, + "loss": 0.3628, + "step": 3681000 + }, + { + "epoch": 24.912705716760502, + "grad_norm": 0.37618666887283325, + "learning_rate": 4.750872942832395e-05, + "loss": 0.3655, + "step": 3681500 + }, + { + "epoch": 24.91608921611087, + "grad_norm": 0.3458239436149597, + "learning_rate": 4.750839107838891e-05, + "loss": 0.3624, + "step": 3682000 + }, + { + "epoch": 24.91947271546124, + "grad_norm": 0.43893659114837646, + "learning_rate": 4.750805272845388e-05, + "loss": 0.3626, + "step": 3682500 + }, + { + "epoch": 24.922856214811606, + "grad_norm": 0.3353835642337799, + "learning_rate": 4.750771437851884e-05, + "loss": 0.3653, + "step": 3683000 + }, + { + "epoch": 24.926239714161976, + "grad_norm": 0.3734031319618225, + "learning_rate": 4.7507376028583804e-05, + "loss": 0.3636, + "step": 3683500 + }, + { + "epoch": 24.929623213512343, + "grad_norm": 0.3506411910057068, + "learning_rate": 4.7507037678648766e-05, + "loss": 0.3631, + "step": 3684000 + }, + { + "epoch": 24.93300671286271, + "grad_norm": 0.35105693340301514, + "learning_rate": 4.7506699328713735e-05, + "loss": 0.3631, + "step": 3684500 + }, + { + "epoch": 24.93639021221308, + "grad_norm": 0.3812028169631958, + "learning_rate": 4.75063609787787e-05, + "loss": 0.3631, + "step": 3685000 + }, + { + "epoch": 24.939773711563447, + "grad_norm": 0.38500118255615234, + "learning_rate": 4.750602262884365e-05, + "loss": 0.363, + "step": 3685500 + }, + { + "epoch": 24.943157210913817, + "grad_norm": 0.40273165702819824, + "learning_rate": 4.750568427890862e-05, + "loss": 0.3631, + "step": 3686000 + }, + { + "epoch": 24.946540710264184, + "grad_norm": 0.34937480092048645, + "learning_rate": 4.7505345928973583e-05, + "loss": 0.3623, + "step": 3686500 + }, + { + "epoch": 24.94992420961455, + "grad_norm": 0.3561623990535736, + "learning_rate": 4.7505007579038546e-05, + "loss": 0.3631, + "step": 3687000 + }, + { + "epoch": 24.95330770896492, + "grad_norm": 0.3699971139431, + "learning_rate": 4.750466922910351e-05, + "loss": 0.365, + "step": 3687500 + }, + { + "epoch": 24.956691208315288, + "grad_norm": 0.36315932869911194, + "learning_rate": 4.750433087916848e-05, + "loss": 0.3631, + "step": 3688000 + }, + { + "epoch": 24.960074707665655, + "grad_norm": 0.4144958555698395, + "learning_rate": 4.750399252923344e-05, + "loss": 0.3622, + "step": 3688500 + }, + { + "epoch": 24.963458207016025, + "grad_norm": 0.3754120171070099, + "learning_rate": 4.75036541792984e-05, + "loss": 0.3649, + "step": 3689000 + }, + { + "epoch": 24.96684170636639, + "grad_norm": 0.3640006184577942, + "learning_rate": 4.750331582936336e-05, + "loss": 0.3623, + "step": 3689500 + }, + { + "epoch": 24.970225205716762, + "grad_norm": 0.35096555948257446, + "learning_rate": 4.750297747942833e-05, + "loss": 0.3619, + "step": 3690000 + }, + { + "epoch": 24.97360870506713, + "grad_norm": 0.33679062128067017, + "learning_rate": 4.750263912949329e-05, + "loss": 0.3629, + "step": 3690500 + }, + { + "epoch": 24.976992204417495, + "grad_norm": 0.35485759377479553, + "learning_rate": 4.750230077955825e-05, + "loss": 0.3622, + "step": 3691000 + }, + { + "epoch": 24.980375703767866, + "grad_norm": 0.3367948830127716, + "learning_rate": 4.750196242962321e-05, + "loss": 0.3641, + "step": 3691500 + }, + { + "epoch": 24.983759203118233, + "grad_norm": 0.3389408588409424, + "learning_rate": 4.750162407968818e-05, + "loss": 0.3633, + "step": 3692000 + }, + { + "epoch": 24.987142702468603, + "grad_norm": 0.37847328186035156, + "learning_rate": 4.750128572975314e-05, + "loss": 0.363, + "step": 3692500 + }, + { + "epoch": 24.99052620181897, + "grad_norm": 0.3536604344844818, + "learning_rate": 4.7500947379818105e-05, + "loss": 0.3629, + "step": 3693000 + }, + { + "epoch": 24.993909701169336, + "grad_norm": 0.36198484897613525, + "learning_rate": 4.750060902988307e-05, + "loss": 0.3624, + "step": 3693500 + }, + { + "epoch": 24.997293200519707, + "grad_norm": 0.3825433850288391, + "learning_rate": 4.7500270679948036e-05, + "loss": 0.3643, + "step": 3694000 + }, + { + "epoch": 25.0, + "eval_accuracy": 0.8616033147198132, + "eval_loss": 0.5629200339317322, + "eval_runtime": 3407.061, + "eval_samples_per_second": 85.336, + "eval_steps_per_second": 5.334, + "step": 3694400 + }, + { + "epoch": 25.000676699870073, + "grad_norm": 0.3510105013847351, + "learning_rate": 4.7499932330013e-05, + "loss": 0.3628, + "step": 3694500 + }, + { + "epoch": 25.00406019922044, + "grad_norm": 0.40690895915031433, + "learning_rate": 4.749959398007795e-05, + "loss": 0.3626, + "step": 3695000 + }, + { + "epoch": 25.00744369857081, + "grad_norm": 0.3398245573043823, + "learning_rate": 4.749925563014292e-05, + "loss": 0.3613, + "step": 3695500 + }, + { + "epoch": 25.010827197921177, + "grad_norm": 0.3666698634624481, + "learning_rate": 4.7498917280207884e-05, + "loss": 0.3625, + "step": 3696000 + }, + { + "epoch": 25.014210697271547, + "grad_norm": 0.37911367416381836, + "learning_rate": 4.7498578930272846e-05, + "loss": 0.3618, + "step": 3696500 + }, + { + "epoch": 25.017594196621914, + "grad_norm": 0.32052409648895264, + "learning_rate": 4.749824058033781e-05, + "loss": 0.3606, + "step": 3697000 + }, + { + "epoch": 25.02097769597228, + "grad_norm": 0.31819668412208557, + "learning_rate": 4.749790223040278e-05, + "loss": 0.3599, + "step": 3697500 + }, + { + "epoch": 25.02436119532265, + "grad_norm": 0.3609914481639862, + "learning_rate": 4.749756388046774e-05, + "loss": 0.3605, + "step": 3698000 + }, + { + "epoch": 25.027744694673018, + "grad_norm": 0.37735220789909363, + "learning_rate": 4.74972255305327e-05, + "loss": 0.3614, + "step": 3698500 + }, + { + "epoch": 25.03112819402339, + "grad_norm": 0.38208866119384766, + "learning_rate": 4.7496887180597664e-05, + "loss": 0.3617, + "step": 3699000 + }, + { + "epoch": 25.034511693373755, + "grad_norm": 0.352157860994339, + "learning_rate": 4.749654883066263e-05, + "loss": 0.3613, + "step": 3699500 + }, + { + "epoch": 25.037895192724122, + "grad_norm": 0.37447309494018555, + "learning_rate": 4.749621048072759e-05, + "loss": 0.3605, + "step": 3700000 + }, + { + "epoch": 25.041278692074492, + "grad_norm": 0.3469589650630951, + "learning_rate": 4.749587213079255e-05, + "loss": 0.361, + "step": 3700500 + }, + { + "epoch": 25.04466219142486, + "grad_norm": 0.38122642040252686, + "learning_rate": 4.749553378085751e-05, + "loss": 0.3619, + "step": 3701000 + }, + { + "epoch": 25.04804569077523, + "grad_norm": 0.3568724989891052, + "learning_rate": 4.749519543092248e-05, + "loss": 0.3606, + "step": 3701500 + }, + { + "epoch": 25.051429190125596, + "grad_norm": 0.3496883809566498, + "learning_rate": 4.749485708098744e-05, + "loss": 0.3612, + "step": 3702000 + }, + { + "epoch": 25.054812689475963, + "grad_norm": 0.3566105365753174, + "learning_rate": 4.7494518731052405e-05, + "loss": 0.3612, + "step": 3702500 + }, + { + "epoch": 25.058196188826333, + "grad_norm": 0.37609434127807617, + "learning_rate": 4.749418038111737e-05, + "loss": 0.36, + "step": 3703000 + }, + { + "epoch": 25.0615796881767, + "grad_norm": 0.35694074630737305, + "learning_rate": 4.7493842031182336e-05, + "loss": 0.3622, + "step": 3703500 + }, + { + "epoch": 25.064963187527066, + "grad_norm": 0.4110996425151825, + "learning_rate": 4.74935036812473e-05, + "loss": 0.3622, + "step": 3704000 + }, + { + "epoch": 25.068346686877437, + "grad_norm": 0.37473252415657043, + "learning_rate": 4.7493165331312254e-05, + "loss": 0.3622, + "step": 3704500 + }, + { + "epoch": 25.071730186227803, + "grad_norm": 0.357221782207489, + "learning_rate": 4.749282698137722e-05, + "loss": 0.3619, + "step": 3705000 + }, + { + "epoch": 25.075113685578174, + "grad_norm": 0.3608001172542572, + "learning_rate": 4.7492488631442185e-05, + "loss": 0.3615, + "step": 3705500 + }, + { + "epoch": 25.07849718492854, + "grad_norm": 0.38230764865875244, + "learning_rate": 4.749215028150715e-05, + "loss": 0.3616, + "step": 3706000 + }, + { + "epoch": 25.081880684278907, + "grad_norm": 0.36422863602638245, + "learning_rate": 4.749181193157211e-05, + "loss": 0.3626, + "step": 3706500 + }, + { + "epoch": 25.085264183629278, + "grad_norm": 0.3531195819377899, + "learning_rate": 4.749147358163708e-05, + "loss": 0.3614, + "step": 3707000 + }, + { + "epoch": 25.088647682979644, + "grad_norm": 0.3735826909542084, + "learning_rate": 4.749113523170204e-05, + "loss": 0.3614, + "step": 3707500 + }, + { + "epoch": 25.092031182330015, + "grad_norm": 0.3754367232322693, + "learning_rate": 4.7490796881767e-05, + "loss": 0.3613, + "step": 3708000 + }, + { + "epoch": 25.09541468168038, + "grad_norm": 0.38606202602386475, + "learning_rate": 4.7490458531831964e-05, + "loss": 0.3621, + "step": 3708500 + }, + { + "epoch": 25.098798181030748, + "grad_norm": 0.39081791043281555, + "learning_rate": 4.749012018189693e-05, + "loss": 0.3629, + "step": 3709000 + }, + { + "epoch": 25.10218168038112, + "grad_norm": 0.3802073299884796, + "learning_rate": 4.748978183196189e-05, + "loss": 0.3625, + "step": 3709500 + }, + { + "epoch": 25.105565179731485, + "grad_norm": 0.3735586702823639, + "learning_rate": 4.748944348202685e-05, + "loss": 0.3624, + "step": 3710000 + }, + { + "epoch": 25.108948679081852, + "grad_norm": 0.3707588315010071, + "learning_rate": 4.748910513209181e-05, + "loss": 0.3624, + "step": 3710500 + }, + { + "epoch": 25.112332178432222, + "grad_norm": 0.39641883969306946, + "learning_rate": 4.748876678215678e-05, + "loss": 0.362, + "step": 3711000 + }, + { + "epoch": 25.11571567778259, + "grad_norm": 0.4227748513221741, + "learning_rate": 4.7488428432221744e-05, + "loss": 0.3631, + "step": 3711500 + }, + { + "epoch": 25.11909917713296, + "grad_norm": 0.37438488006591797, + "learning_rate": 4.7488090082286706e-05, + "loss": 0.3631, + "step": 3712000 + }, + { + "epoch": 25.122482676483326, + "grad_norm": 0.36240556836128235, + "learning_rate": 4.748775173235167e-05, + "loss": 0.3612, + "step": 3712500 + }, + { + "epoch": 25.125866175833693, + "grad_norm": 0.3844728469848633, + "learning_rate": 4.748741338241664e-05, + "loss": 0.3611, + "step": 3713000 + }, + { + "epoch": 25.129249675184063, + "grad_norm": 0.38626983761787415, + "learning_rate": 4.74870750324816e-05, + "loss": 0.3607, + "step": 3713500 + }, + { + "epoch": 25.13263317453443, + "grad_norm": 0.3698998987674713, + "learning_rate": 4.7486736682546554e-05, + "loss": 0.3622, + "step": 3714000 + }, + { + "epoch": 25.1360166738848, + "grad_norm": 0.3693174421787262, + "learning_rate": 4.748639833261152e-05, + "loss": 0.3612, + "step": 3714500 + }, + { + "epoch": 25.139400173235167, + "grad_norm": 0.3790559768676758, + "learning_rate": 4.7486059982676485e-05, + "loss": 0.3618, + "step": 3715000 + }, + { + "epoch": 25.142783672585534, + "grad_norm": 0.3717188537120819, + "learning_rate": 4.748572163274145e-05, + "loss": 0.3639, + "step": 3715500 + }, + { + "epoch": 25.146167171935904, + "grad_norm": 0.4066247045993805, + "learning_rate": 4.748538328280641e-05, + "loss": 0.3638, + "step": 3716000 + }, + { + "epoch": 25.14955067128627, + "grad_norm": 0.3499060869216919, + "learning_rate": 4.748504493287138e-05, + "loss": 0.3617, + "step": 3716500 + }, + { + "epoch": 25.15293417063664, + "grad_norm": 0.35585179924964905, + "learning_rate": 4.748470658293634e-05, + "loss": 0.3623, + "step": 3717000 + }, + { + "epoch": 25.156317669987008, + "grad_norm": 0.36418354511260986, + "learning_rate": 4.74843682330013e-05, + "loss": 0.3623, + "step": 3717500 + }, + { + "epoch": 25.159701169337374, + "grad_norm": 0.3552171587944031, + "learning_rate": 4.7484029883066265e-05, + "loss": 0.3615, + "step": 3718000 + }, + { + "epoch": 25.163084668687745, + "grad_norm": 0.3636505901813507, + "learning_rate": 4.7483691533131234e-05, + "loss": 0.3634, + "step": 3718500 + }, + { + "epoch": 25.16646816803811, + "grad_norm": 0.3401271104812622, + "learning_rate": 4.748335318319619e-05, + "loss": 0.3618, + "step": 3719000 + }, + { + "epoch": 25.16985166738848, + "grad_norm": 0.4107031524181366, + "learning_rate": 4.748301483326115e-05, + "loss": 0.3637, + "step": 3719500 + }, + { + "epoch": 25.17323516673885, + "grad_norm": 0.3740893304347992, + "learning_rate": 4.748267648332611e-05, + "loss": 0.3619, + "step": 3720000 + }, + { + "epoch": 25.176618666089215, + "grad_norm": 0.3845370411872864, + "learning_rate": 4.748233813339108e-05, + "loss": 0.3616, + "step": 3720500 + }, + { + "epoch": 25.180002165439586, + "grad_norm": 0.3863700330257416, + "learning_rate": 4.7481999783456044e-05, + "loss": 0.3602, + "step": 3721000 + }, + { + "epoch": 25.183385664789952, + "grad_norm": 0.43995434045791626, + "learning_rate": 4.7481661433521007e-05, + "loss": 0.3612, + "step": 3721500 + }, + { + "epoch": 25.18676916414032, + "grad_norm": 0.3989493250846863, + "learning_rate": 4.748132308358597e-05, + "loss": 0.3626, + "step": 3722000 + }, + { + "epoch": 25.19015266349069, + "grad_norm": 0.369140625, + "learning_rate": 4.748098473365094e-05, + "loss": 0.3626, + "step": 3722500 + }, + { + "epoch": 25.193536162841056, + "grad_norm": 0.38794782757759094, + "learning_rate": 4.74806463837159e-05, + "loss": 0.3617, + "step": 3723000 + }, + { + "epoch": 25.196919662191426, + "grad_norm": 0.32923948764801025, + "learning_rate": 4.748030803378086e-05, + "loss": 0.3638, + "step": 3723500 + }, + { + "epoch": 25.200303161541793, + "grad_norm": 0.34737542271614075, + "learning_rate": 4.7479969683845824e-05, + "loss": 0.3632, + "step": 3724000 + }, + { + "epoch": 25.20368666089216, + "grad_norm": 0.37374576926231384, + "learning_rate": 4.7479631333910786e-05, + "loss": 0.3632, + "step": 3724500 + }, + { + "epoch": 25.20707016024253, + "grad_norm": 0.39663198590278625, + "learning_rate": 4.747929298397575e-05, + "loss": 0.3616, + "step": 3725000 + }, + { + "epoch": 25.210453659592897, + "grad_norm": 0.38434311747550964, + "learning_rate": 4.747895463404071e-05, + "loss": 0.3597, + "step": 3725500 + }, + { + "epoch": 25.213837158943264, + "grad_norm": 0.3597494661808014, + "learning_rate": 4.747861628410568e-05, + "loss": 0.3623, + "step": 3726000 + }, + { + "epoch": 25.217220658293634, + "grad_norm": 0.4105066955089569, + "learning_rate": 4.747827793417064e-05, + "loss": 0.3622, + "step": 3726500 + }, + { + "epoch": 25.220604157644, + "grad_norm": 0.37283262610435486, + "learning_rate": 4.7477939584235603e-05, + "loss": 0.3634, + "step": 3727000 + }, + { + "epoch": 25.22398765699437, + "grad_norm": 0.3647730350494385, + "learning_rate": 4.7477601234300566e-05, + "loss": 0.3622, + "step": 3727500 + }, + { + "epoch": 25.227371156344738, + "grad_norm": 0.388548344373703, + "learning_rate": 4.747726288436553e-05, + "loss": 0.3635, + "step": 3728000 + }, + { + "epoch": 25.230754655695105, + "grad_norm": 0.4077318012714386, + "learning_rate": 4.747692453443049e-05, + "loss": 0.362, + "step": 3728500 + }, + { + "epoch": 25.234138155045475, + "grad_norm": 0.3589150011539459, + "learning_rate": 4.747658618449545e-05, + "loss": 0.3627, + "step": 3729000 + }, + { + "epoch": 25.23752165439584, + "grad_norm": 0.38778451085090637, + "learning_rate": 4.7476247834560414e-05, + "loss": 0.3633, + "step": 3729500 + }, + { + "epoch": 25.240905153746212, + "grad_norm": 0.408589631319046, + "learning_rate": 4.747590948462538e-05, + "loss": 0.3628, + "step": 3730000 + }, + { + "epoch": 25.24428865309658, + "grad_norm": 0.3778936564922333, + "learning_rate": 4.7475571134690345e-05, + "loss": 0.3631, + "step": 3730500 + }, + { + "epoch": 25.247672152446945, + "grad_norm": 0.41418513655662537, + "learning_rate": 4.747523278475531e-05, + "loss": 0.3638, + "step": 3731000 + }, + { + "epoch": 25.251055651797316, + "grad_norm": 0.36329400539398193, + "learning_rate": 4.747489443482027e-05, + "loss": 0.3615, + "step": 3731500 + }, + { + "epoch": 25.254439151147682, + "grad_norm": 0.369795560836792, + "learning_rate": 4.747455608488524e-05, + "loss": 0.3616, + "step": 3732000 + }, + { + "epoch": 25.257822650498053, + "grad_norm": 0.3831300437450409, + "learning_rate": 4.74742177349502e-05, + "loss": 0.3607, + "step": 3732500 + }, + { + "epoch": 25.26120614984842, + "grad_norm": 0.344378799200058, + "learning_rate": 4.747387938501516e-05, + "loss": 0.3626, + "step": 3733000 + }, + { + "epoch": 25.264589649198786, + "grad_norm": 0.3777843415737152, + "learning_rate": 4.7473541035080125e-05, + "loss": 0.3616, + "step": 3733500 + }, + { + "epoch": 25.267973148549157, + "grad_norm": 0.3878310024738312, + "learning_rate": 4.747320268514509e-05, + "loss": 0.3627, + "step": 3734000 + }, + { + "epoch": 25.271356647899523, + "grad_norm": 0.4041731655597687, + "learning_rate": 4.747286433521005e-05, + "loss": 0.3617, + "step": 3734500 + }, + { + "epoch": 25.27474014724989, + "grad_norm": 0.36002233624458313, + "learning_rate": 4.747252598527501e-05, + "loss": 0.3615, + "step": 3735000 + }, + { + "epoch": 25.27812364660026, + "grad_norm": 0.3519437611103058, + "learning_rate": 4.747218763533997e-05, + "loss": 0.3633, + "step": 3735500 + }, + { + "epoch": 25.281507145950627, + "grad_norm": 0.3840735852718353, + "learning_rate": 4.747184928540494e-05, + "loss": 0.3627, + "step": 3736000 + }, + { + "epoch": 25.284890645300997, + "grad_norm": 0.33825719356536865, + "learning_rate": 4.7471510935469904e-05, + "loss": 0.3605, + "step": 3736500 + }, + { + "epoch": 25.288274144651364, + "grad_norm": 0.3834848403930664, + "learning_rate": 4.7471172585534866e-05, + "loss": 0.3636, + "step": 3737000 + }, + { + "epoch": 25.29165764400173, + "grad_norm": 0.35997170209884644, + "learning_rate": 4.747083423559983e-05, + "loss": 0.364, + "step": 3737500 + }, + { + "epoch": 25.2950411433521, + "grad_norm": 0.385812908411026, + "learning_rate": 4.747049588566479e-05, + "loss": 0.3637, + "step": 3738000 + }, + { + "epoch": 25.298424642702468, + "grad_norm": 0.34480977058410645, + "learning_rate": 4.747015753572975e-05, + "loss": 0.3615, + "step": 3738500 + }, + { + "epoch": 25.30180814205284, + "grad_norm": 0.3738287389278412, + "learning_rate": 4.7469819185794715e-05, + "loss": 0.3637, + "step": 3739000 + }, + { + "epoch": 25.305191641403205, + "grad_norm": 0.34847918152809143, + "learning_rate": 4.7469480835859684e-05, + "loss": 0.3607, + "step": 3739500 + }, + { + "epoch": 25.30857514075357, + "grad_norm": 0.3540794849395752, + "learning_rate": 4.7469142485924646e-05, + "loss": 0.3604, + "step": 3740000 + }, + { + "epoch": 25.311958640103942, + "grad_norm": 0.39001625776290894, + "learning_rate": 4.746880413598961e-05, + "loss": 0.3628, + "step": 3740500 + }, + { + "epoch": 25.31534213945431, + "grad_norm": 0.3212912082672119, + "learning_rate": 4.746846578605457e-05, + "loss": 0.362, + "step": 3741000 + }, + { + "epoch": 25.31872563880468, + "grad_norm": 0.36917877197265625, + "learning_rate": 4.746812743611954e-05, + "loss": 0.3616, + "step": 3741500 + }, + { + "epoch": 25.322109138155046, + "grad_norm": 0.36249247193336487, + "learning_rate": 4.74677890861845e-05, + "loss": 0.3632, + "step": 3742000 + }, + { + "epoch": 25.325492637505413, + "grad_norm": 0.30697494745254517, + "learning_rate": 4.746745073624946e-05, + "loss": 0.3636, + "step": 3742500 + }, + { + "epoch": 25.328876136855783, + "grad_norm": 0.37793347239494324, + "learning_rate": 4.7467112386314425e-05, + "loss": 0.3636, + "step": 3743000 + }, + { + "epoch": 25.33225963620615, + "grad_norm": 0.3809393346309662, + "learning_rate": 4.746677403637939e-05, + "loss": 0.3618, + "step": 3743500 + }, + { + "epoch": 25.335643135556516, + "grad_norm": 0.36163750290870667, + "learning_rate": 4.746643568644435e-05, + "loss": 0.3619, + "step": 3744000 + }, + { + "epoch": 25.339026634906887, + "grad_norm": 0.4327174723148346, + "learning_rate": 4.746609733650931e-05, + "loss": 0.3622, + "step": 3744500 + }, + { + "epoch": 25.342410134257253, + "grad_norm": 0.3379616439342499, + "learning_rate": 4.7465758986574274e-05, + "loss": 0.3622, + "step": 3745000 + }, + { + "epoch": 25.345793633607624, + "grad_norm": 0.3480340242385864, + "learning_rate": 4.746542063663924e-05, + "loss": 0.3651, + "step": 3745500 + }, + { + "epoch": 25.34917713295799, + "grad_norm": 0.3799471855163574, + "learning_rate": 4.7465082286704205e-05, + "loss": 0.3615, + "step": 3746000 + }, + { + "epoch": 25.352560632308357, + "grad_norm": 0.40400072932243347, + "learning_rate": 4.746474393676917e-05, + "loss": 0.3636, + "step": 3746500 + }, + { + "epoch": 25.355944131658728, + "grad_norm": 0.3719613552093506, + "learning_rate": 4.746440558683413e-05, + "loss": 0.3624, + "step": 3747000 + }, + { + "epoch": 25.359327631009094, + "grad_norm": 0.3671671152114868, + "learning_rate": 4.746406723689909e-05, + "loss": 0.3623, + "step": 3747500 + }, + { + "epoch": 25.362711130359465, + "grad_norm": 0.3434518873691559, + "learning_rate": 4.746372888696405e-05, + "loss": 0.3624, + "step": 3748000 + }, + { + "epoch": 25.36609462970983, + "grad_norm": 0.36297357082366943, + "learning_rate": 4.7463390537029015e-05, + "loss": 0.3636, + "step": 3748500 + }, + { + "epoch": 25.369478129060198, + "grad_norm": 0.3704264760017395, + "learning_rate": 4.7463052187093984e-05, + "loss": 0.3621, + "step": 3749000 + }, + { + "epoch": 25.37286162841057, + "grad_norm": 0.36693716049194336, + "learning_rate": 4.7462713837158946e-05, + "loss": 0.362, + "step": 3749500 + }, + { + "epoch": 25.376245127760935, + "grad_norm": 0.40354418754577637, + "learning_rate": 4.746237548722391e-05, + "loss": 0.3613, + "step": 3750000 + }, + { + "epoch": 25.379628627111302, + "grad_norm": 0.3502896726131439, + "learning_rate": 4.746203713728887e-05, + "loss": 0.3618, + "step": 3750500 + }, + { + "epoch": 25.383012126461672, + "grad_norm": 0.381521999835968, + "learning_rate": 4.746169878735384e-05, + "loss": 0.3638, + "step": 3751000 + }, + { + "epoch": 25.38639562581204, + "grad_norm": 0.41425198316574097, + "learning_rate": 4.74613604374188e-05, + "loss": 0.3631, + "step": 3751500 + }, + { + "epoch": 25.38977912516241, + "grad_norm": 0.3814198076725006, + "learning_rate": 4.7461022087483764e-05, + "loss": 0.3639, + "step": 3752000 + }, + { + "epoch": 25.393162624512776, + "grad_norm": 0.3682596683502197, + "learning_rate": 4.746068373754872e-05, + "loss": 0.3631, + "step": 3752500 + }, + { + "epoch": 25.396546123863143, + "grad_norm": 0.3883252441883087, + "learning_rate": 4.746034538761369e-05, + "loss": 0.3621, + "step": 3753000 + }, + { + "epoch": 25.399929623213513, + "grad_norm": 0.3718891739845276, + "learning_rate": 4.746000703767865e-05, + "loss": 0.3634, + "step": 3753500 + }, + { + "epoch": 25.40331312256388, + "grad_norm": 0.37141090631484985, + "learning_rate": 4.745966868774361e-05, + "loss": 0.3623, + "step": 3754000 + }, + { + "epoch": 25.40669662191425, + "grad_norm": 0.3447796702384949, + "learning_rate": 4.7459330337808574e-05, + "loss": 0.3627, + "step": 3754500 + }, + { + "epoch": 25.410080121264617, + "grad_norm": 0.3654313087463379, + "learning_rate": 4.745899198787354e-05, + "loss": 0.3621, + "step": 3755000 + }, + { + "epoch": 25.413463620614984, + "grad_norm": 0.3879503011703491, + "learning_rate": 4.7458653637938505e-05, + "loss": 0.3623, + "step": 3755500 + }, + { + "epoch": 25.416847119965354, + "grad_norm": 0.34639325737953186, + "learning_rate": 4.745831528800347e-05, + "loss": 0.3627, + "step": 3756000 + }, + { + "epoch": 25.42023061931572, + "grad_norm": 0.3954552710056305, + "learning_rate": 4.745797693806843e-05, + "loss": 0.3609, + "step": 3756500 + }, + { + "epoch": 25.42361411866609, + "grad_norm": 0.3754726052284241, + "learning_rate": 4.745763858813339e-05, + "loss": 0.363, + "step": 3757000 + }, + { + "epoch": 25.426997618016458, + "grad_norm": 0.39806005358695984, + "learning_rate": 4.7457300238198354e-05, + "loss": 0.3633, + "step": 3757500 + }, + { + "epoch": 25.430381117366824, + "grad_norm": 0.3469841480255127, + "learning_rate": 4.7456961888263316e-05, + "loss": 0.3633, + "step": 3758000 + }, + { + "epoch": 25.433764616717195, + "grad_norm": 0.3493543863296509, + "learning_rate": 4.7456623538328285e-05, + "loss": 0.3622, + "step": 3758500 + }, + { + "epoch": 25.43714811606756, + "grad_norm": 0.37127816677093506, + "learning_rate": 4.745628518839325e-05, + "loss": 0.3649, + "step": 3759000 + }, + { + "epoch": 25.440531615417928, + "grad_norm": 0.36104732751846313, + "learning_rate": 4.745594683845821e-05, + "loss": 0.3634, + "step": 3759500 + }, + { + "epoch": 25.4439151147683, + "grad_norm": 0.3581537902355194, + "learning_rate": 4.745560848852317e-05, + "loss": 0.3615, + "step": 3760000 + }, + { + "epoch": 25.447298614118665, + "grad_norm": 0.34989815950393677, + "learning_rate": 4.745527013858814e-05, + "loss": 0.3638, + "step": 3760500 + }, + { + "epoch": 25.450682113469036, + "grad_norm": 0.4142208993434906, + "learning_rate": 4.74549317886531e-05, + "loss": 0.3622, + "step": 3761000 + }, + { + "epoch": 25.454065612819402, + "grad_norm": 0.4234132766723633, + "learning_rate": 4.7454593438718064e-05, + "loss": 0.3631, + "step": 3761500 + }, + { + "epoch": 25.45744911216977, + "grad_norm": 0.4301721751689911, + "learning_rate": 4.745425508878302e-05, + "loss": 0.3651, + "step": 3762000 + }, + { + "epoch": 25.46083261152014, + "grad_norm": 0.38137391209602356, + "learning_rate": 4.745391673884799e-05, + "loss": 0.3622, + "step": 3762500 + }, + { + "epoch": 25.464216110870506, + "grad_norm": 0.4020451307296753, + "learning_rate": 4.745357838891295e-05, + "loss": 0.3631, + "step": 3763000 + }, + { + "epoch": 25.467599610220876, + "grad_norm": 0.3583117425441742, + "learning_rate": 4.745324003897791e-05, + "loss": 0.3633, + "step": 3763500 + }, + { + "epoch": 25.470983109571243, + "grad_norm": 0.34035882353782654, + "learning_rate": 4.7452901689042875e-05, + "loss": 0.3637, + "step": 3764000 + }, + { + "epoch": 25.47436660892161, + "grad_norm": 0.3805950880050659, + "learning_rate": 4.7452563339107844e-05, + "loss": 0.3634, + "step": 3764500 + }, + { + "epoch": 25.47775010827198, + "grad_norm": 0.36006009578704834, + "learning_rate": 4.7452224989172806e-05, + "loss": 0.3626, + "step": 3765000 + }, + { + "epoch": 25.481133607622347, + "grad_norm": 0.3865433931350708, + "learning_rate": 4.745188663923777e-05, + "loss": 0.3619, + "step": 3765500 + }, + { + "epoch": 25.484517106972717, + "grad_norm": 0.3628615736961365, + "learning_rate": 4.745154828930273e-05, + "loss": 0.3615, + "step": 3766000 + }, + { + "epoch": 25.487900606323084, + "grad_norm": 0.37006327509880066, + "learning_rate": 4.745120993936769e-05, + "loss": 0.3619, + "step": 3766500 + }, + { + "epoch": 25.49128410567345, + "grad_norm": 0.36280131340026855, + "learning_rate": 4.7450871589432654e-05, + "loss": 0.3625, + "step": 3767000 + }, + { + "epoch": 25.49466760502382, + "grad_norm": 0.363631010055542, + "learning_rate": 4.7450533239497617e-05, + "loss": 0.3619, + "step": 3767500 + }, + { + "epoch": 25.498051104374188, + "grad_norm": 0.36467093229293823, + "learning_rate": 4.7450194889562585e-05, + "loss": 0.362, + "step": 3768000 + }, + { + "epoch": 25.501434603724555, + "grad_norm": 0.34792622923851013, + "learning_rate": 4.744985653962755e-05, + "loss": 0.3626, + "step": 3768500 + }, + { + "epoch": 25.504818103074925, + "grad_norm": 0.35685989260673523, + "learning_rate": 4.744951818969251e-05, + "loss": 0.3625, + "step": 3769000 + }, + { + "epoch": 25.50820160242529, + "grad_norm": 0.40018942952156067, + "learning_rate": 4.744917983975747e-05, + "loss": 0.3626, + "step": 3769500 + }, + { + "epoch": 25.511585101775662, + "grad_norm": 0.3791816830635071, + "learning_rate": 4.744884148982244e-05, + "loss": 0.3629, + "step": 3770000 + }, + { + "epoch": 25.51496860112603, + "grad_norm": 0.3799859881401062, + "learning_rate": 4.74485031398874e-05, + "loss": 0.3628, + "step": 3770500 + }, + { + "epoch": 25.518352100476395, + "grad_norm": 0.3531995117664337, + "learning_rate": 4.7448164789952365e-05, + "loss": 0.3646, + "step": 3771000 + }, + { + "epoch": 25.521735599826766, + "grad_norm": 0.3656378984451294, + "learning_rate": 4.744782644001732e-05, + "loss": 0.3633, + "step": 3771500 + }, + { + "epoch": 25.525119099177132, + "grad_norm": 0.35773876309394836, + "learning_rate": 4.744748809008229e-05, + "loss": 0.3636, + "step": 3772000 + }, + { + "epoch": 25.528502598527503, + "grad_norm": 0.40995362401008606, + "learning_rate": 4.744714974014725e-05, + "loss": 0.3627, + "step": 3772500 + }, + { + "epoch": 25.53188609787787, + "grad_norm": 0.3314554691314697, + "learning_rate": 4.7446811390212213e-05, + "loss": 0.3636, + "step": 3773000 + }, + { + "epoch": 25.535269597228236, + "grad_norm": 0.34793007373809814, + "learning_rate": 4.7446473040277176e-05, + "loss": 0.3628, + "step": 3773500 + }, + { + "epoch": 25.538653096578606, + "grad_norm": 0.4217507839202881, + "learning_rate": 4.7446134690342144e-05, + "loss": 0.362, + "step": 3774000 + }, + { + "epoch": 25.542036595928973, + "grad_norm": 0.4134977459907532, + "learning_rate": 4.7445796340407107e-05, + "loss": 0.3629, + "step": 3774500 + }, + { + "epoch": 25.54542009527934, + "grad_norm": 0.35792210698127747, + "learning_rate": 4.744545799047207e-05, + "loss": 0.3607, + "step": 3775000 + }, + { + "epoch": 25.54880359462971, + "grad_norm": 0.4191746711730957, + "learning_rate": 4.744511964053703e-05, + "loss": 0.3628, + "step": 3775500 + }, + { + "epoch": 25.552187093980077, + "grad_norm": 0.35209158062934875, + "learning_rate": 4.744478129060199e-05, + "loss": 0.3615, + "step": 3776000 + }, + { + "epoch": 25.555570593330447, + "grad_norm": 0.4061746299266815, + "learning_rate": 4.7444442940666955e-05, + "loss": 0.3636, + "step": 3776500 + }, + { + "epoch": 25.558954092680814, + "grad_norm": 0.34914374351501465, + "learning_rate": 4.744410459073192e-05, + "loss": 0.3633, + "step": 3777000 + }, + { + "epoch": 25.56233759203118, + "grad_norm": 0.38358649611473083, + "learning_rate": 4.7443766240796886e-05, + "loss": 0.3645, + "step": 3777500 + }, + { + "epoch": 25.56572109138155, + "grad_norm": 0.4001278877258301, + "learning_rate": 4.744342789086185e-05, + "loss": 0.3618, + "step": 3778000 + }, + { + "epoch": 25.569104590731918, + "grad_norm": 0.3768290877342224, + "learning_rate": 4.744308954092681e-05, + "loss": 0.3636, + "step": 3778500 + }, + { + "epoch": 25.572488090082288, + "grad_norm": 0.36954954266548157, + "learning_rate": 4.744275119099177e-05, + "loss": 0.3622, + "step": 3779000 + }, + { + "epoch": 25.575871589432655, + "grad_norm": 0.38198724389076233, + "learning_rate": 4.744241284105674e-05, + "loss": 0.3632, + "step": 3779500 + }, + { + "epoch": 25.57925508878302, + "grad_norm": 0.3659634590148926, + "learning_rate": 4.7442074491121703e-05, + "loss": 0.362, + "step": 3780000 + }, + { + "epoch": 25.582638588133392, + "grad_norm": 0.3782658576965332, + "learning_rate": 4.7441736141186666e-05, + "loss": 0.3611, + "step": 3780500 + }, + { + "epoch": 25.58602208748376, + "grad_norm": 0.35388150811195374, + "learning_rate": 4.744139779125162e-05, + "loss": 0.3611, + "step": 3781000 + }, + { + "epoch": 25.58940558683413, + "grad_norm": 0.3796742558479309, + "learning_rate": 4.744105944131659e-05, + "loss": 0.3626, + "step": 3781500 + }, + { + "epoch": 25.592789086184496, + "grad_norm": 0.36054444313049316, + "learning_rate": 4.744072109138155e-05, + "loss": 0.3612, + "step": 3782000 + }, + { + "epoch": 25.596172585534863, + "grad_norm": 0.342253714799881, + "learning_rate": 4.7440382741446514e-05, + "loss": 0.3623, + "step": 3782500 + }, + { + "epoch": 25.599556084885233, + "grad_norm": 0.35453668236732483, + "learning_rate": 4.7440044391511476e-05, + "loss": 0.3619, + "step": 3783000 + }, + { + "epoch": 25.6029395842356, + "grad_norm": 0.37217822670936584, + "learning_rate": 4.7439706041576445e-05, + "loss": 0.3622, + "step": 3783500 + }, + { + "epoch": 25.606323083585966, + "grad_norm": 0.3663980960845947, + "learning_rate": 4.743936769164141e-05, + "loss": 0.3633, + "step": 3784000 + }, + { + "epoch": 25.609706582936337, + "grad_norm": 0.4085804522037506, + "learning_rate": 4.743902934170637e-05, + "loss": 0.3623, + "step": 3784500 + }, + { + "epoch": 25.613090082286703, + "grad_norm": 0.37254148721694946, + "learning_rate": 4.743869099177133e-05, + "loss": 0.3623, + "step": 3785000 + }, + { + "epoch": 25.616473581637074, + "grad_norm": 0.370465487241745, + "learning_rate": 4.74383526418363e-05, + "loss": 0.3619, + "step": 3785500 + }, + { + "epoch": 25.61985708098744, + "grad_norm": 0.3487185835838318, + "learning_rate": 4.7438014291901256e-05, + "loss": 0.3619, + "step": 3786000 + }, + { + "epoch": 25.623240580337807, + "grad_norm": 0.3992827534675598, + "learning_rate": 4.743767594196622e-05, + "loss": 0.3615, + "step": 3786500 + }, + { + "epoch": 25.626624079688177, + "grad_norm": 0.35768917202949524, + "learning_rate": 4.743733759203119e-05, + "loss": 0.362, + "step": 3787000 + }, + { + "epoch": 25.630007579038544, + "grad_norm": 0.3796229660511017, + "learning_rate": 4.743699924209615e-05, + "loss": 0.3625, + "step": 3787500 + }, + { + "epoch": 25.633391078388915, + "grad_norm": 0.38146036863327026, + "learning_rate": 4.743666089216111e-05, + "loss": 0.3638, + "step": 3788000 + }, + { + "epoch": 25.63677457773928, + "grad_norm": 0.4110412001609802, + "learning_rate": 4.743632254222607e-05, + "loss": 0.3623, + "step": 3788500 + }, + { + "epoch": 25.640158077089648, + "grad_norm": 0.33680954575538635, + "learning_rate": 4.743598419229104e-05, + "loss": 0.3647, + "step": 3789000 + }, + { + "epoch": 25.64354157644002, + "grad_norm": 0.3926984369754791, + "learning_rate": 4.7435645842356004e-05, + "loss": 0.3615, + "step": 3789500 + }, + { + "epoch": 25.646925075790385, + "grad_norm": 0.3979891836643219, + "learning_rate": 4.7435307492420966e-05, + "loss": 0.3634, + "step": 3790000 + }, + { + "epoch": 25.650308575140755, + "grad_norm": 0.4162346124649048, + "learning_rate": 4.743496914248592e-05, + "loss": 0.3639, + "step": 3790500 + }, + { + "epoch": 25.653692074491122, + "grad_norm": 0.36416104435920715, + "learning_rate": 4.743463079255089e-05, + "loss": 0.3641, + "step": 3791000 + }, + { + "epoch": 25.65707557384149, + "grad_norm": 0.43271806836128235, + "learning_rate": 4.743429244261585e-05, + "loss": 0.3639, + "step": 3791500 + }, + { + "epoch": 25.66045907319186, + "grad_norm": 0.3622170686721802, + "learning_rate": 4.7433954092680815e-05, + "loss": 0.3622, + "step": 3792000 + }, + { + "epoch": 25.663842572542226, + "grad_norm": 0.4002637565135956, + "learning_rate": 4.743361574274578e-05, + "loss": 0.3626, + "step": 3792500 + }, + { + "epoch": 25.667226071892593, + "grad_norm": 0.3458804190158844, + "learning_rate": 4.7433277392810746e-05, + "loss": 0.3599, + "step": 3793000 + }, + { + "epoch": 25.670609571242963, + "grad_norm": 0.3337368071079254, + "learning_rate": 4.743293904287571e-05, + "loss": 0.3619, + "step": 3793500 + }, + { + "epoch": 25.67399307059333, + "grad_norm": 0.3929619789123535, + "learning_rate": 4.743260069294067e-05, + "loss": 0.3635, + "step": 3794000 + }, + { + "epoch": 25.6773765699437, + "grad_norm": 0.3558287024497986, + "learning_rate": 4.743226234300563e-05, + "loss": 0.363, + "step": 3794500 + }, + { + "epoch": 25.680760069294067, + "grad_norm": 0.3677956461906433, + "learning_rate": 4.74319239930706e-05, + "loss": 0.3627, + "step": 3795000 + }, + { + "epoch": 25.684143568644433, + "grad_norm": 0.3794611394405365, + "learning_rate": 4.7431585643135556e-05, + "loss": 0.3629, + "step": 3795500 + }, + { + "epoch": 25.687527067994804, + "grad_norm": 0.35885268449783325, + "learning_rate": 4.743124729320052e-05, + "loss": 0.3621, + "step": 3796000 + }, + { + "epoch": 25.69091056734517, + "grad_norm": 0.3457956910133362, + "learning_rate": 4.743090894326549e-05, + "loss": 0.363, + "step": 3796500 + }, + { + "epoch": 25.69429406669554, + "grad_norm": 0.37334170937538147, + "learning_rate": 4.743057059333045e-05, + "loss": 0.3636, + "step": 3797000 + }, + { + "epoch": 25.697677566045908, + "grad_norm": 0.36617588996887207, + "learning_rate": 4.743023224339541e-05, + "loss": 0.3635, + "step": 3797500 + }, + { + "epoch": 25.701061065396274, + "grad_norm": 0.3778825104236603, + "learning_rate": 4.7429893893460374e-05, + "loss": 0.3626, + "step": 3798000 + }, + { + "epoch": 25.704444564746645, + "grad_norm": 0.37063291668891907, + "learning_rate": 4.7429555543525336e-05, + "loss": 0.3632, + "step": 3798500 + }, + { + "epoch": 25.70782806409701, + "grad_norm": 0.3541485369205475, + "learning_rate": 4.7429217193590305e-05, + "loss": 0.3629, + "step": 3799000 + }, + { + "epoch": 25.711211563447378, + "grad_norm": 0.36199647188186646, + "learning_rate": 4.742887884365527e-05, + "loss": 0.3644, + "step": 3799500 + }, + { + "epoch": 25.71459506279775, + "grad_norm": 0.36414459347724915, + "learning_rate": 4.742854049372022e-05, + "loss": 0.3637, + "step": 3800000 + }, + { + "epoch": 25.717978562148115, + "grad_norm": 0.38215571641921997, + "learning_rate": 4.742820214378519e-05, + "loss": 0.3628, + "step": 3800500 + }, + { + "epoch": 25.721362061498485, + "grad_norm": 0.3797191381454468, + "learning_rate": 4.742786379385015e-05, + "loss": 0.3637, + "step": 3801000 + }, + { + "epoch": 25.724745560848852, + "grad_norm": 0.3910582959651947, + "learning_rate": 4.7427525443915115e-05, + "loss": 0.3635, + "step": 3801500 + }, + { + "epoch": 25.72812906019922, + "grad_norm": 0.3790856599807739, + "learning_rate": 4.742718709398008e-05, + "loss": 0.3624, + "step": 3802000 + }, + { + "epoch": 25.73151255954959, + "grad_norm": 0.36054325103759766, + "learning_rate": 4.7426848744045046e-05, + "loss": 0.3629, + "step": 3802500 + }, + { + "epoch": 25.734896058899956, + "grad_norm": 0.354313462972641, + "learning_rate": 4.742651039411001e-05, + "loss": 0.3631, + "step": 3803000 + }, + { + "epoch": 25.738279558250326, + "grad_norm": 0.37741386890411377, + "learning_rate": 4.742617204417497e-05, + "loss": 0.3633, + "step": 3803500 + }, + { + "epoch": 25.741663057600693, + "grad_norm": 0.37754544615745544, + "learning_rate": 4.742583369423993e-05, + "loss": 0.3631, + "step": 3804000 + }, + { + "epoch": 25.74504655695106, + "grad_norm": 0.34346863627433777, + "learning_rate": 4.74254953443049e-05, + "loss": 0.3625, + "step": 3804500 + }, + { + "epoch": 25.74843005630143, + "grad_norm": 0.31208792328834534, + "learning_rate": 4.742515699436986e-05, + "loss": 0.3619, + "step": 3805000 + }, + { + "epoch": 25.751813555651797, + "grad_norm": 0.380153089761734, + "learning_rate": 4.742481864443482e-05, + "loss": 0.3611, + "step": 3805500 + }, + { + "epoch": 25.755197055002164, + "grad_norm": 0.3855472803115845, + "learning_rate": 4.742448029449978e-05, + "loss": 0.3629, + "step": 3806000 + }, + { + "epoch": 25.758580554352534, + "grad_norm": 0.3726450502872467, + "learning_rate": 4.742414194456475e-05, + "loss": 0.3638, + "step": 3806500 + }, + { + "epoch": 25.7619640537029, + "grad_norm": 0.3527815341949463, + "learning_rate": 4.742380359462971e-05, + "loss": 0.3629, + "step": 3807000 + }, + { + "epoch": 25.76534755305327, + "grad_norm": 0.35626131296157837, + "learning_rate": 4.7423465244694674e-05, + "loss": 0.3646, + "step": 3807500 + }, + { + "epoch": 25.768731052403638, + "grad_norm": 0.36378493905067444, + "learning_rate": 4.7423126894759636e-05, + "loss": 0.3626, + "step": 3808000 + }, + { + "epoch": 25.772114551754004, + "grad_norm": 0.41029688715934753, + "learning_rate": 4.7422788544824605e-05, + "loss": 0.3629, + "step": 3808500 + }, + { + "epoch": 25.775498051104375, + "grad_norm": 0.3284132778644562, + "learning_rate": 4.742245019488957e-05, + "loss": 0.3627, + "step": 3809000 + }, + { + "epoch": 25.77888155045474, + "grad_norm": 0.32602161169052124, + "learning_rate": 4.742211184495452e-05, + "loss": 0.3626, + "step": 3809500 + }, + { + "epoch": 25.782265049805112, + "grad_norm": 0.3718428611755371, + "learning_rate": 4.742177349501949e-05, + "loss": 0.3606, + "step": 3810000 + }, + { + "epoch": 25.78564854915548, + "grad_norm": 0.35943469405174255, + "learning_rate": 4.7421435145084454e-05, + "loss": 0.3615, + "step": 3810500 + }, + { + "epoch": 25.789032048505845, + "grad_norm": 0.38358232378959656, + "learning_rate": 4.7421096795149416e-05, + "loss": 0.3624, + "step": 3811000 + }, + { + "epoch": 25.792415547856216, + "grad_norm": 0.37543123960494995, + "learning_rate": 4.742075844521438e-05, + "loss": 0.3608, + "step": 3811500 + }, + { + "epoch": 25.795799047206582, + "grad_norm": 0.35307949781417847, + "learning_rate": 4.742042009527935e-05, + "loss": 0.3625, + "step": 3812000 + }, + { + "epoch": 25.799182546556953, + "grad_norm": 0.3657030761241913, + "learning_rate": 4.742008174534431e-05, + "loss": 0.3624, + "step": 3812500 + }, + { + "epoch": 25.80256604590732, + "grad_norm": 0.3477466404438019, + "learning_rate": 4.741974339540927e-05, + "loss": 0.3618, + "step": 3813000 + }, + { + "epoch": 25.805949545257686, + "grad_norm": 0.4210405945777893, + "learning_rate": 4.741940504547423e-05, + "loss": 0.3631, + "step": 3813500 + }, + { + "epoch": 25.809333044608056, + "grad_norm": 0.37172731757164, + "learning_rate": 4.74190666955392e-05, + "loss": 0.3628, + "step": 3814000 + }, + { + "epoch": 25.812716543958423, + "grad_norm": 0.3678707480430603, + "learning_rate": 4.741872834560416e-05, + "loss": 0.3634, + "step": 3814500 + }, + { + "epoch": 25.816100043308793, + "grad_norm": 0.317853182554245, + "learning_rate": 4.741838999566912e-05, + "loss": 0.3635, + "step": 3815000 + }, + { + "epoch": 25.81948354265916, + "grad_norm": 0.34395352005958557, + "learning_rate": 4.741805164573408e-05, + "loss": 0.3632, + "step": 3815500 + }, + { + "epoch": 25.822867042009527, + "grad_norm": 0.38165464997291565, + "learning_rate": 4.741771329579905e-05, + "loss": 0.3627, + "step": 3816000 + }, + { + "epoch": 25.826250541359897, + "grad_norm": 0.35092389583587646, + "learning_rate": 4.741737494586401e-05, + "loss": 0.3602, + "step": 3816500 + }, + { + "epoch": 25.829634040710264, + "grad_norm": 0.3503788411617279, + "learning_rate": 4.7417036595928975e-05, + "loss": 0.3627, + "step": 3817000 + }, + { + "epoch": 25.83301754006063, + "grad_norm": 0.3763909637928009, + "learning_rate": 4.741669824599394e-05, + "loss": 0.3617, + "step": 3817500 + }, + { + "epoch": 25.836401039411, + "grad_norm": 0.3493872284889221, + "learning_rate": 4.7416359896058906e-05, + "loss": 0.362, + "step": 3818000 + }, + { + "epoch": 25.839784538761368, + "grad_norm": 0.351111501455307, + "learning_rate": 4.741602154612387e-05, + "loss": 0.3618, + "step": 3818500 + }, + { + "epoch": 25.843168038111738, + "grad_norm": 0.35869088768959045, + "learning_rate": 4.7415683196188823e-05, + "loss": 0.3618, + "step": 3819000 + }, + { + "epoch": 25.846551537462105, + "grad_norm": 0.3645501136779785, + "learning_rate": 4.741534484625379e-05, + "loss": 0.3639, + "step": 3819500 + }, + { + "epoch": 25.84993503681247, + "grad_norm": 0.352455198764801, + "learning_rate": 4.7415006496318754e-05, + "loss": 0.3605, + "step": 3820000 + }, + { + "epoch": 25.853318536162842, + "grad_norm": 0.36805835366249084, + "learning_rate": 4.7414668146383717e-05, + "loss": 0.3611, + "step": 3820500 + }, + { + "epoch": 25.85670203551321, + "grad_norm": 0.376402348279953, + "learning_rate": 4.741432979644868e-05, + "loss": 0.3631, + "step": 3821000 + }, + { + "epoch": 25.86008553486358, + "grad_norm": 0.3744443953037262, + "learning_rate": 4.741399144651365e-05, + "loss": 0.3627, + "step": 3821500 + }, + { + "epoch": 25.863469034213946, + "grad_norm": 0.3914426267147064, + "learning_rate": 4.741365309657861e-05, + "loss": 0.3617, + "step": 3822000 + }, + { + "epoch": 25.866852533564312, + "grad_norm": 0.3761354684829712, + "learning_rate": 4.741331474664357e-05, + "loss": 0.3613, + "step": 3822500 + }, + { + "epoch": 25.870236032914683, + "grad_norm": 0.3577970266342163, + "learning_rate": 4.7412976396708534e-05, + "loss": 0.3637, + "step": 3823000 + }, + { + "epoch": 25.87361953226505, + "grad_norm": 0.37655699253082275, + "learning_rate": 4.74126380467735e-05, + "loss": 0.3649, + "step": 3823500 + }, + { + "epoch": 25.877003031615416, + "grad_norm": 0.4279420077800751, + "learning_rate": 4.741229969683846e-05, + "loss": 0.3618, + "step": 3824000 + }, + { + "epoch": 25.880386530965787, + "grad_norm": 0.36734089255332947, + "learning_rate": 4.741196134690342e-05, + "loss": 0.3634, + "step": 3824500 + }, + { + "epoch": 25.883770030316153, + "grad_norm": 0.3726929724216461, + "learning_rate": 4.741162299696838e-05, + "loss": 0.3638, + "step": 3825000 + }, + { + "epoch": 25.887153529666524, + "grad_norm": 0.36211109161376953, + "learning_rate": 4.741128464703335e-05, + "loss": 0.3636, + "step": 3825500 + }, + { + "epoch": 25.89053702901689, + "grad_norm": 0.3322620689868927, + "learning_rate": 4.7410946297098313e-05, + "loss": 0.3619, + "step": 3826000 + }, + { + "epoch": 25.893920528367257, + "grad_norm": 0.39214855432510376, + "learning_rate": 4.7410607947163276e-05, + "loss": 0.3617, + "step": 3826500 + }, + { + "epoch": 25.897304027717627, + "grad_norm": 0.3484244644641876, + "learning_rate": 4.741026959722824e-05, + "loss": 0.3616, + "step": 3827000 + }, + { + "epoch": 25.900687527067994, + "grad_norm": 0.37596139311790466, + "learning_rate": 4.740993124729321e-05, + "loss": 0.3646, + "step": 3827500 + }, + { + "epoch": 25.904071026418364, + "grad_norm": 0.4313521683216095, + "learning_rate": 4.740959289735817e-05, + "loss": 0.3627, + "step": 3828000 + }, + { + "epoch": 25.90745452576873, + "grad_norm": 0.37457355856895447, + "learning_rate": 4.7409254547423124e-05, + "loss": 0.3625, + "step": 3828500 + }, + { + "epoch": 25.910838025119098, + "grad_norm": 0.33323827385902405, + "learning_rate": 4.740891619748809e-05, + "loss": 0.3628, + "step": 3829000 + }, + { + "epoch": 25.91422152446947, + "grad_norm": 0.3745093047618866, + "learning_rate": 4.7408577847553055e-05, + "loss": 0.362, + "step": 3829500 + }, + { + "epoch": 25.917605023819835, + "grad_norm": 0.35997912287712097, + "learning_rate": 4.740823949761802e-05, + "loss": 0.3629, + "step": 3830000 + }, + { + "epoch": 25.9209885231702, + "grad_norm": 0.34573546051979065, + "learning_rate": 4.740790114768298e-05, + "loss": 0.3627, + "step": 3830500 + }, + { + "epoch": 25.924372022520572, + "grad_norm": 0.3673235774040222, + "learning_rate": 4.740756279774795e-05, + "loss": 0.363, + "step": 3831000 + }, + { + "epoch": 25.92775552187094, + "grad_norm": 0.39050784707069397, + "learning_rate": 4.740722444781291e-05, + "loss": 0.3618, + "step": 3831500 + }, + { + "epoch": 25.93113902122131, + "grad_norm": 0.3835177719593048, + "learning_rate": 4.740688609787787e-05, + "loss": 0.364, + "step": 3832000 + }, + { + "epoch": 25.934522520571676, + "grad_norm": 0.39260244369506836, + "learning_rate": 4.7406547747942835e-05, + "loss": 0.3645, + "step": 3832500 + }, + { + "epoch": 25.937906019922043, + "grad_norm": 0.38124096393585205, + "learning_rate": 4.7406209398007804e-05, + "loss": 0.3639, + "step": 3833000 + }, + { + "epoch": 25.941289519272413, + "grad_norm": 0.3928065598011017, + "learning_rate": 4.740587104807276e-05, + "loss": 0.3615, + "step": 3833500 + }, + { + "epoch": 25.94467301862278, + "grad_norm": 0.3781130611896515, + "learning_rate": 4.740553269813772e-05, + "loss": 0.362, + "step": 3834000 + }, + { + "epoch": 25.94805651797315, + "grad_norm": 0.34431740641593933, + "learning_rate": 4.740519434820268e-05, + "loss": 0.3618, + "step": 3834500 + }, + { + "epoch": 25.951440017323517, + "grad_norm": 0.3865680694580078, + "learning_rate": 4.740485599826765e-05, + "loss": 0.3628, + "step": 3835000 + }, + { + "epoch": 25.954823516673883, + "grad_norm": 0.3760943114757538, + "learning_rate": 4.7404517648332614e-05, + "loss": 0.3633, + "step": 3835500 + }, + { + "epoch": 25.958207016024254, + "grad_norm": 0.33652690052986145, + "learning_rate": 4.7404179298397576e-05, + "loss": 0.3612, + "step": 3836000 + }, + { + "epoch": 25.96159051537462, + "grad_norm": 0.3407208323478699, + "learning_rate": 4.740384094846254e-05, + "loss": 0.3611, + "step": 3836500 + }, + { + "epoch": 25.96497401472499, + "grad_norm": 0.3594238758087158, + "learning_rate": 4.740350259852751e-05, + "loss": 0.3638, + "step": 3837000 + }, + { + "epoch": 25.968357514075358, + "grad_norm": 0.3876775801181793, + "learning_rate": 4.740316424859247e-05, + "loss": 0.3644, + "step": 3837500 + }, + { + "epoch": 25.971741013425724, + "grad_norm": 0.37705734372138977, + "learning_rate": 4.7402825898657425e-05, + "loss": 0.3629, + "step": 3838000 + }, + { + "epoch": 25.975124512776095, + "grad_norm": 0.3409097194671631, + "learning_rate": 4.7402487548722394e-05, + "loss": 0.3624, + "step": 3838500 + }, + { + "epoch": 25.97850801212646, + "grad_norm": 0.37340086698532104, + "learning_rate": 4.7402149198787356e-05, + "loss": 0.3636, + "step": 3839000 + }, + { + "epoch": 25.98189151147683, + "grad_norm": 0.382447749376297, + "learning_rate": 4.740181084885232e-05, + "loss": 0.3641, + "step": 3839500 + }, + { + "epoch": 25.9852750108272, + "grad_norm": 0.38271307945251465, + "learning_rate": 4.740147249891728e-05, + "loss": 0.3629, + "step": 3840000 + }, + { + "epoch": 25.988658510177565, + "grad_norm": 0.3467562198638916, + "learning_rate": 4.740113414898225e-05, + "loss": 0.3643, + "step": 3840500 + }, + { + "epoch": 25.992042009527935, + "grad_norm": 0.3526530861854553, + "learning_rate": 4.740079579904721e-05, + "loss": 0.3609, + "step": 3841000 + }, + { + "epoch": 25.995425508878302, + "grad_norm": 0.37061673402786255, + "learning_rate": 4.740045744911217e-05, + "loss": 0.3629, + "step": 3841500 + }, + { + "epoch": 25.99880900822867, + "grad_norm": 0.3723934590816498, + "learning_rate": 4.7400119099177135e-05, + "loss": 0.3633, + "step": 3842000 + }, + { + "epoch": 26.0, + "eval_accuracy": 0.8616330661467587, + "eval_loss": 0.5619011521339417, + "eval_runtime": 3404.9323, + "eval_samples_per_second": 85.389, + "eval_steps_per_second": 5.337, + "step": 3842176 + }, + { + "epoch": 26.00219250757904, + "grad_norm": 0.370853066444397, + "learning_rate": 4.7399780749242104e-05, + "loss": 0.3604, + "step": 3842500 + }, + { + "epoch": 26.005576006929406, + "grad_norm": 0.4079779088497162, + "learning_rate": 4.739944239930706e-05, + "loss": 0.3601, + "step": 3843000 + }, + { + "epoch": 26.008959506279776, + "grad_norm": 0.3787406086921692, + "learning_rate": 4.739910404937202e-05, + "loss": 0.3595, + "step": 3843500 + }, + { + "epoch": 26.012343005630143, + "grad_norm": 0.37509703636169434, + "learning_rate": 4.7398765699436984e-05, + "loss": 0.3616, + "step": 3844000 + }, + { + "epoch": 26.01572650498051, + "grad_norm": 0.4073735773563385, + "learning_rate": 4.739842734950195e-05, + "loss": 0.3601, + "step": 3844500 + }, + { + "epoch": 26.01911000433088, + "grad_norm": 0.3762528598308563, + "learning_rate": 4.7398088999566915e-05, + "loss": 0.3603, + "step": 3845000 + }, + { + "epoch": 26.022493503681247, + "grad_norm": 0.3731355667114258, + "learning_rate": 4.739775064963188e-05, + "loss": 0.3625, + "step": 3845500 + }, + { + "epoch": 26.025877003031617, + "grad_norm": 0.37788233160972595, + "learning_rate": 4.739741229969684e-05, + "loss": 0.3634, + "step": 3846000 + }, + { + "epoch": 26.029260502381984, + "grad_norm": 0.38412338495254517, + "learning_rate": 4.739707394976181e-05, + "loss": 0.3608, + "step": 3846500 + }, + { + "epoch": 26.03264400173235, + "grad_norm": 0.3652576208114624, + "learning_rate": 4.739673559982677e-05, + "loss": 0.3609, + "step": 3847000 + }, + { + "epoch": 26.03602750108272, + "grad_norm": 0.35933735966682434, + "learning_rate": 4.739639724989173e-05, + "loss": 0.3595, + "step": 3847500 + }, + { + "epoch": 26.039411000433088, + "grad_norm": 0.3820779025554657, + "learning_rate": 4.7396058899956694e-05, + "loss": 0.3598, + "step": 3848000 + }, + { + "epoch": 26.042794499783454, + "grad_norm": 0.4067803621292114, + "learning_rate": 4.7395720550021656e-05, + "loss": 0.3618, + "step": 3848500 + }, + { + "epoch": 26.046177999133825, + "grad_norm": 0.359842449426651, + "learning_rate": 4.739538220008662e-05, + "loss": 0.3614, + "step": 3849000 + }, + { + "epoch": 26.04956149848419, + "grad_norm": 0.36242035031318665, + "learning_rate": 4.739504385015158e-05, + "loss": 0.3624, + "step": 3849500 + }, + { + "epoch": 26.05294499783456, + "grad_norm": 0.39534276723861694, + "learning_rate": 4.739470550021655e-05, + "loss": 0.3602, + "step": 3850000 + }, + { + "epoch": 26.05632849718493, + "grad_norm": 0.34640955924987793, + "learning_rate": 4.739436715028151e-05, + "loss": 0.3612, + "step": 3850500 + }, + { + "epoch": 26.059711996535295, + "grad_norm": 0.4061083495616913, + "learning_rate": 4.7394028800346474e-05, + "loss": 0.3606, + "step": 3851000 + }, + { + "epoch": 26.063095495885666, + "grad_norm": 0.38183504343032837, + "learning_rate": 4.7393690450411436e-05, + "loss": 0.3616, + "step": 3851500 + }, + { + "epoch": 26.066478995236032, + "grad_norm": 0.37067651748657227, + "learning_rate": 4.73933521004764e-05, + "loss": 0.3617, + "step": 3852000 + }, + { + "epoch": 26.069862494586403, + "grad_norm": 0.34321513772010803, + "learning_rate": 4.739301375054136e-05, + "loss": 0.3604, + "step": 3852500 + }, + { + "epoch": 26.07324599393677, + "grad_norm": 0.38992804288864136, + "learning_rate": 4.739267540060632e-05, + "loss": 0.3613, + "step": 3853000 + }, + { + "epoch": 26.076629493287136, + "grad_norm": 0.3853125274181366, + "learning_rate": 4.7392337050671284e-05, + "loss": 0.3617, + "step": 3853500 + }, + { + "epoch": 26.080012992637506, + "grad_norm": 0.35063523054122925, + "learning_rate": 4.739199870073625e-05, + "loss": 0.3632, + "step": 3854000 + }, + { + "epoch": 26.083396491987873, + "grad_norm": 0.3972536623477936, + "learning_rate": 4.7391660350801215e-05, + "loss": 0.3593, + "step": 3854500 + }, + { + "epoch": 26.08677999133824, + "grad_norm": 0.3626726269721985, + "learning_rate": 4.739132200086618e-05, + "loss": 0.3624, + "step": 3855000 + }, + { + "epoch": 26.09016349068861, + "grad_norm": 0.3531181514263153, + "learning_rate": 4.739098365093114e-05, + "loss": 0.36, + "step": 3855500 + }, + { + "epoch": 26.093546990038977, + "grad_norm": 0.3698676824569702, + "learning_rate": 4.739064530099611e-05, + "loss": 0.3635, + "step": 3856000 + }, + { + "epoch": 26.096930489389347, + "grad_norm": 0.38867586851119995, + "learning_rate": 4.739030695106107e-05, + "loss": 0.3624, + "step": 3856500 + }, + { + "epoch": 26.100313988739714, + "grad_norm": 0.36070555448532104, + "learning_rate": 4.738996860112603e-05, + "loss": 0.361, + "step": 3857000 + }, + { + "epoch": 26.10369748809008, + "grad_norm": 0.3562554121017456, + "learning_rate": 4.7389630251190995e-05, + "loss": 0.3611, + "step": 3857500 + }, + { + "epoch": 26.10708098744045, + "grad_norm": 0.3982466757297516, + "learning_rate": 4.738929190125596e-05, + "loss": 0.3601, + "step": 3858000 + }, + { + "epoch": 26.110464486790818, + "grad_norm": 0.4041410982608795, + "learning_rate": 4.738895355132092e-05, + "loss": 0.362, + "step": 3858500 + }, + { + "epoch": 26.113847986141188, + "grad_norm": 0.39795050024986267, + "learning_rate": 4.738861520138588e-05, + "loss": 0.3618, + "step": 3859000 + }, + { + "epoch": 26.117231485491555, + "grad_norm": 0.3832455575466156, + "learning_rate": 4.738827685145085e-05, + "loss": 0.3621, + "step": 3859500 + }, + { + "epoch": 26.12061498484192, + "grad_norm": 0.3504578173160553, + "learning_rate": 4.738793850151581e-05, + "loss": 0.3623, + "step": 3860000 + }, + { + "epoch": 26.123998484192292, + "grad_norm": 0.37978294491767883, + "learning_rate": 4.7387600151580774e-05, + "loss": 0.3608, + "step": 3860500 + }, + { + "epoch": 26.12738198354266, + "grad_norm": 0.34524473547935486, + "learning_rate": 4.7387261801645737e-05, + "loss": 0.3595, + "step": 3861000 + }, + { + "epoch": 26.13076548289303, + "grad_norm": 0.33990371227264404, + "learning_rate": 4.73869234517107e-05, + "loss": 0.361, + "step": 3861500 + }, + { + "epoch": 26.134148982243396, + "grad_norm": 0.41926467418670654, + "learning_rate": 4.738658510177566e-05, + "loss": 0.3609, + "step": 3862000 + }, + { + "epoch": 26.137532481593762, + "grad_norm": 0.3623351752758026, + "learning_rate": 4.738624675184062e-05, + "loss": 0.362, + "step": 3862500 + }, + { + "epoch": 26.140915980944133, + "grad_norm": 0.36266013979911804, + "learning_rate": 4.7385908401905585e-05, + "loss": 0.3615, + "step": 3863000 + }, + { + "epoch": 26.1442994802945, + "grad_norm": 0.3743663430213928, + "learning_rate": 4.7385570051970554e-05, + "loss": 0.3624, + "step": 3863500 + }, + { + "epoch": 26.147682979644866, + "grad_norm": 0.3882286846637726, + "learning_rate": 4.7385231702035516e-05, + "loss": 0.3625, + "step": 3864000 + }, + { + "epoch": 26.151066478995237, + "grad_norm": 0.3620988130569458, + "learning_rate": 4.738489335210048e-05, + "loss": 0.3612, + "step": 3864500 + }, + { + "epoch": 26.154449978345603, + "grad_norm": 0.4201053977012634, + "learning_rate": 4.738455500216544e-05, + "loss": 0.3618, + "step": 3865000 + }, + { + "epoch": 26.157833477695974, + "grad_norm": 0.3706298768520355, + "learning_rate": 4.738421665223041e-05, + "loss": 0.3606, + "step": 3865500 + }, + { + "epoch": 26.16121697704634, + "grad_norm": 0.36147424578666687, + "learning_rate": 4.738387830229537e-05, + "loss": 0.3631, + "step": 3866000 + }, + { + "epoch": 26.164600476396707, + "grad_norm": 0.3444823920726776, + "learning_rate": 4.7383539952360333e-05, + "loss": 0.3614, + "step": 3866500 + }, + { + "epoch": 26.167983975747077, + "grad_norm": 0.38151130080223083, + "learning_rate": 4.7383201602425296e-05, + "loss": 0.3617, + "step": 3867000 + }, + { + "epoch": 26.171367475097444, + "grad_norm": 0.3470594584941864, + "learning_rate": 4.738286325249026e-05, + "loss": 0.3624, + "step": 3867500 + }, + { + "epoch": 26.174750974447814, + "grad_norm": 0.3673975467681885, + "learning_rate": 4.738252490255522e-05, + "loss": 0.362, + "step": 3868000 + }, + { + "epoch": 26.17813447379818, + "grad_norm": 0.34868329763412476, + "learning_rate": 4.738218655262018e-05, + "loss": 0.3622, + "step": 3868500 + }, + { + "epoch": 26.181517973148548, + "grad_norm": 0.3584482967853546, + "learning_rate": 4.7381848202685144e-05, + "loss": 0.3614, + "step": 3869000 + }, + { + "epoch": 26.184901472498918, + "grad_norm": 0.3847140073776245, + "learning_rate": 4.738150985275011e-05, + "loss": 0.3619, + "step": 3869500 + }, + { + "epoch": 26.188284971849285, + "grad_norm": 0.36739906668663025, + "learning_rate": 4.7381171502815075e-05, + "loss": 0.3624, + "step": 3870000 + }, + { + "epoch": 26.191668471199655, + "grad_norm": 0.36381658911705017, + "learning_rate": 4.738083315288004e-05, + "loss": 0.3618, + "step": 3870500 + }, + { + "epoch": 26.195051970550022, + "grad_norm": 0.37314146757125854, + "learning_rate": 4.7380494802945e-05, + "loss": 0.3605, + "step": 3871000 + }, + { + "epoch": 26.19843546990039, + "grad_norm": 0.4022788405418396, + "learning_rate": 4.738015645300996e-05, + "loss": 0.362, + "step": 3871500 + }, + { + "epoch": 26.20181896925076, + "grad_norm": 0.3680167496204376, + "learning_rate": 4.7379818103074924e-05, + "loss": 0.3626, + "step": 3872000 + }, + { + "epoch": 26.205202468601126, + "grad_norm": 0.35963067412376404, + "learning_rate": 4.7379479753139886e-05, + "loss": 0.3628, + "step": 3872500 + }, + { + "epoch": 26.208585967951493, + "grad_norm": 0.36652347445487976, + "learning_rate": 4.7379141403204855e-05, + "loss": 0.3615, + "step": 3873000 + }, + { + "epoch": 26.211969467301863, + "grad_norm": 0.3470577001571655, + "learning_rate": 4.737880305326982e-05, + "loss": 0.362, + "step": 3873500 + }, + { + "epoch": 26.21535296665223, + "grad_norm": 0.37560033798217773, + "learning_rate": 4.737846470333478e-05, + "loss": 0.3623, + "step": 3874000 + }, + { + "epoch": 26.2187364660026, + "grad_norm": 0.3894418179988861, + "learning_rate": 4.737812635339974e-05, + "loss": 0.36, + "step": 3874500 + }, + { + "epoch": 26.222119965352967, + "grad_norm": 0.3860340118408203, + "learning_rate": 4.737778800346471e-05, + "loss": 0.3614, + "step": 3875000 + }, + { + "epoch": 26.225503464703333, + "grad_norm": 0.44323745369911194, + "learning_rate": 4.737744965352967e-05, + "loss": 0.361, + "step": 3875500 + }, + { + "epoch": 26.228886964053704, + "grad_norm": 0.3986993432044983, + "learning_rate": 4.7377111303594634e-05, + "loss": 0.3612, + "step": 3876000 + }, + { + "epoch": 26.23227046340407, + "grad_norm": 0.3293212056159973, + "learning_rate": 4.7376772953659596e-05, + "loss": 0.362, + "step": 3876500 + }, + { + "epoch": 26.23565396275444, + "grad_norm": 0.4050101935863495, + "learning_rate": 4.737643460372456e-05, + "loss": 0.3604, + "step": 3877000 + }, + { + "epoch": 26.239037462104807, + "grad_norm": 0.3691443204879761, + "learning_rate": 4.737609625378952e-05, + "loss": 0.3614, + "step": 3877500 + }, + { + "epoch": 26.242420961455174, + "grad_norm": 0.3770415186882019, + "learning_rate": 4.737575790385448e-05, + "loss": 0.3616, + "step": 3878000 + }, + { + "epoch": 26.245804460805545, + "grad_norm": 0.3994768559932709, + "learning_rate": 4.7375419553919445e-05, + "loss": 0.3615, + "step": 3878500 + }, + { + "epoch": 26.24918796015591, + "grad_norm": 0.35937219858169556, + "learning_rate": 4.7375081203984414e-05, + "loss": 0.3621, + "step": 3879000 + }, + { + "epoch": 26.252571459506278, + "grad_norm": 0.3573669195175171, + "learning_rate": 4.7374742854049376e-05, + "loss": 0.3617, + "step": 3879500 + }, + { + "epoch": 26.25595495885665, + "grad_norm": 0.34345391392707825, + "learning_rate": 4.737440450411434e-05, + "loss": 0.362, + "step": 3880000 + }, + { + "epoch": 26.259338458207015, + "grad_norm": 0.36527591943740845, + "learning_rate": 4.73740661541793e-05, + "loss": 0.3612, + "step": 3880500 + }, + { + "epoch": 26.262721957557385, + "grad_norm": 0.34206730127334595, + "learning_rate": 4.737372780424426e-05, + "loss": 0.3603, + "step": 3881000 + }, + { + "epoch": 26.266105456907752, + "grad_norm": 0.3721388578414917, + "learning_rate": 4.7373389454309224e-05, + "loss": 0.3597, + "step": 3881500 + }, + { + "epoch": 26.26948895625812, + "grad_norm": 0.36627134680747986, + "learning_rate": 4.7373051104374186e-05, + "loss": 0.3606, + "step": 3882000 + }, + { + "epoch": 26.27287245560849, + "grad_norm": 0.39107099175453186, + "learning_rate": 4.7372712754439155e-05, + "loss": 0.3623, + "step": 3882500 + }, + { + "epoch": 26.276255954958856, + "grad_norm": 0.3856317400932312, + "learning_rate": 4.737237440450412e-05, + "loss": 0.3627, + "step": 3883000 + }, + { + "epoch": 26.279639454309226, + "grad_norm": 0.36741721630096436, + "learning_rate": 4.737203605456908e-05, + "loss": 0.362, + "step": 3883500 + }, + { + "epoch": 26.283022953659593, + "grad_norm": 0.387513130903244, + "learning_rate": 4.737169770463404e-05, + "loss": 0.363, + "step": 3884000 + }, + { + "epoch": 26.28640645300996, + "grad_norm": 0.3745093047618866, + "learning_rate": 4.737135935469901e-05, + "loss": 0.3615, + "step": 3884500 + }, + { + "epoch": 26.28978995236033, + "grad_norm": 0.3810684382915497, + "learning_rate": 4.737102100476397e-05, + "loss": 0.3617, + "step": 3885000 + }, + { + "epoch": 26.293173451710697, + "grad_norm": 0.3833572566509247, + "learning_rate": 4.7370682654828935e-05, + "loss": 0.3615, + "step": 3885500 + }, + { + "epoch": 26.296556951061067, + "grad_norm": 0.3960288465023041, + "learning_rate": 4.737034430489389e-05, + "loss": 0.3624, + "step": 3886000 + }, + { + "epoch": 26.299940450411434, + "grad_norm": 0.3764936625957489, + "learning_rate": 4.737000595495886e-05, + "loss": 0.3623, + "step": 3886500 + }, + { + "epoch": 26.3033239497618, + "grad_norm": 0.388413667678833, + "learning_rate": 4.736966760502382e-05, + "loss": 0.3618, + "step": 3887000 + }, + { + "epoch": 26.30670744911217, + "grad_norm": 0.3840676546096802, + "learning_rate": 4.736932925508878e-05, + "loss": 0.361, + "step": 3887500 + }, + { + "epoch": 26.310090948462538, + "grad_norm": 0.39145728945732117, + "learning_rate": 4.7368990905153745e-05, + "loss": 0.3622, + "step": 3888000 + }, + { + "epoch": 26.313474447812904, + "grad_norm": 0.37844544649124146, + "learning_rate": 4.7368652555218714e-05, + "loss": 0.363, + "step": 3888500 + }, + { + "epoch": 26.316857947163275, + "grad_norm": 0.41900765895843506, + "learning_rate": 4.7368314205283676e-05, + "loss": 0.3615, + "step": 3889000 + }, + { + "epoch": 26.32024144651364, + "grad_norm": 0.362606406211853, + "learning_rate": 4.736797585534864e-05, + "loss": 0.3619, + "step": 3889500 + }, + { + "epoch": 26.32362494586401, + "grad_norm": 0.3716253340244293, + "learning_rate": 4.73676375054136e-05, + "loss": 0.3608, + "step": 3890000 + }, + { + "epoch": 26.32700844521438, + "grad_norm": 0.38629764318466187, + "learning_rate": 4.736729915547856e-05, + "loss": 0.3622, + "step": 3890500 + }, + { + "epoch": 26.330391944564745, + "grad_norm": 0.3564845323562622, + "learning_rate": 4.7366960805543525e-05, + "loss": 0.3631, + "step": 3891000 + }, + { + "epoch": 26.333775443915115, + "grad_norm": 0.3664126992225647, + "learning_rate": 4.736662245560849e-05, + "loss": 0.362, + "step": 3891500 + }, + { + "epoch": 26.337158943265482, + "grad_norm": 0.3530741333961487, + "learning_rate": 4.7366284105673456e-05, + "loss": 0.3626, + "step": 3892000 + }, + { + "epoch": 26.340542442615853, + "grad_norm": 0.3932848274707794, + "learning_rate": 4.736594575573842e-05, + "loss": 0.3619, + "step": 3892500 + }, + { + "epoch": 26.34392594196622, + "grad_norm": 0.35414615273475647, + "learning_rate": 4.736560740580338e-05, + "loss": 0.3613, + "step": 3893000 + }, + { + "epoch": 26.347309441316586, + "grad_norm": 0.38742774724960327, + "learning_rate": 4.736526905586834e-05, + "loss": 0.364, + "step": 3893500 + }, + { + "epoch": 26.350692940666956, + "grad_norm": 0.42660969495773315, + "learning_rate": 4.736493070593331e-05, + "loss": 0.3606, + "step": 3894000 + }, + { + "epoch": 26.354076440017323, + "grad_norm": 0.37752971053123474, + "learning_rate": 4.736459235599827e-05, + "loss": 0.3618, + "step": 3894500 + }, + { + "epoch": 26.357459939367693, + "grad_norm": 0.34589022397994995, + "learning_rate": 4.7364254006063235e-05, + "loss": 0.3612, + "step": 3895000 + }, + { + "epoch": 26.36084343871806, + "grad_norm": 0.3234190344810486, + "learning_rate": 4.736391565612819e-05, + "loss": 0.3629, + "step": 3895500 + }, + { + "epoch": 26.364226938068427, + "grad_norm": 0.36566781997680664, + "learning_rate": 4.736357730619316e-05, + "loss": 0.3615, + "step": 3896000 + }, + { + "epoch": 26.367610437418797, + "grad_norm": 0.3908241093158722, + "learning_rate": 4.736323895625812e-05, + "loss": 0.3616, + "step": 3896500 + }, + { + "epoch": 26.370993936769164, + "grad_norm": 0.3684042692184448, + "learning_rate": 4.7362900606323084e-05, + "loss": 0.3626, + "step": 3897000 + }, + { + "epoch": 26.37437743611953, + "grad_norm": 0.35040467977523804, + "learning_rate": 4.7362562256388046e-05, + "loss": 0.3622, + "step": 3897500 + }, + { + "epoch": 26.3777609354699, + "grad_norm": 0.3635048568248749, + "learning_rate": 4.7362223906453015e-05, + "loss": 0.3613, + "step": 3898000 + }, + { + "epoch": 26.381144434820268, + "grad_norm": 0.38577190041542053, + "learning_rate": 4.736188555651798e-05, + "loss": 0.3629, + "step": 3898500 + }, + { + "epoch": 26.384527934170638, + "grad_norm": 0.34817418456077576, + "learning_rate": 4.736154720658294e-05, + "loss": 0.362, + "step": 3899000 + }, + { + "epoch": 26.387911433521005, + "grad_norm": 0.3552338778972626, + "learning_rate": 4.73612088566479e-05, + "loss": 0.3625, + "step": 3899500 + }, + { + "epoch": 26.39129493287137, + "grad_norm": 0.38207247853279114, + "learning_rate": 4.736087050671287e-05, + "loss": 0.3622, + "step": 3900000 + }, + { + "epoch": 26.394678432221742, + "grad_norm": 0.39675194025039673, + "learning_rate": 4.7360532156777825e-05, + "loss": 0.3628, + "step": 3900500 + }, + { + "epoch": 26.39806193157211, + "grad_norm": 0.41346973180770874, + "learning_rate": 4.736019380684279e-05, + "loss": 0.3614, + "step": 3901000 + }, + { + "epoch": 26.40144543092248, + "grad_norm": 0.36755767464637756, + "learning_rate": 4.7359855456907756e-05, + "loss": 0.3626, + "step": 3901500 + }, + { + "epoch": 26.404828930272846, + "grad_norm": 0.39664730429649353, + "learning_rate": 4.735951710697272e-05, + "loss": 0.3623, + "step": 3902000 + }, + { + "epoch": 26.408212429623212, + "grad_norm": 0.37295064330101013, + "learning_rate": 4.735917875703768e-05, + "loss": 0.3609, + "step": 3902500 + }, + { + "epoch": 26.411595928973583, + "grad_norm": 0.37683048844337463, + "learning_rate": 4.735884040710264e-05, + "loss": 0.3619, + "step": 3903000 + }, + { + "epoch": 26.41497942832395, + "grad_norm": 0.3602394461631775, + "learning_rate": 4.735850205716761e-05, + "loss": 0.3618, + "step": 3903500 + }, + { + "epoch": 26.418362927674316, + "grad_norm": 0.4101329743862152, + "learning_rate": 4.7358163707232574e-05, + "loss": 0.362, + "step": 3904000 + }, + { + "epoch": 26.421746427024686, + "grad_norm": 0.3093413710594177, + "learning_rate": 4.7357825357297536e-05, + "loss": 0.3616, + "step": 3904500 + }, + { + "epoch": 26.425129926375053, + "grad_norm": 0.35834816098213196, + "learning_rate": 4.735748700736249e-05, + "loss": 0.3614, + "step": 3905000 + }, + { + "epoch": 26.428513425725424, + "grad_norm": 0.36701199412345886, + "learning_rate": 4.735714865742746e-05, + "loss": 0.3628, + "step": 3905500 + }, + { + "epoch": 26.43189692507579, + "grad_norm": 0.3996990919113159, + "learning_rate": 4.735681030749242e-05, + "loss": 0.3626, + "step": 3906000 + }, + { + "epoch": 26.435280424426157, + "grad_norm": 0.34733450412750244, + "learning_rate": 4.7356471957557384e-05, + "loss": 0.3622, + "step": 3906500 + }, + { + "epoch": 26.438663923776527, + "grad_norm": 0.37848150730133057, + "learning_rate": 4.7356133607622347e-05, + "loss": 0.3602, + "step": 3907000 + }, + { + "epoch": 26.442047423126894, + "grad_norm": 0.35177531838417053, + "learning_rate": 4.7355795257687315e-05, + "loss": 0.3616, + "step": 3907500 + }, + { + "epoch": 26.445430922477264, + "grad_norm": 0.3411475121974945, + "learning_rate": 4.735545690775228e-05, + "loss": 0.3629, + "step": 3908000 + }, + { + "epoch": 26.44881442182763, + "grad_norm": 0.3760984539985657, + "learning_rate": 4.735511855781724e-05, + "loss": 0.3608, + "step": 3908500 + }, + { + "epoch": 26.452197921177998, + "grad_norm": 0.3662256598472595, + "learning_rate": 4.73547802078822e-05, + "loss": 0.3627, + "step": 3909000 + }, + { + "epoch": 26.455581420528368, + "grad_norm": 0.3671654462814331, + "learning_rate": 4.735444185794717e-05, + "loss": 0.3629, + "step": 3909500 + }, + { + "epoch": 26.458964919878735, + "grad_norm": 0.38121622800827026, + "learning_rate": 4.7354103508012126e-05, + "loss": 0.3633, + "step": 3910000 + }, + { + "epoch": 26.462348419229105, + "grad_norm": 0.3634631931781769, + "learning_rate": 4.735376515807709e-05, + "loss": 0.3627, + "step": 3910500 + }, + { + "epoch": 26.465731918579472, + "grad_norm": 0.38152438402175903, + "learning_rate": 4.735342680814206e-05, + "loss": 0.3629, + "step": 3911000 + }, + { + "epoch": 26.46911541792984, + "grad_norm": 0.36148032546043396, + "learning_rate": 4.735308845820702e-05, + "loss": 0.3616, + "step": 3911500 + }, + { + "epoch": 26.47249891728021, + "grad_norm": 0.37832167744636536, + "learning_rate": 4.735275010827198e-05, + "loss": 0.3627, + "step": 3912000 + }, + { + "epoch": 26.475882416630576, + "grad_norm": 0.3521499037742615, + "learning_rate": 4.7352411758336943e-05, + "loss": 0.3594, + "step": 3912500 + }, + { + "epoch": 26.479265915980942, + "grad_norm": 0.32856032252311707, + "learning_rate": 4.735207340840191e-05, + "loss": 0.3638, + "step": 3913000 + }, + { + "epoch": 26.482649415331313, + "grad_norm": 0.3490496873855591, + "learning_rate": 4.7351735058466874e-05, + "loss": 0.3626, + "step": 3913500 + }, + { + "epoch": 26.48603291468168, + "grad_norm": 0.34898892045021057, + "learning_rate": 4.7351396708531837e-05, + "loss": 0.3616, + "step": 3914000 + }, + { + "epoch": 26.48941641403205, + "grad_norm": 0.3551773428916931, + "learning_rate": 4.735105835859679e-05, + "loss": 0.3631, + "step": 3914500 + }, + { + "epoch": 26.492799913382417, + "grad_norm": 0.3575814366340637, + "learning_rate": 4.735072000866176e-05, + "loss": 0.3616, + "step": 3915000 + }, + { + "epoch": 26.496183412732783, + "grad_norm": 0.3804273009300232, + "learning_rate": 4.735038165872672e-05, + "loss": 0.3607, + "step": 3915500 + }, + { + "epoch": 26.499566912083154, + "grad_norm": 0.42856428027153015, + "learning_rate": 4.7350043308791685e-05, + "loss": 0.3622, + "step": 3916000 + }, + { + "epoch": 26.50295041143352, + "grad_norm": 0.40299901366233826, + "learning_rate": 4.734970495885665e-05, + "loss": 0.3621, + "step": 3916500 + }, + { + "epoch": 26.50633391078389, + "grad_norm": 0.389115571975708, + "learning_rate": 4.7349366608921616e-05, + "loss": 0.3615, + "step": 3917000 + }, + { + "epoch": 26.509717410134257, + "grad_norm": 0.3799566626548767, + "learning_rate": 4.734902825898658e-05, + "loss": 0.3627, + "step": 3917500 + }, + { + "epoch": 26.513100909484624, + "grad_norm": 0.3813866376876831, + "learning_rate": 4.734868990905154e-05, + "loss": 0.3633, + "step": 3918000 + }, + { + "epoch": 26.516484408834994, + "grad_norm": 0.36013519763946533, + "learning_rate": 4.73483515591165e-05, + "loss": 0.3629, + "step": 3918500 + }, + { + "epoch": 26.51986790818536, + "grad_norm": 0.370895117521286, + "learning_rate": 4.734801320918147e-05, + "loss": 0.3617, + "step": 3919000 + }, + { + "epoch": 26.52325140753573, + "grad_norm": 0.3173573315143585, + "learning_rate": 4.734767485924643e-05, + "loss": 0.3621, + "step": 3919500 + }, + { + "epoch": 26.5266349068861, + "grad_norm": 0.3925241529941559, + "learning_rate": 4.734733650931139e-05, + "loss": 0.363, + "step": 3920000 + }, + { + "epoch": 26.530018406236465, + "grad_norm": 0.386522114276886, + "learning_rate": 4.734699815937636e-05, + "loss": 0.3634, + "step": 3920500 + }, + { + "epoch": 26.533401905586835, + "grad_norm": 0.37653228640556335, + "learning_rate": 4.734665980944132e-05, + "loss": 0.3632, + "step": 3921000 + }, + { + "epoch": 26.536785404937202, + "grad_norm": 0.3695538640022278, + "learning_rate": 4.734632145950628e-05, + "loss": 0.3627, + "step": 3921500 + }, + { + "epoch": 26.54016890428757, + "grad_norm": 0.34452754259109497, + "learning_rate": 4.7345983109571244e-05, + "loss": 0.3629, + "step": 3922000 + }, + { + "epoch": 26.54355240363794, + "grad_norm": 0.4069274365901947, + "learning_rate": 4.734564475963621e-05, + "loss": 0.3617, + "step": 3922500 + }, + { + "epoch": 26.546935902988306, + "grad_norm": 0.39067214727401733, + "learning_rate": 4.7345306409701175e-05, + "loss": 0.3609, + "step": 3923000 + }, + { + "epoch": 26.550319402338676, + "grad_norm": 0.367840051651001, + "learning_rate": 4.734496805976614e-05, + "loss": 0.3628, + "step": 3923500 + }, + { + "epoch": 26.553702901689043, + "grad_norm": 0.3256097137928009, + "learning_rate": 4.734462970983109e-05, + "loss": 0.3637, + "step": 3924000 + }, + { + "epoch": 26.55708640103941, + "grad_norm": 0.33790019154548645, + "learning_rate": 4.734429135989606e-05, + "loss": 0.36, + "step": 3924500 + }, + { + "epoch": 26.56046990038978, + "grad_norm": 0.38533130288124084, + "learning_rate": 4.7343953009961024e-05, + "loss": 0.3638, + "step": 3925000 + }, + { + "epoch": 26.563853399740147, + "grad_norm": 0.36499324440956116, + "learning_rate": 4.7343614660025986e-05, + "loss": 0.3632, + "step": 3925500 + }, + { + "epoch": 26.567236899090517, + "grad_norm": 0.3344828188419342, + "learning_rate": 4.734327631009095e-05, + "loss": 0.3609, + "step": 3926000 + }, + { + "epoch": 26.570620398440884, + "grad_norm": 0.3918331265449524, + "learning_rate": 4.734293796015592e-05, + "loss": 0.3623, + "step": 3926500 + }, + { + "epoch": 26.57400389779125, + "grad_norm": 0.35491883754730225, + "learning_rate": 4.734259961022088e-05, + "loss": 0.3611, + "step": 3927000 + }, + { + "epoch": 26.57738739714162, + "grad_norm": 0.363515704870224, + "learning_rate": 4.734226126028584e-05, + "loss": 0.3621, + "step": 3927500 + }, + { + "epoch": 26.580770896491988, + "grad_norm": 0.36305171251296997, + "learning_rate": 4.73419229103508e-05, + "loss": 0.362, + "step": 3928000 + }, + { + "epoch": 26.584154395842354, + "grad_norm": 0.3938886821269989, + "learning_rate": 4.734158456041577e-05, + "loss": 0.363, + "step": 3928500 + }, + { + "epoch": 26.587537895192725, + "grad_norm": 0.3645011782646179, + "learning_rate": 4.734124621048073e-05, + "loss": 0.363, + "step": 3929000 + }, + { + "epoch": 26.59092139454309, + "grad_norm": 0.3929731845855713, + "learning_rate": 4.734090786054569e-05, + "loss": 0.3625, + "step": 3929500 + }, + { + "epoch": 26.59430489389346, + "grad_norm": 0.3837154507637024, + "learning_rate": 4.734056951061066e-05, + "loss": 0.3621, + "step": 3930000 + }, + { + "epoch": 26.59768839324383, + "grad_norm": 0.3836233615875244, + "learning_rate": 4.734023116067562e-05, + "loss": 0.3619, + "step": 3930500 + }, + { + "epoch": 26.601071892594195, + "grad_norm": 0.41164737939834595, + "learning_rate": 4.733989281074058e-05, + "loss": 0.3627, + "step": 3931000 + }, + { + "epoch": 26.604455391944565, + "grad_norm": 0.4338706135749817, + "learning_rate": 4.7339554460805545e-05, + "loss": 0.3629, + "step": 3931500 + }, + { + "epoch": 26.607838891294932, + "grad_norm": 0.3807239234447479, + "learning_rate": 4.733921611087051e-05, + "loss": 0.3629, + "step": 3932000 + }, + { + "epoch": 26.611222390645302, + "grad_norm": 0.36886560916900635, + "learning_rate": 4.7338877760935476e-05, + "loss": 0.3609, + "step": 3932500 + }, + { + "epoch": 26.61460588999567, + "grad_norm": 0.35011523962020874, + "learning_rate": 4.733853941100044e-05, + "loss": 0.3626, + "step": 3933000 + }, + { + "epoch": 26.617989389346036, + "grad_norm": 0.3590085208415985, + "learning_rate": 4.733820106106539e-05, + "loss": 0.3631, + "step": 3933500 + }, + { + "epoch": 26.621372888696406, + "grad_norm": 0.39787015318870544, + "learning_rate": 4.733786271113036e-05, + "loss": 0.3612, + "step": 3934000 + }, + { + "epoch": 26.624756388046773, + "grad_norm": 0.4101943075656891, + "learning_rate": 4.7337524361195324e-05, + "loss": 0.3616, + "step": 3934500 + }, + { + "epoch": 26.628139887397143, + "grad_norm": 0.35002443194389343, + "learning_rate": 4.7337186011260286e-05, + "loss": 0.3624, + "step": 3935000 + }, + { + "epoch": 26.63152338674751, + "grad_norm": 0.32612723112106323, + "learning_rate": 4.733684766132525e-05, + "loss": 0.3603, + "step": 3935500 + }, + { + "epoch": 26.634906886097877, + "grad_norm": 0.39648929238319397, + "learning_rate": 4.733650931139022e-05, + "loss": 0.3626, + "step": 3936000 + }, + { + "epoch": 26.638290385448247, + "grad_norm": 0.3798840045928955, + "learning_rate": 4.733617096145518e-05, + "loss": 0.3619, + "step": 3936500 + }, + { + "epoch": 26.641673884798614, + "grad_norm": 0.3551783859729767, + "learning_rate": 4.733583261152014e-05, + "loss": 0.3609, + "step": 3937000 + }, + { + "epoch": 26.64505738414898, + "grad_norm": 0.3756074011325836, + "learning_rate": 4.7335494261585104e-05, + "loss": 0.3626, + "step": 3937500 + }, + { + "epoch": 26.64844088349935, + "grad_norm": 0.37941914796829224, + "learning_rate": 4.733515591165007e-05, + "loss": 0.3632, + "step": 3938000 + }, + { + "epoch": 26.651824382849718, + "grad_norm": 0.3654654324054718, + "learning_rate": 4.733481756171503e-05, + "loss": 0.3623, + "step": 3938500 + }, + { + "epoch": 26.655207882200088, + "grad_norm": 0.38069528341293335, + "learning_rate": 4.733447921177999e-05, + "loss": 0.3621, + "step": 3939000 + }, + { + "epoch": 26.658591381550455, + "grad_norm": 0.3884636461734772, + "learning_rate": 4.733414086184495e-05, + "loss": 0.3628, + "step": 3939500 + }, + { + "epoch": 26.66197488090082, + "grad_norm": 0.3615627586841583, + "learning_rate": 4.733380251190992e-05, + "loss": 0.3615, + "step": 3940000 + }, + { + "epoch": 26.66535838025119, + "grad_norm": 0.37071868777275085, + "learning_rate": 4.733346416197488e-05, + "loss": 0.3627, + "step": 3940500 + }, + { + "epoch": 26.66874187960156, + "grad_norm": 0.3769175410270691, + "learning_rate": 4.7333125812039845e-05, + "loss": 0.3634, + "step": 3941000 + }, + { + "epoch": 26.67212537895193, + "grad_norm": 0.396505206823349, + "learning_rate": 4.733278746210481e-05, + "loss": 0.3629, + "step": 3941500 + }, + { + "epoch": 26.675508878302296, + "grad_norm": 0.3847061097621918, + "learning_rate": 4.7332449112169776e-05, + "loss": 0.3616, + "step": 3942000 + }, + { + "epoch": 26.678892377652662, + "grad_norm": 0.3411423861980438, + "learning_rate": 4.733211076223474e-05, + "loss": 0.3627, + "step": 3942500 + }, + { + "epoch": 26.682275877003033, + "grad_norm": 0.36055833101272583, + "learning_rate": 4.7331772412299694e-05, + "loss": 0.3618, + "step": 3943000 + }, + { + "epoch": 26.6856593763534, + "grad_norm": 0.31502601504325867, + "learning_rate": 4.733143406236466e-05, + "loss": 0.3628, + "step": 3943500 + }, + { + "epoch": 26.68904287570377, + "grad_norm": 0.3988841772079468, + "learning_rate": 4.7331095712429625e-05, + "loss": 0.3632, + "step": 3944000 + }, + { + "epoch": 26.692426375054136, + "grad_norm": 0.3657224476337433, + "learning_rate": 4.733075736249459e-05, + "loss": 0.362, + "step": 3944500 + }, + { + "epoch": 26.695809874404503, + "grad_norm": 0.37842491269111633, + "learning_rate": 4.733041901255955e-05, + "loss": 0.3619, + "step": 3945000 + }, + { + "epoch": 26.699193373754873, + "grad_norm": 0.321781188249588, + "learning_rate": 4.733008066262452e-05, + "loss": 0.3605, + "step": 3945500 + }, + { + "epoch": 26.70257687310524, + "grad_norm": 0.4276789426803589, + "learning_rate": 4.732974231268948e-05, + "loss": 0.3617, + "step": 3946000 + }, + { + "epoch": 26.705960372455607, + "grad_norm": 0.3677583336830139, + "learning_rate": 4.732940396275444e-05, + "loss": 0.362, + "step": 3946500 + }, + { + "epoch": 26.709343871805977, + "grad_norm": 0.3639909625053406, + "learning_rate": 4.7329065612819404e-05, + "loss": 0.3617, + "step": 3947000 + }, + { + "epoch": 26.712727371156344, + "grad_norm": 0.3415442407131195, + "learning_rate": 4.732872726288437e-05, + "loss": 0.3626, + "step": 3947500 + }, + { + "epoch": 26.716110870506714, + "grad_norm": 0.3350616693496704, + "learning_rate": 4.732838891294933e-05, + "loss": 0.3624, + "step": 3948000 + }, + { + "epoch": 26.71949436985708, + "grad_norm": 0.3643999695777893, + "learning_rate": 4.732805056301429e-05, + "loss": 0.362, + "step": 3948500 + }, + { + "epoch": 26.722877869207448, + "grad_norm": 0.33907368779182434, + "learning_rate": 4.732771221307925e-05, + "loss": 0.3627, + "step": 3949000 + }, + { + "epoch": 26.726261368557818, + "grad_norm": 0.41405385732650757, + "learning_rate": 4.732737386314422e-05, + "loss": 0.3626, + "step": 3949500 + }, + { + "epoch": 26.729644867908185, + "grad_norm": 0.366876482963562, + "learning_rate": 4.7327035513209184e-05, + "loss": 0.3615, + "step": 3950000 + }, + { + "epoch": 26.733028367258555, + "grad_norm": 0.3724476993083954, + "learning_rate": 4.7326697163274146e-05, + "loss": 0.3631, + "step": 3950500 + }, + { + "epoch": 26.736411866608922, + "grad_norm": 0.37764909863471985, + "learning_rate": 4.732635881333911e-05, + "loss": 0.3625, + "step": 3951000 + }, + { + "epoch": 26.73979536595929, + "grad_norm": 0.37439340353012085, + "learning_rate": 4.732602046340408e-05, + "loss": 0.3633, + "step": 3951500 + }, + { + "epoch": 26.74317886530966, + "grad_norm": 0.36886876821517944, + "learning_rate": 4.732568211346904e-05, + "loss": 0.3635, + "step": 3952000 + }, + { + "epoch": 26.746562364660026, + "grad_norm": 0.37583333253860474, + "learning_rate": 4.7325343763533994e-05, + "loss": 0.3625, + "step": 3952500 + }, + { + "epoch": 26.749945864010392, + "grad_norm": 0.33810582756996155, + "learning_rate": 4.732500541359896e-05, + "loss": 0.3635, + "step": 3953000 + }, + { + "epoch": 26.753329363360763, + "grad_norm": 0.35692256689071655, + "learning_rate": 4.7324667063663925e-05, + "loss": 0.3624, + "step": 3953500 + }, + { + "epoch": 26.75671286271113, + "grad_norm": 0.33927008509635925, + "learning_rate": 4.732432871372889e-05, + "loss": 0.3632, + "step": 3954000 + }, + { + "epoch": 26.7600963620615, + "grad_norm": 0.3944675028324127, + "learning_rate": 4.732399036379385e-05, + "loss": 0.3656, + "step": 3954500 + }, + { + "epoch": 26.763479861411867, + "grad_norm": 0.35236796736717224, + "learning_rate": 4.732365201385882e-05, + "loss": 0.3611, + "step": 3955000 + }, + { + "epoch": 26.766863360762233, + "grad_norm": 0.43203791975975037, + "learning_rate": 4.732331366392378e-05, + "loss": 0.362, + "step": 3955500 + }, + { + "epoch": 26.770246860112604, + "grad_norm": 0.4389057457447052, + "learning_rate": 4.732297531398874e-05, + "loss": 0.362, + "step": 3956000 + }, + { + "epoch": 26.77363035946297, + "grad_norm": 0.37879079580307007, + "learning_rate": 4.7322636964053705e-05, + "loss": 0.363, + "step": 3956500 + }, + { + "epoch": 26.77701385881334, + "grad_norm": 0.35850194096565247, + "learning_rate": 4.7322298614118674e-05, + "loss": 0.3616, + "step": 3957000 + }, + { + "epoch": 26.780397358163707, + "grad_norm": 0.35824859142303467, + "learning_rate": 4.732196026418363e-05, + "loss": 0.3617, + "step": 3957500 + }, + { + "epoch": 26.783780857514074, + "grad_norm": 0.3858749270439148, + "learning_rate": 4.732162191424859e-05, + "loss": 0.3629, + "step": 3958000 + }, + { + "epoch": 26.787164356864444, + "grad_norm": 0.3488672375679016, + "learning_rate": 4.7321283564313553e-05, + "loss": 0.3626, + "step": 3958500 + }, + { + "epoch": 26.79054785621481, + "grad_norm": 0.39496272802352905, + "learning_rate": 4.732094521437852e-05, + "loss": 0.3633, + "step": 3959000 + }, + { + "epoch": 26.793931355565178, + "grad_norm": 0.40459078550338745, + "learning_rate": 4.7320606864443484e-05, + "loss": 0.3614, + "step": 3959500 + }, + { + "epoch": 26.797314854915548, + "grad_norm": 0.3908441364765167, + "learning_rate": 4.7320268514508447e-05, + "loss": 0.3618, + "step": 3960000 + }, + { + "epoch": 26.800698354265915, + "grad_norm": 0.3586594760417938, + "learning_rate": 4.731993016457341e-05, + "loss": 0.3628, + "step": 3960500 + }, + { + "epoch": 26.804081853616285, + "grad_norm": 0.3698421120643616, + "learning_rate": 4.731959181463838e-05, + "loss": 0.363, + "step": 3961000 + }, + { + "epoch": 26.807465352966652, + "grad_norm": 0.3757856488227844, + "learning_rate": 4.731925346470334e-05, + "loss": 0.3625, + "step": 3961500 + }, + { + "epoch": 26.81084885231702, + "grad_norm": 0.39045828580856323, + "learning_rate": 4.73189151147683e-05, + "loss": 0.361, + "step": 3962000 + }, + { + "epoch": 26.81423235166739, + "grad_norm": 0.3710618019104004, + "learning_rate": 4.7318576764833264e-05, + "loss": 0.3612, + "step": 3962500 + }, + { + "epoch": 26.817615851017756, + "grad_norm": 0.39672067761421204, + "learning_rate": 4.7318238414898226e-05, + "loss": 0.3633, + "step": 3963000 + }, + { + "epoch": 26.820999350368126, + "grad_norm": 0.3813120424747467, + "learning_rate": 4.731790006496319e-05, + "loss": 0.3627, + "step": 3963500 + }, + { + "epoch": 26.824382849718493, + "grad_norm": 0.3530002236366272, + "learning_rate": 4.731756171502815e-05, + "loss": 0.3638, + "step": 3964000 + }, + { + "epoch": 26.82776634906886, + "grad_norm": 0.4095313847064972, + "learning_rate": 4.731722336509312e-05, + "loss": 0.3614, + "step": 3964500 + }, + { + "epoch": 26.83114984841923, + "grad_norm": 0.3469836413860321, + "learning_rate": 4.731688501515808e-05, + "loss": 0.3623, + "step": 3965000 + }, + { + "epoch": 26.834533347769597, + "grad_norm": 0.36245352029800415, + "learning_rate": 4.7316546665223043e-05, + "loss": 0.3616, + "step": 3965500 + }, + { + "epoch": 26.837916847119967, + "grad_norm": 0.3662642240524292, + "learning_rate": 4.7316208315288006e-05, + "loss": 0.363, + "step": 3966000 + }, + { + "epoch": 26.841300346470334, + "grad_norm": 0.37218987941741943, + "learning_rate": 4.7315869965352975e-05, + "loss": 0.3615, + "step": 3966500 + }, + { + "epoch": 26.8446838458207, + "grad_norm": 0.3281077444553375, + "learning_rate": 4.731553161541793e-05, + "loss": 0.3622, + "step": 3967000 + }, + { + "epoch": 26.84806734517107, + "grad_norm": 0.3949171006679535, + "learning_rate": 4.731519326548289e-05, + "loss": 0.3628, + "step": 3967500 + }, + { + "epoch": 26.851450844521437, + "grad_norm": 0.4136095941066742, + "learning_rate": 4.7314854915547854e-05, + "loss": 0.3641, + "step": 3968000 + }, + { + "epoch": 26.854834343871808, + "grad_norm": 0.33874234557151794, + "learning_rate": 4.731451656561282e-05, + "loss": 0.3622, + "step": 3968500 + }, + { + "epoch": 26.858217843222175, + "grad_norm": 0.338762491941452, + "learning_rate": 4.7314178215677785e-05, + "loss": 0.3619, + "step": 3969000 + }, + { + "epoch": 26.86160134257254, + "grad_norm": 0.3444060683250427, + "learning_rate": 4.731383986574275e-05, + "loss": 0.3635, + "step": 3969500 + }, + { + "epoch": 26.86498484192291, + "grad_norm": 0.3588503301143646, + "learning_rate": 4.731350151580771e-05, + "loss": 0.3612, + "step": 3970000 + }, + { + "epoch": 26.86836834127328, + "grad_norm": 0.37239402532577515, + "learning_rate": 4.731316316587268e-05, + "loss": 0.3631, + "step": 3970500 + }, + { + "epoch": 26.871751840623645, + "grad_norm": 0.3820563852787018, + "learning_rate": 4.731282481593764e-05, + "loss": 0.364, + "step": 3971000 + }, + { + "epoch": 26.875135339974015, + "grad_norm": 0.35125070810317993, + "learning_rate": 4.73124864660026e-05, + "loss": 0.3616, + "step": 3971500 + }, + { + "epoch": 26.878518839324382, + "grad_norm": 0.36873704195022583, + "learning_rate": 4.7312148116067565e-05, + "loss": 0.364, + "step": 3972000 + }, + { + "epoch": 26.881902338674752, + "grad_norm": 0.3618520498275757, + "learning_rate": 4.731180976613253e-05, + "loss": 0.3629, + "step": 3972500 + }, + { + "epoch": 26.88528583802512, + "grad_norm": 0.3631375730037689, + "learning_rate": 4.731147141619749e-05, + "loss": 0.3624, + "step": 3973000 + }, + { + "epoch": 26.888669337375486, + "grad_norm": 0.3646116256713867, + "learning_rate": 4.731113306626245e-05, + "loss": 0.3641, + "step": 3973500 + }, + { + "epoch": 26.892052836725856, + "grad_norm": 0.35803481936454773, + "learning_rate": 4.731079471632742e-05, + "loss": 0.3615, + "step": 3974000 + }, + { + "epoch": 26.895436336076223, + "grad_norm": 0.3725024461746216, + "learning_rate": 4.731045636639238e-05, + "loss": 0.3619, + "step": 3974500 + }, + { + "epoch": 26.898819835426593, + "grad_norm": 0.36321380734443665, + "learning_rate": 4.7310118016457344e-05, + "loss": 0.3624, + "step": 3975000 + }, + { + "epoch": 26.90220333477696, + "grad_norm": 0.33477428555488586, + "learning_rate": 4.7309779666522306e-05, + "loss": 0.3628, + "step": 3975500 + }, + { + "epoch": 26.905586834127327, + "grad_norm": 0.3610873818397522, + "learning_rate": 4.7309441316587275e-05, + "loss": 0.3623, + "step": 3976000 + }, + { + "epoch": 26.908970333477697, + "grad_norm": 0.3352227210998535, + "learning_rate": 4.730910296665223e-05, + "loss": 0.3631, + "step": 3976500 + }, + { + "epoch": 26.912353832828064, + "grad_norm": 0.3139859437942505, + "learning_rate": 4.730876461671719e-05, + "loss": 0.3632, + "step": 3977000 + }, + { + "epoch": 26.91573733217843, + "grad_norm": 0.36809584498405457, + "learning_rate": 4.7308426266782155e-05, + "loss": 0.3613, + "step": 3977500 + }, + { + "epoch": 26.9191208315288, + "grad_norm": 0.3616427481174469, + "learning_rate": 4.7308087916847124e-05, + "loss": 0.3639, + "step": 3978000 + }, + { + "epoch": 26.922504330879168, + "grad_norm": 0.35086822509765625, + "learning_rate": 4.7307749566912086e-05, + "loss": 0.3615, + "step": 3978500 + }, + { + "epoch": 26.925887830229538, + "grad_norm": 0.3581278622150421, + "learning_rate": 4.730741121697705e-05, + "loss": 0.3626, + "step": 3979000 + }, + { + "epoch": 26.929271329579905, + "grad_norm": 0.3818640410900116, + "learning_rate": 4.730707286704201e-05, + "loss": 0.3625, + "step": 3979500 + }, + { + "epoch": 26.93265482893027, + "grad_norm": 0.3971289396286011, + "learning_rate": 4.730673451710698e-05, + "loss": 0.3623, + "step": 3980000 + }, + { + "epoch": 26.93603832828064, + "grad_norm": 0.39175015687942505, + "learning_rate": 4.730639616717194e-05, + "loss": 0.3627, + "step": 3980500 + }, + { + "epoch": 26.93942182763101, + "grad_norm": 0.40893542766571045, + "learning_rate": 4.73060578172369e-05, + "loss": 0.3633, + "step": 3981000 + }, + { + "epoch": 26.94280532698138, + "grad_norm": 0.35594314336776733, + "learning_rate": 4.7305719467301865e-05, + "loss": 0.3627, + "step": 3981500 + }, + { + "epoch": 26.946188826331746, + "grad_norm": 0.36786365509033203, + "learning_rate": 4.730538111736683e-05, + "loss": 0.3612, + "step": 3982000 + }, + { + "epoch": 26.949572325682112, + "grad_norm": 0.3444536328315735, + "learning_rate": 4.730504276743179e-05, + "loss": 0.3622, + "step": 3982500 + }, + { + "epoch": 26.952955825032483, + "grad_norm": 0.35013580322265625, + "learning_rate": 4.730470441749675e-05, + "loss": 0.3631, + "step": 3983000 + }, + { + "epoch": 26.95633932438285, + "grad_norm": 0.36221638321876526, + "learning_rate": 4.730436606756172e-05, + "loss": 0.3624, + "step": 3983500 + }, + { + "epoch": 26.959722823733216, + "grad_norm": 0.36968088150024414, + "learning_rate": 4.730402771762668e-05, + "loss": 0.3625, + "step": 3984000 + }, + { + "epoch": 26.963106323083586, + "grad_norm": 0.3494577705860138, + "learning_rate": 4.7303689367691645e-05, + "loss": 0.3627, + "step": 3984500 + }, + { + "epoch": 26.966489822433953, + "grad_norm": 0.3767252266407013, + "learning_rate": 4.730335101775661e-05, + "loss": 0.3632, + "step": 3985000 + }, + { + "epoch": 26.969873321784323, + "grad_norm": 0.36130470037460327, + "learning_rate": 4.730301266782157e-05, + "loss": 0.3628, + "step": 3985500 + }, + { + "epoch": 26.97325682113469, + "grad_norm": 0.3665076792240143, + "learning_rate": 4.730267431788653e-05, + "loss": 0.3647, + "step": 3986000 + }, + { + "epoch": 26.976640320485057, + "grad_norm": 0.37495723366737366, + "learning_rate": 4.730233596795149e-05, + "loss": 0.3631, + "step": 3986500 + }, + { + "epoch": 26.980023819835427, + "grad_norm": 0.3862490952014923, + "learning_rate": 4.7301997618016455e-05, + "loss": 0.3622, + "step": 3987000 + }, + { + "epoch": 26.983407319185794, + "grad_norm": 0.3855757415294647, + "learning_rate": 4.7301659268081424e-05, + "loss": 0.3602, + "step": 3987500 + }, + { + "epoch": 26.986790818536164, + "grad_norm": 0.3551534116268158, + "learning_rate": 4.7301320918146386e-05, + "loss": 0.3624, + "step": 3988000 + }, + { + "epoch": 26.99017431788653, + "grad_norm": 0.3630467653274536, + "learning_rate": 4.730098256821135e-05, + "loss": 0.3615, + "step": 3988500 + }, + { + "epoch": 26.993557817236898, + "grad_norm": 0.3774111866950989, + "learning_rate": 4.730064421827631e-05, + "loss": 0.3648, + "step": 3989000 + }, + { + "epoch": 26.996941316587268, + "grad_norm": 0.35956814885139465, + "learning_rate": 4.730030586834128e-05, + "loss": 0.3621, + "step": 3989500 + }, + { + "epoch": 27.0, + "eval_accuracy": 0.8617120390552822, + "eval_loss": 0.5606268644332886, + "eval_runtime": 3363.4255, + "eval_samples_per_second": 86.443, + "eval_steps_per_second": 5.403, + "step": 3989952 + }, + { + "epoch": 27.000324815937635, + "grad_norm": 0.36992666125297546, + "learning_rate": 4.729996751840624e-05, + "loss": 0.362, + "step": 3990000 + }, + { + "epoch": 27.003708315288005, + "grad_norm": 0.4187975227832794, + "learning_rate": 4.7299629168471204e-05, + "loss": 0.3592, + "step": 3990500 + }, + { + "epoch": 27.007091814638372, + "grad_norm": 0.35702478885650635, + "learning_rate": 4.7299290818536166e-05, + "loss": 0.3611, + "step": 3991000 + }, + { + "epoch": 27.01047531398874, + "grad_norm": 0.40064552426338196, + "learning_rate": 4.729895246860113e-05, + "loss": 0.3586, + "step": 3991500 + }, + { + "epoch": 27.01385881333911, + "grad_norm": 0.39525461196899414, + "learning_rate": 4.729861411866609e-05, + "loss": 0.3607, + "step": 3992000 + }, + { + "epoch": 27.017242312689476, + "grad_norm": 0.3214837312698364, + "learning_rate": 4.729827576873105e-05, + "loss": 0.3591, + "step": 3992500 + }, + { + "epoch": 27.020625812039842, + "grad_norm": 0.362924724817276, + "learning_rate": 4.729793741879602e-05, + "loss": 0.3579, + "step": 3993000 + }, + { + "epoch": 27.024009311390213, + "grad_norm": 0.418075293302536, + "learning_rate": 4.729759906886098e-05, + "loss": 0.359, + "step": 3993500 + }, + { + "epoch": 27.02739281074058, + "grad_norm": 0.39205074310302734, + "learning_rate": 4.7297260718925945e-05, + "loss": 0.3611, + "step": 3994000 + }, + { + "epoch": 27.03077631009095, + "grad_norm": 0.341508150100708, + "learning_rate": 4.729692236899091e-05, + "loss": 0.3607, + "step": 3994500 + }, + { + "epoch": 27.034159809441316, + "grad_norm": 0.3604142963886261, + "learning_rate": 4.729658401905587e-05, + "loss": 0.3597, + "step": 3995000 + }, + { + "epoch": 27.037543308791683, + "grad_norm": 0.39036643505096436, + "learning_rate": 4.729624566912083e-05, + "loss": 0.3611, + "step": 3995500 + }, + { + "epoch": 27.040926808142054, + "grad_norm": 0.3812994360923767, + "learning_rate": 4.7295907319185794e-05, + "loss": 0.3609, + "step": 3996000 + }, + { + "epoch": 27.04431030749242, + "grad_norm": 0.3937513530254364, + "learning_rate": 4.7295568969250756e-05, + "loss": 0.3617, + "step": 3996500 + }, + { + "epoch": 27.04769380684279, + "grad_norm": 0.3918488323688507, + "learning_rate": 4.7295230619315725e-05, + "loss": 0.3582, + "step": 3997000 + }, + { + "epoch": 27.051077306193157, + "grad_norm": 0.36170023679733276, + "learning_rate": 4.729489226938069e-05, + "loss": 0.3612, + "step": 3997500 + }, + { + "epoch": 27.054460805543524, + "grad_norm": 0.3825434446334839, + "learning_rate": 4.729455391944565e-05, + "loss": 0.3616, + "step": 3998000 + }, + { + "epoch": 27.057844304893894, + "grad_norm": 0.37038761377334595, + "learning_rate": 4.729421556951061e-05, + "loss": 0.361, + "step": 3998500 + }, + { + "epoch": 27.06122780424426, + "grad_norm": 0.33910655975341797, + "learning_rate": 4.729387721957558e-05, + "loss": 0.362, + "step": 3999000 + }, + { + "epoch": 27.06461130359463, + "grad_norm": 0.37690266966819763, + "learning_rate": 4.729353886964054e-05, + "loss": 0.3625, + "step": 3999500 + }, + { + "epoch": 27.067994802944998, + "grad_norm": 0.3696511685848236, + "learning_rate": 4.7293200519705504e-05, + "loss": 0.3615, + "step": 4000000 + }, + { + "epoch": 27.071378302295365, + "grad_norm": 0.35271111130714417, + "learning_rate": 4.7292862169770467e-05, + "loss": 0.3619, + "step": 4000500 + }, + { + "epoch": 27.074761801645735, + "grad_norm": 0.3805581033229828, + "learning_rate": 4.729252381983543e-05, + "loss": 0.361, + "step": 4001000 + }, + { + "epoch": 27.078145300996102, + "grad_norm": 0.39928561449050903, + "learning_rate": 4.729218546990039e-05, + "loss": 0.3592, + "step": 4001500 + }, + { + "epoch": 27.08152880034647, + "grad_norm": 0.367096871137619, + "learning_rate": 4.729184711996535e-05, + "loss": 0.3585, + "step": 4002000 + }, + { + "epoch": 27.08491229969684, + "grad_norm": 0.34236449003219604, + "learning_rate": 4.7291508770030315e-05, + "loss": 0.3611, + "step": 4002500 + }, + { + "epoch": 27.088295799047206, + "grad_norm": 0.34770265221595764, + "learning_rate": 4.7291170420095284e-05, + "loss": 0.3614, + "step": 4003000 + }, + { + "epoch": 27.091679298397576, + "grad_norm": 0.37283119559288025, + "learning_rate": 4.7290832070160246e-05, + "loss": 0.3587, + "step": 4003500 + }, + { + "epoch": 27.095062797747943, + "grad_norm": 0.4142138957977295, + "learning_rate": 4.729049372022521e-05, + "loss": 0.3606, + "step": 4004000 + }, + { + "epoch": 27.09844629709831, + "grad_norm": 0.3636082112789154, + "learning_rate": 4.729015537029017e-05, + "loss": 0.3617, + "step": 4004500 + }, + { + "epoch": 27.10182979644868, + "grad_norm": 0.34307849407196045, + "learning_rate": 4.728981702035513e-05, + "loss": 0.3608, + "step": 4005000 + }, + { + "epoch": 27.105213295799047, + "grad_norm": 0.3708007037639618, + "learning_rate": 4.7289478670420095e-05, + "loss": 0.361, + "step": 4005500 + }, + { + "epoch": 27.108596795149417, + "grad_norm": 0.39478373527526855, + "learning_rate": 4.728914032048506e-05, + "loss": 0.3617, + "step": 4006000 + }, + { + "epoch": 27.111980294499784, + "grad_norm": 0.3448827564716339, + "learning_rate": 4.7288801970550026e-05, + "loss": 0.3614, + "step": 4006500 + }, + { + "epoch": 27.11536379385015, + "grad_norm": 0.35732850432395935, + "learning_rate": 4.728846362061499e-05, + "loss": 0.3624, + "step": 4007000 + }, + { + "epoch": 27.11874729320052, + "grad_norm": 0.39854931831359863, + "learning_rate": 4.728812527067995e-05, + "loss": 0.3613, + "step": 4007500 + }, + { + "epoch": 27.122130792550887, + "grad_norm": 0.39971092343330383, + "learning_rate": 4.728778692074491e-05, + "loss": 0.3604, + "step": 4008000 + }, + { + "epoch": 27.125514291901254, + "grad_norm": 0.36446234583854675, + "learning_rate": 4.728744857080988e-05, + "loss": 0.3601, + "step": 4008500 + }, + { + "epoch": 27.128897791251624, + "grad_norm": 0.36135321855545044, + "learning_rate": 4.728711022087484e-05, + "loss": 0.3626, + "step": 4009000 + }, + { + "epoch": 27.13228129060199, + "grad_norm": 0.390614777803421, + "learning_rate": 4.7286771870939805e-05, + "loss": 0.3599, + "step": 4009500 + }, + { + "epoch": 27.13566478995236, + "grad_norm": 0.40647169947624207, + "learning_rate": 4.728643352100476e-05, + "loss": 0.3619, + "step": 4010000 + }, + { + "epoch": 27.13904828930273, + "grad_norm": 0.3296395242214203, + "learning_rate": 4.728609517106973e-05, + "loss": 0.3619, + "step": 4010500 + }, + { + "epoch": 27.142431788653095, + "grad_norm": 0.37486401200294495, + "learning_rate": 4.728575682113469e-05, + "loss": 0.3605, + "step": 4011000 + }, + { + "epoch": 27.145815288003465, + "grad_norm": 0.3922019898891449, + "learning_rate": 4.7285418471199654e-05, + "loss": 0.3608, + "step": 4011500 + }, + { + "epoch": 27.149198787353832, + "grad_norm": 0.32637348771095276, + "learning_rate": 4.7285080121264616e-05, + "loss": 0.3599, + "step": 4012000 + }, + { + "epoch": 27.152582286704202, + "grad_norm": 0.3791635036468506, + "learning_rate": 4.7284741771329585e-05, + "loss": 0.3609, + "step": 4012500 + }, + { + "epoch": 27.15596578605457, + "grad_norm": 0.36705470085144043, + "learning_rate": 4.728440342139455e-05, + "loss": 0.3614, + "step": 4013000 + }, + { + "epoch": 27.159349285404936, + "grad_norm": 0.39378514885902405, + "learning_rate": 4.728406507145951e-05, + "loss": 0.362, + "step": 4013500 + }, + { + "epoch": 27.162732784755306, + "grad_norm": 0.35343000292778015, + "learning_rate": 4.728372672152447e-05, + "loss": 0.3617, + "step": 4014000 + }, + { + "epoch": 27.166116284105673, + "grad_norm": 0.3905445337295532, + "learning_rate": 4.728338837158944e-05, + "loss": 0.3611, + "step": 4014500 + }, + { + "epoch": 27.169499783456043, + "grad_norm": 0.36750927567481995, + "learning_rate": 4.7283050021654395e-05, + "loss": 0.3583, + "step": 4015000 + }, + { + "epoch": 27.17288328280641, + "grad_norm": 0.3575628399848938, + "learning_rate": 4.728271167171936e-05, + "loss": 0.3627, + "step": 4015500 + }, + { + "epoch": 27.176266782156777, + "grad_norm": 0.35029515624046326, + "learning_rate": 4.7282373321784326e-05, + "loss": 0.3629, + "step": 4016000 + }, + { + "epoch": 27.179650281507147, + "grad_norm": 0.4039035439491272, + "learning_rate": 4.728203497184929e-05, + "loss": 0.3619, + "step": 4016500 + }, + { + "epoch": 27.183033780857514, + "grad_norm": 0.37368181347846985, + "learning_rate": 4.728169662191425e-05, + "loss": 0.362, + "step": 4017000 + }, + { + "epoch": 27.18641728020788, + "grad_norm": 0.38231322169303894, + "learning_rate": 4.728135827197921e-05, + "loss": 0.3599, + "step": 4017500 + }, + { + "epoch": 27.18980077955825, + "grad_norm": 0.38161662220954895, + "learning_rate": 4.728101992204418e-05, + "loss": 0.3628, + "step": 4018000 + }, + { + "epoch": 27.193184278908618, + "grad_norm": 0.4048933684825897, + "learning_rate": 4.7280681572109144e-05, + "loss": 0.3604, + "step": 4018500 + }, + { + "epoch": 27.196567778258988, + "grad_norm": 0.4047812521457672, + "learning_rate": 4.7280343222174106e-05, + "loss": 0.3602, + "step": 4019000 + }, + { + "epoch": 27.199951277609355, + "grad_norm": 0.39230266213417053, + "learning_rate": 4.728000487223906e-05, + "loss": 0.3609, + "step": 4019500 + }, + { + "epoch": 27.20333477695972, + "grad_norm": 0.3826599717140198, + "learning_rate": 4.727966652230403e-05, + "loss": 0.3623, + "step": 4020000 + }, + { + "epoch": 27.20671827631009, + "grad_norm": 0.3624647557735443, + "learning_rate": 4.727932817236899e-05, + "loss": 0.361, + "step": 4020500 + }, + { + "epoch": 27.21010177566046, + "grad_norm": 0.3965359330177307, + "learning_rate": 4.7278989822433954e-05, + "loss": 0.3619, + "step": 4021000 + }, + { + "epoch": 27.21348527501083, + "grad_norm": 0.3927077353000641, + "learning_rate": 4.7278651472498916e-05, + "loss": 0.3597, + "step": 4021500 + }, + { + "epoch": 27.216868774361195, + "grad_norm": 0.3874165117740631, + "learning_rate": 4.7278313122563885e-05, + "loss": 0.3618, + "step": 4022000 + }, + { + "epoch": 27.220252273711562, + "grad_norm": 0.36228522658348083, + "learning_rate": 4.727797477262885e-05, + "loss": 0.3608, + "step": 4022500 + }, + { + "epoch": 27.223635773061932, + "grad_norm": 0.3947308361530304, + "learning_rate": 4.727763642269381e-05, + "loss": 0.3606, + "step": 4023000 + }, + { + "epoch": 27.2270192724123, + "grad_norm": 0.353760302066803, + "learning_rate": 4.727729807275877e-05, + "loss": 0.3622, + "step": 4023500 + }, + { + "epoch": 27.23040277176267, + "grad_norm": 0.34535297751426697, + "learning_rate": 4.727695972282374e-05, + "loss": 0.3618, + "step": 4024000 + }, + { + "epoch": 27.233786271113036, + "grad_norm": 0.3804129362106323, + "learning_rate": 4.7276621372888696e-05, + "loss": 0.3609, + "step": 4024500 + }, + { + "epoch": 27.237169770463403, + "grad_norm": 0.42950165271759033, + "learning_rate": 4.727628302295366e-05, + "loss": 0.3612, + "step": 4025000 + }, + { + "epoch": 27.240553269813773, + "grad_norm": 0.3900119662284851, + "learning_rate": 4.727594467301863e-05, + "loss": 0.3625, + "step": 4025500 + }, + { + "epoch": 27.24393676916414, + "grad_norm": 0.38278695940971375, + "learning_rate": 4.727560632308359e-05, + "loss": 0.3621, + "step": 4026000 + }, + { + "epoch": 27.247320268514507, + "grad_norm": 0.3604544401168823, + "learning_rate": 4.727526797314855e-05, + "loss": 0.3603, + "step": 4026500 + }, + { + "epoch": 27.250703767864877, + "grad_norm": 0.30103299021720886, + "learning_rate": 4.727492962321351e-05, + "loss": 0.3614, + "step": 4027000 + }, + { + "epoch": 27.254087267215244, + "grad_norm": 0.3802935779094696, + "learning_rate": 4.727459127327848e-05, + "loss": 0.3635, + "step": 4027500 + }, + { + "epoch": 27.257470766565614, + "grad_norm": 0.3827607035636902, + "learning_rate": 4.7274252923343444e-05, + "loss": 0.3614, + "step": 4028000 + }, + { + "epoch": 27.26085426591598, + "grad_norm": 0.3646654188632965, + "learning_rate": 4.7273914573408406e-05, + "loss": 0.3617, + "step": 4028500 + }, + { + "epoch": 27.264237765266348, + "grad_norm": 0.3567125201225281, + "learning_rate": 4.727357622347336e-05, + "loss": 0.3623, + "step": 4029000 + }, + { + "epoch": 27.267621264616718, + "grad_norm": 0.3684942424297333, + "learning_rate": 4.727323787353833e-05, + "loss": 0.3606, + "step": 4029500 + }, + { + "epoch": 27.271004763967085, + "grad_norm": 0.36184918880462646, + "learning_rate": 4.727289952360329e-05, + "loss": 0.3615, + "step": 4030000 + }, + { + "epoch": 27.274388263317455, + "grad_norm": 0.3627530038356781, + "learning_rate": 4.7272561173668255e-05, + "loss": 0.3611, + "step": 4030500 + }, + { + "epoch": 27.27777176266782, + "grad_norm": 0.41672393679618835, + "learning_rate": 4.727222282373322e-05, + "loss": 0.362, + "step": 4031000 + }, + { + "epoch": 27.28115526201819, + "grad_norm": 0.3685767948627472, + "learning_rate": 4.7271884473798186e-05, + "loss": 0.3637, + "step": 4031500 + }, + { + "epoch": 27.28453876136856, + "grad_norm": 0.3790867030620575, + "learning_rate": 4.727154612386315e-05, + "loss": 0.3612, + "step": 4032000 + }, + { + "epoch": 27.287922260718926, + "grad_norm": 0.3841542601585388, + "learning_rate": 4.727120777392811e-05, + "loss": 0.36, + "step": 4032500 + }, + { + "epoch": 27.291305760069292, + "grad_norm": 0.3759045898914337, + "learning_rate": 4.727086942399307e-05, + "loss": 0.3607, + "step": 4033000 + }, + { + "epoch": 27.294689259419663, + "grad_norm": 0.42025455832481384, + "learning_rate": 4.727053107405804e-05, + "loss": 0.3624, + "step": 4033500 + }, + { + "epoch": 27.29807275877003, + "grad_norm": 0.4027925729751587, + "learning_rate": 4.7270192724122996e-05, + "loss": 0.3613, + "step": 4034000 + }, + { + "epoch": 27.3014562581204, + "grad_norm": 0.4137814939022064, + "learning_rate": 4.726985437418796e-05, + "loss": 0.3614, + "step": 4034500 + }, + { + "epoch": 27.304839757470766, + "grad_norm": 0.390674352645874, + "learning_rate": 4.726951602425293e-05, + "loss": 0.3622, + "step": 4035000 + }, + { + "epoch": 27.308223256821133, + "grad_norm": 0.39336270093917847, + "learning_rate": 4.726917767431789e-05, + "loss": 0.3618, + "step": 4035500 + }, + { + "epoch": 27.311606756171503, + "grad_norm": 0.3716800808906555, + "learning_rate": 4.726883932438285e-05, + "loss": 0.3613, + "step": 4036000 + }, + { + "epoch": 27.31499025552187, + "grad_norm": 0.3723650276660919, + "learning_rate": 4.7268500974447814e-05, + "loss": 0.3622, + "step": 4036500 + }, + { + "epoch": 27.31837375487224, + "grad_norm": 0.41383472084999084, + "learning_rate": 4.726816262451278e-05, + "loss": 0.3609, + "step": 4037000 + }, + { + "epoch": 27.321757254222607, + "grad_norm": 0.38070666790008545, + "learning_rate": 4.7267824274577745e-05, + "loss": 0.3619, + "step": 4037500 + }, + { + "epoch": 27.325140753572974, + "grad_norm": 0.3477083444595337, + "learning_rate": 4.726748592464271e-05, + "loss": 0.3614, + "step": 4038000 + }, + { + "epoch": 27.328524252923344, + "grad_norm": 0.39822229743003845, + "learning_rate": 4.726714757470766e-05, + "loss": 0.3622, + "step": 4038500 + }, + { + "epoch": 27.33190775227371, + "grad_norm": 0.38165462017059326, + "learning_rate": 4.726680922477263e-05, + "loss": 0.3613, + "step": 4039000 + }, + { + "epoch": 27.33529125162408, + "grad_norm": 0.37544435262680054, + "learning_rate": 4.726647087483759e-05, + "loss": 0.3611, + "step": 4039500 + }, + { + "epoch": 27.338674750974448, + "grad_norm": 0.3443518877029419, + "learning_rate": 4.7266132524902555e-05, + "loss": 0.3625, + "step": 4040000 + }, + { + "epoch": 27.342058250324815, + "grad_norm": 0.36943838000297546, + "learning_rate": 4.726579417496752e-05, + "loss": 0.3617, + "step": 4040500 + }, + { + "epoch": 27.345441749675185, + "grad_norm": 0.3410223722457886, + "learning_rate": 4.7265455825032486e-05, + "loss": 0.3622, + "step": 4041000 + }, + { + "epoch": 27.348825249025552, + "grad_norm": 0.3751671612262726, + "learning_rate": 4.726511747509745e-05, + "loss": 0.3603, + "step": 4041500 + }, + { + "epoch": 27.35220874837592, + "grad_norm": 0.41506850719451904, + "learning_rate": 4.726477912516241e-05, + "loss": 0.3613, + "step": 4042000 + }, + { + "epoch": 27.35559224772629, + "grad_norm": 0.40068018436431885, + "learning_rate": 4.726444077522737e-05, + "loss": 0.3627, + "step": 4042500 + }, + { + "epoch": 27.358975747076656, + "grad_norm": 0.34891965985298157, + "learning_rate": 4.726410242529234e-05, + "loss": 0.3619, + "step": 4043000 + }, + { + "epoch": 27.362359246427026, + "grad_norm": 0.38653042912483215, + "learning_rate": 4.72637640753573e-05, + "loss": 0.3641, + "step": 4043500 + }, + { + "epoch": 27.365742745777393, + "grad_norm": 0.4033982753753662, + "learning_rate": 4.726342572542226e-05, + "loss": 0.3619, + "step": 4044000 + }, + { + "epoch": 27.36912624512776, + "grad_norm": 0.40694281458854675, + "learning_rate": 4.726308737548723e-05, + "loss": 0.3625, + "step": 4044500 + }, + { + "epoch": 27.37250974447813, + "grad_norm": 0.37466728687286377, + "learning_rate": 4.726274902555219e-05, + "loss": 0.3603, + "step": 4045000 + }, + { + "epoch": 27.375893243828497, + "grad_norm": 0.3674294650554657, + "learning_rate": 4.726241067561715e-05, + "loss": 0.3609, + "step": 4045500 + }, + { + "epoch": 27.379276743178867, + "grad_norm": 0.3904021680355072, + "learning_rate": 4.7262072325682114e-05, + "loss": 0.3618, + "step": 4046000 + }, + { + "epoch": 27.382660242529234, + "grad_norm": 0.40246257185935974, + "learning_rate": 4.726173397574708e-05, + "loss": 0.3619, + "step": 4046500 + }, + { + "epoch": 27.3860437418796, + "grad_norm": 0.3744506239891052, + "learning_rate": 4.7261395625812045e-05, + "loss": 0.3631, + "step": 4047000 + }, + { + "epoch": 27.38942724122997, + "grad_norm": 0.33874863386154175, + "learning_rate": 4.726105727587701e-05, + "loss": 0.3635, + "step": 4047500 + }, + { + "epoch": 27.392810740580337, + "grad_norm": 0.35687437653541565, + "learning_rate": 4.726071892594196e-05, + "loss": 0.36, + "step": 4048000 + }, + { + "epoch": 27.396194239930708, + "grad_norm": 0.38321176171302795, + "learning_rate": 4.726038057600693e-05, + "loss": 0.3614, + "step": 4048500 + }, + { + "epoch": 27.399577739281074, + "grad_norm": 0.36645007133483887, + "learning_rate": 4.7260042226071894e-05, + "loss": 0.362, + "step": 4049000 + }, + { + "epoch": 27.40296123863144, + "grad_norm": 0.3766949474811554, + "learning_rate": 4.7259703876136856e-05, + "loss": 0.3615, + "step": 4049500 + }, + { + "epoch": 27.40634473798181, + "grad_norm": 0.41977280378341675, + "learning_rate": 4.725936552620182e-05, + "loss": 0.3621, + "step": 4050000 + }, + { + "epoch": 27.40972823733218, + "grad_norm": 0.380024790763855, + "learning_rate": 4.725902717626679e-05, + "loss": 0.3623, + "step": 4050500 + }, + { + "epoch": 27.413111736682545, + "grad_norm": 0.3690486252307892, + "learning_rate": 4.725868882633175e-05, + "loss": 0.3637, + "step": 4051000 + }, + { + "epoch": 27.416495236032915, + "grad_norm": 0.4049592614173889, + "learning_rate": 4.725835047639671e-05, + "loss": 0.3628, + "step": 4051500 + }, + { + "epoch": 27.419878735383282, + "grad_norm": 0.4195794463157654, + "learning_rate": 4.7258012126461673e-05, + "loss": 0.3629, + "step": 4052000 + }, + { + "epoch": 27.423262234733652, + "grad_norm": 0.40863946080207825, + "learning_rate": 4.725767377652664e-05, + "loss": 0.3619, + "step": 4052500 + }, + { + "epoch": 27.42664573408402, + "grad_norm": 0.35777702927589417, + "learning_rate": 4.72573354265916e-05, + "loss": 0.3618, + "step": 4053000 + }, + { + "epoch": 27.430029233434386, + "grad_norm": 0.3972858786582947, + "learning_rate": 4.725699707665656e-05, + "loss": 0.3627, + "step": 4053500 + }, + { + "epoch": 27.433412732784756, + "grad_norm": 0.3517029881477356, + "learning_rate": 4.725665872672153e-05, + "loss": 0.3622, + "step": 4054000 + }, + { + "epoch": 27.436796232135123, + "grad_norm": 0.3701731860637665, + "learning_rate": 4.725632037678649e-05, + "loss": 0.3607, + "step": 4054500 + }, + { + "epoch": 27.440179731485493, + "grad_norm": 0.3667539954185486, + "learning_rate": 4.725598202685145e-05, + "loss": 0.362, + "step": 4055000 + }, + { + "epoch": 27.44356323083586, + "grad_norm": 0.3279666602611542, + "learning_rate": 4.7255643676916415e-05, + "loss": 0.3612, + "step": 4055500 + }, + { + "epoch": 27.446946730186227, + "grad_norm": 0.396075040102005, + "learning_rate": 4.725530532698138e-05, + "loss": 0.3615, + "step": 4056000 + }, + { + "epoch": 27.450330229536597, + "grad_norm": 0.3535042405128479, + "learning_rate": 4.7254966977046346e-05, + "loss": 0.3591, + "step": 4056500 + }, + { + "epoch": 27.453713728886964, + "grad_norm": 0.4029294550418854, + "learning_rate": 4.725462862711131e-05, + "loss": 0.3615, + "step": 4057000 + }, + { + "epoch": 27.45709722823733, + "grad_norm": 0.3558937609195709, + "learning_rate": 4.7254290277176264e-05, + "loss": 0.3614, + "step": 4057500 + }, + { + "epoch": 27.4604807275877, + "grad_norm": 0.36616647243499756, + "learning_rate": 4.725395192724123e-05, + "loss": 0.3611, + "step": 4058000 + }, + { + "epoch": 27.463864226938068, + "grad_norm": 0.36483973264694214, + "learning_rate": 4.7253613577306195e-05, + "loss": 0.3623, + "step": 4058500 + }, + { + "epoch": 27.467247726288438, + "grad_norm": 0.3783068060874939, + "learning_rate": 4.725327522737116e-05, + "loss": 0.3615, + "step": 4059000 + }, + { + "epoch": 27.470631225638805, + "grad_norm": 0.36561059951782227, + "learning_rate": 4.725293687743612e-05, + "loss": 0.3627, + "step": 4059500 + }, + { + "epoch": 27.47401472498917, + "grad_norm": 0.36476603150367737, + "learning_rate": 4.725259852750109e-05, + "loss": 0.3627, + "step": 4060000 + }, + { + "epoch": 27.47739822433954, + "grad_norm": 0.35045233368873596, + "learning_rate": 4.725226017756605e-05, + "loss": 0.3611, + "step": 4060500 + }, + { + "epoch": 27.48078172368991, + "grad_norm": 0.33032065629959106, + "learning_rate": 4.725192182763101e-05, + "loss": 0.3627, + "step": 4061000 + }, + { + "epoch": 27.48416522304028, + "grad_norm": 0.3880566954612732, + "learning_rate": 4.7251583477695974e-05, + "loss": 0.3609, + "step": 4061500 + }, + { + "epoch": 27.487548722390645, + "grad_norm": 0.3635287880897522, + "learning_rate": 4.725124512776094e-05, + "loss": 0.3613, + "step": 4062000 + }, + { + "epoch": 27.490932221741012, + "grad_norm": 0.4103771150112152, + "learning_rate": 4.72509067778259e-05, + "loss": 0.3612, + "step": 4062500 + }, + { + "epoch": 27.494315721091382, + "grad_norm": 0.3320116698741913, + "learning_rate": 4.725056842789086e-05, + "loss": 0.3621, + "step": 4063000 + }, + { + "epoch": 27.49769922044175, + "grad_norm": 0.3799278736114502, + "learning_rate": 4.725023007795583e-05, + "loss": 0.3627, + "step": 4063500 + }, + { + "epoch": 27.50108271979212, + "grad_norm": 0.3894334137439728, + "learning_rate": 4.724989172802079e-05, + "loss": 0.3622, + "step": 4064000 + }, + { + "epoch": 27.504466219142486, + "grad_norm": 0.3801906704902649, + "learning_rate": 4.7249553378085754e-05, + "loss": 0.3628, + "step": 4064500 + }, + { + "epoch": 27.507849718492853, + "grad_norm": 0.35581132769584656, + "learning_rate": 4.7249215028150716e-05, + "loss": 0.3621, + "step": 4065000 + }, + { + "epoch": 27.511233217843223, + "grad_norm": 0.3754686117172241, + "learning_rate": 4.724887667821568e-05, + "loss": 0.3621, + "step": 4065500 + }, + { + "epoch": 27.51461671719359, + "grad_norm": 0.36557772755622864, + "learning_rate": 4.724853832828065e-05, + "loss": 0.3614, + "step": 4066000 + }, + { + "epoch": 27.518000216543957, + "grad_norm": 0.37378761172294617, + "learning_rate": 4.724819997834561e-05, + "loss": 0.3618, + "step": 4066500 + }, + { + "epoch": 27.521383715894327, + "grad_norm": 0.3569445312023163, + "learning_rate": 4.7247861628410564e-05, + "loss": 0.3621, + "step": 4067000 + }, + { + "epoch": 27.524767215244694, + "grad_norm": 0.4252917468547821, + "learning_rate": 4.724752327847553e-05, + "loss": 0.3605, + "step": 4067500 + }, + { + "epoch": 27.528150714595064, + "grad_norm": 0.3686460256576538, + "learning_rate": 4.7247184928540495e-05, + "loss": 0.3623, + "step": 4068000 + }, + { + "epoch": 27.53153421394543, + "grad_norm": 0.405404269695282, + "learning_rate": 4.724684657860546e-05, + "loss": 0.3622, + "step": 4068500 + }, + { + "epoch": 27.534917713295798, + "grad_norm": 0.37391921877861023, + "learning_rate": 4.724650822867042e-05, + "loss": 0.3615, + "step": 4069000 + }, + { + "epoch": 27.538301212646168, + "grad_norm": 0.3840598464012146, + "learning_rate": 4.724616987873539e-05, + "loss": 0.3608, + "step": 4069500 + }, + { + "epoch": 27.541684711996535, + "grad_norm": 0.36684784293174744, + "learning_rate": 4.724583152880035e-05, + "loss": 0.3594, + "step": 4070000 + }, + { + "epoch": 27.545068211346905, + "grad_norm": 0.36803022027015686, + "learning_rate": 4.724549317886531e-05, + "loss": 0.3617, + "step": 4070500 + }, + { + "epoch": 27.54845171069727, + "grad_norm": 0.3929801285266876, + "learning_rate": 4.7245154828930275e-05, + "loss": 0.3611, + "step": 4071000 + }, + { + "epoch": 27.55183521004764, + "grad_norm": 0.37724795937538147, + "learning_rate": 4.7244816478995244e-05, + "loss": 0.3625, + "step": 4071500 + }, + { + "epoch": 27.55521870939801, + "grad_norm": 0.352273553609848, + "learning_rate": 4.72444781290602e-05, + "loss": 0.3622, + "step": 4072000 + }, + { + "epoch": 27.558602208748376, + "grad_norm": 0.37165728211402893, + "learning_rate": 4.724413977912516e-05, + "loss": 0.3624, + "step": 4072500 + }, + { + "epoch": 27.561985708098746, + "grad_norm": 0.36808037757873535, + "learning_rate": 4.724380142919012e-05, + "loss": 0.3622, + "step": 4073000 + }, + { + "epoch": 27.565369207449113, + "grad_norm": 0.4029386341571808, + "learning_rate": 4.724346307925509e-05, + "loss": 0.3614, + "step": 4073500 + }, + { + "epoch": 27.56875270679948, + "grad_norm": 0.37487930059432983, + "learning_rate": 4.7243124729320054e-05, + "loss": 0.3624, + "step": 4074000 + }, + { + "epoch": 27.57213620614985, + "grad_norm": 0.3402683734893799, + "learning_rate": 4.7242786379385016e-05, + "loss": 0.363, + "step": 4074500 + }, + { + "epoch": 27.575519705500216, + "grad_norm": 0.3881003260612488, + "learning_rate": 4.724244802944998e-05, + "loss": 0.3615, + "step": 4075000 + }, + { + "epoch": 27.578903204850583, + "grad_norm": 0.34688401222229004, + "learning_rate": 4.724210967951495e-05, + "loss": 0.3616, + "step": 4075500 + }, + { + "epoch": 27.582286704200953, + "grad_norm": 0.4068271219730377, + "learning_rate": 4.724177132957991e-05, + "loss": 0.3599, + "step": 4076000 + }, + { + "epoch": 27.58567020355132, + "grad_norm": 0.39702337980270386, + "learning_rate": 4.724143297964487e-05, + "loss": 0.3641, + "step": 4076500 + }, + { + "epoch": 27.58905370290169, + "grad_norm": 0.31892362236976624, + "learning_rate": 4.7241094629709834e-05, + "loss": 0.3614, + "step": 4077000 + }, + { + "epoch": 27.592437202252057, + "grad_norm": 0.38383200764656067, + "learning_rate": 4.7240756279774796e-05, + "loss": 0.3611, + "step": 4077500 + }, + { + "epoch": 27.595820701602424, + "grad_norm": 0.3788059651851654, + "learning_rate": 4.724041792983976e-05, + "loss": 0.3614, + "step": 4078000 + }, + { + "epoch": 27.599204200952794, + "grad_norm": 0.38105660676956177, + "learning_rate": 4.724007957990472e-05, + "loss": 0.3601, + "step": 4078500 + }, + { + "epoch": 27.60258770030316, + "grad_norm": 0.3627316355705261, + "learning_rate": 4.723974122996969e-05, + "loss": 0.3617, + "step": 4079000 + }, + { + "epoch": 27.60597119965353, + "grad_norm": 0.4075861871242523, + "learning_rate": 4.723940288003465e-05, + "loss": 0.3613, + "step": 4079500 + }, + { + "epoch": 27.609354699003898, + "grad_norm": 0.39518603682518005, + "learning_rate": 4.723906453009961e-05, + "loss": 0.3621, + "step": 4080000 + }, + { + "epoch": 27.612738198354265, + "grad_norm": 0.3917335867881775, + "learning_rate": 4.7238726180164575e-05, + "loss": 0.3633, + "step": 4080500 + }, + { + "epoch": 27.616121697704635, + "grad_norm": 0.34773170948028564, + "learning_rate": 4.7238387830229544e-05, + "loss": 0.3626, + "step": 4081000 + }, + { + "epoch": 27.619505197055002, + "grad_norm": 0.364572674036026, + "learning_rate": 4.72380494802945e-05, + "loss": 0.3608, + "step": 4081500 + }, + { + "epoch": 27.62288869640537, + "grad_norm": 0.36159542202949524, + "learning_rate": 4.723771113035946e-05, + "loss": 0.3606, + "step": 4082000 + }, + { + "epoch": 27.62627219575574, + "grad_norm": 0.3716708719730377, + "learning_rate": 4.7237372780424424e-05, + "loss": 0.3614, + "step": 4082500 + }, + { + "epoch": 27.629655695106106, + "grad_norm": 0.3588971793651581, + "learning_rate": 4.723703443048939e-05, + "loss": 0.3633, + "step": 4083000 + }, + { + "epoch": 27.633039194456476, + "grad_norm": 0.37614136934280396, + "learning_rate": 4.7236696080554355e-05, + "loss": 0.3615, + "step": 4083500 + }, + { + "epoch": 27.636422693806843, + "grad_norm": 0.42491161823272705, + "learning_rate": 4.723635773061932e-05, + "loss": 0.3615, + "step": 4084000 + }, + { + "epoch": 27.63980619315721, + "grad_norm": 0.3415015637874603, + "learning_rate": 4.723601938068428e-05, + "loss": 0.3619, + "step": 4084500 + }, + { + "epoch": 27.64318969250758, + "grad_norm": 0.3611709773540497, + "learning_rate": 4.723568103074925e-05, + "loss": 0.3638, + "step": 4085000 + }, + { + "epoch": 27.646573191857946, + "grad_norm": 0.3997561037540436, + "learning_rate": 4.723534268081421e-05, + "loss": 0.3608, + "step": 4085500 + }, + { + "epoch": 27.649956691208317, + "grad_norm": 0.37621551752090454, + "learning_rate": 4.723500433087917e-05, + "loss": 0.362, + "step": 4086000 + }, + { + "epoch": 27.653340190558684, + "grad_norm": 0.3873799443244934, + "learning_rate": 4.7234665980944134e-05, + "loss": 0.362, + "step": 4086500 + }, + { + "epoch": 27.65672368990905, + "grad_norm": 0.3721669316291809, + "learning_rate": 4.7234327631009096e-05, + "loss": 0.3615, + "step": 4087000 + }, + { + "epoch": 27.66010718925942, + "grad_norm": 0.334337055683136, + "learning_rate": 4.723398928107406e-05, + "loss": 0.3617, + "step": 4087500 + }, + { + "epoch": 27.663490688609787, + "grad_norm": 0.3558754622936249, + "learning_rate": 4.723365093113902e-05, + "loss": 0.362, + "step": 4088000 + }, + { + "epoch": 27.666874187960158, + "grad_norm": 0.3540439307689667, + "learning_rate": 4.723331258120399e-05, + "loss": 0.3605, + "step": 4088500 + }, + { + "epoch": 27.670257687310524, + "grad_norm": 0.39304184913635254, + "learning_rate": 4.723297423126895e-05, + "loss": 0.3621, + "step": 4089000 + }, + { + "epoch": 27.67364118666089, + "grad_norm": 0.3880615532398224, + "learning_rate": 4.7232635881333914e-05, + "loss": 0.3617, + "step": 4089500 + }, + { + "epoch": 27.67702468601126, + "grad_norm": 0.41420215368270874, + "learning_rate": 4.7232297531398876e-05, + "loss": 0.3608, + "step": 4090000 + }, + { + "epoch": 27.680408185361628, + "grad_norm": 0.3796791732311249, + "learning_rate": 4.7231959181463845e-05, + "loss": 0.3633, + "step": 4090500 + }, + { + "epoch": 27.683791684711995, + "grad_norm": 0.39305371046066284, + "learning_rate": 4.72316208315288e-05, + "loss": 0.362, + "step": 4091000 + }, + { + "epoch": 27.687175184062365, + "grad_norm": 0.3777586817741394, + "learning_rate": 4.723128248159376e-05, + "loss": 0.3617, + "step": 4091500 + }, + { + "epoch": 27.690558683412732, + "grad_norm": 0.3716682195663452, + "learning_rate": 4.7230944131658724e-05, + "loss": 0.3611, + "step": 4092000 + }, + { + "epoch": 27.693942182763102, + "grad_norm": 0.37100502848625183, + "learning_rate": 4.723060578172369e-05, + "loss": 0.362, + "step": 4092500 + }, + { + "epoch": 27.69732568211347, + "grad_norm": 0.3646644651889801, + "learning_rate": 4.7230267431788655e-05, + "loss": 0.3616, + "step": 4093000 + }, + { + "epoch": 27.700709181463836, + "grad_norm": 0.41356727480888367, + "learning_rate": 4.722992908185362e-05, + "loss": 0.3609, + "step": 4093500 + }, + { + "epoch": 27.704092680814206, + "grad_norm": 0.3958253562450409, + "learning_rate": 4.722959073191858e-05, + "loss": 0.3619, + "step": 4094000 + }, + { + "epoch": 27.707476180164573, + "grad_norm": 0.342966765165329, + "learning_rate": 4.722925238198355e-05, + "loss": 0.3618, + "step": 4094500 + }, + { + "epoch": 27.710859679514943, + "grad_norm": 0.36626705527305603, + "learning_rate": 4.722891403204851e-05, + "loss": 0.3613, + "step": 4095000 + }, + { + "epoch": 27.71424317886531, + "grad_norm": 0.36850035190582275, + "learning_rate": 4.722857568211347e-05, + "loss": 0.3611, + "step": 4095500 + }, + { + "epoch": 27.717626678215677, + "grad_norm": 0.4046573340892792, + "learning_rate": 4.7228237332178435e-05, + "loss": 0.3608, + "step": 4096000 + }, + { + "epoch": 27.721010177566047, + "grad_norm": 0.3967227637767792, + "learning_rate": 4.72278989822434e-05, + "loss": 0.362, + "step": 4096500 + }, + { + "epoch": 27.724393676916414, + "grad_norm": 0.3719586730003357, + "learning_rate": 4.722756063230836e-05, + "loss": 0.3626, + "step": 4097000 + }, + { + "epoch": 27.727777176266784, + "grad_norm": 0.36543700098991394, + "learning_rate": 4.722722228237332e-05, + "loss": 0.3636, + "step": 4097500 + }, + { + "epoch": 27.73116067561715, + "grad_norm": 0.3697172999382019, + "learning_rate": 4.722688393243829e-05, + "loss": 0.3599, + "step": 4098000 + }, + { + "epoch": 27.734544174967517, + "grad_norm": 0.337506502866745, + "learning_rate": 4.722654558250325e-05, + "loss": 0.3614, + "step": 4098500 + }, + { + "epoch": 27.737927674317888, + "grad_norm": 0.36323779821395874, + "learning_rate": 4.7226207232568214e-05, + "loss": 0.3626, + "step": 4099000 + }, + { + "epoch": 27.741311173668254, + "grad_norm": 0.39795252680778503, + "learning_rate": 4.722586888263318e-05, + "loss": 0.3616, + "step": 4099500 + }, + { + "epoch": 27.74469467301862, + "grad_norm": 0.34694918990135193, + "learning_rate": 4.7225530532698146e-05, + "loss": 0.3634, + "step": 4100000 + }, + { + "epoch": 27.74807817236899, + "grad_norm": 0.37187129259109497, + "learning_rate": 4.72251921827631e-05, + "loss": 0.3619, + "step": 4100500 + }, + { + "epoch": 27.75146167171936, + "grad_norm": 0.38147711753845215, + "learning_rate": 4.722485383282806e-05, + "loss": 0.3611, + "step": 4101000 + }, + { + "epoch": 27.75484517106973, + "grad_norm": 0.3710109293460846, + "learning_rate": 4.7224515482893025e-05, + "loss": 0.3616, + "step": 4101500 + }, + { + "epoch": 27.758228670420095, + "grad_norm": 0.38604336977005005, + "learning_rate": 4.7224177132957994e-05, + "loss": 0.3611, + "step": 4102000 + }, + { + "epoch": 27.761612169770462, + "grad_norm": 0.40243417024612427, + "learning_rate": 4.7223838783022956e-05, + "loss": 0.3625, + "step": 4102500 + }, + { + "epoch": 27.764995669120832, + "grad_norm": 0.4230211675167084, + "learning_rate": 4.722350043308792e-05, + "loss": 0.3612, + "step": 4103000 + }, + { + "epoch": 27.7683791684712, + "grad_norm": 0.40428245067596436, + "learning_rate": 4.722316208315288e-05, + "loss": 0.3601, + "step": 4103500 + }, + { + "epoch": 27.77176266782157, + "grad_norm": 0.374149352312088, + "learning_rate": 4.722282373321785e-05, + "loss": 0.3618, + "step": 4104000 + }, + { + "epoch": 27.775146167171936, + "grad_norm": 0.3828555941581726, + "learning_rate": 4.722248538328281e-05, + "loss": 0.3633, + "step": 4104500 + }, + { + "epoch": 27.778529666522303, + "grad_norm": 0.38992854952812195, + "learning_rate": 4.7222147033347774e-05, + "loss": 0.3616, + "step": 4105000 + }, + { + "epoch": 27.781913165872673, + "grad_norm": 0.3347651958465576, + "learning_rate": 4.7221808683412736e-05, + "loss": 0.3628, + "step": 4105500 + }, + { + "epoch": 27.78529666522304, + "grad_norm": 0.3705926537513733, + "learning_rate": 4.72214703334777e-05, + "loss": 0.3622, + "step": 4106000 + }, + { + "epoch": 27.788680164573407, + "grad_norm": 0.3586915135383606, + "learning_rate": 4.722113198354266e-05, + "loss": 0.3618, + "step": 4106500 + }, + { + "epoch": 27.792063663923777, + "grad_norm": 0.4063935875892639, + "learning_rate": 4.722079363360762e-05, + "loss": 0.3612, + "step": 4107000 + }, + { + "epoch": 27.795447163274144, + "grad_norm": 0.36764734983444214, + "learning_rate": 4.722045528367259e-05, + "loss": 0.3618, + "step": 4107500 + }, + { + "epoch": 27.798830662624514, + "grad_norm": 0.3634496033191681, + "learning_rate": 4.722011693373755e-05, + "loss": 0.3618, + "step": 4108000 + }, + { + "epoch": 27.80221416197488, + "grad_norm": 0.382001668214798, + "learning_rate": 4.7219778583802515e-05, + "loss": 0.3635, + "step": 4108500 + }, + { + "epoch": 27.805597661325248, + "grad_norm": 0.38688212633132935, + "learning_rate": 4.721944023386748e-05, + "loss": 0.3634, + "step": 4109000 + }, + { + "epoch": 27.808981160675618, + "grad_norm": 0.3695046305656433, + "learning_rate": 4.7219101883932446e-05, + "loss": 0.3614, + "step": 4109500 + }, + { + "epoch": 27.812364660025985, + "grad_norm": 0.3804705739021301, + "learning_rate": 4.72187635339974e-05, + "loss": 0.3618, + "step": 4110000 + }, + { + "epoch": 27.815748159376355, + "grad_norm": 0.37848877906799316, + "learning_rate": 4.7218425184062364e-05, + "loss": 0.3629, + "step": 4110500 + }, + { + "epoch": 27.81913165872672, + "grad_norm": 0.40287455916404724, + "learning_rate": 4.7218086834127326e-05, + "loss": 0.3618, + "step": 4111000 + }, + { + "epoch": 27.82251515807709, + "grad_norm": 0.3197984993457794, + "learning_rate": 4.7217748484192295e-05, + "loss": 0.3627, + "step": 4111500 + }, + { + "epoch": 27.82589865742746, + "grad_norm": 0.37059545516967773, + "learning_rate": 4.721741013425726e-05, + "loss": 0.3615, + "step": 4112000 + }, + { + "epoch": 27.829282156777825, + "grad_norm": 0.38981738686561584, + "learning_rate": 4.721707178432222e-05, + "loss": 0.3621, + "step": 4112500 + }, + { + "epoch": 27.832665656128192, + "grad_norm": 0.3608679175376892, + "learning_rate": 4.721673343438718e-05, + "loss": 0.3622, + "step": 4113000 + }, + { + "epoch": 27.836049155478563, + "grad_norm": 0.37277328968048096, + "learning_rate": 4.721639508445215e-05, + "loss": 0.3642, + "step": 4113500 + }, + { + "epoch": 27.83943265482893, + "grad_norm": 0.389523446559906, + "learning_rate": 4.721605673451711e-05, + "loss": 0.3622, + "step": 4114000 + }, + { + "epoch": 27.8428161541793, + "grad_norm": 0.38738763332366943, + "learning_rate": 4.7215718384582074e-05, + "loss": 0.3632, + "step": 4114500 + }, + { + "epoch": 27.846199653529666, + "grad_norm": 0.3468678593635559, + "learning_rate": 4.7215380034647036e-05, + "loss": 0.361, + "step": 4115000 + }, + { + "epoch": 27.849583152880033, + "grad_norm": 0.38216879963874817, + "learning_rate": 4.7215041684712e-05, + "loss": 0.3625, + "step": 4115500 + }, + { + "epoch": 27.852966652230403, + "grad_norm": 0.3636903464794159, + "learning_rate": 4.721470333477696e-05, + "loss": 0.3623, + "step": 4116000 + }, + { + "epoch": 27.85635015158077, + "grad_norm": 0.3981262445449829, + "learning_rate": 4.721436498484192e-05, + "loss": 0.3622, + "step": 4116500 + }, + { + "epoch": 27.85973365093114, + "grad_norm": 0.36636820435523987, + "learning_rate": 4.721402663490689e-05, + "loss": 0.3614, + "step": 4117000 + }, + { + "epoch": 27.863117150281507, + "grad_norm": 0.35542821884155273, + "learning_rate": 4.7213688284971854e-05, + "loss": 0.3633, + "step": 4117500 + }, + { + "epoch": 27.866500649631874, + "grad_norm": 0.39117830991744995, + "learning_rate": 4.7213349935036816e-05, + "loss": 0.3617, + "step": 4118000 + }, + { + "epoch": 27.869884148982244, + "grad_norm": 0.36367714405059814, + "learning_rate": 4.721301158510178e-05, + "loss": 0.3612, + "step": 4118500 + }, + { + "epoch": 27.87326764833261, + "grad_norm": 0.36560484766960144, + "learning_rate": 4.721267323516674e-05, + "loss": 0.3607, + "step": 4119000 + }, + { + "epoch": 27.87665114768298, + "grad_norm": 0.39626118540763855, + "learning_rate": 4.72123348852317e-05, + "loss": 0.3626, + "step": 4119500 + }, + { + "epoch": 27.880034647033348, + "grad_norm": 0.38299208879470825, + "learning_rate": 4.7211996535296664e-05, + "loss": 0.3618, + "step": 4120000 + }, + { + "epoch": 27.883418146383715, + "grad_norm": 0.32079488039016724, + "learning_rate": 4.7211658185361626e-05, + "loss": 0.3635, + "step": 4120500 + }, + { + "epoch": 27.886801645734085, + "grad_norm": 0.4020163118839264, + "learning_rate": 4.7211319835426595e-05, + "loss": 0.3609, + "step": 4121000 + }, + { + "epoch": 27.890185145084452, + "grad_norm": 0.3395242691040039, + "learning_rate": 4.721098148549156e-05, + "loss": 0.3606, + "step": 4121500 + }, + { + "epoch": 27.893568644434822, + "grad_norm": 0.3686038851737976, + "learning_rate": 4.721064313555652e-05, + "loss": 0.3624, + "step": 4122000 + }, + { + "epoch": 27.89695214378519, + "grad_norm": 0.37975063920021057, + "learning_rate": 4.721030478562148e-05, + "loss": 0.3602, + "step": 4122500 + }, + { + "epoch": 27.900335643135556, + "grad_norm": 0.3796452581882477, + "learning_rate": 4.720996643568645e-05, + "loss": 0.3642, + "step": 4123000 + }, + { + "epoch": 27.903719142485926, + "grad_norm": 0.347863107919693, + "learning_rate": 4.720962808575141e-05, + "loss": 0.3607, + "step": 4123500 + }, + { + "epoch": 27.907102641836293, + "grad_norm": 0.3953015208244324, + "learning_rate": 4.7209289735816375e-05, + "loss": 0.3634, + "step": 4124000 + }, + { + "epoch": 27.91048614118666, + "grad_norm": 0.39113664627075195, + "learning_rate": 4.720895138588134e-05, + "loss": 0.3618, + "step": 4124500 + }, + { + "epoch": 27.91386964053703, + "grad_norm": 0.3630425035953522, + "learning_rate": 4.72086130359463e-05, + "loss": 0.3622, + "step": 4125000 + }, + { + "epoch": 27.917253139887396, + "grad_norm": 0.40215301513671875, + "learning_rate": 4.720827468601126e-05, + "loss": 0.3633, + "step": 4125500 + }, + { + "epoch": 27.920636639237767, + "grad_norm": 0.40930697321891785, + "learning_rate": 4.720793633607622e-05, + "loss": 0.3604, + "step": 4126000 + }, + { + "epoch": 27.924020138588133, + "grad_norm": 0.35423481464385986, + "learning_rate": 4.720759798614119e-05, + "loss": 0.361, + "step": 4126500 + }, + { + "epoch": 27.9274036379385, + "grad_norm": 0.3465040624141693, + "learning_rate": 4.7207259636206154e-05, + "loss": 0.362, + "step": 4127000 + }, + { + "epoch": 27.93078713728887, + "grad_norm": 0.4105958640575409, + "learning_rate": 4.7206921286271116e-05, + "loss": 0.3623, + "step": 4127500 + }, + { + "epoch": 27.934170636639237, + "grad_norm": 0.3818987011909485, + "learning_rate": 4.720658293633608e-05, + "loss": 0.3625, + "step": 4128000 + }, + { + "epoch": 27.937554135989608, + "grad_norm": 0.3856114149093628, + "learning_rate": 4.720624458640104e-05, + "loss": 0.3624, + "step": 4128500 + }, + { + "epoch": 27.940937635339974, + "grad_norm": 0.42435044050216675, + "learning_rate": 4.720590623646601e-05, + "loss": 0.3614, + "step": 4129000 + }, + { + "epoch": 27.94432113469034, + "grad_norm": 0.38874831795692444, + "learning_rate": 4.7205567886530965e-05, + "loss": 0.3609, + "step": 4129500 + }, + { + "epoch": 27.94770463404071, + "grad_norm": 0.43664172291755676, + "learning_rate": 4.720522953659593e-05, + "loss": 0.3617, + "step": 4130000 + }, + { + "epoch": 27.951088133391078, + "grad_norm": 0.3564797043800354, + "learning_rate": 4.7204891186660896e-05, + "loss": 0.3613, + "step": 4130500 + }, + { + "epoch": 27.954471632741445, + "grad_norm": 0.36907705664634705, + "learning_rate": 4.720455283672586e-05, + "loss": 0.3615, + "step": 4131000 + }, + { + "epoch": 27.957855132091815, + "grad_norm": 0.3607375919818878, + "learning_rate": 4.720421448679082e-05, + "loss": 0.3616, + "step": 4131500 + }, + { + "epoch": 27.961238631442182, + "grad_norm": 0.3759130537509918, + "learning_rate": 4.720387613685578e-05, + "loss": 0.3604, + "step": 4132000 + }, + { + "epoch": 27.964622130792552, + "grad_norm": 0.4064229130744934, + "learning_rate": 4.720353778692075e-05, + "loss": 0.3613, + "step": 4132500 + }, + { + "epoch": 27.96800563014292, + "grad_norm": 0.40052372217178345, + "learning_rate": 4.720319943698571e-05, + "loss": 0.3624, + "step": 4133000 + }, + { + "epoch": 27.971389129493286, + "grad_norm": 0.42291682958602905, + "learning_rate": 4.7202861087050675e-05, + "loss": 0.3634, + "step": 4133500 + }, + { + "epoch": 27.974772628843656, + "grad_norm": 0.40400055050849915, + "learning_rate": 4.720252273711564e-05, + "loss": 0.3626, + "step": 4134000 + }, + { + "epoch": 27.978156128194023, + "grad_norm": 0.38714271783828735, + "learning_rate": 4.72021843871806e-05, + "loss": 0.3623, + "step": 4134500 + }, + { + "epoch": 27.981539627544393, + "grad_norm": 0.33237898349761963, + "learning_rate": 4.720184603724556e-05, + "loss": 0.3625, + "step": 4135000 + }, + { + "epoch": 27.98492312689476, + "grad_norm": 0.3680419623851776, + "learning_rate": 4.7201507687310524e-05, + "loss": 0.3623, + "step": 4135500 + }, + { + "epoch": 27.988306626245127, + "grad_norm": 0.38814762234687805, + "learning_rate": 4.7201169337375486e-05, + "loss": 0.3625, + "step": 4136000 + }, + { + "epoch": 27.991690125595497, + "grad_norm": 0.37873101234436035, + "learning_rate": 4.7200830987440455e-05, + "loss": 0.3617, + "step": 4136500 + }, + { + "epoch": 27.995073624945864, + "grad_norm": 0.4092039465904236, + "learning_rate": 4.720049263750542e-05, + "loss": 0.3621, + "step": 4137000 + }, + { + "epoch": 27.99845712429623, + "grad_norm": 0.3645796775817871, + "learning_rate": 4.720015428757038e-05, + "loss": 0.3616, + "step": 4137500 + }, + { + "epoch": 28.0, + "eval_accuracy": 0.8621847770921827, + "eval_loss": 0.5612766146659851, + "eval_runtime": 3363.4387, + "eval_samples_per_second": 86.442, + "eval_steps_per_second": 5.403, + "step": 4137728 + }, + { + "epoch": 28.0018406236466, + "grad_norm": 0.43283811211586, + "learning_rate": 4.719981593763534e-05, + "loss": 0.3593, + "step": 4138000 + }, + { + "epoch": 28.005224122996967, + "grad_norm": 0.37070658802986145, + "learning_rate": 4.719947758770031e-05, + "loss": 0.3594, + "step": 4138500 + }, + { + "epoch": 28.008607622347338, + "grad_norm": 0.37569138407707214, + "learning_rate": 4.7199139237765266e-05, + "loss": 0.3597, + "step": 4139000 + }, + { + "epoch": 28.011991121697704, + "grad_norm": 0.3612668514251709, + "learning_rate": 4.719880088783023e-05, + "loss": 0.3575, + "step": 4139500 + }, + { + "epoch": 28.01537462104807, + "grad_norm": 0.38011425733566284, + "learning_rate": 4.7198462537895197e-05, + "loss": 0.3592, + "step": 4140000 + }, + { + "epoch": 28.01875812039844, + "grad_norm": 0.3579834997653961, + "learning_rate": 4.719812418796016e-05, + "loss": 0.3612, + "step": 4140500 + }, + { + "epoch": 28.02214161974881, + "grad_norm": 0.368886262178421, + "learning_rate": 4.719778583802512e-05, + "loss": 0.3603, + "step": 4141000 + }, + { + "epoch": 28.02552511909918, + "grad_norm": 0.37519514560699463, + "learning_rate": 4.719744748809008e-05, + "loss": 0.3595, + "step": 4141500 + }, + { + "epoch": 28.028908618449545, + "grad_norm": 0.37708401679992676, + "learning_rate": 4.719710913815505e-05, + "loss": 0.3607, + "step": 4142000 + }, + { + "epoch": 28.032292117799912, + "grad_norm": 0.38011103868484497, + "learning_rate": 4.7196770788220014e-05, + "loss": 0.3596, + "step": 4142500 + }, + { + "epoch": 28.035675617150282, + "grad_norm": 0.4065256416797638, + "learning_rate": 4.7196432438284976e-05, + "loss": 0.3593, + "step": 4143000 + }, + { + "epoch": 28.03905911650065, + "grad_norm": 0.37354305386543274, + "learning_rate": 4.719609408834993e-05, + "loss": 0.3605, + "step": 4143500 + }, + { + "epoch": 28.04244261585102, + "grad_norm": 0.38129520416259766, + "learning_rate": 4.71957557384149e-05, + "loss": 0.3614, + "step": 4144000 + }, + { + "epoch": 28.045826115201386, + "grad_norm": 0.3501156270503998, + "learning_rate": 4.719541738847986e-05, + "loss": 0.3579, + "step": 4144500 + }, + { + "epoch": 28.049209614551753, + "grad_norm": 0.4075695872306824, + "learning_rate": 4.7195079038544825e-05, + "loss": 0.3613, + "step": 4145000 + }, + { + "epoch": 28.052593113902123, + "grad_norm": 0.3449297547340393, + "learning_rate": 4.719474068860979e-05, + "loss": 0.3591, + "step": 4145500 + }, + { + "epoch": 28.05597661325249, + "grad_norm": 0.3645080327987671, + "learning_rate": 4.7194402338674756e-05, + "loss": 0.36, + "step": 4146000 + }, + { + "epoch": 28.059360112602857, + "grad_norm": 0.3833574056625366, + "learning_rate": 4.719406398873972e-05, + "loss": 0.3594, + "step": 4146500 + }, + { + "epoch": 28.062743611953227, + "grad_norm": 0.40159744024276733, + "learning_rate": 4.719372563880468e-05, + "loss": 0.3601, + "step": 4147000 + }, + { + "epoch": 28.066127111303594, + "grad_norm": 0.38786768913269043, + "learning_rate": 4.719338728886964e-05, + "loss": 0.3607, + "step": 4147500 + }, + { + "epoch": 28.069510610653964, + "grad_norm": 0.35397759079933167, + "learning_rate": 4.719304893893461e-05, + "loss": 0.36, + "step": 4148000 + }, + { + "epoch": 28.07289411000433, + "grad_norm": 0.3652489483356476, + "learning_rate": 4.7192710588999566e-05, + "loss": 0.3604, + "step": 4148500 + }, + { + "epoch": 28.076277609354698, + "grad_norm": 0.4199112057685852, + "learning_rate": 4.719237223906453e-05, + "loss": 0.3606, + "step": 4149000 + }, + { + "epoch": 28.079661108705068, + "grad_norm": 0.40466582775115967, + "learning_rate": 4.71920338891295e-05, + "loss": 0.3602, + "step": 4149500 + }, + { + "epoch": 28.083044608055435, + "grad_norm": 0.4131191074848175, + "learning_rate": 4.719169553919446e-05, + "loss": 0.3604, + "step": 4150000 + }, + { + "epoch": 28.086428107405805, + "grad_norm": 0.38448336720466614, + "learning_rate": 4.719135718925942e-05, + "loss": 0.3593, + "step": 4150500 + }, + { + "epoch": 28.08981160675617, + "grad_norm": 0.41221994161605835, + "learning_rate": 4.7191018839324384e-05, + "loss": 0.3606, + "step": 4151000 + }, + { + "epoch": 28.09319510610654, + "grad_norm": 0.36168429255485535, + "learning_rate": 4.719068048938935e-05, + "loss": 0.3619, + "step": 4151500 + }, + { + "epoch": 28.09657860545691, + "grad_norm": 0.36486026644706726, + "learning_rate": 4.7190342139454315e-05, + "loss": 0.3619, + "step": 4152000 + }, + { + "epoch": 28.099962104807275, + "grad_norm": 0.3782370686531067, + "learning_rate": 4.719000378951928e-05, + "loss": 0.3608, + "step": 4152500 + }, + { + "epoch": 28.103345604157646, + "grad_norm": 0.3529711067676544, + "learning_rate": 4.718966543958423e-05, + "loss": 0.3601, + "step": 4153000 + }, + { + "epoch": 28.106729103508012, + "grad_norm": 0.3651941418647766, + "learning_rate": 4.71893270896492e-05, + "loss": 0.3595, + "step": 4153500 + }, + { + "epoch": 28.11011260285838, + "grad_norm": 0.37990447878837585, + "learning_rate": 4.718898873971416e-05, + "loss": 0.3608, + "step": 4154000 + }, + { + "epoch": 28.11349610220875, + "grad_norm": 0.3762222230434418, + "learning_rate": 4.7188650389779125e-05, + "loss": 0.3611, + "step": 4154500 + }, + { + "epoch": 28.116879601559116, + "grad_norm": 0.3797217607498169, + "learning_rate": 4.718831203984409e-05, + "loss": 0.3605, + "step": 4155000 + }, + { + "epoch": 28.120263100909483, + "grad_norm": 0.37965890765190125, + "learning_rate": 4.7187973689909056e-05, + "loss": 0.3616, + "step": 4155500 + }, + { + "epoch": 28.123646600259853, + "grad_norm": 0.350475549697876, + "learning_rate": 4.718763533997402e-05, + "loss": 0.3596, + "step": 4156000 + }, + { + "epoch": 28.12703009961022, + "grad_norm": 0.42017796635627747, + "learning_rate": 4.718729699003898e-05, + "loss": 0.3617, + "step": 4156500 + }, + { + "epoch": 28.13041359896059, + "grad_norm": 0.3585604131221771, + "learning_rate": 4.718695864010394e-05, + "loss": 0.3606, + "step": 4157000 + }, + { + "epoch": 28.133797098310957, + "grad_norm": 0.40403875708580017, + "learning_rate": 4.718662029016891e-05, + "loss": 0.3616, + "step": 4157500 + }, + { + "epoch": 28.137180597661324, + "grad_norm": 0.3596263527870178, + "learning_rate": 4.718628194023387e-05, + "loss": 0.3589, + "step": 4158000 + }, + { + "epoch": 28.140564097011694, + "grad_norm": 0.36147862672805786, + "learning_rate": 4.718594359029883e-05, + "loss": 0.3627, + "step": 4158500 + }, + { + "epoch": 28.14394759636206, + "grad_norm": 0.3843778669834137, + "learning_rate": 4.71856052403638e-05, + "loss": 0.3594, + "step": 4159000 + }, + { + "epoch": 28.14733109571243, + "grad_norm": 0.3639533817768097, + "learning_rate": 4.718526689042876e-05, + "loss": 0.3606, + "step": 4159500 + }, + { + "epoch": 28.150714595062798, + "grad_norm": 0.3854668438434601, + "learning_rate": 4.718492854049372e-05, + "loss": 0.3591, + "step": 4160000 + }, + { + "epoch": 28.154098094413165, + "grad_norm": 0.4005782902240753, + "learning_rate": 4.7184590190558684e-05, + "loss": 0.3603, + "step": 4160500 + }, + { + "epoch": 28.157481593763535, + "grad_norm": 0.3792315423488617, + "learning_rate": 4.718425184062365e-05, + "loss": 0.3623, + "step": 4161000 + }, + { + "epoch": 28.1608650931139, + "grad_norm": 0.38012492656707764, + "learning_rate": 4.7183913490688615e-05, + "loss": 0.3614, + "step": 4161500 + }, + { + "epoch": 28.16424859246427, + "grad_norm": 0.35042500495910645, + "learning_rate": 4.718357514075358e-05, + "loss": 0.3605, + "step": 4162000 + }, + { + "epoch": 28.16763209181464, + "grad_norm": 0.39792054891586304, + "learning_rate": 4.718323679081853e-05, + "loss": 0.3603, + "step": 4162500 + }, + { + "epoch": 28.171015591165006, + "grad_norm": 0.37952131032943726, + "learning_rate": 4.71828984408835e-05, + "loss": 0.3612, + "step": 4163000 + }, + { + "epoch": 28.174399090515376, + "grad_norm": 0.3857076168060303, + "learning_rate": 4.7182560090948464e-05, + "loss": 0.3603, + "step": 4163500 + }, + { + "epoch": 28.177782589865743, + "grad_norm": 0.3405747413635254, + "learning_rate": 4.7182221741013426e-05, + "loss": 0.3615, + "step": 4164000 + }, + { + "epoch": 28.18116608921611, + "grad_norm": 0.32335302233695984, + "learning_rate": 4.718188339107839e-05, + "loss": 0.3611, + "step": 4164500 + }, + { + "epoch": 28.18454958856648, + "grad_norm": 0.4017068147659302, + "learning_rate": 4.718154504114336e-05, + "loss": 0.362, + "step": 4165000 + }, + { + "epoch": 28.187933087916846, + "grad_norm": 0.3898669183254242, + "learning_rate": 4.718120669120832e-05, + "loss": 0.362, + "step": 4165500 + }, + { + "epoch": 28.191316587267217, + "grad_norm": 0.40183600783348083, + "learning_rate": 4.718086834127328e-05, + "loss": 0.3612, + "step": 4166000 + }, + { + "epoch": 28.194700086617583, + "grad_norm": 0.37525132298469543, + "learning_rate": 4.718052999133824e-05, + "loss": 0.3606, + "step": 4166500 + }, + { + "epoch": 28.19808358596795, + "grad_norm": 0.38127073645591736, + "learning_rate": 4.718019164140321e-05, + "loss": 0.3613, + "step": 4167000 + }, + { + "epoch": 28.20146708531832, + "grad_norm": 0.3760685324668884, + "learning_rate": 4.717985329146817e-05, + "loss": 0.3609, + "step": 4167500 + }, + { + "epoch": 28.204850584668687, + "grad_norm": 0.361402690410614, + "learning_rate": 4.717951494153313e-05, + "loss": 0.3613, + "step": 4168000 + }, + { + "epoch": 28.208234084019058, + "grad_norm": 0.3808455169200897, + "learning_rate": 4.71791765915981e-05, + "loss": 0.3607, + "step": 4168500 + }, + { + "epoch": 28.211617583369424, + "grad_norm": 0.37132528424263, + "learning_rate": 4.717883824166306e-05, + "loss": 0.3615, + "step": 4169000 + }, + { + "epoch": 28.21500108271979, + "grad_norm": 0.3817844092845917, + "learning_rate": 4.717849989172802e-05, + "loss": 0.3603, + "step": 4169500 + }, + { + "epoch": 28.21838458207016, + "grad_norm": 0.35797831416130066, + "learning_rate": 4.7178161541792985e-05, + "loss": 0.3611, + "step": 4170000 + }, + { + "epoch": 28.221768081420528, + "grad_norm": 0.41020745038986206, + "learning_rate": 4.7177823191857954e-05, + "loss": 0.362, + "step": 4170500 + }, + { + "epoch": 28.225151580770895, + "grad_norm": 0.362379252910614, + "learning_rate": 4.7177484841922916e-05, + "loss": 0.3616, + "step": 4171000 + }, + { + "epoch": 28.228535080121265, + "grad_norm": 0.34908509254455566, + "learning_rate": 4.717714649198788e-05, + "loss": 0.3621, + "step": 4171500 + }, + { + "epoch": 28.231918579471632, + "grad_norm": 0.3852998614311218, + "learning_rate": 4.717680814205283e-05, + "loss": 0.3611, + "step": 4172000 + }, + { + "epoch": 28.235302078822002, + "grad_norm": 0.39981764554977417, + "learning_rate": 4.71764697921178e-05, + "loss": 0.361, + "step": 4172500 + }, + { + "epoch": 28.23868557817237, + "grad_norm": 0.3649141192436218, + "learning_rate": 4.7176131442182764e-05, + "loss": 0.3631, + "step": 4173000 + }, + { + "epoch": 28.242069077522736, + "grad_norm": 0.3951915204524994, + "learning_rate": 4.7175793092247726e-05, + "loss": 0.3611, + "step": 4173500 + }, + { + "epoch": 28.245452576873106, + "grad_norm": 0.39952680468559265, + "learning_rate": 4.717545474231269e-05, + "loss": 0.3612, + "step": 4174000 + }, + { + "epoch": 28.248836076223473, + "grad_norm": 0.40479788184165955, + "learning_rate": 4.717511639237766e-05, + "loss": 0.362, + "step": 4174500 + }, + { + "epoch": 28.252219575573843, + "grad_norm": 0.37462568283081055, + "learning_rate": 4.717477804244262e-05, + "loss": 0.3623, + "step": 4175000 + }, + { + "epoch": 28.25560307492421, + "grad_norm": 0.3540719747543335, + "learning_rate": 4.717443969250758e-05, + "loss": 0.3613, + "step": 4175500 + }, + { + "epoch": 28.258986574274576, + "grad_norm": 0.3863060474395752, + "learning_rate": 4.7174101342572544e-05, + "loss": 0.3619, + "step": 4176000 + }, + { + "epoch": 28.262370073624947, + "grad_norm": 0.35178685188293457, + "learning_rate": 4.717376299263751e-05, + "loss": 0.3628, + "step": 4176500 + }, + { + "epoch": 28.265753572975314, + "grad_norm": 0.37595534324645996, + "learning_rate": 4.717342464270247e-05, + "loss": 0.3619, + "step": 4177000 + }, + { + "epoch": 28.269137072325684, + "grad_norm": 0.3724938631057739, + "learning_rate": 4.717308629276743e-05, + "loss": 0.3599, + "step": 4177500 + }, + { + "epoch": 28.27252057167605, + "grad_norm": 0.3593493700027466, + "learning_rate": 4.71727479428324e-05, + "loss": 0.3622, + "step": 4178000 + }, + { + "epoch": 28.275904071026417, + "grad_norm": 0.39943647384643555, + "learning_rate": 4.717240959289736e-05, + "loss": 0.3598, + "step": 4178500 + }, + { + "epoch": 28.279287570376788, + "grad_norm": 0.3845526874065399, + "learning_rate": 4.717207124296232e-05, + "loss": 0.3605, + "step": 4179000 + }, + { + "epoch": 28.282671069727154, + "grad_norm": 0.3796115219593048, + "learning_rate": 4.7171732893027285e-05, + "loss": 0.3631, + "step": 4179500 + }, + { + "epoch": 28.28605456907752, + "grad_norm": 0.37783804535865784, + "learning_rate": 4.7171394543092254e-05, + "loss": 0.362, + "step": 4180000 + }, + { + "epoch": 28.28943806842789, + "grad_norm": 0.3520191013813019, + "learning_rate": 4.7171056193157216e-05, + "loss": 0.3603, + "step": 4180500 + }, + { + "epoch": 28.292821567778258, + "grad_norm": 0.39619338512420654, + "learning_rate": 4.717071784322218e-05, + "loss": 0.3605, + "step": 4181000 + }, + { + "epoch": 28.29620506712863, + "grad_norm": 0.363335520029068, + "learning_rate": 4.7170379493287134e-05, + "loss": 0.3614, + "step": 4181500 + }, + { + "epoch": 28.299588566478995, + "grad_norm": 0.3505692183971405, + "learning_rate": 4.71700411433521e-05, + "loss": 0.3594, + "step": 4182000 + }, + { + "epoch": 28.302972065829362, + "grad_norm": 0.3883934020996094, + "learning_rate": 4.7169702793417065e-05, + "loss": 0.362, + "step": 4182500 + }, + { + "epoch": 28.306355565179732, + "grad_norm": 0.35601288080215454, + "learning_rate": 4.716936444348203e-05, + "loss": 0.361, + "step": 4183000 + }, + { + "epoch": 28.3097390645301, + "grad_norm": 0.3968811333179474, + "learning_rate": 4.716902609354699e-05, + "loss": 0.3612, + "step": 4183500 + }, + { + "epoch": 28.31312256388047, + "grad_norm": 0.3789132535457611, + "learning_rate": 4.716868774361196e-05, + "loss": 0.3595, + "step": 4184000 + }, + { + "epoch": 28.316506063230836, + "grad_norm": 0.39282628893852234, + "learning_rate": 4.716834939367692e-05, + "loss": 0.3607, + "step": 4184500 + }, + { + "epoch": 28.319889562581203, + "grad_norm": 0.37454989552497864, + "learning_rate": 4.716801104374188e-05, + "loss": 0.3603, + "step": 4185000 + }, + { + "epoch": 28.323273061931573, + "grad_norm": 0.412693053483963, + "learning_rate": 4.7167672693806844e-05, + "loss": 0.3603, + "step": 4185500 + }, + { + "epoch": 28.32665656128194, + "grad_norm": 0.39129626750946045, + "learning_rate": 4.716733434387181e-05, + "loss": 0.3614, + "step": 4186000 + }, + { + "epoch": 28.330040060632307, + "grad_norm": 0.3551253378391266, + "learning_rate": 4.716699599393677e-05, + "loss": 0.3607, + "step": 4186500 + }, + { + "epoch": 28.333423559982677, + "grad_norm": 0.3845466375350952, + "learning_rate": 4.716665764400173e-05, + "loss": 0.3612, + "step": 4187000 + }, + { + "epoch": 28.336807059333044, + "grad_norm": 0.39009878039360046, + "learning_rate": 4.71663192940667e-05, + "loss": 0.3614, + "step": 4187500 + }, + { + "epoch": 28.340190558683414, + "grad_norm": 0.3824731409549713, + "learning_rate": 4.716598094413166e-05, + "loss": 0.3614, + "step": 4188000 + }, + { + "epoch": 28.34357405803378, + "grad_norm": 0.37005728483200073, + "learning_rate": 4.7165642594196624e-05, + "loss": 0.3619, + "step": 4188500 + }, + { + "epoch": 28.346957557384147, + "grad_norm": 0.4180995225906372, + "learning_rate": 4.7165304244261586e-05, + "loss": 0.3593, + "step": 4189000 + }, + { + "epoch": 28.350341056734518, + "grad_norm": 0.3632233738899231, + "learning_rate": 4.716496589432655e-05, + "loss": 0.3615, + "step": 4189500 + }, + { + "epoch": 28.353724556084885, + "grad_norm": 0.33749160170555115, + "learning_rate": 4.716462754439152e-05, + "loss": 0.361, + "step": 4190000 + }, + { + "epoch": 28.357108055435255, + "grad_norm": 0.40541553497314453, + "learning_rate": 4.716428919445648e-05, + "loss": 0.3612, + "step": 4190500 + }, + { + "epoch": 28.36049155478562, + "grad_norm": 0.3835623860359192, + "learning_rate": 4.716395084452144e-05, + "loss": 0.3612, + "step": 4191000 + }, + { + "epoch": 28.36387505413599, + "grad_norm": 0.36845603585243225, + "learning_rate": 4.7163612494586403e-05, + "loss": 0.3627, + "step": 4191500 + }, + { + "epoch": 28.36725855348636, + "grad_norm": 0.3849983215332031, + "learning_rate": 4.7163274144651366e-05, + "loss": 0.3623, + "step": 4192000 + }, + { + "epoch": 28.370642052836725, + "grad_norm": 0.35379183292388916, + "learning_rate": 4.716293579471633e-05, + "loss": 0.3613, + "step": 4192500 + }, + { + "epoch": 28.374025552187096, + "grad_norm": 0.3451194167137146, + "learning_rate": 4.716259744478129e-05, + "loss": 0.3609, + "step": 4193000 + }, + { + "epoch": 28.377409051537462, + "grad_norm": 0.420551598072052, + "learning_rate": 4.716225909484626e-05, + "loss": 0.3626, + "step": 4193500 + }, + { + "epoch": 28.38079255088783, + "grad_norm": 0.3557862341403961, + "learning_rate": 4.716192074491122e-05, + "loss": 0.3604, + "step": 4194000 + }, + { + "epoch": 28.3841760502382, + "grad_norm": 0.35886773467063904, + "learning_rate": 4.716158239497618e-05, + "loss": 0.36, + "step": 4194500 + }, + { + "epoch": 28.387559549588566, + "grad_norm": 0.35705262422561646, + "learning_rate": 4.7161244045041145e-05, + "loss": 0.3617, + "step": 4195000 + }, + { + "epoch": 28.390943048938933, + "grad_norm": 0.3624837100505829, + "learning_rate": 4.7160905695106114e-05, + "loss": 0.3606, + "step": 4195500 + }, + { + "epoch": 28.394326548289303, + "grad_norm": 0.3872695565223694, + "learning_rate": 4.716056734517107e-05, + "loss": 0.3625, + "step": 4196000 + }, + { + "epoch": 28.39771004763967, + "grad_norm": 0.35200896859169006, + "learning_rate": 4.716022899523603e-05, + "loss": 0.3609, + "step": 4196500 + }, + { + "epoch": 28.40109354699004, + "grad_norm": 0.3648911714553833, + "learning_rate": 4.7159890645301e-05, + "loss": 0.362, + "step": 4197000 + }, + { + "epoch": 28.404477046340407, + "grad_norm": 0.3744070827960968, + "learning_rate": 4.715955229536596e-05, + "loss": 0.36, + "step": 4197500 + }, + { + "epoch": 28.407860545690774, + "grad_norm": 0.35949233174324036, + "learning_rate": 4.7159213945430925e-05, + "loss": 0.3612, + "step": 4198000 + }, + { + "epoch": 28.411244045041144, + "grad_norm": 0.3645084798336029, + "learning_rate": 4.715887559549589e-05, + "loss": 0.3608, + "step": 4198500 + }, + { + "epoch": 28.41462754439151, + "grad_norm": 0.37557467818260193, + "learning_rate": 4.715853724556085e-05, + "loss": 0.3607, + "step": 4199000 + }, + { + "epoch": 28.41801104374188, + "grad_norm": 0.39893093705177307, + "learning_rate": 4.715819889562582e-05, + "loss": 0.3615, + "step": 4199500 + }, + { + "epoch": 28.421394543092248, + "grad_norm": 0.3806627094745636, + "learning_rate": 4.715786054569078e-05, + "loss": 0.3632, + "step": 4200000 + }, + { + "epoch": 28.424778042442615, + "grad_norm": 0.40269455313682556, + "learning_rate": 4.715752219575574e-05, + "loss": 0.3608, + "step": 4200500 + }, + { + "epoch": 28.428161541792985, + "grad_norm": 0.34466657042503357, + "learning_rate": 4.7157183845820704e-05, + "loss": 0.3606, + "step": 4201000 + }, + { + "epoch": 28.43154504114335, + "grad_norm": 0.41121411323547363, + "learning_rate": 4.7156845495885666e-05, + "loss": 0.362, + "step": 4201500 + }, + { + "epoch": 28.434928540493722, + "grad_norm": 0.41764146089553833, + "learning_rate": 4.715650714595063e-05, + "loss": 0.3622, + "step": 4202000 + }, + { + "epoch": 28.43831203984409, + "grad_norm": 0.3867233097553253, + "learning_rate": 4.715616879601559e-05, + "loss": 0.363, + "step": 4202500 + }, + { + "epoch": 28.441695539194455, + "grad_norm": 0.3862224817276001, + "learning_rate": 4.715583044608056e-05, + "loss": 0.3617, + "step": 4203000 + }, + { + "epoch": 28.445079038544826, + "grad_norm": 0.38365626335144043, + "learning_rate": 4.715549209614552e-05, + "loss": 0.3611, + "step": 4203500 + }, + { + "epoch": 28.448462537895193, + "grad_norm": 0.3457985520362854, + "learning_rate": 4.7155153746210484e-05, + "loss": 0.3601, + "step": 4204000 + }, + { + "epoch": 28.45184603724556, + "grad_norm": 0.35866719484329224, + "learning_rate": 4.7154815396275446e-05, + "loss": 0.3613, + "step": 4204500 + }, + { + "epoch": 28.45522953659593, + "grad_norm": 0.3741433024406433, + "learning_rate": 4.7154477046340415e-05, + "loss": 0.3608, + "step": 4205000 + }, + { + "epoch": 28.458613035946296, + "grad_norm": 0.42533352971076965, + "learning_rate": 4.715413869640537e-05, + "loss": 0.3623, + "step": 4205500 + }, + { + "epoch": 28.461996535296667, + "grad_norm": 0.3592928946018219, + "learning_rate": 4.715380034647033e-05, + "loss": 0.3619, + "step": 4206000 + }, + { + "epoch": 28.465380034647033, + "grad_norm": 0.36465469002723694, + "learning_rate": 4.7153461996535294e-05, + "loss": 0.3634, + "step": 4206500 + }, + { + "epoch": 28.4687635339974, + "grad_norm": 0.36585113406181335, + "learning_rate": 4.715312364660026e-05, + "loss": 0.361, + "step": 4207000 + }, + { + "epoch": 28.47214703334777, + "grad_norm": 0.38405710458755493, + "learning_rate": 4.7152785296665225e-05, + "loss": 0.3608, + "step": 4207500 + }, + { + "epoch": 28.475530532698137, + "grad_norm": 0.39637649059295654, + "learning_rate": 4.715244694673019e-05, + "loss": 0.3617, + "step": 4208000 + }, + { + "epoch": 28.478914032048507, + "grad_norm": 0.37295740842819214, + "learning_rate": 4.715210859679515e-05, + "loss": 0.3614, + "step": 4208500 + }, + { + "epoch": 28.482297531398874, + "grad_norm": 0.3904971480369568, + "learning_rate": 4.715177024686012e-05, + "loss": 0.3621, + "step": 4209000 + }, + { + "epoch": 28.48568103074924, + "grad_norm": 0.35155045986175537, + "learning_rate": 4.715143189692508e-05, + "loss": 0.3609, + "step": 4209500 + }, + { + "epoch": 28.48906453009961, + "grad_norm": 0.355697363615036, + "learning_rate": 4.715109354699004e-05, + "loss": 0.3605, + "step": 4210000 + }, + { + "epoch": 28.492448029449978, + "grad_norm": 0.3459162414073944, + "learning_rate": 4.7150755197055005e-05, + "loss": 0.3626, + "step": 4210500 + }, + { + "epoch": 28.495831528800345, + "grad_norm": 0.36261093616485596, + "learning_rate": 4.715041684711997e-05, + "loss": 0.3612, + "step": 4211000 + }, + { + "epoch": 28.499215028150715, + "grad_norm": 0.3706207573413849, + "learning_rate": 4.715007849718493e-05, + "loss": 0.3611, + "step": 4211500 + }, + { + "epoch": 28.502598527501082, + "grad_norm": 0.39113545417785645, + "learning_rate": 4.714974014724989e-05, + "loss": 0.3632, + "step": 4212000 + }, + { + "epoch": 28.505982026851452, + "grad_norm": 0.388141393661499, + "learning_rate": 4.714940179731486e-05, + "loss": 0.3615, + "step": 4212500 + }, + { + "epoch": 28.50936552620182, + "grad_norm": 0.35941025614738464, + "learning_rate": 4.714906344737982e-05, + "loss": 0.3619, + "step": 4213000 + }, + { + "epoch": 28.512749025552186, + "grad_norm": 0.39670437574386597, + "learning_rate": 4.7148725097444784e-05, + "loss": 0.3609, + "step": 4213500 + }, + { + "epoch": 28.516132524902556, + "grad_norm": 0.41994255781173706, + "learning_rate": 4.7148386747509746e-05, + "loss": 0.3628, + "step": 4214000 + }, + { + "epoch": 28.519516024252923, + "grad_norm": 0.41873568296432495, + "learning_rate": 4.7148048397574715e-05, + "loss": 0.3601, + "step": 4214500 + }, + { + "epoch": 28.522899523603293, + "grad_norm": 0.36655986309051514, + "learning_rate": 4.714771004763967e-05, + "loss": 0.3627, + "step": 4215000 + }, + { + "epoch": 28.52628302295366, + "grad_norm": 0.38597121834754944, + "learning_rate": 4.714737169770463e-05, + "loss": 0.3624, + "step": 4215500 + }, + { + "epoch": 28.529666522304026, + "grad_norm": 0.34690356254577637, + "learning_rate": 4.7147033347769595e-05, + "loss": 0.3618, + "step": 4216000 + }, + { + "epoch": 28.533050021654397, + "grad_norm": 0.3700477182865143, + "learning_rate": 4.7146694997834564e-05, + "loss": 0.3618, + "step": 4216500 + }, + { + "epoch": 28.536433521004763, + "grad_norm": 0.38068437576293945, + "learning_rate": 4.7146356647899526e-05, + "loss": 0.3598, + "step": 4217000 + }, + { + "epoch": 28.539817020355134, + "grad_norm": 0.4197213053703308, + "learning_rate": 4.714601829796449e-05, + "loss": 0.3639, + "step": 4217500 + }, + { + "epoch": 28.5432005197055, + "grad_norm": 0.4043872356414795, + "learning_rate": 4.714567994802945e-05, + "loss": 0.3612, + "step": 4218000 + }, + { + "epoch": 28.546584019055867, + "grad_norm": 0.38576143980026245, + "learning_rate": 4.714534159809442e-05, + "loss": 0.3617, + "step": 4218500 + }, + { + "epoch": 28.549967518406238, + "grad_norm": 0.3723194897174835, + "learning_rate": 4.714500324815938e-05, + "loss": 0.3611, + "step": 4219000 + }, + { + "epoch": 28.553351017756604, + "grad_norm": 0.3344508707523346, + "learning_rate": 4.714466489822434e-05, + "loss": 0.3618, + "step": 4219500 + }, + { + "epoch": 28.55673451710697, + "grad_norm": 0.3709132969379425, + "learning_rate": 4.7144326548289305e-05, + "loss": 0.361, + "step": 4220000 + }, + { + "epoch": 28.56011801645734, + "grad_norm": 0.3418281078338623, + "learning_rate": 4.714398819835427e-05, + "loss": 0.3599, + "step": 4220500 + }, + { + "epoch": 28.563501515807708, + "grad_norm": 0.34958213567733765, + "learning_rate": 4.714364984841923e-05, + "loss": 0.3613, + "step": 4221000 + }, + { + "epoch": 28.56688501515808, + "grad_norm": 0.36479049921035767, + "learning_rate": 4.714331149848419e-05, + "loss": 0.3604, + "step": 4221500 + }, + { + "epoch": 28.570268514508445, + "grad_norm": 0.4019864797592163, + "learning_rate": 4.714297314854916e-05, + "loss": 0.361, + "step": 4222000 + }, + { + "epoch": 28.573652013858812, + "grad_norm": 0.37898576259613037, + "learning_rate": 4.714263479861412e-05, + "loss": 0.361, + "step": 4222500 + }, + { + "epoch": 28.577035513209182, + "grad_norm": 0.4045998454093933, + "learning_rate": 4.7142296448679085e-05, + "loss": 0.3622, + "step": 4223000 + }, + { + "epoch": 28.58041901255955, + "grad_norm": 0.3739849925041199, + "learning_rate": 4.714195809874405e-05, + "loss": 0.3619, + "step": 4223500 + }, + { + "epoch": 28.58380251190992, + "grad_norm": 0.3683318793773651, + "learning_rate": 4.7141619748809016e-05, + "loss": 0.3617, + "step": 4224000 + }, + { + "epoch": 28.587186011260286, + "grad_norm": 0.3913784325122833, + "learning_rate": 4.714128139887397e-05, + "loss": 0.3627, + "step": 4224500 + }, + { + "epoch": 28.590569510610653, + "grad_norm": 0.39089998602867126, + "learning_rate": 4.714094304893893e-05, + "loss": 0.3604, + "step": 4225000 + }, + { + "epoch": 28.593953009961023, + "grad_norm": 0.4031013548374176, + "learning_rate": 4.7140604699003895e-05, + "loss": 0.3618, + "step": 4225500 + }, + { + "epoch": 28.59733650931139, + "grad_norm": 0.36095115542411804, + "learning_rate": 4.7140266349068864e-05, + "loss": 0.3619, + "step": 4226000 + }, + { + "epoch": 28.60072000866176, + "grad_norm": 0.37524983286857605, + "learning_rate": 4.7139927999133826e-05, + "loss": 0.3612, + "step": 4226500 + }, + { + "epoch": 28.604103508012127, + "grad_norm": 0.3565714955329895, + "learning_rate": 4.713958964919879e-05, + "loss": 0.3608, + "step": 4227000 + }, + { + "epoch": 28.607487007362494, + "grad_norm": 0.35314592719078064, + "learning_rate": 4.713925129926375e-05, + "loss": 0.3618, + "step": 4227500 + }, + { + "epoch": 28.610870506712864, + "grad_norm": 0.3785061538219452, + "learning_rate": 4.713891294932872e-05, + "loss": 0.3619, + "step": 4228000 + }, + { + "epoch": 28.61425400606323, + "grad_norm": 0.40104159712791443, + "learning_rate": 4.713857459939368e-05, + "loss": 0.3612, + "step": 4228500 + }, + { + "epoch": 28.617637505413597, + "grad_norm": 0.373701274394989, + "learning_rate": 4.7138236249458644e-05, + "loss": 0.3609, + "step": 4229000 + }, + { + "epoch": 28.621021004763968, + "grad_norm": 0.3659369647502899, + "learning_rate": 4.7137897899523606e-05, + "loss": 0.3607, + "step": 4229500 + }, + { + "epoch": 28.624404504114334, + "grad_norm": 0.3658003807067871, + "learning_rate": 4.713755954958857e-05, + "loss": 0.3601, + "step": 4230000 + }, + { + "epoch": 28.627788003464705, + "grad_norm": 0.3933974802494049, + "learning_rate": 4.713722119965353e-05, + "loss": 0.3629, + "step": 4230500 + }, + { + "epoch": 28.63117150281507, + "grad_norm": 0.351081907749176, + "learning_rate": 4.713688284971849e-05, + "loss": 0.3621, + "step": 4231000 + }, + { + "epoch": 28.63455500216544, + "grad_norm": 0.3646312952041626, + "learning_rate": 4.713654449978346e-05, + "loss": 0.3607, + "step": 4231500 + }, + { + "epoch": 28.63793850151581, + "grad_norm": 0.3640258312225342, + "learning_rate": 4.713620614984842e-05, + "loss": 0.3619, + "step": 4232000 + }, + { + "epoch": 28.641322000866175, + "grad_norm": 0.38643383979797363, + "learning_rate": 4.7135867799913385e-05, + "loss": 0.3611, + "step": 4232500 + }, + { + "epoch": 28.644705500216546, + "grad_norm": 0.3771829903125763, + "learning_rate": 4.713552944997835e-05, + "loss": 0.3608, + "step": 4233000 + }, + { + "epoch": 28.648088999566912, + "grad_norm": 0.3571754992008209, + "learning_rate": 4.7135191100043317e-05, + "loss": 0.3625, + "step": 4233500 + }, + { + "epoch": 28.65147249891728, + "grad_norm": 0.3494188189506531, + "learning_rate": 4.713485275010827e-05, + "loss": 0.362, + "step": 4234000 + }, + { + "epoch": 28.65485599826765, + "grad_norm": 0.3332378566265106, + "learning_rate": 4.7134514400173234e-05, + "loss": 0.3616, + "step": 4234500 + }, + { + "epoch": 28.658239497618016, + "grad_norm": 0.3812815248966217, + "learning_rate": 4.7134176050238196e-05, + "loss": 0.3602, + "step": 4235000 + }, + { + "epoch": 28.661622996968383, + "grad_norm": 0.3675067722797394, + "learning_rate": 4.7133837700303165e-05, + "loss": 0.3605, + "step": 4235500 + }, + { + "epoch": 28.665006496318753, + "grad_norm": 0.38357430696487427, + "learning_rate": 4.713349935036813e-05, + "loss": 0.3605, + "step": 4236000 + }, + { + "epoch": 28.66838999566912, + "grad_norm": 0.3515216112136841, + "learning_rate": 4.713316100043309e-05, + "loss": 0.3624, + "step": 4236500 + }, + { + "epoch": 28.67177349501949, + "grad_norm": 0.36064037680625916, + "learning_rate": 4.713282265049805e-05, + "loss": 0.3605, + "step": 4237000 + }, + { + "epoch": 28.675156994369857, + "grad_norm": 0.40382781624794006, + "learning_rate": 4.713248430056302e-05, + "loss": 0.3627, + "step": 4237500 + }, + { + "epoch": 28.678540493720224, + "grad_norm": 0.3234970271587372, + "learning_rate": 4.713214595062798e-05, + "loss": 0.3633, + "step": 4238000 + }, + { + "epoch": 28.681923993070594, + "grad_norm": 0.40176334977149963, + "learning_rate": 4.7131807600692945e-05, + "loss": 0.3616, + "step": 4238500 + }, + { + "epoch": 28.68530749242096, + "grad_norm": 0.3842661678791046, + "learning_rate": 4.713146925075791e-05, + "loss": 0.3626, + "step": 4239000 + }, + { + "epoch": 28.68869099177133, + "grad_norm": 0.4055143892765045, + "learning_rate": 4.713113090082287e-05, + "loss": 0.3604, + "step": 4239500 + }, + { + "epoch": 28.692074491121698, + "grad_norm": 0.3751370310783386, + "learning_rate": 4.713079255088783e-05, + "loss": 0.3598, + "step": 4240000 + }, + { + "epoch": 28.695457990472065, + "grad_norm": 0.43553754687309265, + "learning_rate": 4.713045420095279e-05, + "loss": 0.362, + "step": 4240500 + }, + { + "epoch": 28.698841489822435, + "grad_norm": 0.3492582142353058, + "learning_rate": 4.713011585101776e-05, + "loss": 0.3609, + "step": 4241000 + }, + { + "epoch": 28.7022249891728, + "grad_norm": 0.36575913429260254, + "learning_rate": 4.7129777501082724e-05, + "loss": 0.3612, + "step": 4241500 + }, + { + "epoch": 28.70560848852317, + "grad_norm": 0.34025856852531433, + "learning_rate": 4.7129439151147686e-05, + "loss": 0.3612, + "step": 4242000 + }, + { + "epoch": 28.70899198787354, + "grad_norm": 0.36681750416755676, + "learning_rate": 4.712910080121265e-05, + "loss": 0.3617, + "step": 4242500 + }, + { + "epoch": 28.712375487223905, + "grad_norm": 0.3458787798881531, + "learning_rate": 4.712876245127762e-05, + "loss": 0.3598, + "step": 4243000 + }, + { + "epoch": 28.715758986574276, + "grad_norm": 0.3240845203399658, + "learning_rate": 4.712842410134258e-05, + "loss": 0.3609, + "step": 4243500 + }, + { + "epoch": 28.719142485924642, + "grad_norm": 0.3944730758666992, + "learning_rate": 4.7128085751407535e-05, + "loss": 0.3633, + "step": 4244000 + }, + { + "epoch": 28.72252598527501, + "grad_norm": 0.3892706036567688, + "learning_rate": 4.71277474014725e-05, + "loss": 0.3604, + "step": 4244500 + }, + { + "epoch": 28.72590948462538, + "grad_norm": 0.3503298759460449, + "learning_rate": 4.7127409051537466e-05, + "loss": 0.3617, + "step": 4245000 + }, + { + "epoch": 28.729292983975746, + "grad_norm": 0.3950233459472656, + "learning_rate": 4.712707070160243e-05, + "loss": 0.3618, + "step": 4245500 + }, + { + "epoch": 28.732676483326117, + "grad_norm": 0.3659052550792694, + "learning_rate": 4.712673235166739e-05, + "loss": 0.361, + "step": 4246000 + }, + { + "epoch": 28.736059982676483, + "grad_norm": 0.37944474816322327, + "learning_rate": 4.712639400173235e-05, + "loss": 0.3615, + "step": 4246500 + }, + { + "epoch": 28.73944348202685, + "grad_norm": 0.32573771476745605, + "learning_rate": 4.712605565179732e-05, + "loss": 0.3617, + "step": 4247000 + }, + { + "epoch": 28.74282698137722, + "grad_norm": 0.3730095326900482, + "learning_rate": 4.712571730186228e-05, + "loss": 0.3613, + "step": 4247500 + }, + { + "epoch": 28.746210480727587, + "grad_norm": 0.3818854093551636, + "learning_rate": 4.7125378951927245e-05, + "loss": 0.3619, + "step": 4248000 + }, + { + "epoch": 28.749593980077957, + "grad_norm": 0.3606642186641693, + "learning_rate": 4.712504060199221e-05, + "loss": 0.3616, + "step": 4248500 + }, + { + "epoch": 28.752977479428324, + "grad_norm": 0.408157616853714, + "learning_rate": 4.712470225205717e-05, + "loss": 0.359, + "step": 4249000 + }, + { + "epoch": 28.75636097877869, + "grad_norm": 0.3866164982318878, + "learning_rate": 4.712436390212213e-05, + "loss": 0.3625, + "step": 4249500 + }, + { + "epoch": 28.75974447812906, + "grad_norm": 0.3604215979576111, + "learning_rate": 4.7124025552187094e-05, + "loss": 0.3594, + "step": 4250000 + }, + { + "epoch": 28.763127977479428, + "grad_norm": 0.3600050210952759, + "learning_rate": 4.712368720225206e-05, + "loss": 0.3633, + "step": 4250500 + }, + { + "epoch": 28.7665114768298, + "grad_norm": 0.39066022634506226, + "learning_rate": 4.7123348852317025e-05, + "loss": 0.3622, + "step": 4251000 + }, + { + "epoch": 28.769894976180165, + "grad_norm": 0.41124704480171204, + "learning_rate": 4.712301050238199e-05, + "loss": 0.3624, + "step": 4251500 + }, + { + "epoch": 28.77327847553053, + "grad_norm": 0.32621800899505615, + "learning_rate": 4.712267215244695e-05, + "loss": 0.363, + "step": 4252000 + }, + { + "epoch": 28.776661974880902, + "grad_norm": 0.3741280436515808, + "learning_rate": 4.712233380251191e-05, + "loss": 0.3625, + "step": 4252500 + }, + { + "epoch": 28.78004547423127, + "grad_norm": 0.4018608629703522, + "learning_rate": 4.712199545257688e-05, + "loss": 0.3612, + "step": 4253000 + }, + { + "epoch": 28.783428973581636, + "grad_norm": 0.4042356610298157, + "learning_rate": 4.7121657102641835e-05, + "loss": 0.36, + "step": 4253500 + }, + { + "epoch": 28.786812472932006, + "grad_norm": 0.34805935621261597, + "learning_rate": 4.71213187527068e-05, + "loss": 0.3617, + "step": 4254000 + }, + { + "epoch": 28.790195972282373, + "grad_norm": 0.4501422047615051, + "learning_rate": 4.7120980402771766e-05, + "loss": 0.3628, + "step": 4254500 + }, + { + "epoch": 28.793579471632743, + "grad_norm": 0.4037505090236664, + "learning_rate": 4.712064205283673e-05, + "loss": 0.3604, + "step": 4255000 + }, + { + "epoch": 28.79696297098311, + "grad_norm": 0.3673800528049469, + "learning_rate": 4.712030370290169e-05, + "loss": 0.3615, + "step": 4255500 + }, + { + "epoch": 28.800346470333476, + "grad_norm": 0.35557159781455994, + "learning_rate": 4.711996535296665e-05, + "loss": 0.3591, + "step": 4256000 + }, + { + "epoch": 28.803729969683847, + "grad_norm": 0.36397409439086914, + "learning_rate": 4.711962700303162e-05, + "loss": 0.3614, + "step": 4256500 + }, + { + "epoch": 28.807113469034213, + "grad_norm": 0.40870311856269836, + "learning_rate": 4.7119288653096584e-05, + "loss": 0.3626, + "step": 4257000 + }, + { + "epoch": 28.810496968384584, + "grad_norm": 0.36634010076522827, + "learning_rate": 4.7118950303161546e-05, + "loss": 0.3618, + "step": 4257500 + }, + { + "epoch": 28.81388046773495, + "grad_norm": 0.33262279629707336, + "learning_rate": 4.711861195322651e-05, + "loss": 0.362, + "step": 4258000 + }, + { + "epoch": 28.817263967085317, + "grad_norm": 0.36668986082077026, + "learning_rate": 4.711827360329147e-05, + "loss": 0.3614, + "step": 4258500 + }, + { + "epoch": 28.820647466435688, + "grad_norm": 0.34490931034088135, + "learning_rate": 4.711793525335643e-05, + "loss": 0.3617, + "step": 4259000 + }, + { + "epoch": 28.824030965786054, + "grad_norm": 0.3578276038169861, + "learning_rate": 4.7117596903421394e-05, + "loss": 0.3628, + "step": 4259500 + }, + { + "epoch": 28.82741446513642, + "grad_norm": 0.3873530924320221, + "learning_rate": 4.7117258553486356e-05, + "loss": 0.3608, + "step": 4260000 + }, + { + "epoch": 28.83079796448679, + "grad_norm": 0.3942200541496277, + "learning_rate": 4.7116920203551325e-05, + "loss": 0.3613, + "step": 4260500 + }, + { + "epoch": 28.834181463837158, + "grad_norm": 0.3761383295059204, + "learning_rate": 4.711658185361629e-05, + "loss": 0.3613, + "step": 4261000 + }, + { + "epoch": 28.83756496318753, + "grad_norm": 0.3999760150909424, + "learning_rate": 4.711624350368125e-05, + "loss": 0.3618, + "step": 4261500 + }, + { + "epoch": 28.840948462537895, + "grad_norm": 0.36837732791900635, + "learning_rate": 4.711590515374621e-05, + "loss": 0.3629, + "step": 4262000 + }, + { + "epoch": 28.844331961888262, + "grad_norm": 0.3617350459098816, + "learning_rate": 4.711556680381118e-05, + "loss": 0.361, + "step": 4262500 + }, + { + "epoch": 28.847715461238632, + "grad_norm": 0.3469606339931488, + "learning_rate": 4.7115228453876136e-05, + "loss": 0.3617, + "step": 4263000 + }, + { + "epoch": 28.851098960589, + "grad_norm": 0.3613704442977905, + "learning_rate": 4.71148901039411e-05, + "loss": 0.3617, + "step": 4263500 + }, + { + "epoch": 28.85448245993937, + "grad_norm": 0.40201789140701294, + "learning_rate": 4.711455175400607e-05, + "loss": 0.361, + "step": 4264000 + }, + { + "epoch": 28.857865959289736, + "grad_norm": 0.44502681493759155, + "learning_rate": 4.711421340407103e-05, + "loss": 0.3612, + "step": 4264500 + }, + { + "epoch": 28.861249458640103, + "grad_norm": 0.34542232751846313, + "learning_rate": 4.711387505413599e-05, + "loss": 0.3618, + "step": 4265000 + }, + { + "epoch": 28.864632957990473, + "grad_norm": 0.3687632381916046, + "learning_rate": 4.711353670420095e-05, + "loss": 0.3617, + "step": 4265500 + }, + { + "epoch": 28.86801645734084, + "grad_norm": 0.3720606565475464, + "learning_rate": 4.711319835426592e-05, + "loss": 0.3617, + "step": 4266000 + }, + { + "epoch": 28.871399956691207, + "grad_norm": 0.35069751739501953, + "learning_rate": 4.7112860004330884e-05, + "loss": 0.3612, + "step": 4266500 + }, + { + "epoch": 28.874783456041577, + "grad_norm": 0.36152008175849915, + "learning_rate": 4.7112521654395846e-05, + "loss": 0.3615, + "step": 4267000 + }, + { + "epoch": 28.878166955391944, + "grad_norm": 0.3989510238170624, + "learning_rate": 4.711218330446081e-05, + "loss": 0.361, + "step": 4267500 + }, + { + "epoch": 28.881550454742314, + "grad_norm": 0.41541945934295654, + "learning_rate": 4.711184495452577e-05, + "loss": 0.3612, + "step": 4268000 + }, + { + "epoch": 28.88493395409268, + "grad_norm": 0.39647066593170166, + "learning_rate": 4.711150660459073e-05, + "loss": 0.3614, + "step": 4268500 + }, + { + "epoch": 28.888317453443047, + "grad_norm": 0.35431623458862305, + "learning_rate": 4.7111168254655695e-05, + "loss": 0.3627, + "step": 4269000 + }, + { + "epoch": 28.891700952793418, + "grad_norm": 0.3839551508426666, + "learning_rate": 4.711082990472066e-05, + "loss": 0.3601, + "step": 4269500 + }, + { + "epoch": 28.895084452143784, + "grad_norm": 0.3508188724517822, + "learning_rate": 4.7110491554785626e-05, + "loss": 0.3594, + "step": 4270000 + }, + { + "epoch": 28.898467951494155, + "grad_norm": 0.36134690046310425, + "learning_rate": 4.711015320485059e-05, + "loss": 0.3614, + "step": 4270500 + }, + { + "epoch": 28.90185145084452, + "grad_norm": 0.36609122157096863, + "learning_rate": 4.710981485491555e-05, + "loss": 0.361, + "step": 4271000 + }, + { + "epoch": 28.905234950194888, + "grad_norm": 0.3730999231338501, + "learning_rate": 4.710947650498051e-05, + "loss": 0.3617, + "step": 4271500 + }, + { + "epoch": 28.90861844954526, + "grad_norm": 0.41123491525650024, + "learning_rate": 4.710913815504548e-05, + "loss": 0.3614, + "step": 4272000 + }, + { + "epoch": 28.912001948895625, + "grad_norm": 0.3542698323726654, + "learning_rate": 4.7108799805110437e-05, + "loss": 0.3612, + "step": 4272500 + }, + { + "epoch": 28.915385448245996, + "grad_norm": 0.4087660014629364, + "learning_rate": 4.71084614551754e-05, + "loss": 0.3608, + "step": 4273000 + }, + { + "epoch": 28.918768947596362, + "grad_norm": 0.37405553460121155, + "learning_rate": 4.710812310524037e-05, + "loss": 0.3616, + "step": 4273500 + }, + { + "epoch": 28.92215244694673, + "grad_norm": 0.3734908401966095, + "learning_rate": 4.710778475530533e-05, + "loss": 0.3616, + "step": 4274000 + }, + { + "epoch": 28.9255359462971, + "grad_norm": 0.37422364950180054, + "learning_rate": 4.710744640537029e-05, + "loss": 0.3629, + "step": 4274500 + }, + { + "epoch": 28.928919445647466, + "grad_norm": 0.3265927731990814, + "learning_rate": 4.7107108055435254e-05, + "loss": 0.3601, + "step": 4275000 + }, + { + "epoch": 28.932302944997836, + "grad_norm": 0.3842279314994812, + "learning_rate": 4.710676970550022e-05, + "loss": 0.3611, + "step": 4275500 + }, + { + "epoch": 28.935686444348203, + "grad_norm": 0.3847258388996124, + "learning_rate": 4.7106431355565185e-05, + "loss": 0.3616, + "step": 4276000 + }, + { + "epoch": 28.93906994369857, + "grad_norm": 0.4233771860599518, + "learning_rate": 4.710609300563015e-05, + "loss": 0.3609, + "step": 4276500 + }, + { + "epoch": 28.94245344304894, + "grad_norm": 0.38106000423431396, + "learning_rate": 4.71057546556951e-05, + "loss": 0.3624, + "step": 4277000 + }, + { + "epoch": 28.945836942399307, + "grad_norm": 0.3772009015083313, + "learning_rate": 4.710541630576007e-05, + "loss": 0.3609, + "step": 4277500 + }, + { + "epoch": 28.949220441749674, + "grad_norm": 0.34100696444511414, + "learning_rate": 4.7105077955825033e-05, + "loss": 0.362, + "step": 4278000 + }, + { + "epoch": 28.952603941100044, + "grad_norm": 0.3932490646839142, + "learning_rate": 4.7104739605889996e-05, + "loss": 0.3628, + "step": 4278500 + }, + { + "epoch": 28.95598744045041, + "grad_norm": 0.36704206466674805, + "learning_rate": 4.710440125595496e-05, + "loss": 0.3617, + "step": 4279000 + }, + { + "epoch": 28.95937093980078, + "grad_norm": 0.393768846988678, + "learning_rate": 4.7104062906019927e-05, + "loss": 0.3609, + "step": 4279500 + }, + { + "epoch": 28.962754439151148, + "grad_norm": 0.40044960379600525, + "learning_rate": 4.710372455608489e-05, + "loss": 0.3613, + "step": 4280000 + }, + { + "epoch": 28.966137938501515, + "grad_norm": 0.37255147099494934, + "learning_rate": 4.710338620614985e-05, + "loss": 0.3619, + "step": 4280500 + }, + { + "epoch": 28.969521437851885, + "grad_norm": 0.4047803580760956, + "learning_rate": 4.710304785621481e-05, + "loss": 0.3612, + "step": 4281000 + }, + { + "epoch": 28.97290493720225, + "grad_norm": 0.3823975920677185, + "learning_rate": 4.710270950627978e-05, + "loss": 0.3605, + "step": 4281500 + }, + { + "epoch": 28.976288436552622, + "grad_norm": 0.4176645874977112, + "learning_rate": 4.710237115634474e-05, + "loss": 0.3621, + "step": 4282000 + }, + { + "epoch": 28.97967193590299, + "grad_norm": 0.3672531843185425, + "learning_rate": 4.71020328064097e-05, + "loss": 0.3615, + "step": 4282500 + }, + { + "epoch": 28.983055435253355, + "grad_norm": 0.36267173290252686, + "learning_rate": 4.710169445647467e-05, + "loss": 0.3616, + "step": 4283000 + }, + { + "epoch": 28.986438934603726, + "grad_norm": 0.3600069582462311, + "learning_rate": 4.710135610653963e-05, + "loss": 0.3616, + "step": 4283500 + }, + { + "epoch": 28.989822433954092, + "grad_norm": 0.3569662272930145, + "learning_rate": 4.710101775660459e-05, + "loss": 0.3609, + "step": 4284000 + }, + { + "epoch": 28.99320593330446, + "grad_norm": 0.3485254645347595, + "learning_rate": 4.7100679406669555e-05, + "loss": 0.3608, + "step": 4284500 + }, + { + "epoch": 28.99658943265483, + "grad_norm": 0.4217069149017334, + "learning_rate": 4.7100341056734523e-05, + "loss": 0.362, + "step": 4285000 + }, + { + "epoch": 28.999972932005196, + "grad_norm": 0.36723482608795166, + "learning_rate": 4.7100002706799486e-05, + "loss": 0.3631, + "step": 4285500 + }, + { + "epoch": 29.0, + "eval_accuracy": 0.8623626838338284, + "eval_loss": 0.5590023398399353, + "eval_runtime": 3353.1542, + "eval_samples_per_second": 86.708, + "eval_steps_per_second": 5.419, + "step": 4285504 + }, + { + "epoch": 29.003356431355567, + "grad_norm": 0.37278684973716736, + "learning_rate": 4.709966435686445e-05, + "loss": 0.3605, + "step": 4286000 + }, + { + "epoch": 29.006739930705933, + "grad_norm": 0.3531095087528229, + "learning_rate": 4.70993260069294e-05, + "loss": 0.3574, + "step": 4286500 + }, + { + "epoch": 29.0101234300563, + "grad_norm": 0.35951194167137146, + "learning_rate": 4.709898765699437e-05, + "loss": 0.3575, + "step": 4287000 + }, + { + "epoch": 29.01350692940667, + "grad_norm": 0.4008044898509979, + "learning_rate": 4.7098649307059334e-05, + "loss": 0.36, + "step": 4287500 + }, + { + "epoch": 29.016890428757037, + "grad_norm": 0.34333372116088867, + "learning_rate": 4.7098310957124296e-05, + "loss": 0.3602, + "step": 4288000 + }, + { + "epoch": 29.020273928107407, + "grad_norm": 0.34748372435569763, + "learning_rate": 4.709797260718926e-05, + "loss": 0.3594, + "step": 4288500 + }, + { + "epoch": 29.023657427457774, + "grad_norm": 0.3687286972999573, + "learning_rate": 4.709763425725423e-05, + "loss": 0.3606, + "step": 4289000 + }, + { + "epoch": 29.02704092680814, + "grad_norm": 0.37696537375450134, + "learning_rate": 4.709729590731919e-05, + "loss": 0.3597, + "step": 4289500 + }, + { + "epoch": 29.03042442615851, + "grad_norm": 0.3552592992782593, + "learning_rate": 4.709695755738415e-05, + "loss": 0.36, + "step": 4290000 + }, + { + "epoch": 29.033807925508878, + "grad_norm": 0.3937763571739197, + "learning_rate": 4.7096619207449114e-05, + "loss": 0.3604, + "step": 4290500 + }, + { + "epoch": 29.037191424859245, + "grad_norm": 0.41418910026550293, + "learning_rate": 4.709628085751408e-05, + "loss": 0.3599, + "step": 4291000 + }, + { + "epoch": 29.040574924209615, + "grad_norm": 0.3835686147212982, + "learning_rate": 4.709594250757904e-05, + "loss": 0.3587, + "step": 4291500 + }, + { + "epoch": 29.04395842355998, + "grad_norm": 0.36630573868751526, + "learning_rate": 4.7095604157644e-05, + "loss": 0.3597, + "step": 4292000 + }, + { + "epoch": 29.047341922910352, + "grad_norm": 0.32699382305145264, + "learning_rate": 4.709526580770897e-05, + "loss": 0.3601, + "step": 4292500 + }, + { + "epoch": 29.05072542226072, + "grad_norm": 0.37527844309806824, + "learning_rate": 4.709492745777393e-05, + "loss": 0.3595, + "step": 4293000 + }, + { + "epoch": 29.054108921611085, + "grad_norm": 0.39489516615867615, + "learning_rate": 4.709458910783889e-05, + "loss": 0.3607, + "step": 4293500 + }, + { + "epoch": 29.057492420961456, + "grad_norm": 0.3843926191329956, + "learning_rate": 4.7094250757903855e-05, + "loss": 0.3596, + "step": 4294000 + }, + { + "epoch": 29.060875920311823, + "grad_norm": 0.3965241014957428, + "learning_rate": 4.7093912407968824e-05, + "loss": 0.3581, + "step": 4294500 + }, + { + "epoch": 29.064259419662193, + "grad_norm": 0.393311083316803, + "learning_rate": 4.7093574058033786e-05, + "loss": 0.3608, + "step": 4295000 + }, + { + "epoch": 29.06764291901256, + "grad_norm": 0.40740710496902466, + "learning_rate": 4.709323570809875e-05, + "loss": 0.3594, + "step": 4295500 + }, + { + "epoch": 29.071026418362926, + "grad_norm": 0.40089091658592224, + "learning_rate": 4.7092897358163704e-05, + "loss": 0.3596, + "step": 4296000 + }, + { + "epoch": 29.074409917713297, + "grad_norm": 0.37659919261932373, + "learning_rate": 4.709255900822867e-05, + "loss": 0.359, + "step": 4296500 + }, + { + "epoch": 29.077793417063663, + "grad_norm": 0.3679037094116211, + "learning_rate": 4.7092220658293635e-05, + "loss": 0.3592, + "step": 4297000 + }, + { + "epoch": 29.081176916414034, + "grad_norm": 0.3677593767642975, + "learning_rate": 4.70918823083586e-05, + "loss": 0.3608, + "step": 4297500 + }, + { + "epoch": 29.0845604157644, + "grad_norm": 0.3663621246814728, + "learning_rate": 4.709154395842356e-05, + "loss": 0.3593, + "step": 4298000 + }, + { + "epoch": 29.087943915114767, + "grad_norm": 0.39356285333633423, + "learning_rate": 4.709120560848853e-05, + "loss": 0.3602, + "step": 4298500 + }, + { + "epoch": 29.091327414465137, + "grad_norm": 0.3937978744506836, + "learning_rate": 4.709086725855349e-05, + "loss": 0.3601, + "step": 4299000 + }, + { + "epoch": 29.094710913815504, + "grad_norm": 0.36316731572151184, + "learning_rate": 4.709052890861845e-05, + "loss": 0.3599, + "step": 4299500 + }, + { + "epoch": 29.09809441316587, + "grad_norm": 0.38304466009140015, + "learning_rate": 4.7090190558683414e-05, + "loss": 0.3593, + "step": 4300000 + }, + { + "epoch": 29.10147791251624, + "grad_norm": 0.35874873399734497, + "learning_rate": 4.708985220874838e-05, + "loss": 0.3598, + "step": 4300500 + }, + { + "epoch": 29.104861411866608, + "grad_norm": 0.402009516954422, + "learning_rate": 4.708951385881334e-05, + "loss": 0.3611, + "step": 4301000 + }, + { + "epoch": 29.10824491121698, + "grad_norm": 0.39570045471191406, + "learning_rate": 4.70891755088783e-05, + "loss": 0.3613, + "step": 4301500 + }, + { + "epoch": 29.111628410567345, + "grad_norm": 0.363852858543396, + "learning_rate": 4.708883715894327e-05, + "loss": 0.361, + "step": 4302000 + }, + { + "epoch": 29.115011909917712, + "grad_norm": 0.3463621437549591, + "learning_rate": 4.708849880900823e-05, + "loss": 0.362, + "step": 4302500 + }, + { + "epoch": 29.118395409268082, + "grad_norm": 0.3559868335723877, + "learning_rate": 4.7088160459073194e-05, + "loss": 0.3604, + "step": 4303000 + }, + { + "epoch": 29.12177890861845, + "grad_norm": 0.4086800813674927, + "learning_rate": 4.7087822109138156e-05, + "loss": 0.3598, + "step": 4303500 + }, + { + "epoch": 29.12516240796882, + "grad_norm": 0.3575659692287445, + "learning_rate": 4.7087483759203125e-05, + "loss": 0.3619, + "step": 4304000 + }, + { + "epoch": 29.128545907319186, + "grad_norm": 0.36323443055152893, + "learning_rate": 4.708714540926809e-05, + "loss": 0.3604, + "step": 4304500 + }, + { + "epoch": 29.131929406669553, + "grad_norm": 0.32497456669807434, + "learning_rate": 4.708680705933305e-05, + "loss": 0.361, + "step": 4305000 + }, + { + "epoch": 29.135312906019923, + "grad_norm": 0.4088630974292755, + "learning_rate": 4.708646870939801e-05, + "loss": 0.3608, + "step": 4305500 + }, + { + "epoch": 29.13869640537029, + "grad_norm": 0.3804304301738739, + "learning_rate": 4.708613035946297e-05, + "loss": 0.3601, + "step": 4306000 + }, + { + "epoch": 29.14207990472066, + "grad_norm": 0.420848548412323, + "learning_rate": 4.7085792009527935e-05, + "loss": 0.36, + "step": 4306500 + }, + { + "epoch": 29.145463404071027, + "grad_norm": 0.3965778648853302, + "learning_rate": 4.70854536595929e-05, + "loss": 0.3608, + "step": 4307000 + }, + { + "epoch": 29.148846903421394, + "grad_norm": 0.3896230161190033, + "learning_rate": 4.708511530965786e-05, + "loss": 0.3583, + "step": 4307500 + }, + { + "epoch": 29.152230402771764, + "grad_norm": 0.4677799344062805, + "learning_rate": 4.708477695972283e-05, + "loss": 0.3619, + "step": 4308000 + }, + { + "epoch": 29.15561390212213, + "grad_norm": 0.38909775018692017, + "learning_rate": 4.708443860978779e-05, + "loss": 0.3618, + "step": 4308500 + }, + { + "epoch": 29.158997401472497, + "grad_norm": 0.37043771147727966, + "learning_rate": 4.708410025985275e-05, + "loss": 0.3622, + "step": 4309000 + }, + { + "epoch": 29.162380900822868, + "grad_norm": 0.3786575496196747, + "learning_rate": 4.7083761909917715e-05, + "loss": 0.3612, + "step": 4309500 + }, + { + "epoch": 29.165764400173234, + "grad_norm": 0.3752582371234894, + "learning_rate": 4.7083423559982684e-05, + "loss": 0.3618, + "step": 4310000 + }, + { + "epoch": 29.169147899523605, + "grad_norm": 0.37404459714889526, + "learning_rate": 4.708308521004764e-05, + "loss": 0.359, + "step": 4310500 + }, + { + "epoch": 29.17253139887397, + "grad_norm": 0.37942689657211304, + "learning_rate": 4.70827468601126e-05, + "loss": 0.3612, + "step": 4311000 + }, + { + "epoch": 29.175914898224338, + "grad_norm": 0.3892790973186493, + "learning_rate": 4.708240851017757e-05, + "loss": 0.3616, + "step": 4311500 + }, + { + "epoch": 29.17929839757471, + "grad_norm": 0.38474565744400024, + "learning_rate": 4.708207016024253e-05, + "loss": 0.3597, + "step": 4312000 + }, + { + "epoch": 29.182681896925075, + "grad_norm": 0.3569866418838501, + "learning_rate": 4.7081731810307494e-05, + "loss": 0.36, + "step": 4312500 + }, + { + "epoch": 29.186065396275445, + "grad_norm": 0.41088053584098816, + "learning_rate": 4.7081393460372456e-05, + "loss": 0.3602, + "step": 4313000 + }, + { + "epoch": 29.189448895625812, + "grad_norm": 0.3667789697647095, + "learning_rate": 4.7081055110437425e-05, + "loss": 0.361, + "step": 4313500 + }, + { + "epoch": 29.19283239497618, + "grad_norm": 0.34484750032424927, + "learning_rate": 4.708071676050239e-05, + "loss": 0.36, + "step": 4314000 + }, + { + "epoch": 29.19621589432655, + "grad_norm": 0.3753925859928131, + "learning_rate": 4.708037841056735e-05, + "loss": 0.3593, + "step": 4314500 + }, + { + "epoch": 29.199599393676916, + "grad_norm": 0.3755471408367157, + "learning_rate": 4.708004006063231e-05, + "loss": 0.3607, + "step": 4315000 + }, + { + "epoch": 29.202982893027283, + "grad_norm": 0.3804199993610382, + "learning_rate": 4.7079701710697274e-05, + "loss": 0.362, + "step": 4315500 + }, + { + "epoch": 29.206366392377653, + "grad_norm": 0.4020286798477173, + "learning_rate": 4.7079363360762236e-05, + "loss": 0.3618, + "step": 4316000 + }, + { + "epoch": 29.20974989172802, + "grad_norm": 0.4113852381706238, + "learning_rate": 4.70790250108272e-05, + "loss": 0.3602, + "step": 4316500 + }, + { + "epoch": 29.21313339107839, + "grad_norm": 0.3878616690635681, + "learning_rate": 4.707868666089216e-05, + "loss": 0.3618, + "step": 4317000 + }, + { + "epoch": 29.216516890428757, + "grad_norm": 0.4221148192882538, + "learning_rate": 4.707834831095713e-05, + "loss": 0.3599, + "step": 4317500 + }, + { + "epoch": 29.219900389779124, + "grad_norm": 0.3535100817680359, + "learning_rate": 4.707800996102209e-05, + "loss": 0.3613, + "step": 4318000 + }, + { + "epoch": 29.223283889129494, + "grad_norm": 0.4416511356830597, + "learning_rate": 4.707767161108705e-05, + "loss": 0.3613, + "step": 4318500 + }, + { + "epoch": 29.22666738847986, + "grad_norm": 0.3685159981250763, + "learning_rate": 4.7077333261152015e-05, + "loss": 0.3607, + "step": 4319000 + }, + { + "epoch": 29.23005088783023, + "grad_norm": 0.3986661732196808, + "learning_rate": 4.7076994911216984e-05, + "loss": 0.3607, + "step": 4319500 + }, + { + "epoch": 29.233434387180598, + "grad_norm": 0.3640308976173401, + "learning_rate": 4.707665656128194e-05, + "loss": 0.3625, + "step": 4320000 + }, + { + "epoch": 29.236817886530964, + "grad_norm": 0.36600756645202637, + "learning_rate": 4.70763182113469e-05, + "loss": 0.3593, + "step": 4320500 + }, + { + "epoch": 29.240201385881335, + "grad_norm": 0.37366312742233276, + "learning_rate": 4.707597986141187e-05, + "loss": 0.3609, + "step": 4321000 + }, + { + "epoch": 29.2435848852317, + "grad_norm": 0.38794785737991333, + "learning_rate": 4.707564151147683e-05, + "loss": 0.3605, + "step": 4321500 + }, + { + "epoch": 29.246968384582072, + "grad_norm": 0.3888173997402191, + "learning_rate": 4.7075303161541795e-05, + "loss": 0.3602, + "step": 4322000 + }, + { + "epoch": 29.25035188393244, + "grad_norm": 0.3312968313694, + "learning_rate": 4.707496481160676e-05, + "loss": 0.3605, + "step": 4322500 + }, + { + "epoch": 29.253735383282805, + "grad_norm": 0.38750284910202026, + "learning_rate": 4.707462646167172e-05, + "loss": 0.3611, + "step": 4323000 + }, + { + "epoch": 29.257118882633176, + "grad_norm": 0.3781498670578003, + "learning_rate": 4.707428811173669e-05, + "loss": 0.3614, + "step": 4323500 + }, + { + "epoch": 29.260502381983542, + "grad_norm": 0.375742107629776, + "learning_rate": 4.707394976180165e-05, + "loss": 0.3607, + "step": 4324000 + }, + { + "epoch": 29.26388588133391, + "grad_norm": 0.3669828176498413, + "learning_rate": 4.707361141186661e-05, + "loss": 0.3608, + "step": 4324500 + }, + { + "epoch": 29.26726938068428, + "grad_norm": 0.3698820173740387, + "learning_rate": 4.7073273061931574e-05, + "loss": 0.36, + "step": 4325000 + }, + { + "epoch": 29.270652880034646, + "grad_norm": 0.33423253893852234, + "learning_rate": 4.7072934711996537e-05, + "loss": 0.3597, + "step": 4325500 + }, + { + "epoch": 29.274036379385016, + "grad_norm": 0.3763883113861084, + "learning_rate": 4.70725963620615e-05, + "loss": 0.36, + "step": 4326000 + }, + { + "epoch": 29.277419878735383, + "grad_norm": 0.3575267195701599, + "learning_rate": 4.707225801212646e-05, + "loss": 0.3584, + "step": 4326500 + }, + { + "epoch": 29.28080337808575, + "grad_norm": 0.39500075578689575, + "learning_rate": 4.707191966219143e-05, + "loss": 0.3601, + "step": 4327000 + }, + { + "epoch": 29.28418687743612, + "grad_norm": 0.46322932839393616, + "learning_rate": 4.707158131225639e-05, + "loss": 0.3611, + "step": 4327500 + }, + { + "epoch": 29.287570376786487, + "grad_norm": 0.4118308126926422, + "learning_rate": 4.7071242962321354e-05, + "loss": 0.3608, + "step": 4328000 + }, + { + "epoch": 29.290953876136857, + "grad_norm": 0.3777792751789093, + "learning_rate": 4.7070904612386316e-05, + "loss": 0.3606, + "step": 4328500 + }, + { + "epoch": 29.294337375487224, + "grad_norm": 0.35416123270988464, + "learning_rate": 4.7070566262451285e-05, + "loss": 0.3616, + "step": 4329000 + }, + { + "epoch": 29.29772087483759, + "grad_norm": 0.35928085446357727, + "learning_rate": 4.707022791251624e-05, + "loss": 0.3603, + "step": 4329500 + }, + { + "epoch": 29.30110437418796, + "grad_norm": 0.3598766326904297, + "learning_rate": 4.70698895625812e-05, + "loss": 0.3604, + "step": 4330000 + }, + { + "epoch": 29.304487873538328, + "grad_norm": 0.3819045424461365, + "learning_rate": 4.706955121264617e-05, + "loss": 0.361, + "step": 4330500 + }, + { + "epoch": 29.307871372888698, + "grad_norm": 0.3496757447719574, + "learning_rate": 4.7069212862711133e-05, + "loss": 0.3598, + "step": 4331000 + }, + { + "epoch": 29.311254872239065, + "grad_norm": 0.37243950366973877, + "learning_rate": 4.7068874512776096e-05, + "loss": 0.3612, + "step": 4331500 + }, + { + "epoch": 29.31463837158943, + "grad_norm": 0.3167635202407837, + "learning_rate": 4.706853616284106e-05, + "loss": 0.3609, + "step": 4332000 + }, + { + "epoch": 29.318021870939802, + "grad_norm": 0.3345939815044403, + "learning_rate": 4.706819781290602e-05, + "loss": 0.3601, + "step": 4332500 + }, + { + "epoch": 29.32140537029017, + "grad_norm": 0.3902365565299988, + "learning_rate": 4.706785946297099e-05, + "loss": 0.3621, + "step": 4333000 + }, + { + "epoch": 29.324788869640535, + "grad_norm": 0.38333427906036377, + "learning_rate": 4.706752111303595e-05, + "loss": 0.3606, + "step": 4333500 + }, + { + "epoch": 29.328172368990906, + "grad_norm": 0.34627583622932434, + "learning_rate": 4.706718276310091e-05, + "loss": 0.3609, + "step": 4334000 + }, + { + "epoch": 29.331555868341272, + "grad_norm": 0.35351866483688354, + "learning_rate": 4.7066844413165875e-05, + "loss": 0.3615, + "step": 4334500 + }, + { + "epoch": 29.334939367691643, + "grad_norm": 0.4443703889846802, + "learning_rate": 4.706650606323084e-05, + "loss": 0.3614, + "step": 4335000 + }, + { + "epoch": 29.33832286704201, + "grad_norm": 0.392807275056839, + "learning_rate": 4.70661677132958e-05, + "loss": 0.3603, + "step": 4335500 + }, + { + "epoch": 29.341706366392376, + "grad_norm": 0.39465057849884033, + "learning_rate": 4.706582936336076e-05, + "loss": 0.3621, + "step": 4336000 + }, + { + "epoch": 29.345089865742747, + "grad_norm": 0.37733396887779236, + "learning_rate": 4.706549101342573e-05, + "loss": 0.3625, + "step": 4336500 + }, + { + "epoch": 29.348473365093113, + "grad_norm": 0.3787976801395416, + "learning_rate": 4.706515266349069e-05, + "loss": 0.3625, + "step": 4337000 + }, + { + "epoch": 29.351856864443484, + "grad_norm": 0.36785629391670227, + "learning_rate": 4.7064814313555655e-05, + "loss": 0.3599, + "step": 4337500 + }, + { + "epoch": 29.35524036379385, + "grad_norm": 0.40150776505470276, + "learning_rate": 4.706447596362062e-05, + "loss": 0.3614, + "step": 4338000 + }, + { + "epoch": 29.358623863144217, + "grad_norm": 0.34027591347694397, + "learning_rate": 4.7064137613685586e-05, + "loss": 0.3612, + "step": 4338500 + }, + { + "epoch": 29.362007362494587, + "grad_norm": 0.41849809885025024, + "learning_rate": 4.706379926375054e-05, + "loss": 0.3602, + "step": 4339000 + }, + { + "epoch": 29.365390861844954, + "grad_norm": 0.3601183593273163, + "learning_rate": 4.70634609138155e-05, + "loss": 0.3608, + "step": 4339500 + }, + { + "epoch": 29.36877436119532, + "grad_norm": 0.36231401562690735, + "learning_rate": 4.7063122563880465e-05, + "loss": 0.3608, + "step": 4340000 + }, + { + "epoch": 29.37215786054569, + "grad_norm": 0.38266411423683167, + "learning_rate": 4.7062784213945434e-05, + "loss": 0.3604, + "step": 4340500 + }, + { + "epoch": 29.375541359896058, + "grad_norm": 0.4335971772670746, + "learning_rate": 4.7062445864010396e-05, + "loss": 0.3599, + "step": 4341000 + }, + { + "epoch": 29.37892485924643, + "grad_norm": 0.3665923774242401, + "learning_rate": 4.706210751407536e-05, + "loss": 0.3608, + "step": 4341500 + }, + { + "epoch": 29.382308358596795, + "grad_norm": 0.4152335226535797, + "learning_rate": 4.706176916414032e-05, + "loss": 0.3601, + "step": 4342000 + }, + { + "epoch": 29.38569185794716, + "grad_norm": 0.3795805871486664, + "learning_rate": 4.706143081420529e-05, + "loss": 0.361, + "step": 4342500 + }, + { + "epoch": 29.389075357297532, + "grad_norm": 0.3984297811985016, + "learning_rate": 4.706109246427025e-05, + "loss": 0.3607, + "step": 4343000 + }, + { + "epoch": 29.3924588566479, + "grad_norm": 0.39234739542007446, + "learning_rate": 4.7060754114335214e-05, + "loss": 0.361, + "step": 4343500 + }, + { + "epoch": 29.39584235599827, + "grad_norm": 0.40547654032707214, + "learning_rate": 4.7060415764400176e-05, + "loss": 0.3614, + "step": 4344000 + }, + { + "epoch": 29.399225855348636, + "grad_norm": 0.36906078457832336, + "learning_rate": 4.706007741446514e-05, + "loss": 0.3617, + "step": 4344500 + }, + { + "epoch": 29.402609354699003, + "grad_norm": 0.3373365104198456, + "learning_rate": 4.70597390645301e-05, + "loss": 0.3611, + "step": 4345000 + }, + { + "epoch": 29.405992854049373, + "grad_norm": 0.3218066096305847, + "learning_rate": 4.705940071459506e-05, + "loss": 0.3616, + "step": 4345500 + }, + { + "epoch": 29.40937635339974, + "grad_norm": 0.37889230251312256, + "learning_rate": 4.705906236466003e-05, + "loss": 0.3618, + "step": 4346000 + }, + { + "epoch": 29.41275985275011, + "grad_norm": 0.3492085039615631, + "learning_rate": 4.705872401472499e-05, + "loss": 0.3613, + "step": 4346500 + }, + { + "epoch": 29.416143352100477, + "grad_norm": 0.36156710982322693, + "learning_rate": 4.7058385664789955e-05, + "loss": 0.3589, + "step": 4347000 + }, + { + "epoch": 29.419526851450843, + "grad_norm": 0.3797200620174408, + "learning_rate": 4.705804731485492e-05, + "loss": 0.3608, + "step": 4347500 + }, + { + "epoch": 29.422910350801214, + "grad_norm": 0.3567017614841461, + "learning_rate": 4.7057708964919886e-05, + "loss": 0.3621, + "step": 4348000 + }, + { + "epoch": 29.42629385015158, + "grad_norm": 0.3498610258102417, + "learning_rate": 4.705737061498484e-05, + "loss": 0.36, + "step": 4348500 + }, + { + "epoch": 29.429677349501947, + "grad_norm": 0.376139760017395, + "learning_rate": 4.7057032265049804e-05, + "loss": 0.3596, + "step": 4349000 + }, + { + "epoch": 29.433060848852318, + "grad_norm": 0.37960341572761536, + "learning_rate": 4.7056693915114766e-05, + "loss": 0.3608, + "step": 4349500 + }, + { + "epoch": 29.436444348202684, + "grad_norm": 0.38942623138427734, + "learning_rate": 4.7056355565179735e-05, + "loss": 0.3614, + "step": 4350000 + }, + { + "epoch": 29.439827847553055, + "grad_norm": 0.38209375739097595, + "learning_rate": 4.70560172152447e-05, + "loss": 0.3608, + "step": 4350500 + }, + { + "epoch": 29.44321134690342, + "grad_norm": 0.41018059849739075, + "learning_rate": 4.705567886530966e-05, + "loss": 0.3622, + "step": 4351000 + }, + { + "epoch": 29.446594846253788, + "grad_norm": 0.39549022912979126, + "learning_rate": 4.705534051537462e-05, + "loss": 0.359, + "step": 4351500 + }, + { + "epoch": 29.44997834560416, + "grad_norm": 0.39096882939338684, + "learning_rate": 4.705500216543959e-05, + "loss": 0.3605, + "step": 4352000 + }, + { + "epoch": 29.453361844954525, + "grad_norm": 0.36013635993003845, + "learning_rate": 4.705466381550455e-05, + "loss": 0.3614, + "step": 4352500 + }, + { + "epoch": 29.456745344304895, + "grad_norm": 0.37079086899757385, + "learning_rate": 4.7054325465569514e-05, + "loss": 0.3598, + "step": 4353000 + }, + { + "epoch": 29.460128843655262, + "grad_norm": 0.3588239848613739, + "learning_rate": 4.7053987115634476e-05, + "loss": 0.3615, + "step": 4353500 + }, + { + "epoch": 29.46351234300563, + "grad_norm": 0.4153600335121155, + "learning_rate": 4.705364876569944e-05, + "loss": 0.3611, + "step": 4354000 + }, + { + "epoch": 29.466895842356, + "grad_norm": 0.35029229521751404, + "learning_rate": 4.70533104157644e-05, + "loss": 0.3609, + "step": 4354500 + }, + { + "epoch": 29.470279341706366, + "grad_norm": 0.39438584446907043, + "learning_rate": 4.705297206582936e-05, + "loss": 0.3603, + "step": 4355000 + }, + { + "epoch": 29.473662841056736, + "grad_norm": 0.35910433530807495, + "learning_rate": 4.705263371589433e-05, + "loss": 0.3599, + "step": 4355500 + }, + { + "epoch": 29.477046340407103, + "grad_norm": 0.3918461501598358, + "learning_rate": 4.7052295365959294e-05, + "loss": 0.3588, + "step": 4356000 + }, + { + "epoch": 29.48042983975747, + "grad_norm": 0.38227224349975586, + "learning_rate": 4.7051957016024256e-05, + "loss": 0.3596, + "step": 4356500 + }, + { + "epoch": 29.48381333910784, + "grad_norm": 0.3609776496887207, + "learning_rate": 4.705161866608922e-05, + "loss": 0.362, + "step": 4357000 + }, + { + "epoch": 29.487196838458207, + "grad_norm": 0.3666316270828247, + "learning_rate": 4.705128031615419e-05, + "loss": 0.3604, + "step": 4357500 + }, + { + "epoch": 29.490580337808574, + "grad_norm": 0.40913018584251404, + "learning_rate": 4.705094196621914e-05, + "loss": 0.3615, + "step": 4358000 + }, + { + "epoch": 29.493963837158944, + "grad_norm": 0.4122064709663391, + "learning_rate": 4.7050603616284104e-05, + "loss": 0.3604, + "step": 4358500 + }, + { + "epoch": 29.49734733650931, + "grad_norm": 0.3761943578720093, + "learning_rate": 4.7050265266349066e-05, + "loss": 0.3603, + "step": 4359000 + }, + { + "epoch": 29.50073083585968, + "grad_norm": 0.360890656709671, + "learning_rate": 4.7049926916414035e-05, + "loss": 0.3629, + "step": 4359500 + }, + { + "epoch": 29.504114335210048, + "grad_norm": 0.379151850938797, + "learning_rate": 4.7049588566479e-05, + "loss": 0.3621, + "step": 4360000 + }, + { + "epoch": 29.507497834560414, + "grad_norm": 0.3611099421977997, + "learning_rate": 4.704925021654396e-05, + "loss": 0.3605, + "step": 4360500 + }, + { + "epoch": 29.510881333910785, + "grad_norm": 0.3445838391780853, + "learning_rate": 4.704891186660892e-05, + "loss": 0.3615, + "step": 4361000 + }, + { + "epoch": 29.51426483326115, + "grad_norm": 0.38196510076522827, + "learning_rate": 4.704857351667389e-05, + "loss": 0.36, + "step": 4361500 + }, + { + "epoch": 29.51764833261152, + "grad_norm": 0.38769465684890747, + "learning_rate": 4.704823516673885e-05, + "loss": 0.3606, + "step": 4362000 + }, + { + "epoch": 29.52103183196189, + "grad_norm": 0.3828052878379822, + "learning_rate": 4.7047896816803815e-05, + "loss": 0.3621, + "step": 4362500 + }, + { + "epoch": 29.524415331312255, + "grad_norm": 0.40334826707839966, + "learning_rate": 4.704755846686878e-05, + "loss": 0.3623, + "step": 4363000 + }, + { + "epoch": 29.527798830662626, + "grad_norm": 0.38683021068573, + "learning_rate": 4.704722011693374e-05, + "loss": 0.359, + "step": 4363500 + }, + { + "epoch": 29.531182330012992, + "grad_norm": 0.37075749039649963, + "learning_rate": 4.70468817669987e-05, + "loss": 0.3612, + "step": 4364000 + }, + { + "epoch": 29.53456582936336, + "grad_norm": 0.3464714586734772, + "learning_rate": 4.704654341706366e-05, + "loss": 0.3618, + "step": 4364500 + }, + { + "epoch": 29.53794932871373, + "grad_norm": 0.3840615749359131, + "learning_rate": 4.704620506712863e-05, + "loss": 0.3609, + "step": 4365000 + }, + { + "epoch": 29.541332828064096, + "grad_norm": 0.3820098042488098, + "learning_rate": 4.7045866717193594e-05, + "loss": 0.3622, + "step": 4365500 + }, + { + "epoch": 29.544716327414466, + "grad_norm": 0.3600960075855255, + "learning_rate": 4.7045528367258556e-05, + "loss": 0.3596, + "step": 4366000 + }, + { + "epoch": 29.548099826764833, + "grad_norm": 0.37287211418151855, + "learning_rate": 4.704519001732352e-05, + "loss": 0.3618, + "step": 4366500 + }, + { + "epoch": 29.5514833261152, + "grad_norm": 0.368624746799469, + "learning_rate": 4.704485166738849e-05, + "loss": 0.3606, + "step": 4367000 + }, + { + "epoch": 29.55486682546557, + "grad_norm": 0.3669593632221222, + "learning_rate": 4.704451331745345e-05, + "loss": 0.3615, + "step": 4367500 + }, + { + "epoch": 29.558250324815937, + "grad_norm": 0.39893969893455505, + "learning_rate": 4.7044174967518405e-05, + "loss": 0.3618, + "step": 4368000 + }, + { + "epoch": 29.561633824166307, + "grad_norm": 0.36940744519233704, + "learning_rate": 4.704383661758337e-05, + "loss": 0.3612, + "step": 4368500 + }, + { + "epoch": 29.565017323516674, + "grad_norm": 0.35981062054634094, + "learning_rate": 4.7043498267648336e-05, + "loss": 0.3609, + "step": 4369000 + }, + { + "epoch": 29.56840082286704, + "grad_norm": 0.394185334444046, + "learning_rate": 4.70431599177133e-05, + "loss": 0.3604, + "step": 4369500 + }, + { + "epoch": 29.57178432221741, + "grad_norm": 0.35855773091316223, + "learning_rate": 4.704282156777826e-05, + "loss": 0.3594, + "step": 4370000 + }, + { + "epoch": 29.575167821567778, + "grad_norm": 0.3543767035007477, + "learning_rate": 4.704248321784322e-05, + "loss": 0.3615, + "step": 4370500 + }, + { + "epoch": 29.578551320918148, + "grad_norm": 0.3909660875797272, + "learning_rate": 4.704214486790819e-05, + "loss": 0.3593, + "step": 4371000 + }, + { + "epoch": 29.581934820268515, + "grad_norm": 0.39414045214653015, + "learning_rate": 4.704180651797315e-05, + "loss": 0.3599, + "step": 4371500 + }, + { + "epoch": 29.58531831961888, + "grad_norm": 0.36996832489967346, + "learning_rate": 4.7041468168038116e-05, + "loss": 0.3603, + "step": 4372000 + }, + { + "epoch": 29.588701818969252, + "grad_norm": 0.35564756393432617, + "learning_rate": 4.704112981810308e-05, + "loss": 0.3609, + "step": 4372500 + }, + { + "epoch": 29.59208531831962, + "grad_norm": 0.4340767562389374, + "learning_rate": 4.704079146816804e-05, + "loss": 0.3608, + "step": 4373000 + }, + { + "epoch": 29.595468817669985, + "grad_norm": 0.38144609332084656, + "learning_rate": 4.7040453118233e-05, + "loss": 0.3608, + "step": 4373500 + }, + { + "epoch": 29.598852317020356, + "grad_norm": 0.3498848080635071, + "learning_rate": 4.7040114768297964e-05, + "loss": 0.3624, + "step": 4374000 + }, + { + "epoch": 29.602235816370722, + "grad_norm": 0.4123530387878418, + "learning_rate": 4.703977641836293e-05, + "loss": 0.3612, + "step": 4374500 + }, + { + "epoch": 29.605619315721093, + "grad_norm": 0.3861576318740845, + "learning_rate": 4.7039438068427895e-05, + "loss": 0.3609, + "step": 4375000 + }, + { + "epoch": 29.60900281507146, + "grad_norm": 0.3890489339828491, + "learning_rate": 4.703909971849286e-05, + "loss": 0.3617, + "step": 4375500 + }, + { + "epoch": 29.612386314421826, + "grad_norm": 0.3864743709564209, + "learning_rate": 4.703876136855782e-05, + "loss": 0.3616, + "step": 4376000 + }, + { + "epoch": 29.615769813772197, + "grad_norm": 0.34650570154190063, + "learning_rate": 4.703842301862279e-05, + "loss": 0.361, + "step": 4376500 + }, + { + "epoch": 29.619153313122563, + "grad_norm": 0.39615970849990845, + "learning_rate": 4.703808466868775e-05, + "loss": 0.3614, + "step": 4377000 + }, + { + "epoch": 29.622536812472934, + "grad_norm": 0.4292544722557068, + "learning_rate": 4.7037746318752706e-05, + "loss": 0.3608, + "step": 4377500 + }, + { + "epoch": 29.6259203118233, + "grad_norm": 0.4034567177295685, + "learning_rate": 4.703740796881767e-05, + "loss": 0.3618, + "step": 4378000 + }, + { + "epoch": 29.629303811173667, + "grad_norm": 0.3960621953010559, + "learning_rate": 4.703706961888264e-05, + "loss": 0.3606, + "step": 4378500 + }, + { + "epoch": 29.632687310524037, + "grad_norm": 0.3851377069950104, + "learning_rate": 4.70367312689476e-05, + "loss": 0.3608, + "step": 4379000 + }, + { + "epoch": 29.636070809874404, + "grad_norm": 0.32946351170539856, + "learning_rate": 4.703639291901256e-05, + "loss": 0.361, + "step": 4379500 + }, + { + "epoch": 29.639454309224774, + "grad_norm": 0.39054372906684875, + "learning_rate": 4.703605456907752e-05, + "loss": 0.3621, + "step": 4380000 + }, + { + "epoch": 29.64283780857514, + "grad_norm": 0.39904502034187317, + "learning_rate": 4.703571621914249e-05, + "loss": 0.3626, + "step": 4380500 + }, + { + "epoch": 29.646221307925508, + "grad_norm": 0.3671392798423767, + "learning_rate": 4.7035377869207454e-05, + "loss": 0.3602, + "step": 4381000 + }, + { + "epoch": 29.649604807275878, + "grad_norm": 0.3751044273376465, + "learning_rate": 4.7035039519272416e-05, + "loss": 0.3598, + "step": 4381500 + }, + { + "epoch": 29.652988306626245, + "grad_norm": 0.3547898828983307, + "learning_rate": 4.703470116933738e-05, + "loss": 0.3634, + "step": 4382000 + }, + { + "epoch": 29.65637180597661, + "grad_norm": 0.3591013252735138, + "learning_rate": 4.703436281940234e-05, + "loss": 0.3623, + "step": 4382500 + }, + { + "epoch": 29.659755305326982, + "grad_norm": 0.42844271659851074, + "learning_rate": 4.70340244694673e-05, + "loss": 0.361, + "step": 4383000 + }, + { + "epoch": 29.66313880467735, + "grad_norm": 0.3706211447715759, + "learning_rate": 4.7033686119532265e-05, + "loss": 0.3616, + "step": 4383500 + }, + { + "epoch": 29.66652230402772, + "grad_norm": 0.3800713121891022, + "learning_rate": 4.7033347769597234e-05, + "loss": 0.3604, + "step": 4384000 + }, + { + "epoch": 29.669905803378086, + "grad_norm": 0.35419294238090515, + "learning_rate": 4.7033009419662196e-05, + "loss": 0.3608, + "step": 4384500 + }, + { + "epoch": 29.673289302728453, + "grad_norm": 0.39697718620300293, + "learning_rate": 4.703267106972716e-05, + "loss": 0.3613, + "step": 4385000 + }, + { + "epoch": 29.676672802078823, + "grad_norm": 0.388007789850235, + "learning_rate": 4.703233271979212e-05, + "loss": 0.3597, + "step": 4385500 + }, + { + "epoch": 29.68005630142919, + "grad_norm": 0.38864538073539734, + "learning_rate": 4.703199436985708e-05, + "loss": 0.3604, + "step": 4386000 + }, + { + "epoch": 29.68343980077956, + "grad_norm": 0.36027100682258606, + "learning_rate": 4.703165601992205e-05, + "loss": 0.3632, + "step": 4386500 + }, + { + "epoch": 29.686823300129927, + "grad_norm": 0.39807599782943726, + "learning_rate": 4.7031317669987006e-05, + "loss": 0.3612, + "step": 4387000 + }, + { + "epoch": 29.690206799480293, + "grad_norm": 0.3704375922679901, + "learning_rate": 4.703097932005197e-05, + "loss": 0.3611, + "step": 4387500 + }, + { + "epoch": 29.693590298830664, + "grad_norm": 0.35683444142341614, + "learning_rate": 4.703064097011694e-05, + "loss": 0.3624, + "step": 4388000 + }, + { + "epoch": 29.69697379818103, + "grad_norm": 0.38248318433761597, + "learning_rate": 4.70303026201819e-05, + "loss": 0.3599, + "step": 4388500 + }, + { + "epoch": 29.700357297531397, + "grad_norm": 0.39218348264694214, + "learning_rate": 4.702996427024686e-05, + "loss": 0.3601, + "step": 4389000 + }, + { + "epoch": 29.703740796881767, + "grad_norm": 0.3762940764427185, + "learning_rate": 4.7029625920311824e-05, + "loss": 0.3618, + "step": 4389500 + }, + { + "epoch": 29.707124296232134, + "grad_norm": 0.3820953369140625, + "learning_rate": 4.702928757037679e-05, + "loss": 0.3598, + "step": 4390000 + }, + { + "epoch": 29.710507795582505, + "grad_norm": 0.38454100489616394, + "learning_rate": 4.7028949220441755e-05, + "loss": 0.3609, + "step": 4390500 + }, + { + "epoch": 29.71389129493287, + "grad_norm": 0.40086978673934937, + "learning_rate": 4.702861087050672e-05, + "loss": 0.3596, + "step": 4391000 + }, + { + "epoch": 29.717274794283238, + "grad_norm": 0.39777833223342896, + "learning_rate": 4.702827252057168e-05, + "loss": 0.3609, + "step": 4391500 + }, + { + "epoch": 29.72065829363361, + "grad_norm": 0.36414191126823425, + "learning_rate": 4.702793417063664e-05, + "loss": 0.3621, + "step": 4392000 + }, + { + "epoch": 29.724041792983975, + "grad_norm": 0.4004693031311035, + "learning_rate": 4.70275958207016e-05, + "loss": 0.3623, + "step": 4392500 + }, + { + "epoch": 29.727425292334345, + "grad_norm": 0.3868340253829956, + "learning_rate": 4.7027257470766565e-05, + "loss": 0.3606, + "step": 4393000 + }, + { + "epoch": 29.730808791684712, + "grad_norm": 0.3559049367904663, + "learning_rate": 4.702691912083153e-05, + "loss": 0.3602, + "step": 4393500 + }, + { + "epoch": 29.73419229103508, + "grad_norm": 0.3866305351257324, + "learning_rate": 4.7026580770896496e-05, + "loss": 0.3592, + "step": 4394000 + }, + { + "epoch": 29.73757579038545, + "grad_norm": 0.33717986941337585, + "learning_rate": 4.702624242096146e-05, + "loss": 0.3617, + "step": 4394500 + }, + { + "epoch": 29.740959289735816, + "grad_norm": 0.3839470148086548, + "learning_rate": 4.702590407102642e-05, + "loss": 0.3605, + "step": 4395000 + }, + { + "epoch": 29.744342789086183, + "grad_norm": 0.3991214334964752, + "learning_rate": 4.702556572109138e-05, + "loss": 0.3593, + "step": 4395500 + }, + { + "epoch": 29.747726288436553, + "grad_norm": 0.3859218955039978, + "learning_rate": 4.702522737115635e-05, + "loss": 0.3609, + "step": 4396000 + }, + { + "epoch": 29.75110978778692, + "grad_norm": 0.3510134220123291, + "learning_rate": 4.702488902122131e-05, + "loss": 0.362, + "step": 4396500 + }, + { + "epoch": 29.75449328713729, + "grad_norm": 0.38294655084609985, + "learning_rate": 4.702455067128627e-05, + "loss": 0.3613, + "step": 4397000 + }, + { + "epoch": 29.757876786487657, + "grad_norm": 0.3909126818180084, + "learning_rate": 4.702421232135124e-05, + "loss": 0.3618, + "step": 4397500 + }, + { + "epoch": 29.761260285838024, + "grad_norm": 0.36475950479507446, + "learning_rate": 4.70238739714162e-05, + "loss": 0.3612, + "step": 4398000 + }, + { + "epoch": 29.764643785188394, + "grad_norm": 0.38487258553504944, + "learning_rate": 4.702353562148116e-05, + "loss": 0.3617, + "step": 4398500 + }, + { + "epoch": 29.76802728453876, + "grad_norm": 0.35010942816734314, + "learning_rate": 4.7023197271546124e-05, + "loss": 0.3615, + "step": 4399000 + }, + { + "epoch": 29.77141078388913, + "grad_norm": 0.34493643045425415, + "learning_rate": 4.702285892161109e-05, + "loss": 0.3618, + "step": 4399500 + }, + { + "epoch": 29.774794283239498, + "grad_norm": 0.3537772297859192, + "learning_rate": 4.7022520571676055e-05, + "loss": 0.3614, + "step": 4400000 + }, + { + "epoch": 29.778177782589864, + "grad_norm": 0.4162846505641937, + "learning_rate": 4.702218222174102e-05, + "loss": 0.3609, + "step": 4400500 + }, + { + "epoch": 29.781561281940235, + "grad_norm": 0.40141090750694275, + "learning_rate": 4.702184387180598e-05, + "loss": 0.3613, + "step": 4401000 + }, + { + "epoch": 29.7849447812906, + "grad_norm": 0.3717232048511505, + "learning_rate": 4.702150552187094e-05, + "loss": 0.3596, + "step": 4401500 + }, + { + "epoch": 29.78832828064097, + "grad_norm": 0.3388756513595581, + "learning_rate": 4.7021167171935904e-05, + "loss": 0.3608, + "step": 4402000 + }, + { + "epoch": 29.79171177999134, + "grad_norm": 0.417555570602417, + "learning_rate": 4.7020828822000866e-05, + "loss": 0.3615, + "step": 4402500 + }, + { + "epoch": 29.795095279341705, + "grad_norm": 0.36066934466362, + "learning_rate": 4.702049047206583e-05, + "loss": 0.3629, + "step": 4403000 + }, + { + "epoch": 29.798478778692076, + "grad_norm": 0.3946530222892761, + "learning_rate": 4.70201521221308e-05, + "loss": 0.3619, + "step": 4403500 + }, + { + "epoch": 29.801862278042442, + "grad_norm": 0.3465481996536255, + "learning_rate": 4.701981377219576e-05, + "loss": 0.3609, + "step": 4404000 + }, + { + "epoch": 29.805245777392813, + "grad_norm": 0.34671127796173096, + "learning_rate": 4.701947542226072e-05, + "loss": 0.3605, + "step": 4404500 + }, + { + "epoch": 29.80862927674318, + "grad_norm": 0.3607783019542694, + "learning_rate": 4.701913707232568e-05, + "loss": 0.3611, + "step": 4405000 + }, + { + "epoch": 29.812012776093546, + "grad_norm": 0.3950929343700409, + "learning_rate": 4.701879872239065e-05, + "loss": 0.3623, + "step": 4405500 + }, + { + "epoch": 29.815396275443916, + "grad_norm": 0.3737722933292389, + "learning_rate": 4.701846037245561e-05, + "loss": 0.3616, + "step": 4406000 + }, + { + "epoch": 29.818779774794283, + "grad_norm": 0.4050480127334595, + "learning_rate": 4.701812202252057e-05, + "loss": 0.3619, + "step": 4406500 + }, + { + "epoch": 29.82216327414465, + "grad_norm": 0.3761073350906372, + "learning_rate": 4.701778367258554e-05, + "loss": 0.3601, + "step": 4407000 + }, + { + "epoch": 29.82554677349502, + "grad_norm": 0.38306209444999695, + "learning_rate": 4.70174453226505e-05, + "loss": 0.3612, + "step": 4407500 + }, + { + "epoch": 29.828930272845387, + "grad_norm": 0.392617404460907, + "learning_rate": 4.701710697271546e-05, + "loss": 0.3602, + "step": 4408000 + }, + { + "epoch": 29.832313772195757, + "grad_norm": 0.3812628388404846, + "learning_rate": 4.7016768622780425e-05, + "loss": 0.3628, + "step": 4408500 + }, + { + "epoch": 29.835697271546124, + "grad_norm": 0.3694405257701874, + "learning_rate": 4.7016430272845394e-05, + "loss": 0.3619, + "step": 4409000 + }, + { + "epoch": 29.83908077089649, + "grad_norm": 0.36880505084991455, + "learning_rate": 4.7016091922910356e-05, + "loss": 0.3605, + "step": 4409500 + }, + { + "epoch": 29.84246427024686, + "grad_norm": 0.37738537788391113, + "learning_rate": 4.701575357297532e-05, + "loss": 0.3635, + "step": 4410000 + }, + { + "epoch": 29.845847769597228, + "grad_norm": 0.3428693115711212, + "learning_rate": 4.701541522304027e-05, + "loss": 0.3616, + "step": 4410500 + }, + { + "epoch": 29.849231268947598, + "grad_norm": 0.34460675716400146, + "learning_rate": 4.701507687310524e-05, + "loss": 0.3632, + "step": 4411000 + }, + { + "epoch": 29.852614768297965, + "grad_norm": 0.35981813073158264, + "learning_rate": 4.7014738523170204e-05, + "loss": 0.3603, + "step": 4411500 + }, + { + "epoch": 29.85599826764833, + "grad_norm": 0.4211750030517578, + "learning_rate": 4.7014400173235167e-05, + "loss": 0.3619, + "step": 4412000 + }, + { + "epoch": 29.859381766998702, + "grad_norm": 0.40512290596961975, + "learning_rate": 4.701406182330013e-05, + "loss": 0.3609, + "step": 4412500 + }, + { + "epoch": 29.86276526634907, + "grad_norm": 0.34953683614730835, + "learning_rate": 4.70137234733651e-05, + "loss": 0.3609, + "step": 4413000 + }, + { + "epoch": 29.866148765699435, + "grad_norm": 0.38703393936157227, + "learning_rate": 4.701338512343006e-05, + "loss": 0.3621, + "step": 4413500 + }, + { + "epoch": 29.869532265049806, + "grad_norm": 0.35878366231918335, + "learning_rate": 4.701304677349502e-05, + "loss": 0.3622, + "step": 4414000 + }, + { + "epoch": 29.872915764400172, + "grad_norm": 0.37528976798057556, + "learning_rate": 4.7012708423559984e-05, + "loss": 0.3601, + "step": 4414500 + }, + { + "epoch": 29.876299263750543, + "grad_norm": 0.3553240895271301, + "learning_rate": 4.701237007362495e-05, + "loss": 0.3617, + "step": 4415000 + }, + { + "epoch": 29.87968276310091, + "grad_norm": 0.3715384304523468, + "learning_rate": 4.701203172368991e-05, + "loss": 0.3623, + "step": 4415500 + }, + { + "epoch": 29.883066262451276, + "grad_norm": 0.3395177721977234, + "learning_rate": 4.701169337375487e-05, + "loss": 0.3605, + "step": 4416000 + }, + { + "epoch": 29.886449761801646, + "grad_norm": 0.3491855561733246, + "learning_rate": 4.701135502381984e-05, + "loss": 0.3601, + "step": 4416500 + }, + { + "epoch": 29.889833261152013, + "grad_norm": 0.3426545560359955, + "learning_rate": 4.70110166738848e-05, + "loss": 0.3606, + "step": 4417000 + }, + { + "epoch": 29.893216760502384, + "grad_norm": 0.42065492272377014, + "learning_rate": 4.7010678323949763e-05, + "loss": 0.361, + "step": 4417500 + }, + { + "epoch": 29.89660025985275, + "grad_norm": 0.4317997694015503, + "learning_rate": 4.7010339974014726e-05, + "loss": 0.3613, + "step": 4418000 + }, + { + "epoch": 29.899983759203117, + "grad_norm": 0.42664942145347595, + "learning_rate": 4.7010001624079694e-05, + "loss": 0.3604, + "step": 4418500 + }, + { + "epoch": 29.903367258553487, + "grad_norm": 0.3807823359966278, + "learning_rate": 4.7009663274144657e-05, + "loss": 0.3613, + "step": 4419000 + }, + { + "epoch": 29.906750757903854, + "grad_norm": 0.3887644410133362, + "learning_rate": 4.700932492420962e-05, + "loss": 0.3608, + "step": 4419500 + }, + { + "epoch": 29.91013425725422, + "grad_norm": 0.3543424606323242, + "learning_rate": 4.700898657427458e-05, + "loss": 0.3617, + "step": 4420000 + }, + { + "epoch": 29.91351775660459, + "grad_norm": 0.4223953187465668, + "learning_rate": 4.700864822433954e-05, + "loss": 0.3611, + "step": 4420500 + }, + { + "epoch": 29.916901255954958, + "grad_norm": 0.33534178137779236, + "learning_rate": 4.7008309874404505e-05, + "loss": 0.3606, + "step": 4421000 + }, + { + "epoch": 29.920284755305328, + "grad_norm": 0.38698020577430725, + "learning_rate": 4.700797152446947e-05, + "loss": 0.3597, + "step": 4421500 + }, + { + "epoch": 29.923668254655695, + "grad_norm": 0.39192628860473633, + "learning_rate": 4.700763317453443e-05, + "loss": 0.3617, + "step": 4422000 + }, + { + "epoch": 29.92705175400606, + "grad_norm": 0.4152633249759674, + "learning_rate": 4.70072948245994e-05, + "loss": 0.3604, + "step": 4422500 + }, + { + "epoch": 29.930435253356432, + "grad_norm": 0.3698878288269043, + "learning_rate": 4.700695647466436e-05, + "loss": 0.3625, + "step": 4423000 + }, + { + "epoch": 29.9338187527068, + "grad_norm": 0.3756166398525238, + "learning_rate": 4.700661812472932e-05, + "loss": 0.3621, + "step": 4423500 + }, + { + "epoch": 29.93720225205717, + "grad_norm": 0.395037442445755, + "learning_rate": 4.7006279774794285e-05, + "loss": 0.3604, + "step": 4424000 + }, + { + "epoch": 29.940585751407536, + "grad_norm": 0.40361082553863525, + "learning_rate": 4.7005941424859253e-05, + "loss": 0.3607, + "step": 4424500 + }, + { + "epoch": 29.943969250757903, + "grad_norm": 0.3730100393295288, + "learning_rate": 4.700560307492421e-05, + "loss": 0.362, + "step": 4425000 + }, + { + "epoch": 29.947352750108273, + "grad_norm": 0.3448823094367981, + "learning_rate": 4.700526472498917e-05, + "loss": 0.3619, + "step": 4425500 + }, + { + "epoch": 29.95073624945864, + "grad_norm": 0.3748168349266052, + "learning_rate": 4.700492637505414e-05, + "loss": 0.3613, + "step": 4426000 + }, + { + "epoch": 29.95411974880901, + "grad_norm": 0.4092998504638672, + "learning_rate": 4.70045880251191e-05, + "loss": 0.3614, + "step": 4426500 + }, + { + "epoch": 29.957503248159377, + "grad_norm": 0.37465614080429077, + "learning_rate": 4.7004249675184064e-05, + "loss": 0.3614, + "step": 4427000 + }, + { + "epoch": 29.960886747509743, + "grad_norm": 0.3548694849014282, + "learning_rate": 4.7003911325249026e-05, + "loss": 0.3598, + "step": 4427500 + }, + { + "epoch": 29.964270246860114, + "grad_norm": 0.3760984539985657, + "learning_rate": 4.7003572975313995e-05, + "loss": 0.3622, + "step": 4428000 + }, + { + "epoch": 29.96765374621048, + "grad_norm": 0.37645846605300903, + "learning_rate": 4.700323462537896e-05, + "loss": 0.3606, + "step": 4428500 + }, + { + "epoch": 29.971037245560847, + "grad_norm": 0.35915839672088623, + "learning_rate": 4.700289627544392e-05, + "loss": 0.3619, + "step": 4429000 + }, + { + "epoch": 29.974420744911217, + "grad_norm": 0.3876221776008606, + "learning_rate": 4.700255792550888e-05, + "loss": 0.3616, + "step": 4429500 + }, + { + "epoch": 29.977804244261584, + "grad_norm": 0.3990806043148041, + "learning_rate": 4.7002219575573844e-05, + "loss": 0.3609, + "step": 4430000 + }, + { + "epoch": 29.981187743611954, + "grad_norm": 0.40360915660858154, + "learning_rate": 4.7001881225638806e-05, + "loss": 0.3632, + "step": 4430500 + }, + { + "epoch": 29.98457124296232, + "grad_norm": 0.39829617738723755, + "learning_rate": 4.700154287570377e-05, + "loss": 0.3603, + "step": 4431000 + }, + { + "epoch": 29.987954742312688, + "grad_norm": 0.40149858593940735, + "learning_rate": 4.700120452576873e-05, + "loss": 0.3615, + "step": 4431500 + }, + { + "epoch": 29.99133824166306, + "grad_norm": 0.3309660255908966, + "learning_rate": 4.70008661758337e-05, + "loss": 0.3612, + "step": 4432000 + }, + { + "epoch": 29.994721741013425, + "grad_norm": 0.3783590495586395, + "learning_rate": 4.700052782589866e-05, + "loss": 0.3623, + "step": 4432500 + }, + { + "epoch": 29.998105240363795, + "grad_norm": 0.35361525416374207, + "learning_rate": 4.700018947596362e-05, + "loss": 0.361, + "step": 4433000 + }, + { + "epoch": 30.0, + "eval_accuracy": 0.8624615571487356, + "eval_loss": 0.5571216940879822, + "eval_runtime": 3351.2354, + "eval_samples_per_second": 86.757, + "eval_steps_per_second": 5.422, + "step": 4433280 + }, + { + "epoch": 30.001488739714162, + "grad_norm": 0.36750322580337524, + "learning_rate": 4.6999851126028585e-05, + "loss": 0.3598, + "step": 4433500 + }, + { + "epoch": 30.00487223906453, + "grad_norm": 0.3880546987056732, + "learning_rate": 4.6999512776093554e-05, + "loss": 0.3591, + "step": 4434000 + }, + { + "epoch": 30.0082557384149, + "grad_norm": 0.38808223605155945, + "learning_rate": 4.699917442615851e-05, + "loss": 0.3582, + "step": 4434500 + }, + { + "epoch": 30.011639237765266, + "grad_norm": 0.3704867362976074, + "learning_rate": 4.699883607622347e-05, + "loss": 0.3611, + "step": 4435000 + }, + { + "epoch": 30.015022737115636, + "grad_norm": 0.3950101435184479, + "learning_rate": 4.699849772628844e-05, + "loss": 0.3596, + "step": 4435500 + }, + { + "epoch": 30.018406236466003, + "grad_norm": 0.3647949993610382, + "learning_rate": 4.69981593763534e-05, + "loss": 0.3585, + "step": 4436000 + }, + { + "epoch": 30.02178973581637, + "grad_norm": 0.3902396857738495, + "learning_rate": 4.6997821026418365e-05, + "loss": 0.3582, + "step": 4436500 + }, + { + "epoch": 30.02517323516674, + "grad_norm": 0.3702434301376343, + "learning_rate": 4.699748267648333e-05, + "loss": 0.3596, + "step": 4437000 + }, + { + "epoch": 30.028556734517107, + "grad_norm": 0.4297676682472229, + "learning_rate": 4.6997144326548296e-05, + "loss": 0.3596, + "step": 4437500 + }, + { + "epoch": 30.031940233867473, + "grad_norm": 0.39627528190612793, + "learning_rate": 4.699680597661326e-05, + "loss": 0.3596, + "step": 4438000 + }, + { + "epoch": 30.035323733217844, + "grad_norm": 0.3526000678539276, + "learning_rate": 4.699646762667822e-05, + "loss": 0.3583, + "step": 4438500 + }, + { + "epoch": 30.03870723256821, + "grad_norm": 0.3624178171157837, + "learning_rate": 4.699612927674318e-05, + "loss": 0.3604, + "step": 4439000 + }, + { + "epoch": 30.04209073191858, + "grad_norm": 0.40386179089546204, + "learning_rate": 4.6995790926808144e-05, + "loss": 0.3587, + "step": 4439500 + }, + { + "epoch": 30.045474231268948, + "grad_norm": 0.3986361622810364, + "learning_rate": 4.6995452576873106e-05, + "loss": 0.3606, + "step": 4440000 + }, + { + "epoch": 30.048857730619314, + "grad_norm": 0.3725155293941498, + "learning_rate": 4.699511422693807e-05, + "loss": 0.3593, + "step": 4440500 + }, + { + "epoch": 30.052241229969685, + "grad_norm": 0.36626580357551575, + "learning_rate": 4.699477587700303e-05, + "loss": 0.3589, + "step": 4441000 + }, + { + "epoch": 30.05562472932005, + "grad_norm": 0.38600772619247437, + "learning_rate": 4.6994437527068e-05, + "loss": 0.3597, + "step": 4441500 + }, + { + "epoch": 30.05900822867042, + "grad_norm": 0.38901904225349426, + "learning_rate": 4.699409917713296e-05, + "loss": 0.3606, + "step": 4442000 + }, + { + "epoch": 30.06239172802079, + "grad_norm": 0.36698004603385925, + "learning_rate": 4.6993760827197924e-05, + "loss": 0.3615, + "step": 4442500 + }, + { + "epoch": 30.065775227371155, + "grad_norm": 0.3775622844696045, + "learning_rate": 4.6993422477262886e-05, + "loss": 0.3597, + "step": 4443000 + }, + { + "epoch": 30.069158726721525, + "grad_norm": 0.3821042478084564, + "learning_rate": 4.6993084127327855e-05, + "loss": 0.3594, + "step": 4443500 + }, + { + "epoch": 30.072542226071892, + "grad_norm": 0.3540192246437073, + "learning_rate": 4.699274577739281e-05, + "loss": 0.3604, + "step": 4444000 + }, + { + "epoch": 30.07592572542226, + "grad_norm": 0.3810232877731323, + "learning_rate": 4.699240742745777e-05, + "loss": 0.3587, + "step": 4444500 + }, + { + "epoch": 30.07930922477263, + "grad_norm": 0.3720387816429138, + "learning_rate": 4.699206907752274e-05, + "loss": 0.3603, + "step": 4445000 + }, + { + "epoch": 30.082692724122996, + "grad_norm": 0.4007198214530945, + "learning_rate": 4.69917307275877e-05, + "loss": 0.3588, + "step": 4445500 + }, + { + "epoch": 30.086076223473366, + "grad_norm": 0.40312501788139343, + "learning_rate": 4.6991392377652665e-05, + "loss": 0.3594, + "step": 4446000 + }, + { + "epoch": 30.089459722823733, + "grad_norm": 0.3461494445800781, + "learning_rate": 4.699105402771763e-05, + "loss": 0.3583, + "step": 4446500 + }, + { + "epoch": 30.0928432221741, + "grad_norm": 0.36141762137413025, + "learning_rate": 4.6990715677782596e-05, + "loss": 0.3594, + "step": 4447000 + }, + { + "epoch": 30.09622672152447, + "grad_norm": 0.39119812846183777, + "learning_rate": 4.699037732784756e-05, + "loss": 0.359, + "step": 4447500 + }, + { + "epoch": 30.099610220874837, + "grad_norm": 0.41577664017677307, + "learning_rate": 4.699003897791252e-05, + "loss": 0.3598, + "step": 4448000 + }, + { + "epoch": 30.102993720225207, + "grad_norm": 0.35464170575141907, + "learning_rate": 4.698970062797748e-05, + "loss": 0.3604, + "step": 4448500 + }, + { + "epoch": 30.106377219575574, + "grad_norm": 0.37684959173202515, + "learning_rate": 4.6989362278042445e-05, + "loss": 0.3603, + "step": 4449000 + }, + { + "epoch": 30.10976071892594, + "grad_norm": 0.38398608565330505, + "learning_rate": 4.698902392810741e-05, + "loss": 0.3597, + "step": 4449500 + }, + { + "epoch": 30.11314421827631, + "grad_norm": 0.3568044900894165, + "learning_rate": 4.698868557817237e-05, + "loss": 0.3612, + "step": 4450000 + }, + { + "epoch": 30.116527717626678, + "grad_norm": 0.3571385145187378, + "learning_rate": 4.698834722823733e-05, + "loss": 0.3606, + "step": 4450500 + }, + { + "epoch": 30.119911216977048, + "grad_norm": 0.3913380801677704, + "learning_rate": 4.69880088783023e-05, + "loss": 0.3597, + "step": 4451000 + }, + { + "epoch": 30.123294716327415, + "grad_norm": 0.390293151140213, + "learning_rate": 4.698767052836726e-05, + "loss": 0.3595, + "step": 4451500 + }, + { + "epoch": 30.12667821567778, + "grad_norm": 0.40033870935440063, + "learning_rate": 4.6987332178432224e-05, + "loss": 0.3605, + "step": 4452000 + }, + { + "epoch": 30.130061715028152, + "grad_norm": 0.3905723989009857, + "learning_rate": 4.6986993828497186e-05, + "loss": 0.3589, + "step": 4452500 + }, + { + "epoch": 30.13344521437852, + "grad_norm": 0.37469837069511414, + "learning_rate": 4.6986655478562155e-05, + "loss": 0.3591, + "step": 4453000 + }, + { + "epoch": 30.136828713728885, + "grad_norm": 0.36965039372444153, + "learning_rate": 4.698631712862711e-05, + "loss": 0.3595, + "step": 4453500 + }, + { + "epoch": 30.140212213079256, + "grad_norm": 0.36205413937568665, + "learning_rate": 4.698597877869207e-05, + "loss": 0.3599, + "step": 4454000 + }, + { + "epoch": 30.143595712429622, + "grad_norm": 0.4121004045009613, + "learning_rate": 4.698564042875704e-05, + "loss": 0.3597, + "step": 4454500 + }, + { + "epoch": 30.146979211779993, + "grad_norm": 0.3439054489135742, + "learning_rate": 4.6985302078822004e-05, + "loss": 0.3606, + "step": 4455000 + }, + { + "epoch": 30.15036271113036, + "grad_norm": 0.3691730499267578, + "learning_rate": 4.6984963728886966e-05, + "loss": 0.3594, + "step": 4455500 + }, + { + "epoch": 30.153746210480726, + "grad_norm": 0.39207255840301514, + "learning_rate": 4.698462537895193e-05, + "loss": 0.3602, + "step": 4456000 + }, + { + "epoch": 30.157129709831096, + "grad_norm": 0.4125577509403229, + "learning_rate": 4.698428702901689e-05, + "loss": 0.3615, + "step": 4456500 + }, + { + "epoch": 30.160513209181463, + "grad_norm": 0.371836394071579, + "learning_rate": 4.698394867908186e-05, + "loss": 0.359, + "step": 4457000 + }, + { + "epoch": 30.163896708531833, + "grad_norm": 0.38201549649238586, + "learning_rate": 4.698361032914682e-05, + "loss": 0.3606, + "step": 4457500 + }, + { + "epoch": 30.1672802078822, + "grad_norm": 0.3787565529346466, + "learning_rate": 4.698327197921178e-05, + "loss": 0.3619, + "step": 4458000 + }, + { + "epoch": 30.170663707232567, + "grad_norm": 0.3717700242996216, + "learning_rate": 4.6982933629276745e-05, + "loss": 0.3608, + "step": 4458500 + }, + { + "epoch": 30.174047206582937, + "grad_norm": 0.3638845384120941, + "learning_rate": 4.698259527934171e-05, + "loss": 0.3593, + "step": 4459000 + }, + { + "epoch": 30.177430705933304, + "grad_norm": 0.3781699538230896, + "learning_rate": 4.698225692940667e-05, + "loss": 0.3599, + "step": 4459500 + }, + { + "epoch": 30.180814205283674, + "grad_norm": 0.38198304176330566, + "learning_rate": 4.698191857947163e-05, + "loss": 0.3609, + "step": 4460000 + }, + { + "epoch": 30.18419770463404, + "grad_norm": 0.382424533367157, + "learning_rate": 4.69815802295366e-05, + "loss": 0.3601, + "step": 4460500 + }, + { + "epoch": 30.187581203984408, + "grad_norm": 0.37011992931365967, + "learning_rate": 4.698124187960156e-05, + "loss": 0.3611, + "step": 4461000 + }, + { + "epoch": 30.190964703334778, + "grad_norm": 0.4030992090702057, + "learning_rate": 4.6980903529666525e-05, + "loss": 0.3599, + "step": 4461500 + }, + { + "epoch": 30.194348202685145, + "grad_norm": 0.380354642868042, + "learning_rate": 4.698056517973149e-05, + "loss": 0.3594, + "step": 4462000 + }, + { + "epoch": 30.19773170203551, + "grad_norm": 0.36242347955703735, + "learning_rate": 4.6980226829796456e-05, + "loss": 0.3588, + "step": 4462500 + }, + { + "epoch": 30.201115201385882, + "grad_norm": 0.4033394753932953, + "learning_rate": 4.697988847986141e-05, + "loss": 0.3617, + "step": 4463000 + }, + { + "epoch": 30.20449870073625, + "grad_norm": 0.39557284116744995, + "learning_rate": 4.6979550129926373e-05, + "loss": 0.3606, + "step": 4463500 + }, + { + "epoch": 30.20788220008662, + "grad_norm": 0.3634182810783386, + "learning_rate": 4.6979211779991336e-05, + "loss": 0.3591, + "step": 4464000 + }, + { + "epoch": 30.211265699436986, + "grad_norm": 0.39265722036361694, + "learning_rate": 4.6978873430056304e-05, + "loss": 0.3611, + "step": 4464500 + }, + { + "epoch": 30.214649198787352, + "grad_norm": 0.40822023153305054, + "learning_rate": 4.6978535080121267e-05, + "loss": 0.361, + "step": 4465000 + }, + { + "epoch": 30.218032698137723, + "grad_norm": 0.41366010904312134, + "learning_rate": 4.697819673018623e-05, + "loss": 0.3597, + "step": 4465500 + }, + { + "epoch": 30.22141619748809, + "grad_norm": 0.40065518021583557, + "learning_rate": 4.697785838025119e-05, + "loss": 0.3619, + "step": 4466000 + }, + { + "epoch": 30.22479969683846, + "grad_norm": 0.39153268933296204, + "learning_rate": 4.697752003031616e-05, + "loss": 0.3602, + "step": 4466500 + }, + { + "epoch": 30.228183196188827, + "grad_norm": 0.3661727011203766, + "learning_rate": 4.697718168038112e-05, + "loss": 0.3608, + "step": 4467000 + }, + { + "epoch": 30.231566695539193, + "grad_norm": 0.3721162974834442, + "learning_rate": 4.6976843330446084e-05, + "loss": 0.3611, + "step": 4467500 + }, + { + "epoch": 30.234950194889564, + "grad_norm": 0.35852909088134766, + "learning_rate": 4.6976504980511046e-05, + "loss": 0.3603, + "step": 4468000 + }, + { + "epoch": 30.23833369423993, + "grad_norm": 0.37085092067718506, + "learning_rate": 4.697616663057601e-05, + "loss": 0.3613, + "step": 4468500 + }, + { + "epoch": 30.241717193590297, + "grad_norm": 0.3549633324146271, + "learning_rate": 4.697582828064097e-05, + "loss": 0.3589, + "step": 4469000 + }, + { + "epoch": 30.245100692940667, + "grad_norm": 0.423056036233902, + "learning_rate": 4.697548993070593e-05, + "loss": 0.3593, + "step": 4469500 + }, + { + "epoch": 30.248484192291034, + "grad_norm": 0.37763503193855286, + "learning_rate": 4.69751515807709e-05, + "loss": 0.3592, + "step": 4470000 + }, + { + "epoch": 30.251867691641404, + "grad_norm": 0.3798384964466095, + "learning_rate": 4.6974813230835863e-05, + "loss": 0.3593, + "step": 4470500 + }, + { + "epoch": 30.25525119099177, + "grad_norm": 0.42569223046302795, + "learning_rate": 4.6974474880900826e-05, + "loss": 0.3603, + "step": 4471000 + }, + { + "epoch": 30.258634690342138, + "grad_norm": 0.3724420666694641, + "learning_rate": 4.697413653096579e-05, + "loss": 0.3614, + "step": 4471500 + }, + { + "epoch": 30.26201818969251, + "grad_norm": 0.3679376244544983, + "learning_rate": 4.697379818103076e-05, + "loss": 0.3603, + "step": 4472000 + }, + { + "epoch": 30.265401689042875, + "grad_norm": 0.34181925654411316, + "learning_rate": 4.697345983109571e-05, + "loss": 0.3612, + "step": 4472500 + }, + { + "epoch": 30.268785188393245, + "grad_norm": 0.4061616063117981, + "learning_rate": 4.6973121481160674e-05, + "loss": 0.3599, + "step": 4473000 + }, + { + "epoch": 30.272168687743612, + "grad_norm": 0.4007011950016022, + "learning_rate": 4.6972783131225636e-05, + "loss": 0.3615, + "step": 4473500 + }, + { + "epoch": 30.27555218709398, + "grad_norm": 0.31149446964263916, + "learning_rate": 4.6972444781290605e-05, + "loss": 0.3598, + "step": 4474000 + }, + { + "epoch": 30.27893568644435, + "grad_norm": 0.3719044029712677, + "learning_rate": 4.697210643135557e-05, + "loss": 0.3603, + "step": 4474500 + }, + { + "epoch": 30.282319185794716, + "grad_norm": 0.35385191440582275, + "learning_rate": 4.697176808142053e-05, + "loss": 0.36, + "step": 4475000 + }, + { + "epoch": 30.285702685145086, + "grad_norm": 0.3812853693962097, + "learning_rate": 4.697142973148549e-05, + "loss": 0.3617, + "step": 4475500 + }, + { + "epoch": 30.289086184495453, + "grad_norm": 0.3912808299064636, + "learning_rate": 4.697109138155046e-05, + "loss": 0.3614, + "step": 4476000 + }, + { + "epoch": 30.29246968384582, + "grad_norm": 0.3804853856563568, + "learning_rate": 4.697075303161542e-05, + "loss": 0.3616, + "step": 4476500 + }, + { + "epoch": 30.29585318319619, + "grad_norm": 0.32981404662132263, + "learning_rate": 4.6970414681680385e-05, + "loss": 0.3598, + "step": 4477000 + }, + { + "epoch": 30.299236682546557, + "grad_norm": 0.3632580637931824, + "learning_rate": 4.697007633174535e-05, + "loss": 0.3613, + "step": 4477500 + }, + { + "epoch": 30.302620181896923, + "grad_norm": 0.36564409732818604, + "learning_rate": 4.696973798181031e-05, + "loss": 0.3622, + "step": 4478000 + }, + { + "epoch": 30.306003681247294, + "grad_norm": 0.4020627439022064, + "learning_rate": 4.696939963187527e-05, + "loss": 0.3601, + "step": 4478500 + }, + { + "epoch": 30.30938718059766, + "grad_norm": 0.37020233273506165, + "learning_rate": 4.696906128194023e-05, + "loss": 0.3609, + "step": 4479000 + }, + { + "epoch": 30.31277067994803, + "grad_norm": 0.38797226548194885, + "learning_rate": 4.69687229320052e-05, + "loss": 0.3603, + "step": 4479500 + }, + { + "epoch": 30.316154179298398, + "grad_norm": 0.37550729513168335, + "learning_rate": 4.6968384582070164e-05, + "loss": 0.3603, + "step": 4480000 + }, + { + "epoch": 30.319537678648764, + "grad_norm": 0.34766310453414917, + "learning_rate": 4.6968046232135126e-05, + "loss": 0.3604, + "step": 4480500 + }, + { + "epoch": 30.322921177999135, + "grad_norm": 0.4076807498931885, + "learning_rate": 4.696770788220009e-05, + "loss": 0.36, + "step": 4481000 + }, + { + "epoch": 30.3263046773495, + "grad_norm": 0.418477863073349, + "learning_rate": 4.696736953226506e-05, + "loss": 0.3617, + "step": 4481500 + }, + { + "epoch": 30.32968817669987, + "grad_norm": 0.38338854908943176, + "learning_rate": 4.696703118233002e-05, + "loss": 0.3595, + "step": 4482000 + }, + { + "epoch": 30.33307167605024, + "grad_norm": 0.37635135650634766, + "learning_rate": 4.6966692832394975e-05, + "loss": 0.3584, + "step": 4482500 + }, + { + "epoch": 30.336455175400605, + "grad_norm": 0.38679662346839905, + "learning_rate": 4.696635448245994e-05, + "loss": 0.3599, + "step": 4483000 + }, + { + "epoch": 30.339838674750975, + "grad_norm": 0.377916157245636, + "learning_rate": 4.6966016132524906e-05, + "loss": 0.3602, + "step": 4483500 + }, + { + "epoch": 30.343222174101342, + "grad_norm": 0.38286900520324707, + "learning_rate": 4.696567778258987e-05, + "loss": 0.361, + "step": 4484000 + }, + { + "epoch": 30.346605673451712, + "grad_norm": 0.372231662273407, + "learning_rate": 4.696533943265483e-05, + "loss": 0.3596, + "step": 4484500 + }, + { + "epoch": 30.34998917280208, + "grad_norm": 0.3833298087120056, + "learning_rate": 4.696500108271979e-05, + "loss": 0.3601, + "step": 4485000 + }, + { + "epoch": 30.353372672152446, + "grad_norm": 0.4139886498451233, + "learning_rate": 4.696466273278476e-05, + "loss": 0.3605, + "step": 4485500 + }, + { + "epoch": 30.356756171502816, + "grad_norm": 0.34204918146133423, + "learning_rate": 4.696432438284972e-05, + "loss": 0.3613, + "step": 4486000 + }, + { + "epoch": 30.360139670853183, + "grad_norm": 0.38752761483192444, + "learning_rate": 4.6963986032914685e-05, + "loss": 0.3615, + "step": 4486500 + }, + { + "epoch": 30.36352317020355, + "grad_norm": 0.3860863447189331, + "learning_rate": 4.696364768297965e-05, + "loss": 0.3607, + "step": 4487000 + }, + { + "epoch": 30.36690666955392, + "grad_norm": 0.4313901960849762, + "learning_rate": 4.696330933304461e-05, + "loss": 0.3608, + "step": 4487500 + }, + { + "epoch": 30.370290168904287, + "grad_norm": 0.39515420794487, + "learning_rate": 4.696297098310957e-05, + "loss": 0.3607, + "step": 4488000 + }, + { + "epoch": 30.373673668254657, + "grad_norm": 0.39293739199638367, + "learning_rate": 4.6962632633174534e-05, + "loss": 0.3601, + "step": 4488500 + }, + { + "epoch": 30.377057167605024, + "grad_norm": 0.37145382165908813, + "learning_rate": 4.69622942832395e-05, + "loss": 0.3606, + "step": 4489000 + }, + { + "epoch": 30.38044066695539, + "grad_norm": 0.4163252115249634, + "learning_rate": 4.6961955933304465e-05, + "loss": 0.3613, + "step": 4489500 + }, + { + "epoch": 30.38382416630576, + "grad_norm": 0.34504857659339905, + "learning_rate": 4.696161758336943e-05, + "loss": 0.3605, + "step": 4490000 + }, + { + "epoch": 30.387207665656128, + "grad_norm": 0.34117037057876587, + "learning_rate": 4.696127923343439e-05, + "loss": 0.3623, + "step": 4490500 + }, + { + "epoch": 30.390591165006498, + "grad_norm": 0.40235698223114014, + "learning_rate": 4.696094088349936e-05, + "loss": 0.3616, + "step": 4491000 + }, + { + "epoch": 30.393974664356865, + "grad_norm": 0.38580021262168884, + "learning_rate": 4.696060253356432e-05, + "loss": 0.3604, + "step": 4491500 + }, + { + "epoch": 30.39735816370723, + "grad_norm": 0.36691147089004517, + "learning_rate": 4.6960264183629275e-05, + "loss": 0.3599, + "step": 4492000 + }, + { + "epoch": 30.4007416630576, + "grad_norm": 0.39179420471191406, + "learning_rate": 4.695992583369424e-05, + "loss": 0.3618, + "step": 4492500 + }, + { + "epoch": 30.40412516240797, + "grad_norm": 0.3824407756328583, + "learning_rate": 4.6959587483759206e-05, + "loss": 0.3598, + "step": 4493000 + }, + { + "epoch": 30.407508661758335, + "grad_norm": 0.394069641828537, + "learning_rate": 4.695924913382417e-05, + "loss": 0.36, + "step": 4493500 + }, + { + "epoch": 30.410892161108706, + "grad_norm": 0.3963245153427124, + "learning_rate": 4.695891078388913e-05, + "loss": 0.3608, + "step": 4494000 + }, + { + "epoch": 30.414275660459072, + "grad_norm": 0.35123634338378906, + "learning_rate": 4.695857243395409e-05, + "loss": 0.3611, + "step": 4494500 + }, + { + "epoch": 30.417659159809443, + "grad_norm": 0.39586952328681946, + "learning_rate": 4.695823408401906e-05, + "loss": 0.3626, + "step": 4495000 + }, + { + "epoch": 30.42104265915981, + "grad_norm": 0.35105255246162415, + "learning_rate": 4.6957895734084024e-05, + "loss": 0.3599, + "step": 4495500 + }, + { + "epoch": 30.424426158510176, + "grad_norm": 0.342883825302124, + "learning_rate": 4.6957557384148986e-05, + "loss": 0.36, + "step": 4496000 + }, + { + "epoch": 30.427809657860546, + "grad_norm": 0.3733304440975189, + "learning_rate": 4.695721903421395e-05, + "loss": 0.3599, + "step": 4496500 + }, + { + "epoch": 30.431193157210913, + "grad_norm": 0.3445548415184021, + "learning_rate": 4.695688068427891e-05, + "loss": 0.3603, + "step": 4497000 + }, + { + "epoch": 30.434576656561283, + "grad_norm": 0.3503607213497162, + "learning_rate": 4.695654233434387e-05, + "loss": 0.3599, + "step": 4497500 + }, + { + "epoch": 30.43796015591165, + "grad_norm": 0.3949844241142273, + "learning_rate": 4.6956203984408834e-05, + "loss": 0.3617, + "step": 4498000 + }, + { + "epoch": 30.441343655262017, + "grad_norm": 0.40511298179626465, + "learning_rate": 4.69558656344738e-05, + "loss": 0.3609, + "step": 4498500 + }, + { + "epoch": 30.444727154612387, + "grad_norm": 0.37571224570274353, + "learning_rate": 4.6955527284538765e-05, + "loss": 0.3585, + "step": 4499000 + }, + { + "epoch": 30.448110653962754, + "grad_norm": 0.37550434470176697, + "learning_rate": 4.695518893460373e-05, + "loss": 0.3615, + "step": 4499500 + }, + { + "epoch": 30.451494153313124, + "grad_norm": 0.4065387547016144, + "learning_rate": 4.695485058466869e-05, + "loss": 0.3602, + "step": 4500000 + }, + { + "epoch": 30.45487765266349, + "grad_norm": 0.3657798171043396, + "learning_rate": 4.695451223473366e-05, + "loss": 0.3628, + "step": 4500500 + }, + { + "epoch": 30.458261152013858, + "grad_norm": 0.402597039937973, + "learning_rate": 4.695417388479862e-05, + "loss": 0.3611, + "step": 4501000 + }, + { + "epoch": 30.461644651364228, + "grad_norm": 0.3700020909309387, + "learning_rate": 4.6953835534863576e-05, + "loss": 0.3609, + "step": 4501500 + }, + { + "epoch": 30.465028150714595, + "grad_norm": 0.37999027967453003, + "learning_rate": 4.695349718492854e-05, + "loss": 0.3611, + "step": 4502000 + }, + { + "epoch": 30.46841165006496, + "grad_norm": 0.4107026755809784, + "learning_rate": 4.695315883499351e-05, + "loss": 0.3602, + "step": 4502500 + }, + { + "epoch": 30.471795149415332, + "grad_norm": 0.3572571277618408, + "learning_rate": 4.695282048505847e-05, + "loss": 0.3605, + "step": 4503000 + }, + { + "epoch": 30.4751786487657, + "grad_norm": 0.3318372368812561, + "learning_rate": 4.695248213512343e-05, + "loss": 0.3606, + "step": 4503500 + }, + { + "epoch": 30.47856214811607, + "grad_norm": 0.38756465911865234, + "learning_rate": 4.695214378518839e-05, + "loss": 0.3615, + "step": 4504000 + }, + { + "epoch": 30.481945647466436, + "grad_norm": 0.38692402839660645, + "learning_rate": 4.695180543525336e-05, + "loss": 0.3605, + "step": 4504500 + }, + { + "epoch": 30.485329146816802, + "grad_norm": 0.3692699670791626, + "learning_rate": 4.6951467085318324e-05, + "loss": 0.3595, + "step": 4505000 + }, + { + "epoch": 30.488712646167173, + "grad_norm": 0.3602524697780609, + "learning_rate": 4.6951128735383287e-05, + "loss": 0.3608, + "step": 4505500 + }, + { + "epoch": 30.49209614551754, + "grad_norm": 0.3955193758010864, + "learning_rate": 4.695079038544825e-05, + "loss": 0.36, + "step": 4506000 + }, + { + "epoch": 30.49547964486791, + "grad_norm": 0.37169477343559265, + "learning_rate": 4.695045203551321e-05, + "loss": 0.3605, + "step": 4506500 + }, + { + "epoch": 30.498863144218276, + "grad_norm": 0.37480148673057556, + "learning_rate": 4.695011368557817e-05, + "loss": 0.3618, + "step": 4507000 + }, + { + "epoch": 30.502246643568643, + "grad_norm": 0.36751168966293335, + "learning_rate": 4.6949775335643135e-05, + "loss": 0.3608, + "step": 4507500 + }, + { + "epoch": 30.505630142919014, + "grad_norm": 0.311517596244812, + "learning_rate": 4.6949436985708104e-05, + "loss": 0.3602, + "step": 4508000 + }, + { + "epoch": 30.50901364226938, + "grad_norm": 0.3592129349708557, + "learning_rate": 4.6949098635773066e-05, + "loss": 0.3604, + "step": 4508500 + }, + { + "epoch": 30.51239714161975, + "grad_norm": 0.3845657706260681, + "learning_rate": 4.694876028583803e-05, + "loss": 0.3602, + "step": 4509000 + }, + { + "epoch": 30.515780640970117, + "grad_norm": 0.37815576791763306, + "learning_rate": 4.694842193590299e-05, + "loss": 0.3603, + "step": 4509500 + }, + { + "epoch": 30.519164140320484, + "grad_norm": 0.36821088194847107, + "learning_rate": 4.694808358596795e-05, + "loss": 0.3617, + "step": 4510000 + }, + { + "epoch": 30.522547639670854, + "grad_norm": 0.3512398600578308, + "learning_rate": 4.694774523603292e-05, + "loss": 0.3624, + "step": 4510500 + }, + { + "epoch": 30.52593113902122, + "grad_norm": 0.30272915959358215, + "learning_rate": 4.694740688609788e-05, + "loss": 0.3631, + "step": 4511000 + }, + { + "epoch": 30.529314638371588, + "grad_norm": 0.37545034289360046, + "learning_rate": 4.694706853616284e-05, + "loss": 0.36, + "step": 4511500 + }, + { + "epoch": 30.532698137721958, + "grad_norm": 0.3488900363445282, + "learning_rate": 4.694673018622781e-05, + "loss": 0.361, + "step": 4512000 + }, + { + "epoch": 30.536081637072325, + "grad_norm": 0.3494974672794342, + "learning_rate": 4.694639183629277e-05, + "loss": 0.3613, + "step": 4512500 + }, + { + "epoch": 30.539465136422695, + "grad_norm": 0.37769636511802673, + "learning_rate": 4.694605348635773e-05, + "loss": 0.359, + "step": 4513000 + }, + { + "epoch": 30.542848635773062, + "grad_norm": 0.3510251045227051, + "learning_rate": 4.6945715136422694e-05, + "loss": 0.3607, + "step": 4513500 + }, + { + "epoch": 30.54623213512343, + "grad_norm": 0.43237999081611633, + "learning_rate": 4.694537678648766e-05, + "loss": 0.3607, + "step": 4514000 + }, + { + "epoch": 30.5496156344738, + "grad_norm": 0.40540429949760437, + "learning_rate": 4.6945038436552625e-05, + "loss": 0.3607, + "step": 4514500 + }, + { + "epoch": 30.552999133824166, + "grad_norm": 0.392507940530777, + "learning_rate": 4.694470008661759e-05, + "loss": 0.3608, + "step": 4515000 + }, + { + "epoch": 30.556382633174536, + "grad_norm": 0.403692364692688, + "learning_rate": 4.694436173668255e-05, + "loss": 0.3608, + "step": 4515500 + }, + { + "epoch": 30.559766132524903, + "grad_norm": 0.3994150459766388, + "learning_rate": 4.694402338674751e-05, + "loss": 0.3625, + "step": 4516000 + }, + { + "epoch": 30.56314963187527, + "grad_norm": 0.3970039486885071, + "learning_rate": 4.6943685036812474e-05, + "loss": 0.3603, + "step": 4516500 + }, + { + "epoch": 30.56653313122564, + "grad_norm": 0.35074129700660706, + "learning_rate": 4.6943346686877436e-05, + "loss": 0.3609, + "step": 4517000 + }, + { + "epoch": 30.569916630576007, + "grad_norm": 0.3436708152294159, + "learning_rate": 4.6943008336942405e-05, + "loss": 0.3615, + "step": 4517500 + }, + { + "epoch": 30.573300129926373, + "grad_norm": 0.3597992956638336, + "learning_rate": 4.694266998700737e-05, + "loss": 0.3614, + "step": 4518000 + }, + { + "epoch": 30.576683629276744, + "grad_norm": 0.40081173181533813, + "learning_rate": 4.694233163707233e-05, + "loss": 0.362, + "step": 4518500 + }, + { + "epoch": 30.58006712862711, + "grad_norm": 0.41162216663360596, + "learning_rate": 4.694199328713729e-05, + "loss": 0.3607, + "step": 4519000 + }, + { + "epoch": 30.58345062797748, + "grad_norm": 0.3673850893974304, + "learning_rate": 4.694165493720225e-05, + "loss": 0.3613, + "step": 4519500 + }, + { + "epoch": 30.586834127327847, + "grad_norm": 0.3754594326019287, + "learning_rate": 4.694131658726722e-05, + "loss": 0.3604, + "step": 4520000 + }, + { + "epoch": 30.590217626678214, + "grad_norm": 0.3868243992328644, + "learning_rate": 4.694097823733218e-05, + "loss": 0.3605, + "step": 4520500 + }, + { + "epoch": 30.593601126028585, + "grad_norm": 0.3701590299606323, + "learning_rate": 4.694063988739714e-05, + "loss": 0.3587, + "step": 4521000 + }, + { + "epoch": 30.59698462537895, + "grad_norm": 0.3930908739566803, + "learning_rate": 4.694030153746211e-05, + "loss": 0.3594, + "step": 4521500 + }, + { + "epoch": 30.60036812472932, + "grad_norm": 0.39562007784843445, + "learning_rate": 4.693996318752707e-05, + "loss": 0.3603, + "step": 4522000 + }, + { + "epoch": 30.60375162407969, + "grad_norm": 0.3444449305534363, + "learning_rate": 4.693962483759203e-05, + "loss": 0.3606, + "step": 4522500 + }, + { + "epoch": 30.607135123430055, + "grad_norm": 0.38593870401382446, + "learning_rate": 4.6939286487656995e-05, + "loss": 0.3598, + "step": 4523000 + }, + { + "epoch": 30.610518622780425, + "grad_norm": 0.3739345967769623, + "learning_rate": 4.6938948137721964e-05, + "loss": 0.3602, + "step": 4523500 + }, + { + "epoch": 30.613902122130792, + "grad_norm": 0.3817853629589081, + "learning_rate": 4.6938609787786926e-05, + "loss": 0.3609, + "step": 4524000 + }, + { + "epoch": 30.617285621481162, + "grad_norm": 0.39146387577056885, + "learning_rate": 4.693827143785189e-05, + "loss": 0.3619, + "step": 4524500 + }, + { + "epoch": 30.62066912083153, + "grad_norm": 0.39248788356781006, + "learning_rate": 4.693793308791685e-05, + "loss": 0.3594, + "step": 4525000 + }, + { + "epoch": 30.624052620181896, + "grad_norm": 0.4058881103992462, + "learning_rate": 4.693759473798181e-05, + "loss": 0.3597, + "step": 4525500 + }, + { + "epoch": 30.627436119532266, + "grad_norm": 0.39480721950531006, + "learning_rate": 4.6937256388046774e-05, + "loss": 0.3609, + "step": 4526000 + }, + { + "epoch": 30.630819618882633, + "grad_norm": 0.3836055099964142, + "learning_rate": 4.6936918038111736e-05, + "loss": 0.3614, + "step": 4526500 + }, + { + "epoch": 30.634203118233, + "grad_norm": 0.42320966720581055, + "learning_rate": 4.69365796881767e-05, + "loss": 0.3606, + "step": 4527000 + }, + { + "epoch": 30.63758661758337, + "grad_norm": 0.38261085748672485, + "learning_rate": 4.693624133824167e-05, + "loss": 0.3593, + "step": 4527500 + }, + { + "epoch": 30.640970116933737, + "grad_norm": 0.371084600687027, + "learning_rate": 4.693590298830663e-05, + "loss": 0.3598, + "step": 4528000 + }, + { + "epoch": 30.644353616284107, + "grad_norm": 0.3737763464450836, + "learning_rate": 4.693556463837159e-05, + "loss": 0.3616, + "step": 4528500 + }, + { + "epoch": 30.647737115634474, + "grad_norm": 0.34043997526168823, + "learning_rate": 4.6935226288436554e-05, + "loss": 0.3604, + "step": 4529000 + }, + { + "epoch": 30.65112061498484, + "grad_norm": 0.37720245122909546, + "learning_rate": 4.693488793850152e-05, + "loss": 0.3594, + "step": 4529500 + }, + { + "epoch": 30.65450411433521, + "grad_norm": 0.41709980368614197, + "learning_rate": 4.693454958856648e-05, + "loss": 0.362, + "step": 4530000 + }, + { + "epoch": 30.657887613685578, + "grad_norm": 0.336330771446228, + "learning_rate": 4.693421123863144e-05, + "loss": 0.3612, + "step": 4530500 + }, + { + "epoch": 30.661271113035948, + "grad_norm": 0.36730971932411194, + "learning_rate": 4.693387288869641e-05, + "loss": 0.3578, + "step": 4531000 + }, + { + "epoch": 30.664654612386315, + "grad_norm": 0.3970741629600525, + "learning_rate": 4.693353453876137e-05, + "loss": 0.3613, + "step": 4531500 + }, + { + "epoch": 30.66803811173668, + "grad_norm": 0.4086032509803772, + "learning_rate": 4.693319618882633e-05, + "loss": 0.3595, + "step": 4532000 + }, + { + "epoch": 30.67142161108705, + "grad_norm": 0.4054871201515198, + "learning_rate": 4.6932857838891295e-05, + "loss": 0.3606, + "step": 4532500 + }, + { + "epoch": 30.67480511043742, + "grad_norm": 0.3702140152454376, + "learning_rate": 4.6932519488956264e-05, + "loss": 0.3601, + "step": 4533000 + }, + { + "epoch": 30.67818860978779, + "grad_norm": 0.3790774941444397, + "learning_rate": 4.6932181139021226e-05, + "loss": 0.3623, + "step": 4533500 + }, + { + "epoch": 30.681572109138155, + "grad_norm": 0.3950534760951996, + "learning_rate": 4.693184278908619e-05, + "loss": 0.3607, + "step": 4534000 + }, + { + "epoch": 30.684955608488522, + "grad_norm": 0.3758167326450348, + "learning_rate": 4.693150443915115e-05, + "loss": 0.3599, + "step": 4534500 + }, + { + "epoch": 30.688339107838893, + "grad_norm": 0.41029444336891174, + "learning_rate": 4.693116608921611e-05, + "loss": 0.3598, + "step": 4535000 + }, + { + "epoch": 30.69172260718926, + "grad_norm": 0.39428913593292236, + "learning_rate": 4.6930827739281075e-05, + "loss": 0.36, + "step": 4535500 + }, + { + "epoch": 30.695106106539626, + "grad_norm": 0.33379751443862915, + "learning_rate": 4.693048938934604e-05, + "loss": 0.36, + "step": 4536000 + }, + { + "epoch": 30.698489605889996, + "grad_norm": 0.38447538018226624, + "learning_rate": 4.6930151039411e-05, + "loss": 0.3615, + "step": 4536500 + }, + { + "epoch": 30.701873105240363, + "grad_norm": 0.3635065257549286, + "learning_rate": 4.692981268947597e-05, + "loss": 0.361, + "step": 4537000 + }, + { + "epoch": 30.705256604590733, + "grad_norm": 0.3788832724094391, + "learning_rate": 4.692947433954093e-05, + "loss": 0.3614, + "step": 4537500 + }, + { + "epoch": 30.7086401039411, + "grad_norm": 0.3611948788166046, + "learning_rate": 4.692913598960589e-05, + "loss": 0.362, + "step": 4538000 + }, + { + "epoch": 30.712023603291467, + "grad_norm": 0.3582962155342102, + "learning_rate": 4.6928797639670854e-05, + "loss": 0.3599, + "step": 4538500 + }, + { + "epoch": 30.715407102641837, + "grad_norm": 0.3849903643131256, + "learning_rate": 4.692845928973582e-05, + "loss": 0.3603, + "step": 4539000 + }, + { + "epoch": 30.718790601992204, + "grad_norm": 0.38260653614997864, + "learning_rate": 4.692812093980078e-05, + "loss": 0.3601, + "step": 4539500 + }, + { + "epoch": 30.722174101342574, + "grad_norm": 0.3879907727241516, + "learning_rate": 4.692778258986574e-05, + "loss": 0.3619, + "step": 4540000 + }, + { + "epoch": 30.72555760069294, + "grad_norm": 0.3873540163040161, + "learning_rate": 4.692744423993071e-05, + "loss": 0.3615, + "step": 4540500 + }, + { + "epoch": 30.728941100043308, + "grad_norm": 0.3742757737636566, + "learning_rate": 4.692710588999567e-05, + "loss": 0.3597, + "step": 4541000 + }, + { + "epoch": 30.732324599393678, + "grad_norm": 0.39747047424316406, + "learning_rate": 4.6926767540060634e-05, + "loss": 0.3603, + "step": 4541500 + }, + { + "epoch": 30.735708098744045, + "grad_norm": 0.3961320221424103, + "learning_rate": 4.6926429190125596e-05, + "loss": 0.3619, + "step": 4542000 + }, + { + "epoch": 30.73909159809441, + "grad_norm": 0.38149750232696533, + "learning_rate": 4.6926090840190565e-05, + "loss": 0.3608, + "step": 4542500 + }, + { + "epoch": 30.742475097444782, + "grad_norm": 0.4243999719619751, + "learning_rate": 4.692575249025553e-05, + "loss": 0.36, + "step": 4543000 + }, + { + "epoch": 30.74585859679515, + "grad_norm": 0.36307740211486816, + "learning_rate": 4.692541414032049e-05, + "loss": 0.3607, + "step": 4543500 + }, + { + "epoch": 30.74924209614552, + "grad_norm": 0.3696213960647583, + "learning_rate": 4.692507579038545e-05, + "loss": 0.3623, + "step": 4544000 + }, + { + "epoch": 30.752625595495886, + "grad_norm": 0.34193122386932373, + "learning_rate": 4.692473744045041e-05, + "loss": 0.3615, + "step": 4544500 + }, + { + "epoch": 30.756009094846252, + "grad_norm": 0.3728967607021332, + "learning_rate": 4.6924399090515375e-05, + "loss": 0.3598, + "step": 4545000 + }, + { + "epoch": 30.759392594196623, + "grad_norm": 0.3507138788700104, + "learning_rate": 4.692406074058034e-05, + "loss": 0.3605, + "step": 4545500 + }, + { + "epoch": 30.76277609354699, + "grad_norm": 0.36319103837013245, + "learning_rate": 4.69237223906453e-05, + "loss": 0.3617, + "step": 4546000 + }, + { + "epoch": 30.76615959289736, + "grad_norm": 0.3437182307243347, + "learning_rate": 4.692338404071027e-05, + "loss": 0.362, + "step": 4546500 + }, + { + "epoch": 30.769543092247726, + "grad_norm": 0.3841271996498108, + "learning_rate": 4.692304569077523e-05, + "loss": 0.3605, + "step": 4547000 + }, + { + "epoch": 30.772926591598093, + "grad_norm": 0.3654472529888153, + "learning_rate": 4.692270734084019e-05, + "loss": 0.3614, + "step": 4547500 + }, + { + "epoch": 30.776310090948463, + "grad_norm": 0.3569715917110443, + "learning_rate": 4.6922368990905155e-05, + "loss": 0.3613, + "step": 4548000 + }, + { + "epoch": 30.77969359029883, + "grad_norm": 0.37520235776901245, + "learning_rate": 4.6922030640970124e-05, + "loss": 0.3602, + "step": 4548500 + }, + { + "epoch": 30.783077089649197, + "grad_norm": 0.3281329870223999, + "learning_rate": 4.692169229103508e-05, + "loss": 0.3602, + "step": 4549000 + }, + { + "epoch": 30.786460588999567, + "grad_norm": 0.4030281603336334, + "learning_rate": 4.692135394110004e-05, + "loss": 0.3609, + "step": 4549500 + }, + { + "epoch": 30.789844088349934, + "grad_norm": 0.40497538447380066, + "learning_rate": 4.692101559116501e-05, + "loss": 0.362, + "step": 4550000 + }, + { + "epoch": 30.793227587700304, + "grad_norm": 0.3655402362346649, + "learning_rate": 4.692067724122997e-05, + "loss": 0.3611, + "step": 4550500 + }, + { + "epoch": 30.79661108705067, + "grad_norm": 0.37955793738365173, + "learning_rate": 4.6920338891294934e-05, + "loss": 0.3622, + "step": 4551000 + }, + { + "epoch": 30.799994586401038, + "grad_norm": 0.36361464858055115, + "learning_rate": 4.6920000541359897e-05, + "loss": 0.3599, + "step": 4551500 + }, + { + "epoch": 30.803378085751408, + "grad_norm": 0.3851824998855591, + "learning_rate": 4.6919662191424865e-05, + "loss": 0.3619, + "step": 4552000 + }, + { + "epoch": 30.806761585101775, + "grad_norm": 0.3443010449409485, + "learning_rate": 4.691932384148983e-05, + "loss": 0.3598, + "step": 4552500 + }, + { + "epoch": 30.810145084452145, + "grad_norm": 0.35350069403648376, + "learning_rate": 4.691898549155479e-05, + "loss": 0.3599, + "step": 4553000 + }, + { + "epoch": 30.813528583802512, + "grad_norm": 0.36163684725761414, + "learning_rate": 4.691864714161975e-05, + "loss": 0.362, + "step": 4553500 + }, + { + "epoch": 30.81691208315288, + "grad_norm": 0.3298378586769104, + "learning_rate": 4.6918308791684714e-05, + "loss": 0.3609, + "step": 4554000 + }, + { + "epoch": 30.82029558250325, + "grad_norm": 0.4053889811038971, + "learning_rate": 4.6917970441749676e-05, + "loss": 0.3603, + "step": 4554500 + }, + { + "epoch": 30.823679081853616, + "grad_norm": 0.40790021419525146, + "learning_rate": 4.691763209181464e-05, + "loss": 0.3612, + "step": 4555000 + }, + { + "epoch": 30.827062581203986, + "grad_norm": 0.373309850692749, + "learning_rate": 4.69172937418796e-05, + "loss": 0.3604, + "step": 4555500 + }, + { + "epoch": 30.830446080554353, + "grad_norm": 0.3495636582374573, + "learning_rate": 4.691695539194457e-05, + "loss": 0.3601, + "step": 4556000 + }, + { + "epoch": 30.83382957990472, + "grad_norm": 0.4138449728488922, + "learning_rate": 4.691661704200953e-05, + "loss": 0.3625, + "step": 4556500 + }, + { + "epoch": 30.83721307925509, + "grad_norm": 0.4141077995300293, + "learning_rate": 4.6916278692074493e-05, + "loss": 0.361, + "step": 4557000 + }, + { + "epoch": 30.840596578605457, + "grad_norm": 0.42484527826309204, + "learning_rate": 4.6915940342139456e-05, + "loss": 0.3623, + "step": 4557500 + }, + { + "epoch": 30.843980077955827, + "grad_norm": 0.3764879107475281, + "learning_rate": 4.6915601992204424e-05, + "loss": 0.3611, + "step": 4558000 + }, + { + "epoch": 30.847363577306194, + "grad_norm": 0.39300256967544556, + "learning_rate": 4.691526364226938e-05, + "loss": 0.3597, + "step": 4558500 + }, + { + "epoch": 30.85074707665656, + "grad_norm": 0.34704139828681946, + "learning_rate": 4.691492529233434e-05, + "loss": 0.3596, + "step": 4559000 + }, + { + "epoch": 30.85413057600693, + "grad_norm": 0.3790600895881653, + "learning_rate": 4.691458694239931e-05, + "loss": 0.36, + "step": 4559500 + }, + { + "epoch": 30.857514075357297, + "grad_norm": 0.3805277347564697, + "learning_rate": 4.691424859246427e-05, + "loss": 0.3623, + "step": 4560000 + }, + { + "epoch": 30.860897574707664, + "grad_norm": 0.4104638993740082, + "learning_rate": 4.6913910242529235e-05, + "loss": 0.3611, + "step": 4560500 + }, + { + "epoch": 30.864281074058034, + "grad_norm": 0.40832528471946716, + "learning_rate": 4.69135718925942e-05, + "loss": 0.3614, + "step": 4561000 + }, + { + "epoch": 30.8676645734084, + "grad_norm": 0.362267404794693, + "learning_rate": 4.6913233542659166e-05, + "loss": 0.3608, + "step": 4561500 + }, + { + "epoch": 30.87104807275877, + "grad_norm": 0.34720054268836975, + "learning_rate": 4.691289519272413e-05, + "loss": 0.3592, + "step": 4562000 + }, + { + "epoch": 30.87443157210914, + "grad_norm": 0.4220869541168213, + "learning_rate": 4.691255684278909e-05, + "loss": 0.3618, + "step": 4562500 + }, + { + "epoch": 30.877815071459505, + "grad_norm": 0.3882518708705902, + "learning_rate": 4.691221849285405e-05, + "loss": 0.3594, + "step": 4563000 + }, + { + "epoch": 30.881198570809875, + "grad_norm": 0.39041945338249207, + "learning_rate": 4.6911880142919015e-05, + "loss": 0.3616, + "step": 4563500 + }, + { + "epoch": 30.884582070160242, + "grad_norm": 0.40342891216278076, + "learning_rate": 4.691154179298398e-05, + "loss": 0.3609, + "step": 4564000 + }, + { + "epoch": 30.887965569510612, + "grad_norm": 0.3788714110851288, + "learning_rate": 4.691120344304894e-05, + "loss": 0.3605, + "step": 4564500 + }, + { + "epoch": 30.89134906886098, + "grad_norm": 0.37482404708862305, + "learning_rate": 4.69108650931139e-05, + "loss": 0.3606, + "step": 4565000 + }, + { + "epoch": 30.894732568211346, + "grad_norm": 0.3604934513568878, + "learning_rate": 4.691052674317887e-05, + "loss": 0.3619, + "step": 4565500 + }, + { + "epoch": 30.898116067561716, + "grad_norm": 0.38737213611602783, + "learning_rate": 4.691018839324383e-05, + "loss": 0.3614, + "step": 4566000 + }, + { + "epoch": 30.901499566912083, + "grad_norm": 0.37785857915878296, + "learning_rate": 4.6909850043308794e-05, + "loss": 0.3608, + "step": 4566500 + }, + { + "epoch": 30.90488306626245, + "grad_norm": 0.39249515533447266, + "learning_rate": 4.6909511693373756e-05, + "loss": 0.3614, + "step": 4567000 + }, + { + "epoch": 30.90826656561282, + "grad_norm": 0.3591851592063904, + "learning_rate": 4.6909173343438725e-05, + "loss": 0.3613, + "step": 4567500 + }, + { + "epoch": 30.911650064963187, + "grad_norm": 0.3635176420211792, + "learning_rate": 4.690883499350368e-05, + "loss": 0.3601, + "step": 4568000 + }, + { + "epoch": 30.915033564313557, + "grad_norm": 0.3742623031139374, + "learning_rate": 4.690849664356864e-05, + "loss": 0.3604, + "step": 4568500 + }, + { + "epoch": 30.918417063663924, + "grad_norm": 0.3985554277896881, + "learning_rate": 4.690815829363361e-05, + "loss": 0.3606, + "step": 4569000 + }, + { + "epoch": 30.92180056301429, + "grad_norm": 0.3568021059036255, + "learning_rate": 4.6907819943698574e-05, + "loss": 0.3616, + "step": 4569500 + }, + { + "epoch": 30.92518406236466, + "grad_norm": 0.4350185692310333, + "learning_rate": 4.6907481593763536e-05, + "loss": 0.3606, + "step": 4570000 + }, + { + "epoch": 30.928567561715028, + "grad_norm": 0.39672911167144775, + "learning_rate": 4.69071432438285e-05, + "loss": 0.3609, + "step": 4570500 + }, + { + "epoch": 30.931951061065398, + "grad_norm": 0.34133613109588623, + "learning_rate": 4.690680489389347e-05, + "loss": 0.361, + "step": 4571000 + }, + { + "epoch": 30.935334560415765, + "grad_norm": 0.37614962458610535, + "learning_rate": 4.690646654395843e-05, + "loss": 0.3605, + "step": 4571500 + }, + { + "epoch": 30.93871805976613, + "grad_norm": 0.3928428888320923, + "learning_rate": 4.690612819402339e-05, + "loss": 0.3598, + "step": 4572000 + }, + { + "epoch": 30.9421015591165, + "grad_norm": 0.37217843532562256, + "learning_rate": 4.690578984408835e-05, + "loss": 0.3615, + "step": 4572500 + }, + { + "epoch": 30.94548505846687, + "grad_norm": 0.3554867208003998, + "learning_rate": 4.6905451494153315e-05, + "loss": 0.3617, + "step": 4573000 + }, + { + "epoch": 30.948868557817235, + "grad_norm": 0.3619111180305481, + "learning_rate": 4.690511314421828e-05, + "loss": 0.3578, + "step": 4573500 + }, + { + "epoch": 30.952252057167605, + "grad_norm": 0.3643021583557129, + "learning_rate": 4.690477479428324e-05, + "loss": 0.3602, + "step": 4574000 + }, + { + "epoch": 30.955635556517972, + "grad_norm": 0.3516826927661896, + "learning_rate": 4.69044364443482e-05, + "loss": 0.3608, + "step": 4574500 + }, + { + "epoch": 30.959019055868342, + "grad_norm": 0.40577948093414307, + "learning_rate": 4.690409809441317e-05, + "loss": 0.361, + "step": 4575000 + }, + { + "epoch": 30.96240255521871, + "grad_norm": 0.382333368062973, + "learning_rate": 4.690375974447813e-05, + "loss": 0.3602, + "step": 4575500 + }, + { + "epoch": 30.965786054569076, + "grad_norm": 0.36798179149627686, + "learning_rate": 4.6903421394543095e-05, + "loss": 0.3614, + "step": 4576000 + }, + { + "epoch": 30.969169553919446, + "grad_norm": 0.3688673973083496, + "learning_rate": 4.690308304460806e-05, + "loss": 0.3603, + "step": 4576500 + }, + { + "epoch": 30.972553053269813, + "grad_norm": 0.3547976016998291, + "learning_rate": 4.6902744694673026e-05, + "loss": 0.3595, + "step": 4577000 + }, + { + "epoch": 30.975936552620183, + "grad_norm": 0.3677893280982971, + "learning_rate": 4.690240634473798e-05, + "loss": 0.3605, + "step": 4577500 + }, + { + "epoch": 30.97932005197055, + "grad_norm": 0.3358895480632782, + "learning_rate": 4.690206799480294e-05, + "loss": 0.3603, + "step": 4578000 + }, + { + "epoch": 30.982703551320917, + "grad_norm": 0.3861973285675049, + "learning_rate": 4.690172964486791e-05, + "loss": 0.3606, + "step": 4578500 + }, + { + "epoch": 30.986087050671287, + "grad_norm": 0.43521299958229065, + "learning_rate": 4.6901391294932874e-05, + "loss": 0.3597, + "step": 4579000 + }, + { + "epoch": 30.989470550021654, + "grad_norm": 0.3649062216281891, + "learning_rate": 4.6901052944997836e-05, + "loss": 0.3608, + "step": 4579500 + }, + { + "epoch": 30.992854049372024, + "grad_norm": 0.3704669177532196, + "learning_rate": 4.69007145950628e-05, + "loss": 0.3602, + "step": 4580000 + }, + { + "epoch": 30.99623754872239, + "grad_norm": 0.3746252655982971, + "learning_rate": 4.690037624512777e-05, + "loss": 0.3623, + "step": 4580500 + }, + { + "epoch": 30.999621048072758, + "grad_norm": 0.38075169920921326, + "learning_rate": 4.690003789519273e-05, + "loss": 0.36, + "step": 4581000 + }, + { + "epoch": 31.0, + "eval_accuracy": 0.8624102377023692, + "eval_loss": 0.5580710172653198, + "eval_runtime": 3344.7912, + "eval_samples_per_second": 86.924, + "eval_steps_per_second": 5.433, + "step": 4581056 + }, + { + "epoch": 31.003004547423128, + "grad_norm": 0.3490939438343048, + "learning_rate": 4.689969954525769e-05, + "loss": 0.3585, + "step": 4581500 + }, + { + "epoch": 31.006388046773495, + "grad_norm": 0.34328708052635193, + "learning_rate": 4.6899361195322654e-05, + "loss": 0.3583, + "step": 4582000 + }, + { + "epoch": 31.00977154612386, + "grad_norm": 0.37582507729530334, + "learning_rate": 4.6899022845387616e-05, + "loss": 0.3595, + "step": 4582500 + }, + { + "epoch": 31.01315504547423, + "grad_norm": 0.36721697449684143, + "learning_rate": 4.689868449545258e-05, + "loss": 0.3587, + "step": 4583000 + }, + { + "epoch": 31.0165385448246, + "grad_norm": 0.3577655255794525, + "learning_rate": 4.689834614551754e-05, + "loss": 0.3561, + "step": 4583500 + }, + { + "epoch": 31.01992204417497, + "grad_norm": 0.37689751386642456, + "learning_rate": 4.68980077955825e-05, + "loss": 0.3586, + "step": 4584000 + }, + { + "epoch": 31.023305543525336, + "grad_norm": 0.3436678349971771, + "learning_rate": 4.689766944564747e-05, + "loss": 0.3581, + "step": 4584500 + }, + { + "epoch": 31.026689042875702, + "grad_norm": 0.37823644280433655, + "learning_rate": 4.689733109571243e-05, + "loss": 0.3589, + "step": 4585000 + }, + { + "epoch": 31.030072542226073, + "grad_norm": 0.4318449795246124, + "learning_rate": 4.6896992745777395e-05, + "loss": 0.3594, + "step": 4585500 + }, + { + "epoch": 31.03345604157644, + "grad_norm": 0.38328883051872253, + "learning_rate": 4.689665439584236e-05, + "loss": 0.3596, + "step": 4586000 + }, + { + "epoch": 31.03683954092681, + "grad_norm": 0.3845657706260681, + "learning_rate": 4.6896316045907326e-05, + "loss": 0.3585, + "step": 4586500 + }, + { + "epoch": 31.040223040277176, + "grad_norm": 0.4173922538757324, + "learning_rate": 4.689597769597228e-05, + "loss": 0.3592, + "step": 4587000 + }, + { + "epoch": 31.043606539627543, + "grad_norm": 0.4044972360134125, + "learning_rate": 4.6895639346037244e-05, + "loss": 0.3597, + "step": 4587500 + }, + { + "epoch": 31.046990038977913, + "grad_norm": 0.3396126925945282, + "learning_rate": 4.689530099610221e-05, + "loss": 0.3581, + "step": 4588000 + }, + { + "epoch": 31.05037353832828, + "grad_norm": 0.3836978077888489, + "learning_rate": 4.6894962646167175e-05, + "loss": 0.3594, + "step": 4588500 + }, + { + "epoch": 31.05375703767865, + "grad_norm": 0.3947596251964569, + "learning_rate": 4.689462429623214e-05, + "loss": 0.359, + "step": 4589000 + }, + { + "epoch": 31.057140537029017, + "grad_norm": 0.3396083414554596, + "learning_rate": 4.68942859462971e-05, + "loss": 0.3609, + "step": 4589500 + }, + { + "epoch": 31.060524036379384, + "grad_norm": 0.3815324902534485, + "learning_rate": 4.689394759636206e-05, + "loss": 0.359, + "step": 4590000 + }, + { + "epoch": 31.063907535729754, + "grad_norm": 0.40731295943260193, + "learning_rate": 4.689360924642703e-05, + "loss": 0.3594, + "step": 4590500 + }, + { + "epoch": 31.06729103508012, + "grad_norm": 0.40843304991722107, + "learning_rate": 4.689327089649199e-05, + "loss": 0.3592, + "step": 4591000 + }, + { + "epoch": 31.070674534430488, + "grad_norm": 0.39855343103408813, + "learning_rate": 4.6892932546556954e-05, + "loss": 0.359, + "step": 4591500 + }, + { + "epoch": 31.074058033780858, + "grad_norm": 0.4003579318523407, + "learning_rate": 4.6892594196621916e-05, + "loss": 0.3581, + "step": 4592000 + }, + { + "epoch": 31.077441533131225, + "grad_norm": 0.4000629484653473, + "learning_rate": 4.689225584668688e-05, + "loss": 0.3591, + "step": 4592500 + }, + { + "epoch": 31.080825032481595, + "grad_norm": 0.40287065505981445, + "learning_rate": 4.689191749675184e-05, + "loss": 0.3597, + "step": 4593000 + }, + { + "epoch": 31.084208531831962, + "grad_norm": 0.3625352680683136, + "learning_rate": 4.68915791468168e-05, + "loss": 0.36, + "step": 4593500 + }, + { + "epoch": 31.08759203118233, + "grad_norm": 0.40282487869262695, + "learning_rate": 4.689124079688177e-05, + "loss": 0.359, + "step": 4594000 + }, + { + "epoch": 31.0909755305327, + "grad_norm": 0.37409570813179016, + "learning_rate": 4.6890902446946734e-05, + "loss": 0.3595, + "step": 4594500 + }, + { + "epoch": 31.094359029883066, + "grad_norm": 0.33406898379325867, + "learning_rate": 4.6890564097011696e-05, + "loss": 0.3591, + "step": 4595000 + }, + { + "epoch": 31.097742529233436, + "grad_norm": 0.3854425251483917, + "learning_rate": 4.689022574707666e-05, + "loss": 0.3594, + "step": 4595500 + }, + { + "epoch": 31.101126028583803, + "grad_norm": 0.402474582195282, + "learning_rate": 4.688988739714163e-05, + "loss": 0.3604, + "step": 4596000 + }, + { + "epoch": 31.10450952793417, + "grad_norm": 0.38899245858192444, + "learning_rate": 4.688954904720659e-05, + "loss": 0.3606, + "step": 4596500 + }, + { + "epoch": 31.10789302728454, + "grad_norm": 0.4109959304332733, + "learning_rate": 4.6889210697271544e-05, + "loss": 0.3592, + "step": 4597000 + }, + { + "epoch": 31.111276526634907, + "grad_norm": 0.3785685896873474, + "learning_rate": 4.6888872347336507e-05, + "loss": 0.3604, + "step": 4597500 + }, + { + "epoch": 31.114660025985273, + "grad_norm": 0.3987598121166229, + "learning_rate": 4.6888533997401475e-05, + "loss": 0.3601, + "step": 4598000 + }, + { + "epoch": 31.118043525335644, + "grad_norm": 0.38963502645492554, + "learning_rate": 4.688819564746644e-05, + "loss": 0.3597, + "step": 4598500 + }, + { + "epoch": 31.12142702468601, + "grad_norm": 0.38518026471138, + "learning_rate": 4.68878572975314e-05, + "loss": 0.3609, + "step": 4599000 + }, + { + "epoch": 31.12481052403638, + "grad_norm": 0.4085533022880554, + "learning_rate": 4.688751894759636e-05, + "loss": 0.3602, + "step": 4599500 + }, + { + "epoch": 31.128194023386747, + "grad_norm": 0.35115405917167664, + "learning_rate": 4.688718059766133e-05, + "loss": 0.3591, + "step": 4600000 + }, + { + "epoch": 31.131577522737114, + "grad_norm": 0.3623368442058563, + "learning_rate": 4.688684224772629e-05, + "loss": 0.36, + "step": 4600500 + }, + { + "epoch": 31.134961022087484, + "grad_norm": 0.36518803238868713, + "learning_rate": 4.6886503897791255e-05, + "loss": 0.3588, + "step": 4601000 + }, + { + "epoch": 31.13834452143785, + "grad_norm": 0.40113335847854614, + "learning_rate": 4.688616554785622e-05, + "loss": 0.3592, + "step": 4601500 + }, + { + "epoch": 31.14172802078822, + "grad_norm": 0.35938259959220886, + "learning_rate": 4.688582719792118e-05, + "loss": 0.3597, + "step": 4602000 + }, + { + "epoch": 31.145111520138588, + "grad_norm": 0.3931177258491516, + "learning_rate": 4.688548884798614e-05, + "loss": 0.3592, + "step": 4602500 + }, + { + "epoch": 31.148495019488955, + "grad_norm": 0.41567766666412354, + "learning_rate": 4.6885150498051103e-05, + "loss": 0.3603, + "step": 4603000 + }, + { + "epoch": 31.151878518839325, + "grad_norm": 0.4345746338367462, + "learning_rate": 4.688481214811607e-05, + "loss": 0.3614, + "step": 4603500 + }, + { + "epoch": 31.155262018189692, + "grad_norm": 0.36789408326148987, + "learning_rate": 4.6884473798181034e-05, + "loss": 0.3604, + "step": 4604000 + }, + { + "epoch": 31.158645517540062, + "grad_norm": 0.3862924575805664, + "learning_rate": 4.6884135448245997e-05, + "loss": 0.3582, + "step": 4604500 + }, + { + "epoch": 31.16202901689043, + "grad_norm": 0.384548544883728, + "learning_rate": 4.688379709831096e-05, + "loss": 0.3579, + "step": 4605000 + }, + { + "epoch": 31.165412516240796, + "grad_norm": 0.3928464353084564, + "learning_rate": 4.688345874837593e-05, + "loss": 0.3612, + "step": 4605500 + }, + { + "epoch": 31.168796015591166, + "grad_norm": 0.36854758858680725, + "learning_rate": 4.688312039844089e-05, + "loss": 0.3608, + "step": 4606000 + }, + { + "epoch": 31.172179514941533, + "grad_norm": 0.36410751938819885, + "learning_rate": 4.6882782048505845e-05, + "loss": 0.3594, + "step": 4606500 + }, + { + "epoch": 31.1755630142919, + "grad_norm": 0.3804601728916168, + "learning_rate": 4.688244369857081e-05, + "loss": 0.3597, + "step": 4607000 + }, + { + "epoch": 31.17894651364227, + "grad_norm": 0.36085045337677, + "learning_rate": 4.6882105348635776e-05, + "loss": 0.3607, + "step": 4607500 + }, + { + "epoch": 31.182330012992637, + "grad_norm": 0.3745473325252533, + "learning_rate": 4.688176699870074e-05, + "loss": 0.3591, + "step": 4608000 + }, + { + "epoch": 31.185713512343007, + "grad_norm": 0.39781248569488525, + "learning_rate": 4.68814286487657e-05, + "loss": 0.3596, + "step": 4608500 + }, + { + "epoch": 31.189097011693374, + "grad_norm": 0.39101073145866394, + "learning_rate": 4.688109029883066e-05, + "loss": 0.3619, + "step": 4609000 + }, + { + "epoch": 31.19248051104374, + "grad_norm": 0.37476101517677307, + "learning_rate": 4.688075194889563e-05, + "loss": 0.3585, + "step": 4609500 + }, + { + "epoch": 31.19586401039411, + "grad_norm": 0.39833685755729675, + "learning_rate": 4.6880413598960593e-05, + "loss": 0.36, + "step": 4610000 + }, + { + "epoch": 31.199247509744477, + "grad_norm": 0.39467480778694153, + "learning_rate": 4.6880075249025556e-05, + "loss": 0.359, + "step": 4610500 + }, + { + "epoch": 31.202631009094848, + "grad_norm": 0.3747001588344574, + "learning_rate": 4.687973689909052e-05, + "loss": 0.3619, + "step": 4611000 + }, + { + "epoch": 31.206014508445215, + "grad_norm": 0.4207741916179657, + "learning_rate": 4.687939854915548e-05, + "loss": 0.3614, + "step": 4611500 + }, + { + "epoch": 31.20939800779558, + "grad_norm": 0.3837501108646393, + "learning_rate": 4.687906019922044e-05, + "loss": 0.3598, + "step": 4612000 + }, + { + "epoch": 31.21278150714595, + "grad_norm": 0.36298200488090515, + "learning_rate": 4.6878721849285404e-05, + "loss": 0.3595, + "step": 4612500 + }, + { + "epoch": 31.21616500649632, + "grad_norm": 0.4514963626861572, + "learning_rate": 4.687838349935037e-05, + "loss": 0.3576, + "step": 4613000 + }, + { + "epoch": 31.21954850584669, + "grad_norm": 0.4059322774410248, + "learning_rate": 4.6878045149415335e-05, + "loss": 0.3601, + "step": 4613500 + }, + { + "epoch": 31.222932005197055, + "grad_norm": 0.39534878730773926, + "learning_rate": 4.68777067994803e-05, + "loss": 0.3604, + "step": 4614000 + }, + { + "epoch": 31.226315504547422, + "grad_norm": 0.38197407126426697, + "learning_rate": 4.687736844954526e-05, + "loss": 0.3593, + "step": 4614500 + }, + { + "epoch": 31.229699003897792, + "grad_norm": 0.3725822865962982, + "learning_rate": 4.687703009961023e-05, + "loss": 0.3613, + "step": 4615000 + }, + { + "epoch": 31.23308250324816, + "grad_norm": 0.36064666509628296, + "learning_rate": 4.687669174967519e-05, + "loss": 0.3612, + "step": 4615500 + }, + { + "epoch": 31.236466002598526, + "grad_norm": 0.361724317073822, + "learning_rate": 4.6876353399740146e-05, + "loss": 0.3586, + "step": 4616000 + }, + { + "epoch": 31.239849501948896, + "grad_norm": 0.37389740347862244, + "learning_rate": 4.687601504980511e-05, + "loss": 0.3596, + "step": 4616500 + }, + { + "epoch": 31.243233001299263, + "grad_norm": 0.3896735906600952, + "learning_rate": 4.687567669987008e-05, + "loss": 0.3614, + "step": 4617000 + }, + { + "epoch": 31.246616500649633, + "grad_norm": 0.36957865953445435, + "learning_rate": 4.687533834993504e-05, + "loss": 0.3581, + "step": 4617500 + }, + { + "epoch": 31.25, + "grad_norm": 0.37003257870674133, + "learning_rate": 4.6875e-05, + "loss": 0.3604, + "step": 4618000 + }, + { + "epoch": 31.253383499350367, + "grad_norm": 0.3659612834453583, + "learning_rate": 4.687466165006496e-05, + "loss": 0.3609, + "step": 4618500 + }, + { + "epoch": 31.256766998700737, + "grad_norm": 0.3906296491622925, + "learning_rate": 4.687432330012993e-05, + "loss": 0.3606, + "step": 4619000 + }, + { + "epoch": 31.260150498051104, + "grad_norm": 0.37453439831733704, + "learning_rate": 4.6873984950194894e-05, + "loss": 0.3604, + "step": 4619500 + }, + { + "epoch": 31.263533997401474, + "grad_norm": 0.40176907181739807, + "learning_rate": 4.6873646600259856e-05, + "loss": 0.3591, + "step": 4620000 + }, + { + "epoch": 31.26691749675184, + "grad_norm": 0.3883937895298004, + "learning_rate": 4.687330825032482e-05, + "loss": 0.3588, + "step": 4620500 + }, + { + "epoch": 31.270300996102208, + "grad_norm": 0.3804904818534851, + "learning_rate": 4.687296990038978e-05, + "loss": 0.3596, + "step": 4621000 + }, + { + "epoch": 31.273684495452578, + "grad_norm": 0.3401568531990051, + "learning_rate": 4.687263155045474e-05, + "loss": 0.3607, + "step": 4621500 + }, + { + "epoch": 31.277067994802945, + "grad_norm": 0.397629052400589, + "learning_rate": 4.6872293200519705e-05, + "loss": 0.3597, + "step": 4622000 + }, + { + "epoch": 31.28045149415331, + "grad_norm": 0.40658038854599, + "learning_rate": 4.6871954850584674e-05, + "loss": 0.3608, + "step": 4622500 + }, + { + "epoch": 31.28383499350368, + "grad_norm": 0.38460561633110046, + "learning_rate": 4.6871616500649636e-05, + "loss": 0.3596, + "step": 4623000 + }, + { + "epoch": 31.28721849285405, + "grad_norm": 0.4182147979736328, + "learning_rate": 4.68712781507146e-05, + "loss": 0.3604, + "step": 4623500 + }, + { + "epoch": 31.29060199220442, + "grad_norm": 0.38921990990638733, + "learning_rate": 4.687093980077956e-05, + "loss": 0.3601, + "step": 4624000 + }, + { + "epoch": 31.293985491554785, + "grad_norm": 0.40398287773132324, + "learning_rate": 4.687060145084453e-05, + "loss": 0.3618, + "step": 4624500 + }, + { + "epoch": 31.297368990905152, + "grad_norm": 0.38871896266937256, + "learning_rate": 4.687026310090949e-05, + "loss": 0.3617, + "step": 4625000 + }, + { + "epoch": 31.300752490255523, + "grad_norm": 0.4172930419445038, + "learning_rate": 4.6869924750974446e-05, + "loss": 0.3603, + "step": 4625500 + }, + { + "epoch": 31.30413598960589, + "grad_norm": 0.38299837708473206, + "learning_rate": 4.686958640103941e-05, + "loss": 0.3602, + "step": 4626000 + }, + { + "epoch": 31.30751948895626, + "grad_norm": 0.3578115999698639, + "learning_rate": 4.686924805110438e-05, + "loss": 0.3597, + "step": 4626500 + }, + { + "epoch": 31.310902988306626, + "grad_norm": 0.3577576279640198, + "learning_rate": 4.686890970116934e-05, + "loss": 0.3599, + "step": 4627000 + }, + { + "epoch": 31.314286487656993, + "grad_norm": 0.35940787196159363, + "learning_rate": 4.68685713512343e-05, + "loss": 0.3588, + "step": 4627500 + }, + { + "epoch": 31.317669987007363, + "grad_norm": 0.37951037287712097, + "learning_rate": 4.6868233001299264e-05, + "loss": 0.3605, + "step": 4628000 + }, + { + "epoch": 31.32105348635773, + "grad_norm": 0.4084251821041107, + "learning_rate": 4.686789465136423e-05, + "loss": 0.3606, + "step": 4628500 + }, + { + "epoch": 31.3244369857081, + "grad_norm": 0.3893352150917053, + "learning_rate": 4.6867556301429195e-05, + "loss": 0.3592, + "step": 4629000 + }, + { + "epoch": 31.327820485058467, + "grad_norm": 0.37116125226020813, + "learning_rate": 4.686721795149416e-05, + "loss": 0.3598, + "step": 4629500 + }, + { + "epoch": 31.331203984408834, + "grad_norm": 0.4443565905094147, + "learning_rate": 4.686687960155912e-05, + "loss": 0.3601, + "step": 4630000 + }, + { + "epoch": 31.334587483759204, + "grad_norm": 0.3604671061038971, + "learning_rate": 4.686654125162408e-05, + "loss": 0.3599, + "step": 4630500 + }, + { + "epoch": 31.33797098310957, + "grad_norm": 0.35667118430137634, + "learning_rate": 4.686620290168904e-05, + "loss": 0.3591, + "step": 4631000 + }, + { + "epoch": 31.341354482459938, + "grad_norm": 0.3471163213253021, + "learning_rate": 4.6865864551754005e-05, + "loss": 0.3587, + "step": 4631500 + }, + { + "epoch": 31.344737981810308, + "grad_norm": 0.3652501106262207, + "learning_rate": 4.6865526201818974e-05, + "loss": 0.3599, + "step": 4632000 + }, + { + "epoch": 31.348121481160675, + "grad_norm": 0.3757786154747009, + "learning_rate": 4.6865187851883936e-05, + "loss": 0.3615, + "step": 4632500 + }, + { + "epoch": 31.351504980511045, + "grad_norm": 0.3931577801704407, + "learning_rate": 4.68648495019489e-05, + "loss": 0.3613, + "step": 4633000 + }, + { + "epoch": 31.354888479861412, + "grad_norm": 0.4004794657230377, + "learning_rate": 4.686451115201386e-05, + "loss": 0.3608, + "step": 4633500 + }, + { + "epoch": 31.35827197921178, + "grad_norm": 0.42484724521636963, + "learning_rate": 4.686417280207883e-05, + "loss": 0.3614, + "step": 4634000 + }, + { + "epoch": 31.36165547856215, + "grad_norm": 0.4051852226257324, + "learning_rate": 4.686383445214379e-05, + "loss": 0.3596, + "step": 4634500 + }, + { + "epoch": 31.365038977912516, + "grad_norm": 0.3806665539741516, + "learning_rate": 4.686349610220875e-05, + "loss": 0.3601, + "step": 4635000 + }, + { + "epoch": 31.368422477262886, + "grad_norm": 0.3938251733779907, + "learning_rate": 4.686315775227371e-05, + "loss": 0.3603, + "step": 4635500 + }, + { + "epoch": 31.371805976613253, + "grad_norm": 0.36172622442245483, + "learning_rate": 4.686281940233868e-05, + "loss": 0.3614, + "step": 4636000 + }, + { + "epoch": 31.37518947596362, + "grad_norm": 0.40758925676345825, + "learning_rate": 4.686248105240364e-05, + "loss": 0.3592, + "step": 4636500 + }, + { + "epoch": 31.37857297531399, + "grad_norm": 0.3601266145706177, + "learning_rate": 4.68621427024686e-05, + "loss": 0.3614, + "step": 4637000 + }, + { + "epoch": 31.381956474664356, + "grad_norm": 0.38919326663017273, + "learning_rate": 4.6861804352533564e-05, + "loss": 0.3603, + "step": 4637500 + }, + { + "epoch": 31.385339974014727, + "grad_norm": 0.35700440406799316, + "learning_rate": 4.686146600259853e-05, + "loss": 0.3607, + "step": 4638000 + }, + { + "epoch": 31.388723473365093, + "grad_norm": 0.3763803541660309, + "learning_rate": 4.6861127652663495e-05, + "loss": 0.3595, + "step": 4638500 + }, + { + "epoch": 31.39210697271546, + "grad_norm": 0.4129246175289154, + "learning_rate": 4.686078930272846e-05, + "loss": 0.3612, + "step": 4639000 + }, + { + "epoch": 31.39549047206583, + "grad_norm": 0.38882720470428467, + "learning_rate": 4.686045095279342e-05, + "loss": 0.3608, + "step": 4639500 + }, + { + "epoch": 31.398873971416197, + "grad_norm": 0.3492278456687927, + "learning_rate": 4.686011260285838e-05, + "loss": 0.361, + "step": 4640000 + }, + { + "epoch": 31.402257470766564, + "grad_norm": 0.36681997776031494, + "learning_rate": 4.6859774252923344e-05, + "loss": 0.3616, + "step": 4640500 + }, + { + "epoch": 31.405640970116934, + "grad_norm": 0.3288574516773224, + "learning_rate": 4.6859435902988306e-05, + "loss": 0.3616, + "step": 4641000 + }, + { + "epoch": 31.4090244694673, + "grad_norm": 0.3875277042388916, + "learning_rate": 4.6859097553053275e-05, + "loss": 0.3604, + "step": 4641500 + }, + { + "epoch": 31.41240796881767, + "grad_norm": 0.3879646956920624, + "learning_rate": 4.685875920311824e-05, + "loss": 0.3618, + "step": 4642000 + }, + { + "epoch": 31.415791468168038, + "grad_norm": 0.367970734834671, + "learning_rate": 4.68584208531832e-05, + "loss": 0.36, + "step": 4642500 + }, + { + "epoch": 31.419174967518405, + "grad_norm": 0.3683345913887024, + "learning_rate": 4.685808250324816e-05, + "loss": 0.3608, + "step": 4643000 + }, + { + "epoch": 31.422558466868775, + "grad_norm": 0.3478187322616577, + "learning_rate": 4.685774415331312e-05, + "loss": 0.3597, + "step": 4643500 + }, + { + "epoch": 31.425941966219142, + "grad_norm": 0.37414970993995667, + "learning_rate": 4.685740580337809e-05, + "loss": 0.3599, + "step": 4644000 + }, + { + "epoch": 31.429325465569512, + "grad_norm": 0.3836267590522766, + "learning_rate": 4.685706745344305e-05, + "loss": 0.3589, + "step": 4644500 + }, + { + "epoch": 31.43270896491988, + "grad_norm": 0.3911152482032776, + "learning_rate": 4.685672910350801e-05, + "loss": 0.361, + "step": 4645000 + }, + { + "epoch": 31.436092464270246, + "grad_norm": 0.3513597249984741, + "learning_rate": 4.685639075357298e-05, + "loss": 0.3602, + "step": 4645500 + }, + { + "epoch": 31.439475963620616, + "grad_norm": 0.42742282152175903, + "learning_rate": 4.685605240363794e-05, + "loss": 0.3618, + "step": 4646000 + }, + { + "epoch": 31.442859462970983, + "grad_norm": 0.4075610637664795, + "learning_rate": 4.68557140537029e-05, + "loss": 0.3613, + "step": 4646500 + }, + { + "epoch": 31.44624296232135, + "grad_norm": 0.36794495582580566, + "learning_rate": 4.6855375703767865e-05, + "loss": 0.3609, + "step": 4647000 + }, + { + "epoch": 31.44962646167172, + "grad_norm": 0.38682469725608826, + "learning_rate": 4.6855037353832834e-05, + "loss": 0.3623, + "step": 4647500 + }, + { + "epoch": 31.453009961022087, + "grad_norm": 0.39508524537086487, + "learning_rate": 4.6854699003897796e-05, + "loss": 0.3607, + "step": 4648000 + }, + { + "epoch": 31.456393460372457, + "grad_norm": 0.3804340362548828, + "learning_rate": 4.685436065396276e-05, + "loss": 0.3601, + "step": 4648500 + }, + { + "epoch": 31.459776959722824, + "grad_norm": 0.39548158645629883, + "learning_rate": 4.685402230402772e-05, + "loss": 0.3597, + "step": 4649000 + }, + { + "epoch": 31.46316045907319, + "grad_norm": 0.3658633232116699, + "learning_rate": 4.685368395409268e-05, + "loss": 0.3596, + "step": 4649500 + }, + { + "epoch": 31.46654395842356, + "grad_norm": 0.382374107837677, + "learning_rate": 4.6853345604157645e-05, + "loss": 0.3595, + "step": 4650000 + }, + { + "epoch": 31.469927457773927, + "grad_norm": 0.40996837615966797, + "learning_rate": 4.685300725422261e-05, + "loss": 0.3614, + "step": 4650500 + }, + { + "epoch": 31.473310957124298, + "grad_norm": 0.4142885208129883, + "learning_rate": 4.6852668904287576e-05, + "loss": 0.3584, + "step": 4651000 + }, + { + "epoch": 31.476694456474664, + "grad_norm": 0.4049195647239685, + "learning_rate": 4.685233055435254e-05, + "loss": 0.3593, + "step": 4651500 + }, + { + "epoch": 31.48007795582503, + "grad_norm": 0.4016236662864685, + "learning_rate": 4.68519922044175e-05, + "loss": 0.3615, + "step": 4652000 + }, + { + "epoch": 31.4834614551754, + "grad_norm": 0.39193233847618103, + "learning_rate": 4.685165385448246e-05, + "loss": 0.36, + "step": 4652500 + }, + { + "epoch": 31.48684495452577, + "grad_norm": 0.33804407715797424, + "learning_rate": 4.6851315504547424e-05, + "loss": 0.3608, + "step": 4653000 + }, + { + "epoch": 31.49022845387614, + "grad_norm": 0.40392035245895386, + "learning_rate": 4.685097715461239e-05, + "loss": 0.3597, + "step": 4653500 + }, + { + "epoch": 31.493611953226505, + "grad_norm": 0.37518763542175293, + "learning_rate": 4.685063880467735e-05, + "loss": 0.3611, + "step": 4654000 + }, + { + "epoch": 31.496995452576872, + "grad_norm": 0.3940230906009674, + "learning_rate": 4.685030045474231e-05, + "loss": 0.3596, + "step": 4654500 + }, + { + "epoch": 31.500378951927242, + "grad_norm": 0.35870859026908875, + "learning_rate": 4.684996210480728e-05, + "loss": 0.3611, + "step": 4655000 + }, + { + "epoch": 31.50376245127761, + "grad_norm": 0.3652559518814087, + "learning_rate": 4.684962375487224e-05, + "loss": 0.3584, + "step": 4655500 + }, + { + "epoch": 31.507145950627976, + "grad_norm": 0.3551163375377655, + "learning_rate": 4.6849285404937204e-05, + "loss": 0.3606, + "step": 4656000 + }, + { + "epoch": 31.510529449978346, + "grad_norm": 0.3690461814403534, + "learning_rate": 4.6848947055002166e-05, + "loss": 0.3606, + "step": 4656500 + }, + { + "epoch": 31.513912949328713, + "grad_norm": 0.3778305947780609, + "learning_rate": 4.6848608705067135e-05, + "loss": 0.3603, + "step": 4657000 + }, + { + "epoch": 31.517296448679083, + "grad_norm": 0.359300434589386, + "learning_rate": 4.68482703551321e-05, + "loss": 0.36, + "step": 4657500 + }, + { + "epoch": 31.52067994802945, + "grad_norm": 0.4256763756275177, + "learning_rate": 4.684793200519706e-05, + "loss": 0.3606, + "step": 4658000 + }, + { + "epoch": 31.524063447379817, + "grad_norm": 0.37461280822753906, + "learning_rate": 4.684759365526202e-05, + "loss": 0.36, + "step": 4658500 + }, + { + "epoch": 31.527446946730187, + "grad_norm": 0.40624773502349854, + "learning_rate": 4.684725530532698e-05, + "loss": 0.3599, + "step": 4659000 + }, + { + "epoch": 31.530830446080554, + "grad_norm": 0.3790808916091919, + "learning_rate": 4.6846916955391945e-05, + "loss": 0.3609, + "step": 4659500 + }, + { + "epoch": 31.534213945430924, + "grad_norm": 0.4063674807548523, + "learning_rate": 4.684657860545691e-05, + "loss": 0.3604, + "step": 4660000 + }, + { + "epoch": 31.53759744478129, + "grad_norm": 0.3722701966762543, + "learning_rate": 4.684624025552187e-05, + "loss": 0.3596, + "step": 4660500 + }, + { + "epoch": 31.540980944131658, + "grad_norm": 0.4062025249004364, + "learning_rate": 4.684590190558684e-05, + "loss": 0.3611, + "step": 4661000 + }, + { + "epoch": 31.544364443482028, + "grad_norm": 0.3837735056877136, + "learning_rate": 4.68455635556518e-05, + "loss": 0.3603, + "step": 4661500 + }, + { + "epoch": 31.547747942832395, + "grad_norm": 0.3805675208568573, + "learning_rate": 4.684522520571676e-05, + "loss": 0.3596, + "step": 4662000 + }, + { + "epoch": 31.551131442182765, + "grad_norm": 0.3643147051334381, + "learning_rate": 4.6844886855781725e-05, + "loss": 0.3609, + "step": 4662500 + }, + { + "epoch": 31.55451494153313, + "grad_norm": 0.3622497022151947, + "learning_rate": 4.6844548505846694e-05, + "loss": 0.3583, + "step": 4663000 + }, + { + "epoch": 31.5578984408835, + "grad_norm": 0.3624139428138733, + "learning_rate": 4.684421015591165e-05, + "loss": 0.3609, + "step": 4663500 + }, + { + "epoch": 31.56128194023387, + "grad_norm": 0.3771470785140991, + "learning_rate": 4.684387180597661e-05, + "loss": 0.3601, + "step": 4664000 + }, + { + "epoch": 31.564665439584235, + "grad_norm": 0.3783354163169861, + "learning_rate": 4.684353345604158e-05, + "loss": 0.3622, + "step": 4664500 + }, + { + "epoch": 31.568048938934602, + "grad_norm": 0.3971211314201355, + "learning_rate": 4.684319510610654e-05, + "loss": 0.3596, + "step": 4665000 + }, + { + "epoch": 31.571432438284972, + "grad_norm": 0.3950501084327698, + "learning_rate": 4.6842856756171504e-05, + "loss": 0.361, + "step": 4665500 + }, + { + "epoch": 31.57481593763534, + "grad_norm": 0.3610475957393646, + "learning_rate": 4.6842518406236466e-05, + "loss": 0.3603, + "step": 4666000 + }, + { + "epoch": 31.57819943698571, + "grad_norm": 0.3884837031364441, + "learning_rate": 4.6842180056301435e-05, + "loss": 0.3606, + "step": 4666500 + }, + { + "epoch": 31.581582936336076, + "grad_norm": 0.3898885250091553, + "learning_rate": 4.68418417063664e-05, + "loss": 0.3604, + "step": 4667000 + }, + { + "epoch": 31.584966435686443, + "grad_norm": 0.38525745272636414, + "learning_rate": 4.684150335643136e-05, + "loss": 0.3623, + "step": 4667500 + }, + { + "epoch": 31.588349935036813, + "grad_norm": 0.40762364864349365, + "learning_rate": 4.684116500649632e-05, + "loss": 0.3597, + "step": 4668000 + }, + { + "epoch": 31.59173343438718, + "grad_norm": 0.37447842955589294, + "learning_rate": 4.6840826656561284e-05, + "loss": 0.3588, + "step": 4668500 + }, + { + "epoch": 31.59511693373755, + "grad_norm": 0.40254485607147217, + "learning_rate": 4.6840488306626246e-05, + "loss": 0.3594, + "step": 4669000 + }, + { + "epoch": 31.598500433087917, + "grad_norm": 0.39968255162239075, + "learning_rate": 4.684014995669121e-05, + "loss": 0.361, + "step": 4669500 + }, + { + "epoch": 31.601883932438284, + "grad_norm": 0.33560580015182495, + "learning_rate": 4.683981160675617e-05, + "loss": 0.3596, + "step": 4670000 + }, + { + "epoch": 31.605267431788654, + "grad_norm": 0.39426931738853455, + "learning_rate": 4.683947325682114e-05, + "loss": 0.3596, + "step": 4670500 + }, + { + "epoch": 31.60865093113902, + "grad_norm": 0.4080418646335602, + "learning_rate": 4.68391349068861e-05, + "loss": 0.3581, + "step": 4671000 + }, + { + "epoch": 31.612034430489388, + "grad_norm": 0.33502811193466187, + "learning_rate": 4.683879655695106e-05, + "loss": 0.3599, + "step": 4671500 + }, + { + "epoch": 31.615417929839758, + "grad_norm": 0.36781832575798035, + "learning_rate": 4.6838458207016025e-05, + "loss": 0.3623, + "step": 4672000 + }, + { + "epoch": 31.618801429190125, + "grad_norm": 0.42620423436164856, + "learning_rate": 4.6838119857080994e-05, + "loss": 0.3588, + "step": 4672500 + }, + { + "epoch": 31.622184928540495, + "grad_norm": 0.3697637617588043, + "learning_rate": 4.683778150714595e-05, + "loss": 0.3604, + "step": 4673000 + }, + { + "epoch": 31.62556842789086, + "grad_norm": 0.3921816945075989, + "learning_rate": 4.683744315721091e-05, + "loss": 0.3601, + "step": 4673500 + }, + { + "epoch": 31.62895192724123, + "grad_norm": 0.38950610160827637, + "learning_rate": 4.683710480727588e-05, + "loss": 0.3595, + "step": 4674000 + }, + { + "epoch": 31.6323354265916, + "grad_norm": 0.3828791677951813, + "learning_rate": 4.683676645734084e-05, + "loss": 0.3605, + "step": 4674500 + }, + { + "epoch": 31.635718925941966, + "grad_norm": 0.3760644495487213, + "learning_rate": 4.6836428107405805e-05, + "loss": 0.3605, + "step": 4675000 + }, + { + "epoch": 31.639102425292336, + "grad_norm": 0.3590831458568573, + "learning_rate": 4.683608975747077e-05, + "loss": 0.3611, + "step": 4675500 + }, + { + "epoch": 31.642485924642703, + "grad_norm": 0.38537073135375977, + "learning_rate": 4.6835751407535736e-05, + "loss": 0.361, + "step": 4676000 + }, + { + "epoch": 31.64586942399307, + "grad_norm": 0.4026997685432434, + "learning_rate": 4.68354130576007e-05, + "loss": 0.36, + "step": 4676500 + }, + { + "epoch": 31.64925292334344, + "grad_norm": 0.4079139530658722, + "learning_rate": 4.683507470766566e-05, + "loss": 0.3604, + "step": 4677000 + }, + { + "epoch": 31.652636422693806, + "grad_norm": 0.3519248068332672, + "learning_rate": 4.683473635773062e-05, + "loss": 0.3607, + "step": 4677500 + }, + { + "epoch": 31.656019922044173, + "grad_norm": 0.422990620136261, + "learning_rate": 4.6834398007795584e-05, + "loss": 0.3605, + "step": 4678000 + }, + { + "epoch": 31.659403421394543, + "grad_norm": 0.36565840244293213, + "learning_rate": 4.6834059657860546e-05, + "loss": 0.3603, + "step": 4678500 + }, + { + "epoch": 31.66278692074491, + "grad_norm": 0.39964473247528076, + "learning_rate": 4.683372130792551e-05, + "loss": 0.3616, + "step": 4679000 + }, + { + "epoch": 31.66617042009528, + "grad_norm": 0.3694780170917511, + "learning_rate": 4.683338295799047e-05, + "loss": 0.3601, + "step": 4679500 + }, + { + "epoch": 31.669553919445647, + "grad_norm": 0.34623003005981445, + "learning_rate": 4.683304460805544e-05, + "loss": 0.3626, + "step": 4680000 + }, + { + "epoch": 31.672937418796014, + "grad_norm": 0.3790321946144104, + "learning_rate": 4.68327062581204e-05, + "loss": 0.3618, + "step": 4680500 + }, + { + "epoch": 31.676320918146384, + "grad_norm": 0.34973055124282837, + "learning_rate": 4.6832367908185364e-05, + "loss": 0.3612, + "step": 4681000 + }, + { + "epoch": 31.67970441749675, + "grad_norm": 0.3752252459526062, + "learning_rate": 4.6832029558250326e-05, + "loss": 0.3584, + "step": 4681500 + }, + { + "epoch": 31.68308791684712, + "grad_norm": 0.364828884601593, + "learning_rate": 4.6831691208315295e-05, + "loss": 0.3619, + "step": 4682000 + }, + { + "epoch": 31.686471416197488, + "grad_norm": 0.4167463183403015, + "learning_rate": 4.683135285838025e-05, + "loss": 0.3609, + "step": 4682500 + }, + { + "epoch": 31.689854915547855, + "grad_norm": 0.3400552272796631, + "learning_rate": 4.683101450844521e-05, + "loss": 0.3607, + "step": 4683000 + }, + { + "epoch": 31.693238414898225, + "grad_norm": 0.3850339651107788, + "learning_rate": 4.683067615851018e-05, + "loss": 0.3591, + "step": 4683500 + }, + { + "epoch": 31.696621914248592, + "grad_norm": 0.3828616142272949, + "learning_rate": 4.683033780857514e-05, + "loss": 0.361, + "step": 4684000 + }, + { + "epoch": 31.700005413598962, + "grad_norm": 0.3609130084514618, + "learning_rate": 4.6829999458640105e-05, + "loss": 0.3619, + "step": 4684500 + }, + { + "epoch": 31.70338891294933, + "grad_norm": 0.3726107180118561, + "learning_rate": 4.682966110870507e-05, + "loss": 0.3617, + "step": 4685000 + }, + { + "epoch": 31.706772412299696, + "grad_norm": 0.4102441668510437, + "learning_rate": 4.6829322758770036e-05, + "loss": 0.36, + "step": 4685500 + }, + { + "epoch": 31.710155911650066, + "grad_norm": 0.3822396397590637, + "learning_rate": 4.6828984408835e-05, + "loss": 0.3606, + "step": 4686000 + }, + { + "epoch": 31.713539411000433, + "grad_norm": 0.40908774733543396, + "learning_rate": 4.682864605889996e-05, + "loss": 0.3601, + "step": 4686500 + }, + { + "epoch": 31.716922910350803, + "grad_norm": 0.34824812412261963, + "learning_rate": 4.682830770896492e-05, + "loss": 0.3598, + "step": 4687000 + }, + { + "epoch": 31.72030640970117, + "grad_norm": 0.3778131306171417, + "learning_rate": 4.6827969359029885e-05, + "loss": 0.3619, + "step": 4687500 + }, + { + "epoch": 31.723689909051537, + "grad_norm": 0.34503281116485596, + "learning_rate": 4.682763100909485e-05, + "loss": 0.3607, + "step": 4688000 + }, + { + "epoch": 31.727073408401907, + "grad_norm": 0.38387107849121094, + "learning_rate": 4.682729265915981e-05, + "loss": 0.3619, + "step": 4688500 + }, + { + "epoch": 31.730456907752274, + "grad_norm": 0.379212886095047, + "learning_rate": 4.682695430922477e-05, + "loss": 0.3605, + "step": 4689000 + }, + { + "epoch": 31.73384040710264, + "grad_norm": 0.37629860639572144, + "learning_rate": 4.682661595928974e-05, + "loss": 0.361, + "step": 4689500 + }, + { + "epoch": 31.73722390645301, + "grad_norm": 0.332742303609848, + "learning_rate": 4.68262776093547e-05, + "loss": 0.3589, + "step": 4690000 + }, + { + "epoch": 31.740607405803377, + "grad_norm": 0.3826378285884857, + "learning_rate": 4.6825939259419664e-05, + "loss": 0.3604, + "step": 4690500 + }, + { + "epoch": 31.743990905153748, + "grad_norm": 0.3938352167606354, + "learning_rate": 4.6825600909484627e-05, + "loss": 0.3603, + "step": 4691000 + }, + { + "epoch": 31.747374404504114, + "grad_norm": 0.45102402567863464, + "learning_rate": 4.6825262559549595e-05, + "loss": 0.3603, + "step": 4691500 + }, + { + "epoch": 31.75075790385448, + "grad_norm": 0.3734528422355652, + "learning_rate": 4.682492420961455e-05, + "loss": 0.3608, + "step": 4692000 + }, + { + "epoch": 31.75414140320485, + "grad_norm": 0.379263311624527, + "learning_rate": 4.682458585967951e-05, + "loss": 0.3602, + "step": 4692500 + }, + { + "epoch": 31.757524902555218, + "grad_norm": 0.40262335538864136, + "learning_rate": 4.682424750974448e-05, + "loss": 0.3591, + "step": 4693000 + }, + { + "epoch": 31.76090840190559, + "grad_norm": 0.38411325216293335, + "learning_rate": 4.6823909159809444e-05, + "loss": 0.3609, + "step": 4693500 + }, + { + "epoch": 31.764291901255955, + "grad_norm": 0.34137555956840515, + "learning_rate": 4.6823570809874406e-05, + "loss": 0.3612, + "step": 4694000 + }, + { + "epoch": 31.767675400606322, + "grad_norm": 0.4293920695781708, + "learning_rate": 4.682323245993937e-05, + "loss": 0.3608, + "step": 4694500 + }, + { + "epoch": 31.771058899956692, + "grad_norm": 0.4304783344268799, + "learning_rate": 4.682289411000434e-05, + "loss": 0.3611, + "step": 4695000 + }, + { + "epoch": 31.77444239930706, + "grad_norm": 0.3868955075740814, + "learning_rate": 4.68225557600693e-05, + "loss": 0.3601, + "step": 4695500 + }, + { + "epoch": 31.777825898657426, + "grad_norm": 0.35239407420158386, + "learning_rate": 4.682221741013426e-05, + "loss": 0.3616, + "step": 4696000 + }, + { + "epoch": 31.781209398007796, + "grad_norm": 0.35901278257369995, + "learning_rate": 4.6821879060199223e-05, + "loss": 0.361, + "step": 4696500 + }, + { + "epoch": 31.784592897358163, + "grad_norm": 0.3940626084804535, + "learning_rate": 4.6821540710264186e-05, + "loss": 0.361, + "step": 4697000 + }, + { + "epoch": 31.787976396708533, + "grad_norm": 0.4088931977748871, + "learning_rate": 4.682120236032915e-05, + "loss": 0.3593, + "step": 4697500 + }, + { + "epoch": 31.7913598960589, + "grad_norm": 0.38752543926239014, + "learning_rate": 4.682086401039411e-05, + "loss": 0.3604, + "step": 4698000 + }, + { + "epoch": 31.794743395409267, + "grad_norm": 0.3893696665763855, + "learning_rate": 4.682052566045907e-05, + "loss": 0.3588, + "step": 4698500 + }, + { + "epoch": 31.798126894759637, + "grad_norm": 0.34849750995635986, + "learning_rate": 4.682018731052404e-05, + "loss": 0.3597, + "step": 4699000 + }, + { + "epoch": 31.801510394110004, + "grad_norm": 0.32492223381996155, + "learning_rate": 4.6819848960589e-05, + "loss": 0.3603, + "step": 4699500 + }, + { + "epoch": 31.804893893460374, + "grad_norm": 0.38688844442367554, + "learning_rate": 4.6819510610653965e-05, + "loss": 0.3614, + "step": 4700000 + }, + { + "epoch": 31.80827739281074, + "grad_norm": 0.3697317838668823, + "learning_rate": 4.681917226071893e-05, + "loss": 0.3595, + "step": 4700500 + }, + { + "epoch": 31.811660892161107, + "grad_norm": 0.359293133020401, + "learning_rate": 4.6818833910783896e-05, + "loss": 0.36, + "step": 4701000 + }, + { + "epoch": 31.815044391511478, + "grad_norm": 0.39656931161880493, + "learning_rate": 4.681849556084885e-05, + "loss": 0.3615, + "step": 4701500 + }, + { + "epoch": 31.818427890861845, + "grad_norm": 0.3779048025608063, + "learning_rate": 4.6818157210913814e-05, + "loss": 0.36, + "step": 4702000 + }, + { + "epoch": 31.82181139021221, + "grad_norm": 0.42255547642707825, + "learning_rate": 4.681781886097878e-05, + "loss": 0.3608, + "step": 4702500 + }, + { + "epoch": 31.82519488956258, + "grad_norm": 0.36566025018692017, + "learning_rate": 4.6817480511043745e-05, + "loss": 0.3613, + "step": 4703000 + }, + { + "epoch": 31.82857838891295, + "grad_norm": 0.3470553755760193, + "learning_rate": 4.681714216110871e-05, + "loss": 0.361, + "step": 4703500 + }, + { + "epoch": 31.83196188826332, + "grad_norm": 0.37342017889022827, + "learning_rate": 4.681680381117367e-05, + "loss": 0.3595, + "step": 4704000 + }, + { + "epoch": 31.835345387613685, + "grad_norm": 0.3994550108909607, + "learning_rate": 4.681646546123864e-05, + "loss": 0.3606, + "step": 4704500 + }, + { + "epoch": 31.838728886964052, + "grad_norm": 0.36774635314941406, + "learning_rate": 4.68161271113036e-05, + "loss": 0.3609, + "step": 4705000 + }, + { + "epoch": 31.842112386314422, + "grad_norm": 0.37347596883773804, + "learning_rate": 4.681578876136856e-05, + "loss": 0.3613, + "step": 4705500 + }, + { + "epoch": 31.84549588566479, + "grad_norm": 0.36244910955429077, + "learning_rate": 4.6815450411433524e-05, + "loss": 0.36, + "step": 4706000 + }, + { + "epoch": 31.84887938501516, + "grad_norm": 0.3883986175060272, + "learning_rate": 4.6815112061498486e-05, + "loss": 0.3593, + "step": 4706500 + }, + { + "epoch": 31.852262884365526, + "grad_norm": 0.4014483094215393, + "learning_rate": 4.681477371156345e-05, + "loss": 0.3617, + "step": 4707000 + }, + { + "epoch": 31.855646383715893, + "grad_norm": 0.3901611268520355, + "learning_rate": 4.681443536162841e-05, + "loss": 0.3597, + "step": 4707500 + }, + { + "epoch": 31.859029883066263, + "grad_norm": 0.3857150375843048, + "learning_rate": 4.681409701169337e-05, + "loss": 0.3604, + "step": 4708000 + }, + { + "epoch": 31.86241338241663, + "grad_norm": 0.39421144127845764, + "learning_rate": 4.681375866175834e-05, + "loss": 0.3607, + "step": 4708500 + }, + { + "epoch": 31.865796881767, + "grad_norm": 0.3842255771160126, + "learning_rate": 4.6813420311823304e-05, + "loss": 0.3608, + "step": 4709000 + }, + { + "epoch": 31.869180381117367, + "grad_norm": 0.3913736045360565, + "learning_rate": 4.6813081961888266e-05, + "loss": 0.3614, + "step": 4709500 + }, + { + "epoch": 31.872563880467734, + "grad_norm": 0.328275591135025, + "learning_rate": 4.681274361195323e-05, + "loss": 0.3608, + "step": 4710000 + }, + { + "epoch": 31.875947379818104, + "grad_norm": 0.3807612359523773, + "learning_rate": 4.68124052620182e-05, + "loss": 0.3601, + "step": 4710500 + }, + { + "epoch": 31.87933087916847, + "grad_norm": 0.35134157538414, + "learning_rate": 4.681206691208316e-05, + "loss": 0.3597, + "step": 4711000 + }, + { + "epoch": 31.882714378518838, + "grad_norm": 0.34368059039115906, + "learning_rate": 4.6811728562148114e-05, + "loss": 0.3611, + "step": 4711500 + }, + { + "epoch": 31.886097877869208, + "grad_norm": 0.3971202075481415, + "learning_rate": 4.681139021221308e-05, + "loss": 0.3606, + "step": 4712000 + }, + { + "epoch": 31.889481377219575, + "grad_norm": 0.3558181822299957, + "learning_rate": 4.6811051862278045e-05, + "loss": 0.3599, + "step": 4712500 + }, + { + "epoch": 31.892864876569945, + "grad_norm": 0.3947175145149231, + "learning_rate": 4.681071351234301e-05, + "loss": 0.3594, + "step": 4713000 + }, + { + "epoch": 31.89624837592031, + "grad_norm": 0.4031308889389038, + "learning_rate": 4.681037516240797e-05, + "loss": 0.3608, + "step": 4713500 + }, + { + "epoch": 31.89963187527068, + "grad_norm": 0.3834698498249054, + "learning_rate": 4.681003681247293e-05, + "loss": 0.3594, + "step": 4714000 + }, + { + "epoch": 31.90301537462105, + "grad_norm": 0.35495537519454956, + "learning_rate": 4.68096984625379e-05, + "loss": 0.3596, + "step": 4714500 + }, + { + "epoch": 31.906398873971415, + "grad_norm": 0.3593955338001251, + "learning_rate": 4.680936011260286e-05, + "loss": 0.361, + "step": 4715000 + }, + { + "epoch": 31.909782373321786, + "grad_norm": 0.37075990438461304, + "learning_rate": 4.6809021762667825e-05, + "loss": 0.3607, + "step": 4715500 + }, + { + "epoch": 31.913165872672153, + "grad_norm": 0.3548746109008789, + "learning_rate": 4.680868341273279e-05, + "loss": 0.359, + "step": 4716000 + }, + { + "epoch": 31.91654937202252, + "grad_norm": 0.36042797565460205, + "learning_rate": 4.680834506279775e-05, + "loss": 0.3597, + "step": 4716500 + }, + { + "epoch": 31.91993287137289, + "grad_norm": 0.40099388360977173, + "learning_rate": 4.680800671286271e-05, + "loss": 0.3614, + "step": 4717000 + }, + { + "epoch": 31.923316370723256, + "grad_norm": 0.4233962893486023, + "learning_rate": 4.680766836292767e-05, + "loss": 0.3597, + "step": 4717500 + }, + { + "epoch": 31.926699870073627, + "grad_norm": 0.36417707800865173, + "learning_rate": 4.680733001299264e-05, + "loss": 0.3604, + "step": 4718000 + }, + { + "epoch": 31.930083369423993, + "grad_norm": 0.3779143989086151, + "learning_rate": 4.6806991663057604e-05, + "loss": 0.3596, + "step": 4718500 + }, + { + "epoch": 31.93346686877436, + "grad_norm": 0.35195863246917725, + "learning_rate": 4.6806653313122566e-05, + "loss": 0.36, + "step": 4719000 + }, + { + "epoch": 31.93685036812473, + "grad_norm": 0.39064928889274597, + "learning_rate": 4.680631496318753e-05, + "loss": 0.3608, + "step": 4719500 + }, + { + "epoch": 31.940233867475097, + "grad_norm": 0.4043065309524536, + "learning_rate": 4.68059766132525e-05, + "loss": 0.3606, + "step": 4720000 + }, + { + "epoch": 31.943617366825464, + "grad_norm": 0.391126811504364, + "learning_rate": 4.680563826331746e-05, + "loss": 0.3608, + "step": 4720500 + }, + { + "epoch": 31.947000866175834, + "grad_norm": 0.3295779526233673, + "learning_rate": 4.6805299913382415e-05, + "loss": 0.3599, + "step": 4721000 + }, + { + "epoch": 31.9503843655262, + "grad_norm": 0.3799513876438141, + "learning_rate": 4.6804961563447384e-05, + "loss": 0.3609, + "step": 4721500 + }, + { + "epoch": 31.95376786487657, + "grad_norm": 0.3539388179779053, + "learning_rate": 4.6804623213512346e-05, + "loss": 0.3592, + "step": 4722000 + }, + { + "epoch": 31.957151364226938, + "grad_norm": 0.35162782669067383, + "learning_rate": 4.680428486357731e-05, + "loss": 0.3612, + "step": 4722500 + }, + { + "epoch": 31.960534863577305, + "grad_norm": 0.35630112886428833, + "learning_rate": 4.680394651364227e-05, + "loss": 0.3607, + "step": 4723000 + }, + { + "epoch": 31.963918362927675, + "grad_norm": 0.3836091458797455, + "learning_rate": 4.680360816370723e-05, + "loss": 0.3619, + "step": 4723500 + }, + { + "epoch": 31.967301862278042, + "grad_norm": 0.3872407078742981, + "learning_rate": 4.68032698137722e-05, + "loss": 0.3602, + "step": 4724000 + }, + { + "epoch": 31.970685361628412, + "grad_norm": 0.3915506601333618, + "learning_rate": 4.680293146383716e-05, + "loss": 0.3602, + "step": 4724500 + }, + { + "epoch": 31.97406886097878, + "grad_norm": 0.36311158537864685, + "learning_rate": 4.6802593113902125e-05, + "loss": 0.3592, + "step": 4725000 + }, + { + "epoch": 31.977452360329146, + "grad_norm": 0.31301188468933105, + "learning_rate": 4.680225476396709e-05, + "loss": 0.362, + "step": 4725500 + }, + { + "epoch": 31.980835859679516, + "grad_norm": 0.37805527448654175, + "learning_rate": 4.680191641403205e-05, + "loss": 0.3605, + "step": 4726000 + }, + { + "epoch": 31.984219359029883, + "grad_norm": 0.40406933426856995, + "learning_rate": 4.680157806409701e-05, + "loss": 0.3609, + "step": 4726500 + }, + { + "epoch": 31.98760285838025, + "grad_norm": 0.3633565604686737, + "learning_rate": 4.6801239714161974e-05, + "loss": 0.3615, + "step": 4727000 + }, + { + "epoch": 31.99098635773062, + "grad_norm": 0.37993454933166504, + "learning_rate": 4.680090136422694e-05, + "loss": 0.3605, + "step": 4727500 + }, + { + "epoch": 31.994369857080986, + "grad_norm": 0.40591785311698914, + "learning_rate": 4.6800563014291905e-05, + "loss": 0.3599, + "step": 4728000 + }, + { + "epoch": 31.997753356431357, + "grad_norm": 0.3888760805130005, + "learning_rate": 4.680022466435687e-05, + "loss": 0.3603, + "step": 4728500 + }, + { + "epoch": 32.0, + "eval_accuracy": 0.8626005641265935, + "eval_loss": 0.5565376281738281, + "eval_runtime": 3342.5449, + "eval_samples_per_second": 86.983, + "eval_steps_per_second": 5.437, + "step": 4728832 + }, + { + "epoch": 32.00113685578172, + "grad_norm": 0.39983639121055603, + "learning_rate": 4.679988631442183e-05, + "loss": 0.3589, + "step": 4729000 + }, + { + "epoch": 32.004520355132094, + "grad_norm": 0.34574589133262634, + "learning_rate": 4.67995479644868e-05, + "loss": 0.3593, + "step": 4729500 + }, + { + "epoch": 32.00790385448246, + "grad_norm": 0.37997013330459595, + "learning_rate": 4.679920961455176e-05, + "loss": 0.3575, + "step": 4730000 + }, + { + "epoch": 32.01128735383283, + "grad_norm": 0.39169222116470337, + "learning_rate": 4.6798871264616715e-05, + "loss": 0.3577, + "step": 4730500 + }, + { + "epoch": 32.0146708531832, + "grad_norm": 0.36011025309562683, + "learning_rate": 4.679853291468168e-05, + "loss": 0.3582, + "step": 4731000 + }, + { + "epoch": 32.01805435253356, + "grad_norm": 0.3974354565143585, + "learning_rate": 4.6798194564746646e-05, + "loss": 0.3589, + "step": 4731500 + }, + { + "epoch": 32.02143785188393, + "grad_norm": 0.38111263513565063, + "learning_rate": 4.679785621481161e-05, + "loss": 0.3583, + "step": 4732000 + }, + { + "epoch": 32.0248213512343, + "grad_norm": 0.3499109148979187, + "learning_rate": 4.679751786487657e-05, + "loss": 0.3584, + "step": 4732500 + }, + { + "epoch": 32.02820485058467, + "grad_norm": 0.4800916612148285, + "learning_rate": 4.679717951494153e-05, + "loss": 0.3591, + "step": 4733000 + }, + { + "epoch": 32.031588349935035, + "grad_norm": 0.3772190809249878, + "learning_rate": 4.67968411650065e-05, + "loss": 0.359, + "step": 4733500 + }, + { + "epoch": 32.034971849285405, + "grad_norm": 0.3727220892906189, + "learning_rate": 4.6796502815071464e-05, + "loss": 0.3599, + "step": 4734000 + }, + { + "epoch": 32.038355348635775, + "grad_norm": 0.41529178619384766, + "learning_rate": 4.6796164465136426e-05, + "loss": 0.3586, + "step": 4734500 + }, + { + "epoch": 32.04173884798614, + "grad_norm": 0.4112264811992645, + "learning_rate": 4.679582611520139e-05, + "loss": 0.3594, + "step": 4735000 + }, + { + "epoch": 32.04512234733651, + "grad_norm": 0.3342491686344147, + "learning_rate": 4.679548776526635e-05, + "loss": 0.3589, + "step": 4735500 + }, + { + "epoch": 32.04850584668688, + "grad_norm": 0.3887941241264343, + "learning_rate": 4.679514941533131e-05, + "loss": 0.3588, + "step": 4736000 + }, + { + "epoch": 32.05188934603724, + "grad_norm": 0.37229710817337036, + "learning_rate": 4.6794811065396274e-05, + "loss": 0.3602, + "step": 4736500 + }, + { + "epoch": 32.05527284538761, + "grad_norm": 0.46064630150794983, + "learning_rate": 4.679447271546124e-05, + "loss": 0.3597, + "step": 4737000 + }, + { + "epoch": 32.05865634473798, + "grad_norm": 0.39698526263237, + "learning_rate": 4.6794134365526205e-05, + "loss": 0.3597, + "step": 4737500 + }, + { + "epoch": 32.06203984408835, + "grad_norm": 0.3707379996776581, + "learning_rate": 4.679379601559117e-05, + "loss": 0.3585, + "step": 4738000 + }, + { + "epoch": 32.06542334343872, + "grad_norm": 0.4082753658294678, + "learning_rate": 4.679345766565613e-05, + "loss": 0.3571, + "step": 4738500 + }, + { + "epoch": 32.06880684278909, + "grad_norm": 0.3864918649196625, + "learning_rate": 4.67931193157211e-05, + "loss": 0.3588, + "step": 4739000 + }, + { + "epoch": 32.07219034213946, + "grad_norm": 0.36663374304771423, + "learning_rate": 4.679278096578606e-05, + "loss": 0.3582, + "step": 4739500 + }, + { + "epoch": 32.07557384148982, + "grad_norm": 0.37536200881004333, + "learning_rate": 4.6792442615851016e-05, + "loss": 0.3581, + "step": 4740000 + }, + { + "epoch": 32.07895734084019, + "grad_norm": 0.37068191170692444, + "learning_rate": 4.679210426591598e-05, + "loss": 0.3593, + "step": 4740500 + }, + { + "epoch": 32.08234084019056, + "grad_norm": 0.41226398944854736, + "learning_rate": 4.679176591598095e-05, + "loss": 0.3582, + "step": 4741000 + }, + { + "epoch": 32.085724339540924, + "grad_norm": 0.35647261142730713, + "learning_rate": 4.679142756604591e-05, + "loss": 0.3607, + "step": 4741500 + }, + { + "epoch": 32.089107838891294, + "grad_norm": 0.37242090702056885, + "learning_rate": 4.679108921611087e-05, + "loss": 0.3582, + "step": 4742000 + }, + { + "epoch": 32.092491338241665, + "grad_norm": 0.39995208382606506, + "learning_rate": 4.6790750866175833e-05, + "loss": 0.3591, + "step": 4742500 + }, + { + "epoch": 32.09587483759203, + "grad_norm": 0.3676217198371887, + "learning_rate": 4.67904125162408e-05, + "loss": 0.3603, + "step": 4743000 + }, + { + "epoch": 32.0992583369424, + "grad_norm": 0.37295204401016235, + "learning_rate": 4.6790074166305764e-05, + "loss": 0.3582, + "step": 4743500 + }, + { + "epoch": 32.10264183629277, + "grad_norm": 0.38864821195602417, + "learning_rate": 4.6789735816370727e-05, + "loss": 0.3596, + "step": 4744000 + }, + { + "epoch": 32.10602533564314, + "grad_norm": 0.3998364210128784, + "learning_rate": 4.678939746643569e-05, + "loss": 0.3579, + "step": 4744500 + }, + { + "epoch": 32.1094088349935, + "grad_norm": 0.44700887799263, + "learning_rate": 4.678905911650065e-05, + "loss": 0.3596, + "step": 4745000 + }, + { + "epoch": 32.11279233434387, + "grad_norm": 0.34449413418769836, + "learning_rate": 4.678872076656561e-05, + "loss": 0.3592, + "step": 4745500 + }, + { + "epoch": 32.11617583369424, + "grad_norm": 0.3522138297557831, + "learning_rate": 4.6788382416630575e-05, + "loss": 0.3591, + "step": 4746000 + }, + { + "epoch": 32.119559333044606, + "grad_norm": 0.4162333607673645, + "learning_rate": 4.6788044066695544e-05, + "loss": 0.3589, + "step": 4746500 + }, + { + "epoch": 32.122942832394976, + "grad_norm": 0.42545434832572937, + "learning_rate": 4.6787705716760506e-05, + "loss": 0.3604, + "step": 4747000 + }, + { + "epoch": 32.12632633174535, + "grad_norm": 0.3827410042285919, + "learning_rate": 4.678736736682547e-05, + "loss": 0.3586, + "step": 4747500 + }, + { + "epoch": 32.12970983109571, + "grad_norm": 0.3557436168193817, + "learning_rate": 4.678702901689043e-05, + "loss": 0.3601, + "step": 4748000 + }, + { + "epoch": 32.13309333044608, + "grad_norm": 0.3585735857486725, + "learning_rate": 4.67866906669554e-05, + "loss": 0.3587, + "step": 4748500 + }, + { + "epoch": 32.13647682979645, + "grad_norm": 0.37218043208122253, + "learning_rate": 4.678635231702036e-05, + "loss": 0.3584, + "step": 4749000 + }, + { + "epoch": 32.13986032914681, + "grad_norm": 0.37905794382095337, + "learning_rate": 4.678601396708532e-05, + "loss": 0.3594, + "step": 4749500 + }, + { + "epoch": 32.143243828497184, + "grad_norm": 0.34577351808547974, + "learning_rate": 4.678567561715028e-05, + "loss": 0.3602, + "step": 4750000 + }, + { + "epoch": 32.146627327847554, + "grad_norm": 0.3684971034526825, + "learning_rate": 4.678533726721525e-05, + "loss": 0.3599, + "step": 4750500 + }, + { + "epoch": 32.150010827197924, + "grad_norm": 0.3612573444843292, + "learning_rate": 4.678499891728021e-05, + "loss": 0.3586, + "step": 4751000 + }, + { + "epoch": 32.15339432654829, + "grad_norm": 0.3678892254829407, + "learning_rate": 4.678466056734517e-05, + "loss": 0.3591, + "step": 4751500 + }, + { + "epoch": 32.15677782589866, + "grad_norm": 0.404162734746933, + "learning_rate": 4.6784322217410134e-05, + "loss": 0.3605, + "step": 4752000 + }, + { + "epoch": 32.16016132524903, + "grad_norm": 0.41183698177337646, + "learning_rate": 4.67839838674751e-05, + "loss": 0.3615, + "step": 4752500 + }, + { + "epoch": 32.16354482459939, + "grad_norm": 0.3945522606372833, + "learning_rate": 4.6783645517540065e-05, + "loss": 0.3611, + "step": 4753000 + }, + { + "epoch": 32.16692832394976, + "grad_norm": 0.3847481906414032, + "learning_rate": 4.678330716760503e-05, + "loss": 0.3587, + "step": 4753500 + }, + { + "epoch": 32.17031182330013, + "grad_norm": 0.37639281153678894, + "learning_rate": 4.678296881766999e-05, + "loss": 0.3596, + "step": 4754000 + }, + { + "epoch": 32.173695322650495, + "grad_norm": 0.3757501542568207, + "learning_rate": 4.678263046773495e-05, + "loss": 0.3607, + "step": 4754500 + }, + { + "epoch": 32.177078822000865, + "grad_norm": 0.3711193799972534, + "learning_rate": 4.6782292117799914e-05, + "loss": 0.3588, + "step": 4755000 + }, + { + "epoch": 32.180462321351236, + "grad_norm": 0.35680362582206726, + "learning_rate": 4.6781953767864876e-05, + "loss": 0.3611, + "step": 4755500 + }, + { + "epoch": 32.1838458207016, + "grad_norm": 0.3715004026889801, + "learning_rate": 4.6781615417929845e-05, + "loss": 0.3595, + "step": 4756000 + }, + { + "epoch": 32.18722932005197, + "grad_norm": 0.3905080556869507, + "learning_rate": 4.678127706799481e-05, + "loss": 0.36, + "step": 4756500 + }, + { + "epoch": 32.19061281940234, + "grad_norm": 0.3584302067756653, + "learning_rate": 4.678093871805977e-05, + "loss": 0.359, + "step": 4757000 + }, + { + "epoch": 32.19399631875271, + "grad_norm": 0.37233132123947144, + "learning_rate": 4.678060036812473e-05, + "loss": 0.3586, + "step": 4757500 + }, + { + "epoch": 32.19737981810307, + "grad_norm": 0.3986111879348755, + "learning_rate": 4.67802620181897e-05, + "loss": 0.3601, + "step": 4758000 + }, + { + "epoch": 32.20076331745344, + "grad_norm": 0.39952966570854187, + "learning_rate": 4.677992366825466e-05, + "loss": 0.3602, + "step": 4758500 + }, + { + "epoch": 32.204146816803814, + "grad_norm": 0.3870437443256378, + "learning_rate": 4.677958531831962e-05, + "loss": 0.36, + "step": 4759000 + }, + { + "epoch": 32.20753031615418, + "grad_norm": 0.4017411172389984, + "learning_rate": 4.677924696838458e-05, + "loss": 0.3598, + "step": 4759500 + }, + { + "epoch": 32.21091381550455, + "grad_norm": 0.3741097152233124, + "learning_rate": 4.677890861844955e-05, + "loss": 0.359, + "step": 4760000 + }, + { + "epoch": 32.21429731485492, + "grad_norm": 0.34826579689979553, + "learning_rate": 4.677857026851451e-05, + "loss": 0.3603, + "step": 4760500 + }, + { + "epoch": 32.21768081420528, + "grad_norm": 0.3822405934333801, + "learning_rate": 4.677823191857947e-05, + "loss": 0.3602, + "step": 4761000 + }, + { + "epoch": 32.22106431355565, + "grad_norm": 0.42045676708221436, + "learning_rate": 4.6777893568644435e-05, + "loss": 0.3608, + "step": 4761500 + }, + { + "epoch": 32.22444781290602, + "grad_norm": 0.3952271044254303, + "learning_rate": 4.6777555218709404e-05, + "loss": 0.3608, + "step": 4762000 + }, + { + "epoch": 32.22783131225639, + "grad_norm": 0.34460505843162537, + "learning_rate": 4.6777216868774366e-05, + "loss": 0.3583, + "step": 4762500 + }, + { + "epoch": 32.231214811606755, + "grad_norm": 0.4037352204322815, + "learning_rate": 4.677687851883933e-05, + "loss": 0.3601, + "step": 4763000 + }, + { + "epoch": 32.234598310957125, + "grad_norm": 0.35192182660102844, + "learning_rate": 4.677654016890429e-05, + "loss": 0.3587, + "step": 4763500 + }, + { + "epoch": 32.237981810307495, + "grad_norm": 0.3796621561050415, + "learning_rate": 4.677620181896925e-05, + "loss": 0.36, + "step": 4764000 + }, + { + "epoch": 32.24136530965786, + "grad_norm": 0.4029964208602905, + "learning_rate": 4.6775863469034214e-05, + "loss": 0.3596, + "step": 4764500 + }, + { + "epoch": 32.24474880900823, + "grad_norm": 0.3702819049358368, + "learning_rate": 4.6775525119099176e-05, + "loss": 0.3596, + "step": 4765000 + }, + { + "epoch": 32.2481323083586, + "grad_norm": 0.3341525197029114, + "learning_rate": 4.6775186769164145e-05, + "loss": 0.3597, + "step": 4765500 + }, + { + "epoch": 32.25151580770896, + "grad_norm": 0.37364503741264343, + "learning_rate": 4.677484841922911e-05, + "loss": 0.3597, + "step": 4766000 + }, + { + "epoch": 32.25489930705933, + "grad_norm": 0.3837050795555115, + "learning_rate": 4.677451006929407e-05, + "loss": 0.3593, + "step": 4766500 + }, + { + "epoch": 32.2582828064097, + "grad_norm": 0.390402227640152, + "learning_rate": 4.677417171935903e-05, + "loss": 0.359, + "step": 4767000 + }, + { + "epoch": 32.261666305760066, + "grad_norm": 0.37794673442840576, + "learning_rate": 4.6773833369424e-05, + "loss": 0.359, + "step": 4767500 + }, + { + "epoch": 32.265049805110436, + "grad_norm": 0.38157832622528076, + "learning_rate": 4.677349501948896e-05, + "loss": 0.3594, + "step": 4768000 + }, + { + "epoch": 32.26843330446081, + "grad_norm": 0.3674872815608978, + "learning_rate": 4.677315666955392e-05, + "loss": 0.3611, + "step": 4768500 + }, + { + "epoch": 32.27181680381118, + "grad_norm": 0.3956109881401062, + "learning_rate": 4.677281831961888e-05, + "loss": 0.3602, + "step": 4769000 + }, + { + "epoch": 32.27520030316154, + "grad_norm": 0.37928852438926697, + "learning_rate": 4.677247996968385e-05, + "loss": 0.3594, + "step": 4769500 + }, + { + "epoch": 32.27858380251191, + "grad_norm": 0.38557955622673035, + "learning_rate": 4.677214161974881e-05, + "loss": 0.3595, + "step": 4770000 + }, + { + "epoch": 32.28196730186228, + "grad_norm": 0.38661181926727295, + "learning_rate": 4.677180326981377e-05, + "loss": 0.3581, + "step": 4770500 + }, + { + "epoch": 32.285350801212644, + "grad_norm": 0.38425928354263306, + "learning_rate": 4.6771464919878735e-05, + "loss": 0.3598, + "step": 4771000 + }, + { + "epoch": 32.288734300563014, + "grad_norm": 0.3826102614402771, + "learning_rate": 4.6771126569943704e-05, + "loss": 0.36, + "step": 4771500 + }, + { + "epoch": 32.292117799913385, + "grad_norm": 0.40827110409736633, + "learning_rate": 4.6770788220008666e-05, + "loss": 0.3592, + "step": 4772000 + }, + { + "epoch": 32.29550129926375, + "grad_norm": 0.3643607497215271, + "learning_rate": 4.677044987007363e-05, + "loss": 0.3614, + "step": 4772500 + }, + { + "epoch": 32.29888479861412, + "grad_norm": 0.3747618496417999, + "learning_rate": 4.677011152013859e-05, + "loss": 0.3588, + "step": 4773000 + }, + { + "epoch": 32.30226829796449, + "grad_norm": 0.35724756121635437, + "learning_rate": 4.676977317020355e-05, + "loss": 0.3606, + "step": 4773500 + }, + { + "epoch": 32.30565179731485, + "grad_norm": 0.3933773338794708, + "learning_rate": 4.6769434820268515e-05, + "loss": 0.3594, + "step": 4774000 + }, + { + "epoch": 32.30903529666522, + "grad_norm": 0.36762532591819763, + "learning_rate": 4.676909647033348e-05, + "loss": 0.3599, + "step": 4774500 + }, + { + "epoch": 32.31241879601559, + "grad_norm": 0.37173232436180115, + "learning_rate": 4.6768758120398446e-05, + "loss": 0.3596, + "step": 4775000 + }, + { + "epoch": 32.31580229536596, + "grad_norm": 0.39160171151161194, + "learning_rate": 4.676841977046341e-05, + "loss": 0.3604, + "step": 4775500 + }, + { + "epoch": 32.319185794716326, + "grad_norm": 0.3913881182670593, + "learning_rate": 4.676808142052837e-05, + "loss": 0.3596, + "step": 4776000 + }, + { + "epoch": 32.322569294066696, + "grad_norm": 0.37087172269821167, + "learning_rate": 4.676774307059333e-05, + "loss": 0.3595, + "step": 4776500 + }, + { + "epoch": 32.325952793417066, + "grad_norm": 0.4002458155155182, + "learning_rate": 4.6767404720658294e-05, + "loss": 0.3597, + "step": 4777000 + }, + { + "epoch": 32.32933629276743, + "grad_norm": 0.392475962638855, + "learning_rate": 4.676706637072326e-05, + "loss": 0.3601, + "step": 4777500 + }, + { + "epoch": 32.3327197921178, + "grad_norm": 0.37204068899154663, + "learning_rate": 4.676672802078822e-05, + "loss": 0.3593, + "step": 4778000 + }, + { + "epoch": 32.33610329146817, + "grad_norm": 0.35469821095466614, + "learning_rate": 4.676638967085318e-05, + "loss": 0.3597, + "step": 4778500 + }, + { + "epoch": 32.33948679081853, + "grad_norm": 0.4092603027820587, + "learning_rate": 4.676605132091815e-05, + "loss": 0.3582, + "step": 4779000 + }, + { + "epoch": 32.342870290168904, + "grad_norm": 0.36772671341896057, + "learning_rate": 4.676571297098311e-05, + "loss": 0.359, + "step": 4779500 + }, + { + "epoch": 32.346253789519274, + "grad_norm": 0.3744930922985077, + "learning_rate": 4.6765374621048074e-05, + "loss": 0.3605, + "step": 4780000 + }, + { + "epoch": 32.34963728886964, + "grad_norm": 0.4097921550273895, + "learning_rate": 4.6765036271113036e-05, + "loss": 0.3601, + "step": 4780500 + }, + { + "epoch": 32.35302078822001, + "grad_norm": 0.3940643072128296, + "learning_rate": 4.6764697921178005e-05, + "loss": 0.36, + "step": 4781000 + }, + { + "epoch": 32.35640428757038, + "grad_norm": 0.37415140867233276, + "learning_rate": 4.676435957124297e-05, + "loss": 0.3594, + "step": 4781500 + }, + { + "epoch": 32.35978778692075, + "grad_norm": 0.3879052400588989, + "learning_rate": 4.676402122130793e-05, + "loss": 0.3597, + "step": 4782000 + }, + { + "epoch": 32.36317128627111, + "grad_norm": 0.4095955193042755, + "learning_rate": 4.676368287137289e-05, + "loss": 0.3587, + "step": 4782500 + }, + { + "epoch": 32.36655478562148, + "grad_norm": 0.3983441889286041, + "learning_rate": 4.676334452143785e-05, + "loss": 0.36, + "step": 4783000 + }, + { + "epoch": 32.36993828497185, + "grad_norm": 0.38272032141685486, + "learning_rate": 4.6763006171502816e-05, + "loss": 0.3613, + "step": 4783500 + }, + { + "epoch": 32.373321784322215, + "grad_norm": 0.3874930143356323, + "learning_rate": 4.676266782156778e-05, + "loss": 0.3589, + "step": 4784000 + }, + { + "epoch": 32.376705283672585, + "grad_norm": 0.3716520369052887, + "learning_rate": 4.6762329471632747e-05, + "loss": 0.3599, + "step": 4784500 + }, + { + "epoch": 32.380088783022956, + "grad_norm": 0.3528953492641449, + "learning_rate": 4.676199112169771e-05, + "loss": 0.3598, + "step": 4785000 + }, + { + "epoch": 32.38347228237332, + "grad_norm": 0.3771120309829712, + "learning_rate": 4.676165277176267e-05, + "loss": 0.3577, + "step": 4785500 + }, + { + "epoch": 32.38685578172369, + "grad_norm": 0.37628158926963806, + "learning_rate": 4.676131442182763e-05, + "loss": 0.3605, + "step": 4786000 + }, + { + "epoch": 32.39023928107406, + "grad_norm": 0.37140750885009766, + "learning_rate": 4.6760976071892595e-05, + "loss": 0.3598, + "step": 4786500 + }, + { + "epoch": 32.39362278042443, + "grad_norm": 0.40377941727638245, + "learning_rate": 4.6760637721957564e-05, + "loss": 0.3595, + "step": 4787000 + }, + { + "epoch": 32.39700627977479, + "grad_norm": 0.3502059578895569, + "learning_rate": 4.676029937202252e-05, + "loss": 0.3597, + "step": 4787500 + }, + { + "epoch": 32.40038977912516, + "grad_norm": 0.3411078155040741, + "learning_rate": 4.675996102208748e-05, + "loss": 0.3597, + "step": 4788000 + }, + { + "epoch": 32.40377327847553, + "grad_norm": 0.3946812152862549, + "learning_rate": 4.675962267215245e-05, + "loss": 0.36, + "step": 4788500 + }, + { + "epoch": 32.4071567778259, + "grad_norm": 0.39661461114883423, + "learning_rate": 4.675928432221741e-05, + "loss": 0.3618, + "step": 4789000 + }, + { + "epoch": 32.41054027717627, + "grad_norm": 0.40929490327835083, + "learning_rate": 4.6758945972282375e-05, + "loss": 0.3597, + "step": 4789500 + }, + { + "epoch": 32.41392377652664, + "grad_norm": 0.37436795234680176, + "learning_rate": 4.675860762234734e-05, + "loss": 0.3599, + "step": 4790000 + }, + { + "epoch": 32.417307275877, + "grad_norm": 0.3523145914077759, + "learning_rate": 4.6758269272412306e-05, + "loss": 0.3591, + "step": 4790500 + }, + { + "epoch": 32.42069077522737, + "grad_norm": 0.3629417419433594, + "learning_rate": 4.675793092247727e-05, + "loss": 0.3609, + "step": 4791000 + }, + { + "epoch": 32.42407427457774, + "grad_norm": 0.42778724431991577, + "learning_rate": 4.675759257254223e-05, + "loss": 0.3592, + "step": 4791500 + }, + { + "epoch": 32.427457773928104, + "grad_norm": 0.3381754457950592, + "learning_rate": 4.675725422260719e-05, + "loss": 0.3603, + "step": 4792000 + }, + { + "epoch": 32.430841273278475, + "grad_norm": 0.3926127851009369, + "learning_rate": 4.6756915872672154e-05, + "loss": 0.3599, + "step": 4792500 + }, + { + "epoch": 32.434224772628845, + "grad_norm": 0.4157543182373047, + "learning_rate": 4.6756577522737116e-05, + "loss": 0.3611, + "step": 4793000 + }, + { + "epoch": 32.437608271979215, + "grad_norm": 0.3517511785030365, + "learning_rate": 4.675623917280208e-05, + "loss": 0.3607, + "step": 4793500 + }, + { + "epoch": 32.44099177132958, + "grad_norm": 0.377532035112381, + "learning_rate": 4.675590082286704e-05, + "loss": 0.3587, + "step": 4794000 + }, + { + "epoch": 32.44437527067995, + "grad_norm": 0.45183494687080383, + "learning_rate": 4.675556247293201e-05, + "loss": 0.3605, + "step": 4794500 + }, + { + "epoch": 32.44775877003032, + "grad_norm": 0.4093371331691742, + "learning_rate": 4.675522412299697e-05, + "loss": 0.3611, + "step": 4795000 + }, + { + "epoch": 32.45114226938068, + "grad_norm": 0.38602185249328613, + "learning_rate": 4.6754885773061934e-05, + "loss": 0.3602, + "step": 4795500 + }, + { + "epoch": 32.45452576873105, + "grad_norm": 0.3903023600578308, + "learning_rate": 4.6754547423126896e-05, + "loss": 0.3591, + "step": 4796000 + }, + { + "epoch": 32.45790926808142, + "grad_norm": 0.3937007486820221, + "learning_rate": 4.6754209073191865e-05, + "loss": 0.3576, + "step": 4796500 + }, + { + "epoch": 32.461292767431786, + "grad_norm": 0.37577611207962036, + "learning_rate": 4.675387072325682e-05, + "loss": 0.3592, + "step": 4797000 + }, + { + "epoch": 32.464676266782156, + "grad_norm": 0.3842858076095581, + "learning_rate": 4.675353237332178e-05, + "loss": 0.3596, + "step": 4797500 + }, + { + "epoch": 32.46805976613253, + "grad_norm": 0.3671128451824188, + "learning_rate": 4.675319402338675e-05, + "loss": 0.3595, + "step": 4798000 + }, + { + "epoch": 32.47144326548289, + "grad_norm": 0.40977221727371216, + "learning_rate": 4.675285567345171e-05, + "loss": 0.3591, + "step": 4798500 + }, + { + "epoch": 32.47482676483326, + "grad_norm": 0.3818470239639282, + "learning_rate": 4.6752517323516675e-05, + "loss": 0.3584, + "step": 4799000 + }, + { + "epoch": 32.47821026418363, + "grad_norm": 0.429762065410614, + "learning_rate": 4.675217897358164e-05, + "loss": 0.3608, + "step": 4799500 + }, + { + "epoch": 32.481593763534, + "grad_norm": 0.3330877125263214, + "learning_rate": 4.6751840623646606e-05, + "loss": 0.3603, + "step": 4800000 + }, + { + "epoch": 32.484977262884364, + "grad_norm": 0.3307543396949768, + "learning_rate": 4.675150227371157e-05, + "loss": 0.3603, + "step": 4800500 + }, + { + "epoch": 32.488360762234734, + "grad_norm": 0.4174194037914276, + "learning_rate": 4.675116392377653e-05, + "loss": 0.3612, + "step": 4801000 + }, + { + "epoch": 32.491744261585104, + "grad_norm": 0.39622727036476135, + "learning_rate": 4.675082557384149e-05, + "loss": 0.3587, + "step": 4801500 + }, + { + "epoch": 32.49512776093547, + "grad_norm": 0.38315004110336304, + "learning_rate": 4.6750487223906455e-05, + "loss": 0.3598, + "step": 4802000 + }, + { + "epoch": 32.49851126028584, + "grad_norm": 0.3557627201080322, + "learning_rate": 4.675014887397142e-05, + "loss": 0.3595, + "step": 4802500 + }, + { + "epoch": 32.50189475963621, + "grad_norm": 0.3378044366836548, + "learning_rate": 4.674981052403638e-05, + "loss": 0.3598, + "step": 4803000 + }, + { + "epoch": 32.50527825898657, + "grad_norm": 0.3784140348434448, + "learning_rate": 4.674947217410134e-05, + "loss": 0.3594, + "step": 4803500 + }, + { + "epoch": 32.50866175833694, + "grad_norm": 0.3857872188091278, + "learning_rate": 4.674913382416631e-05, + "loss": 0.3604, + "step": 4804000 + }, + { + "epoch": 32.51204525768731, + "grad_norm": 0.37597906589508057, + "learning_rate": 4.674879547423127e-05, + "loss": 0.3583, + "step": 4804500 + }, + { + "epoch": 32.515428757037675, + "grad_norm": 0.3755928575992584, + "learning_rate": 4.6748457124296234e-05, + "loss": 0.3581, + "step": 4805000 + }, + { + "epoch": 32.518812256388046, + "grad_norm": 0.38332635164260864, + "learning_rate": 4.6748118774361196e-05, + "loss": 0.3601, + "step": 4805500 + }, + { + "epoch": 32.522195755738416, + "grad_norm": 0.40942707657814026, + "learning_rate": 4.6747780424426165e-05, + "loss": 0.361, + "step": 4806000 + }, + { + "epoch": 32.525579255088786, + "grad_norm": 0.3584476709365845, + "learning_rate": 4.674744207449112e-05, + "loss": 0.3581, + "step": 4806500 + }, + { + "epoch": 32.52896275443915, + "grad_norm": 0.363699734210968, + "learning_rate": 4.674710372455608e-05, + "loss": 0.3595, + "step": 4807000 + }, + { + "epoch": 32.53234625378952, + "grad_norm": 0.38536036014556885, + "learning_rate": 4.674676537462105e-05, + "loss": 0.3594, + "step": 4807500 + }, + { + "epoch": 32.53572975313989, + "grad_norm": 0.36360061168670654, + "learning_rate": 4.6746427024686014e-05, + "loss": 0.3588, + "step": 4808000 + }, + { + "epoch": 32.53911325249025, + "grad_norm": 0.3698710799217224, + "learning_rate": 4.6746088674750976e-05, + "loss": 0.3603, + "step": 4808500 + }, + { + "epoch": 32.54249675184062, + "grad_norm": 0.4059138596057892, + "learning_rate": 4.674575032481594e-05, + "loss": 0.36, + "step": 4809000 + }, + { + "epoch": 32.545880251190994, + "grad_norm": 0.4048005938529968, + "learning_rate": 4.674541197488091e-05, + "loss": 0.3596, + "step": 4809500 + }, + { + "epoch": 32.54926375054136, + "grad_norm": 0.37766924500465393, + "learning_rate": 4.674507362494587e-05, + "loss": 0.3606, + "step": 4810000 + }, + { + "epoch": 32.55264724989173, + "grad_norm": 0.3820647597312927, + "learning_rate": 4.674473527501083e-05, + "loss": 0.361, + "step": 4810500 + }, + { + "epoch": 32.5560307492421, + "grad_norm": 0.338340163230896, + "learning_rate": 4.674439692507579e-05, + "loss": 0.3601, + "step": 4811000 + }, + { + "epoch": 32.55941424859246, + "grad_norm": 0.36598479747772217, + "learning_rate": 4.6744058575140755e-05, + "loss": 0.3583, + "step": 4811500 + }, + { + "epoch": 32.56279774794283, + "grad_norm": 0.34915420413017273, + "learning_rate": 4.674372022520572e-05, + "loss": 0.3599, + "step": 4812000 + }, + { + "epoch": 32.5661812472932, + "grad_norm": 0.3632654845714569, + "learning_rate": 4.674338187527068e-05, + "loss": 0.3617, + "step": 4812500 + }, + { + "epoch": 32.56956474664357, + "grad_norm": 0.40251436829566956, + "learning_rate": 4.674304352533564e-05, + "loss": 0.3595, + "step": 4813000 + }, + { + "epoch": 32.572948245993935, + "grad_norm": 0.39226406812667847, + "learning_rate": 4.674270517540061e-05, + "loss": 0.3579, + "step": 4813500 + }, + { + "epoch": 32.576331745344305, + "grad_norm": 0.3955608606338501, + "learning_rate": 4.674236682546557e-05, + "loss": 0.3601, + "step": 4814000 + }, + { + "epoch": 32.579715244694675, + "grad_norm": 0.3937000632286072, + "learning_rate": 4.6742028475530535e-05, + "loss": 0.3597, + "step": 4814500 + }, + { + "epoch": 32.58309874404504, + "grad_norm": 0.38861674070358276, + "learning_rate": 4.67416901255955e-05, + "loss": 0.3599, + "step": 4815000 + }, + { + "epoch": 32.58648224339541, + "grad_norm": 0.40989431738853455, + "learning_rate": 4.6741351775660466e-05, + "loss": 0.3584, + "step": 4815500 + }, + { + "epoch": 32.58986574274578, + "grad_norm": 0.3231464624404907, + "learning_rate": 4.674101342572542e-05, + "loss": 0.3611, + "step": 4816000 + }, + { + "epoch": 32.59324924209614, + "grad_norm": 0.3813242018222809, + "learning_rate": 4.674067507579038e-05, + "loss": 0.3609, + "step": 4816500 + }, + { + "epoch": 32.59663274144651, + "grad_norm": 0.3899689316749573, + "learning_rate": 4.674033672585535e-05, + "loss": 0.36, + "step": 4817000 + }, + { + "epoch": 32.60001624079688, + "grad_norm": 0.36689555644989014, + "learning_rate": 4.6739998375920314e-05, + "loss": 0.361, + "step": 4817500 + }, + { + "epoch": 32.60339974014725, + "grad_norm": 0.40323251485824585, + "learning_rate": 4.6739660025985276e-05, + "loss": 0.3602, + "step": 4818000 + }, + { + "epoch": 32.60678323949762, + "grad_norm": 0.388261079788208, + "learning_rate": 4.673932167605024e-05, + "loss": 0.3607, + "step": 4818500 + }, + { + "epoch": 32.61016673884799, + "grad_norm": 0.3827062249183655, + "learning_rate": 4.673898332611521e-05, + "loss": 0.3599, + "step": 4819000 + }, + { + "epoch": 32.61355023819836, + "grad_norm": 0.37925106287002563, + "learning_rate": 4.673864497618017e-05, + "loss": 0.362, + "step": 4819500 + }, + { + "epoch": 32.61693373754872, + "grad_norm": 0.35628315806388855, + "learning_rate": 4.673830662624513e-05, + "loss": 0.3597, + "step": 4820000 + }, + { + "epoch": 32.62031723689909, + "grad_norm": 0.37639716267585754, + "learning_rate": 4.6737968276310094e-05, + "loss": 0.3591, + "step": 4820500 + }, + { + "epoch": 32.62370073624946, + "grad_norm": 0.37759920954704285, + "learning_rate": 4.6737629926375056e-05, + "loss": 0.3618, + "step": 4821000 + }, + { + "epoch": 32.627084235599824, + "grad_norm": 0.3671201765537262, + "learning_rate": 4.673729157644002e-05, + "loss": 0.3609, + "step": 4821500 + }, + { + "epoch": 32.630467734950194, + "grad_norm": 0.32618698477745056, + "learning_rate": 4.673695322650498e-05, + "loss": 0.36, + "step": 4822000 + }, + { + "epoch": 32.633851234300565, + "grad_norm": 0.3773176074028015, + "learning_rate": 4.673661487656994e-05, + "loss": 0.3591, + "step": 4822500 + }, + { + "epoch": 32.63723473365093, + "grad_norm": 0.39158689975738525, + "learning_rate": 4.673627652663491e-05, + "loss": 0.3604, + "step": 4823000 + }, + { + "epoch": 32.6406182330013, + "grad_norm": 0.38822489976882935, + "learning_rate": 4.673593817669987e-05, + "loss": 0.3607, + "step": 4823500 + }, + { + "epoch": 32.64400173235167, + "grad_norm": 0.36614277958869934, + "learning_rate": 4.6735599826764835e-05, + "loss": 0.3594, + "step": 4824000 + }, + { + "epoch": 32.64738523170204, + "grad_norm": 0.39221155643463135, + "learning_rate": 4.67352614768298e-05, + "loss": 0.3608, + "step": 4824500 + }, + { + "epoch": 32.6507687310524, + "grad_norm": 0.33508527278900146, + "learning_rate": 4.6734923126894766e-05, + "loss": 0.3603, + "step": 4825000 + }, + { + "epoch": 32.65415223040277, + "grad_norm": 0.3844468295574188, + "learning_rate": 4.673458477695973e-05, + "loss": 0.3604, + "step": 4825500 + }, + { + "epoch": 32.65753572975314, + "grad_norm": 0.35471558570861816, + "learning_rate": 4.6734246427024684e-05, + "loss": 0.3601, + "step": 4826000 + }, + { + "epoch": 32.660919229103506, + "grad_norm": 0.3607887029647827, + "learning_rate": 4.673390807708965e-05, + "loss": 0.3592, + "step": 4826500 + }, + { + "epoch": 32.664302728453876, + "grad_norm": 0.3825249671936035, + "learning_rate": 4.6733569727154615e-05, + "loss": 0.3613, + "step": 4827000 + }, + { + "epoch": 32.667686227804246, + "grad_norm": 0.3158179819583893, + "learning_rate": 4.673323137721958e-05, + "loss": 0.3599, + "step": 4827500 + }, + { + "epoch": 32.67106972715461, + "grad_norm": 0.37185347080230713, + "learning_rate": 4.673289302728454e-05, + "loss": 0.3614, + "step": 4828000 + }, + { + "epoch": 32.67445322650498, + "grad_norm": 0.3913642168045044, + "learning_rate": 4.673255467734951e-05, + "loss": 0.3598, + "step": 4828500 + }, + { + "epoch": 32.67783672585535, + "grad_norm": 0.36755040287971497, + "learning_rate": 4.673221632741447e-05, + "loss": 0.358, + "step": 4829000 + }, + { + "epoch": 32.68122022520571, + "grad_norm": 0.3860124349594116, + "learning_rate": 4.673187797747943e-05, + "loss": 0.36, + "step": 4829500 + }, + { + "epoch": 32.684603724556084, + "grad_norm": 0.4249396026134491, + "learning_rate": 4.6731539627544394e-05, + "loss": 0.3605, + "step": 4830000 + }, + { + "epoch": 32.687987223906454, + "grad_norm": 0.39165258407592773, + "learning_rate": 4.6731201277609357e-05, + "loss": 0.3593, + "step": 4830500 + }, + { + "epoch": 32.691370723256824, + "grad_norm": 0.39184510707855225, + "learning_rate": 4.673086292767432e-05, + "loss": 0.3595, + "step": 4831000 + }, + { + "epoch": 32.69475422260719, + "grad_norm": 0.3789173662662506, + "learning_rate": 4.673052457773928e-05, + "loss": 0.3587, + "step": 4831500 + }, + { + "epoch": 32.69813772195756, + "grad_norm": 0.3882516920566559, + "learning_rate": 4.673018622780424e-05, + "loss": 0.3605, + "step": 4832000 + }, + { + "epoch": 32.70152122130793, + "grad_norm": 0.39031362533569336, + "learning_rate": 4.672984787786921e-05, + "loss": 0.36, + "step": 4832500 + }, + { + "epoch": 32.70490472065829, + "grad_norm": 0.3862682282924652, + "learning_rate": 4.6729509527934174e-05, + "loss": 0.3595, + "step": 4833000 + }, + { + "epoch": 32.70828822000866, + "grad_norm": 0.3593071699142456, + "learning_rate": 4.6729171177999136e-05, + "loss": 0.36, + "step": 4833500 + }, + { + "epoch": 32.71167171935903, + "grad_norm": 0.36724409461021423, + "learning_rate": 4.67288328280641e-05, + "loss": 0.3619, + "step": 4834000 + }, + { + "epoch": 32.715055218709395, + "grad_norm": 0.36828306317329407, + "learning_rate": 4.672849447812907e-05, + "loss": 0.3615, + "step": 4834500 + }, + { + "epoch": 32.718438718059765, + "grad_norm": 0.36529555916786194, + "learning_rate": 4.672815612819403e-05, + "loss": 0.3605, + "step": 4835000 + }, + { + "epoch": 32.721822217410136, + "grad_norm": 0.36009782552719116, + "learning_rate": 4.6727817778258985e-05, + "loss": 0.3617, + "step": 4835500 + }, + { + "epoch": 32.7252057167605, + "grad_norm": 0.3862138092517853, + "learning_rate": 4.6727479428323953e-05, + "loss": 0.36, + "step": 4836000 + }, + { + "epoch": 32.72858921611087, + "grad_norm": 0.45396357774734497, + "learning_rate": 4.6727141078388916e-05, + "loss": 0.3611, + "step": 4836500 + }, + { + "epoch": 32.73197271546124, + "grad_norm": 0.3386971354484558, + "learning_rate": 4.672680272845388e-05, + "loss": 0.3605, + "step": 4837000 + }, + { + "epoch": 32.73535621481161, + "grad_norm": 0.39177754521369934, + "learning_rate": 4.672646437851884e-05, + "loss": 0.3595, + "step": 4837500 + }, + { + "epoch": 32.73873971416197, + "grad_norm": 0.3584165871143341, + "learning_rate": 4.672612602858381e-05, + "loss": 0.3598, + "step": 4838000 + }, + { + "epoch": 32.74212321351234, + "grad_norm": 0.3619323968887329, + "learning_rate": 4.672578767864877e-05, + "loss": 0.3605, + "step": 4838500 + }, + { + "epoch": 32.74550671286271, + "grad_norm": 0.4178721010684967, + "learning_rate": 4.672544932871373e-05, + "loss": 0.3603, + "step": 4839000 + }, + { + "epoch": 32.74889021221308, + "grad_norm": 0.34086382389068604, + "learning_rate": 4.6725110978778695e-05, + "loss": 0.3589, + "step": 4839500 + }, + { + "epoch": 32.75227371156345, + "grad_norm": 0.36982759833335876, + "learning_rate": 4.672477262884366e-05, + "loss": 0.3608, + "step": 4840000 + }, + { + "epoch": 32.75565721091382, + "grad_norm": 0.34839120507240295, + "learning_rate": 4.672443427890862e-05, + "loss": 0.361, + "step": 4840500 + }, + { + "epoch": 32.75904071026418, + "grad_norm": 0.39314019680023193, + "learning_rate": 4.672409592897358e-05, + "loss": 0.3602, + "step": 4841000 + }, + { + "epoch": 32.76242420961455, + "grad_norm": 0.3656735122203827, + "learning_rate": 4.6723757579038544e-05, + "loss": 0.3606, + "step": 4841500 + }, + { + "epoch": 32.76580770896492, + "grad_norm": 0.3888089954853058, + "learning_rate": 4.672341922910351e-05, + "loss": 0.3609, + "step": 4842000 + }, + { + "epoch": 32.76919120831529, + "grad_norm": 0.3699166476726532, + "learning_rate": 4.6723080879168475e-05, + "loss": 0.3597, + "step": 4842500 + }, + { + "epoch": 32.772574707665655, + "grad_norm": 0.3918304443359375, + "learning_rate": 4.672274252923344e-05, + "loss": 0.3619, + "step": 4843000 + }, + { + "epoch": 32.775958207016025, + "grad_norm": 0.3193145990371704, + "learning_rate": 4.67224041792984e-05, + "loss": 0.3607, + "step": 4843500 + }, + { + "epoch": 32.779341706366395, + "grad_norm": 0.38491323590278625, + "learning_rate": 4.672206582936337e-05, + "loss": 0.3613, + "step": 4844000 + }, + { + "epoch": 32.78272520571676, + "grad_norm": 0.3670559823513031, + "learning_rate": 4.672172747942833e-05, + "loss": 0.36, + "step": 4844500 + }, + { + "epoch": 32.78610870506713, + "grad_norm": 0.35742372274398804, + "learning_rate": 4.6721389129493285e-05, + "loss": 0.3591, + "step": 4845000 + }, + { + "epoch": 32.7894922044175, + "grad_norm": 0.39176833629608154, + "learning_rate": 4.6721050779558254e-05, + "loss": 0.3599, + "step": 4845500 + }, + { + "epoch": 32.79287570376786, + "grad_norm": 0.3562770187854767, + "learning_rate": 4.6720712429623216e-05, + "loss": 0.3596, + "step": 4846000 + }, + { + "epoch": 32.79625920311823, + "grad_norm": 0.40307995676994324, + "learning_rate": 4.672037407968818e-05, + "loss": 0.3599, + "step": 4846500 + }, + { + "epoch": 32.7996427024686, + "grad_norm": 0.3937683701515198, + "learning_rate": 4.672003572975314e-05, + "loss": 0.3609, + "step": 4847000 + }, + { + "epoch": 32.803026201818966, + "grad_norm": 0.3605369031429291, + "learning_rate": 4.67196973798181e-05, + "loss": 0.3599, + "step": 4847500 + }, + { + "epoch": 32.806409701169336, + "grad_norm": 0.3692830801010132, + "learning_rate": 4.671935902988307e-05, + "loss": 0.3589, + "step": 4848000 + }, + { + "epoch": 32.80979320051971, + "grad_norm": 0.3484083116054535, + "learning_rate": 4.6719020679948034e-05, + "loss": 0.3594, + "step": 4848500 + }, + { + "epoch": 32.81317669987008, + "grad_norm": 0.35944950580596924, + "learning_rate": 4.6718682330012996e-05, + "loss": 0.3596, + "step": 4849000 + }, + { + "epoch": 32.81656019922044, + "grad_norm": 0.374080091714859, + "learning_rate": 4.671834398007796e-05, + "loss": 0.3611, + "step": 4849500 + }, + { + "epoch": 32.81994369857081, + "grad_norm": 0.41749081015586853, + "learning_rate": 4.671800563014292e-05, + "loss": 0.3601, + "step": 4850000 + }, + { + "epoch": 32.82332719792118, + "grad_norm": 0.3499038517475128, + "learning_rate": 4.671766728020788e-05, + "loss": 0.3597, + "step": 4850500 + }, + { + "epoch": 32.826710697271544, + "grad_norm": 0.3616945743560791, + "learning_rate": 4.6717328930272844e-05, + "loss": 0.3604, + "step": 4851000 + }, + { + "epoch": 32.830094196621914, + "grad_norm": 0.3701276183128357, + "learning_rate": 4.671699058033781e-05, + "loss": 0.3589, + "step": 4851500 + }, + { + "epoch": 32.833477695972284, + "grad_norm": 0.4054557979106903, + "learning_rate": 4.6716652230402775e-05, + "loss": 0.3605, + "step": 4852000 + }, + { + "epoch": 32.83686119532265, + "grad_norm": 0.3850688338279724, + "learning_rate": 4.671631388046774e-05, + "loss": 0.3607, + "step": 4852500 + }, + { + "epoch": 32.84024469467302, + "grad_norm": 0.3332974314689636, + "learning_rate": 4.67159755305327e-05, + "loss": 0.361, + "step": 4853000 + }, + { + "epoch": 32.84362819402339, + "grad_norm": 0.40008342266082764, + "learning_rate": 4.671563718059767e-05, + "loss": 0.3588, + "step": 4853500 + }, + { + "epoch": 32.84701169337375, + "grad_norm": 0.3763919174671173, + "learning_rate": 4.671529883066263e-05, + "loss": 0.3595, + "step": 4854000 + }, + { + "epoch": 32.85039519272412, + "grad_norm": 0.3576953113079071, + "learning_rate": 4.6714960480727586e-05, + "loss": 0.3602, + "step": 4854500 + }, + { + "epoch": 32.85377869207449, + "grad_norm": 0.38050130009651184, + "learning_rate": 4.6714622130792555e-05, + "loss": 0.3601, + "step": 4855000 + }, + { + "epoch": 32.85716219142486, + "grad_norm": 0.35217756032943726, + "learning_rate": 4.671428378085752e-05, + "loss": 0.3594, + "step": 4855500 + }, + { + "epoch": 32.860545690775226, + "grad_norm": 0.33961227536201477, + "learning_rate": 4.671394543092248e-05, + "loss": 0.36, + "step": 4856000 + }, + { + "epoch": 32.863929190125596, + "grad_norm": 0.3544408977031708, + "learning_rate": 4.671360708098744e-05, + "loss": 0.361, + "step": 4856500 + }, + { + "epoch": 32.867312689475966, + "grad_norm": 0.431446373462677, + "learning_rate": 4.67132687310524e-05, + "loss": 0.3598, + "step": 4857000 + }, + { + "epoch": 32.87069618882633, + "grad_norm": 0.39491701126098633, + "learning_rate": 4.671293038111737e-05, + "loss": 0.3601, + "step": 4857500 + }, + { + "epoch": 32.8740796881767, + "grad_norm": 0.36959463357925415, + "learning_rate": 4.6712592031182334e-05, + "loss": 0.3603, + "step": 4858000 + }, + { + "epoch": 32.87746318752707, + "grad_norm": 0.37260931730270386, + "learning_rate": 4.6712253681247296e-05, + "loss": 0.3602, + "step": 4858500 + }, + { + "epoch": 32.88084668687743, + "grad_norm": 0.3644866347312927, + "learning_rate": 4.671191533131226e-05, + "loss": 0.3607, + "step": 4859000 + }, + { + "epoch": 32.8842301862278, + "grad_norm": 0.3580959439277649, + "learning_rate": 4.671157698137722e-05, + "loss": 0.3599, + "step": 4859500 + }, + { + "epoch": 32.887613685578174, + "grad_norm": 0.37152689695358276, + "learning_rate": 4.671123863144218e-05, + "loss": 0.3601, + "step": 4860000 + }, + { + "epoch": 32.89099718492854, + "grad_norm": 0.3975582420825958, + "learning_rate": 4.6710900281507145e-05, + "loss": 0.3601, + "step": 4860500 + }, + { + "epoch": 32.89438068427891, + "grad_norm": 0.35651257634162903, + "learning_rate": 4.6710561931572114e-05, + "loss": 0.3608, + "step": 4861000 + }, + { + "epoch": 32.89776418362928, + "grad_norm": 0.36524707078933716, + "learning_rate": 4.6710223581637076e-05, + "loss": 0.3594, + "step": 4861500 + }, + { + "epoch": 32.90114768297965, + "grad_norm": 0.3711818754673004, + "learning_rate": 4.670988523170204e-05, + "loss": 0.3584, + "step": 4862000 + }, + { + "epoch": 32.90453118233001, + "grad_norm": 0.39311543107032776, + "learning_rate": 4.6709546881767e-05, + "loss": 0.3595, + "step": 4862500 + }, + { + "epoch": 32.90791468168038, + "grad_norm": 0.3769167363643646, + "learning_rate": 4.670920853183197e-05, + "loss": 0.3623, + "step": 4863000 + }, + { + "epoch": 32.91129818103075, + "grad_norm": 0.40175095200538635, + "learning_rate": 4.670887018189693e-05, + "loss": 0.3613, + "step": 4863500 + }, + { + "epoch": 32.914681680381115, + "grad_norm": 0.38338661193847656, + "learning_rate": 4.6708531831961886e-05, + "loss": 0.3593, + "step": 4864000 + }, + { + "epoch": 32.918065179731485, + "grad_norm": 0.37028419971466064, + "learning_rate": 4.670819348202685e-05, + "loss": 0.3617, + "step": 4864500 + }, + { + "epoch": 32.921448679081855, + "grad_norm": 0.3657394349575043, + "learning_rate": 4.670785513209182e-05, + "loss": 0.3597, + "step": 4865000 + }, + { + "epoch": 32.92483217843222, + "grad_norm": 0.3838844299316406, + "learning_rate": 4.670751678215678e-05, + "loss": 0.3596, + "step": 4865500 + }, + { + "epoch": 32.92821567778259, + "grad_norm": 0.3631505072116852, + "learning_rate": 4.670717843222174e-05, + "loss": 0.3587, + "step": 4866000 + }, + { + "epoch": 32.93159917713296, + "grad_norm": 0.3473980128765106, + "learning_rate": 4.6706840082286704e-05, + "loss": 0.3604, + "step": 4866500 + }, + { + "epoch": 32.93498267648333, + "grad_norm": 0.3915610611438751, + "learning_rate": 4.670650173235167e-05, + "loss": 0.3594, + "step": 4867000 + }, + { + "epoch": 32.93836617583369, + "grad_norm": 0.3296775221824646, + "learning_rate": 4.6706163382416635e-05, + "loss": 0.3595, + "step": 4867500 + }, + { + "epoch": 32.94174967518406, + "grad_norm": 0.35316115617752075, + "learning_rate": 4.67058250324816e-05, + "loss": 0.3616, + "step": 4868000 + }, + { + "epoch": 32.94513317453443, + "grad_norm": 0.3577609062194824, + "learning_rate": 4.670548668254656e-05, + "loss": 0.3616, + "step": 4868500 + }, + { + "epoch": 32.9485166738848, + "grad_norm": 0.3541504740715027, + "learning_rate": 4.670514833261152e-05, + "loss": 0.3601, + "step": 4869000 + }, + { + "epoch": 32.95190017323517, + "grad_norm": 0.3470286726951599, + "learning_rate": 4.670480998267648e-05, + "loss": 0.3599, + "step": 4869500 + }, + { + "epoch": 32.95528367258554, + "grad_norm": 0.3620125651359558, + "learning_rate": 4.6704471632741445e-05, + "loss": 0.3595, + "step": 4870000 + }, + { + "epoch": 32.9586671719359, + "grad_norm": 0.371470183134079, + "learning_rate": 4.6704133282806414e-05, + "loss": 0.3622, + "step": 4870500 + }, + { + "epoch": 32.96205067128627, + "grad_norm": 0.3785405457019806, + "learning_rate": 4.6703794932871376e-05, + "loss": 0.3598, + "step": 4871000 + }, + { + "epoch": 32.96543417063664, + "grad_norm": 0.372321218252182, + "learning_rate": 4.670345658293634e-05, + "loss": 0.3601, + "step": 4871500 + }, + { + "epoch": 32.968817669987004, + "grad_norm": 0.36906686425209045, + "learning_rate": 4.67031182330013e-05, + "loss": 0.3588, + "step": 4872000 + }, + { + "epoch": 32.972201169337374, + "grad_norm": 0.4017215371131897, + "learning_rate": 4.670277988306627e-05, + "loss": 0.36, + "step": 4872500 + }, + { + "epoch": 32.975584668687745, + "grad_norm": 0.3841962516307831, + "learning_rate": 4.670244153313123e-05, + "loss": 0.3593, + "step": 4873000 + }, + { + "epoch": 32.978968168038115, + "grad_norm": 0.3314242362976074, + "learning_rate": 4.670210318319619e-05, + "loss": 0.3613, + "step": 4873500 + }, + { + "epoch": 32.98235166738848, + "grad_norm": 0.37515687942504883, + "learning_rate": 4.670176483326115e-05, + "loss": 0.3598, + "step": 4874000 + }, + { + "epoch": 32.98573516673885, + "grad_norm": 0.37607279419898987, + "learning_rate": 4.670142648332612e-05, + "loss": 0.361, + "step": 4874500 + }, + { + "epoch": 32.98911866608922, + "grad_norm": 0.39258554577827454, + "learning_rate": 4.670108813339108e-05, + "loss": 0.3606, + "step": 4875000 + }, + { + "epoch": 32.99250216543958, + "grad_norm": 0.39667394757270813, + "learning_rate": 4.670074978345604e-05, + "loss": 0.3608, + "step": 4875500 + }, + { + "epoch": 32.99588566478995, + "grad_norm": 0.3682968318462372, + "learning_rate": 4.6700411433521004e-05, + "loss": 0.3602, + "step": 4876000 + }, + { + "epoch": 32.99926916414032, + "grad_norm": 0.3392215967178345, + "learning_rate": 4.670007308358597e-05, + "loss": 0.3602, + "step": 4876500 + }, + { + "epoch": 33.0, + "eval_accuracy": 0.862800375099966, + "eval_loss": 0.5578371286392212, + "eval_runtime": 3360.5352, + "eval_samples_per_second": 86.517, + "eval_steps_per_second": 5.407, + "step": 4876608 + }, + { + "epoch": 33.002652663490686, + "grad_norm": 0.40107080340385437, + "learning_rate": 4.6699734733650935e-05, + "loss": 0.3596, + "step": 4877000 + }, + { + "epoch": 33.006036162841056, + "grad_norm": 0.366609126329422, + "learning_rate": 4.66993963837159e-05, + "loss": 0.3588, + "step": 4877500 + }, + { + "epoch": 33.009419662191426, + "grad_norm": 0.37344497442245483, + "learning_rate": 4.669905803378086e-05, + "loss": 0.358, + "step": 4878000 + }, + { + "epoch": 33.01280316154179, + "grad_norm": 0.3847726285457611, + "learning_rate": 4.669871968384582e-05, + "loss": 0.3586, + "step": 4878500 + }, + { + "epoch": 33.01618666089216, + "grad_norm": 0.3856114447116852, + "learning_rate": 4.6698381333910784e-05, + "loss": 0.3557, + "step": 4879000 + }, + { + "epoch": 33.01957016024253, + "grad_norm": 0.40235546231269836, + "learning_rate": 4.6698042983975746e-05, + "loss": 0.3593, + "step": 4879500 + }, + { + "epoch": 33.0229536595929, + "grad_norm": 0.37816309928894043, + "learning_rate": 4.6697704634040715e-05, + "loss": 0.3573, + "step": 4880000 + }, + { + "epoch": 33.026337158943264, + "grad_norm": 0.3726678192615509, + "learning_rate": 4.669736628410568e-05, + "loss": 0.3558, + "step": 4880500 + }, + { + "epoch": 33.029720658293634, + "grad_norm": 0.3437913954257965, + "learning_rate": 4.669702793417064e-05, + "loss": 0.3588, + "step": 4881000 + }, + { + "epoch": 33.033104157644004, + "grad_norm": 0.40929800271987915, + "learning_rate": 4.66966895842356e-05, + "loss": 0.3574, + "step": 4881500 + }, + { + "epoch": 33.03648765699437, + "grad_norm": 0.3646358251571655, + "learning_rate": 4.669635123430057e-05, + "loss": 0.3566, + "step": 4882000 + }, + { + "epoch": 33.03987115634474, + "grad_norm": 0.38432562351226807, + "learning_rate": 4.669601288436553e-05, + "loss": 0.3575, + "step": 4882500 + }, + { + "epoch": 33.04325465569511, + "grad_norm": 0.3522011935710907, + "learning_rate": 4.669567453443049e-05, + "loss": 0.3586, + "step": 4883000 + }, + { + "epoch": 33.04663815504547, + "grad_norm": 0.3728903830051422, + "learning_rate": 4.669533618449545e-05, + "loss": 0.3585, + "step": 4883500 + }, + { + "epoch": 33.05002165439584, + "grad_norm": 0.366802453994751, + "learning_rate": 4.669499783456042e-05, + "loss": 0.3586, + "step": 4884000 + }, + { + "epoch": 33.05340515374621, + "grad_norm": 0.3676486611366272, + "learning_rate": 4.669465948462538e-05, + "loss": 0.3582, + "step": 4884500 + }, + { + "epoch": 33.056788653096575, + "grad_norm": 0.38962480425834656, + "learning_rate": 4.669432113469034e-05, + "loss": 0.3586, + "step": 4885000 + }, + { + "epoch": 33.060172152446945, + "grad_norm": 0.4253799319267273, + "learning_rate": 4.6693982784755305e-05, + "loss": 0.3583, + "step": 4885500 + }, + { + "epoch": 33.063555651797316, + "grad_norm": 0.3932543992996216, + "learning_rate": 4.6693644434820274e-05, + "loss": 0.3582, + "step": 4886000 + }, + { + "epoch": 33.066939151147686, + "grad_norm": 0.39341434836387634, + "learning_rate": 4.6693306084885236e-05, + "loss": 0.358, + "step": 4886500 + }, + { + "epoch": 33.07032265049805, + "grad_norm": 0.38702037930488586, + "learning_rate": 4.66929677349502e-05, + "loss": 0.3588, + "step": 4887000 + }, + { + "epoch": 33.07370614984842, + "grad_norm": 0.3913222849369049, + "learning_rate": 4.669262938501516e-05, + "loss": 0.3594, + "step": 4887500 + }, + { + "epoch": 33.07708964919879, + "grad_norm": 0.3729381561279297, + "learning_rate": 4.669229103508012e-05, + "loss": 0.3587, + "step": 4888000 + }, + { + "epoch": 33.08047314854915, + "grad_norm": 0.3916638493537903, + "learning_rate": 4.6691952685145085e-05, + "loss": 0.3596, + "step": 4888500 + }, + { + "epoch": 33.08385664789952, + "grad_norm": 0.3831269145011902, + "learning_rate": 4.669161433521005e-05, + "loss": 0.3584, + "step": 4889000 + }, + { + "epoch": 33.087240147249894, + "grad_norm": 0.3752813935279846, + "learning_rate": 4.6691275985275016e-05, + "loss": 0.3596, + "step": 4889500 + }, + { + "epoch": 33.09062364660026, + "grad_norm": 0.37731239199638367, + "learning_rate": 4.669093763533998e-05, + "loss": 0.3589, + "step": 4890000 + }, + { + "epoch": 33.09400714595063, + "grad_norm": 0.40924587845802307, + "learning_rate": 4.669059928540494e-05, + "loss": 0.3581, + "step": 4890500 + }, + { + "epoch": 33.097390645301, + "grad_norm": 0.3785557150840759, + "learning_rate": 4.66902609354699e-05, + "loss": 0.3586, + "step": 4891000 + }, + { + "epoch": 33.10077414465137, + "grad_norm": 0.39560848474502563, + "learning_rate": 4.668992258553487e-05, + "loss": 0.3566, + "step": 4891500 + }, + { + "epoch": 33.10415764400173, + "grad_norm": 0.4397170841693878, + "learning_rate": 4.668958423559983e-05, + "loss": 0.3584, + "step": 4892000 + }, + { + "epoch": 33.1075411433521, + "grad_norm": 0.4220854640007019, + "learning_rate": 4.668924588566479e-05, + "loss": 0.3603, + "step": 4892500 + }, + { + "epoch": 33.11092464270247, + "grad_norm": 0.4177074730396271, + "learning_rate": 4.668890753572975e-05, + "loss": 0.3579, + "step": 4893000 + }, + { + "epoch": 33.114308142052835, + "grad_norm": 0.35741767287254333, + "learning_rate": 4.668856918579472e-05, + "loss": 0.3576, + "step": 4893500 + }, + { + "epoch": 33.117691641403205, + "grad_norm": 0.36441221833229065, + "learning_rate": 4.668823083585968e-05, + "loss": 0.3579, + "step": 4894000 + }, + { + "epoch": 33.121075140753575, + "grad_norm": 0.3814169466495514, + "learning_rate": 4.6687892485924644e-05, + "loss": 0.359, + "step": 4894500 + }, + { + "epoch": 33.12445864010394, + "grad_norm": 0.39792677760124207, + "learning_rate": 4.6687554135989606e-05, + "loss": 0.3582, + "step": 4895000 + }, + { + "epoch": 33.12784213945431, + "grad_norm": 0.38604456186294556, + "learning_rate": 4.6687215786054575e-05, + "loss": 0.3575, + "step": 4895500 + }, + { + "epoch": 33.13122563880468, + "grad_norm": 0.3650904595851898, + "learning_rate": 4.668687743611954e-05, + "loss": 0.3598, + "step": 4896000 + }, + { + "epoch": 33.13460913815504, + "grad_norm": 0.41717758774757385, + "learning_rate": 4.66865390861845e-05, + "loss": 0.3583, + "step": 4896500 + }, + { + "epoch": 33.13799263750541, + "grad_norm": 0.3828420639038086, + "learning_rate": 4.668620073624946e-05, + "loss": 0.3593, + "step": 4897000 + }, + { + "epoch": 33.14137613685578, + "grad_norm": 0.41209474205970764, + "learning_rate": 4.668586238631442e-05, + "loss": 0.3582, + "step": 4897500 + }, + { + "epoch": 33.14475963620615, + "grad_norm": 0.3780669867992401, + "learning_rate": 4.6685524036379385e-05, + "loss": 0.3591, + "step": 4898000 + }, + { + "epoch": 33.148143135556516, + "grad_norm": 0.35185787081718445, + "learning_rate": 4.668518568644435e-05, + "loss": 0.3577, + "step": 4898500 + }, + { + "epoch": 33.15152663490689, + "grad_norm": 0.3798390030860901, + "learning_rate": 4.6684847336509316e-05, + "loss": 0.3575, + "step": 4899000 + }, + { + "epoch": 33.15491013425726, + "grad_norm": 0.38691797852516174, + "learning_rate": 4.668450898657428e-05, + "loss": 0.359, + "step": 4899500 + }, + { + "epoch": 33.15829363360762, + "grad_norm": 0.34414389729499817, + "learning_rate": 4.668417063663924e-05, + "loss": 0.3597, + "step": 4900000 + }, + { + "epoch": 33.16167713295799, + "grad_norm": 0.3660948574542999, + "learning_rate": 4.66838322867042e-05, + "loss": 0.3598, + "step": 4900500 + }, + { + "epoch": 33.16506063230836, + "grad_norm": 0.3499019145965576, + "learning_rate": 4.668349393676917e-05, + "loss": 0.3602, + "step": 4901000 + }, + { + "epoch": 33.168444131658724, + "grad_norm": 0.3914853036403656, + "learning_rate": 4.6683155586834134e-05, + "loss": 0.3587, + "step": 4901500 + }, + { + "epoch": 33.171827631009094, + "grad_norm": 0.37145745754241943, + "learning_rate": 4.668281723689909e-05, + "loss": 0.3585, + "step": 4902000 + }, + { + "epoch": 33.175211130359465, + "grad_norm": 0.3621034622192383, + "learning_rate": 4.668247888696405e-05, + "loss": 0.3594, + "step": 4902500 + }, + { + "epoch": 33.17859462970983, + "grad_norm": 0.3783204257488251, + "learning_rate": 4.668214053702902e-05, + "loss": 0.3592, + "step": 4903000 + }, + { + "epoch": 33.1819781290602, + "grad_norm": 0.3723049759864807, + "learning_rate": 4.668180218709398e-05, + "loss": 0.3578, + "step": 4903500 + }, + { + "epoch": 33.18536162841057, + "grad_norm": 0.4073820114135742, + "learning_rate": 4.6681463837158944e-05, + "loss": 0.3595, + "step": 4904000 + }, + { + "epoch": 33.18874512776094, + "grad_norm": 0.36406978964805603, + "learning_rate": 4.6681125487223906e-05, + "loss": 0.3605, + "step": 4904500 + }, + { + "epoch": 33.1921286271113, + "grad_norm": 0.3732823133468628, + "learning_rate": 4.6680787137288875e-05, + "loss": 0.3593, + "step": 4905000 + }, + { + "epoch": 33.19551212646167, + "grad_norm": 0.3324179947376251, + "learning_rate": 4.668044878735384e-05, + "loss": 0.3593, + "step": 4905500 + }, + { + "epoch": 33.19889562581204, + "grad_norm": 0.3814818561077118, + "learning_rate": 4.66801104374188e-05, + "loss": 0.3595, + "step": 4906000 + }, + { + "epoch": 33.202279125162406, + "grad_norm": 0.3528575897216797, + "learning_rate": 4.667977208748376e-05, + "loss": 0.3589, + "step": 4906500 + }, + { + "epoch": 33.205662624512776, + "grad_norm": 0.4369259178638458, + "learning_rate": 4.6679433737548724e-05, + "loss": 0.3591, + "step": 4907000 + }, + { + "epoch": 33.209046123863146, + "grad_norm": 0.3980875015258789, + "learning_rate": 4.6679095387613686e-05, + "loss": 0.3606, + "step": 4907500 + }, + { + "epoch": 33.21242962321351, + "grad_norm": 0.3704988956451416, + "learning_rate": 4.667875703767865e-05, + "loss": 0.3599, + "step": 4908000 + }, + { + "epoch": 33.21581312256388, + "grad_norm": 0.3260713219642639, + "learning_rate": 4.667841868774362e-05, + "loss": 0.3594, + "step": 4908500 + }, + { + "epoch": 33.21919662191425, + "grad_norm": 0.3785054683685303, + "learning_rate": 4.667808033780858e-05, + "loss": 0.3585, + "step": 4909000 + }, + { + "epoch": 33.22258012126461, + "grad_norm": 0.39522457122802734, + "learning_rate": 4.667774198787354e-05, + "loss": 0.3599, + "step": 4909500 + }, + { + "epoch": 33.22596362061498, + "grad_norm": 0.3919886350631714, + "learning_rate": 4.66774036379385e-05, + "loss": 0.3592, + "step": 4910000 + }, + { + "epoch": 33.229347119965354, + "grad_norm": 0.35814350843429565, + "learning_rate": 4.6677065288003465e-05, + "loss": 0.3585, + "step": 4910500 + }, + { + "epoch": 33.232730619315724, + "grad_norm": 0.37814611196517944, + "learning_rate": 4.6676726938068434e-05, + "loss": 0.3595, + "step": 4911000 + }, + { + "epoch": 33.23611411866609, + "grad_norm": 0.3492003381252289, + "learning_rate": 4.667638858813339e-05, + "loss": 0.3581, + "step": 4911500 + }, + { + "epoch": 33.23949761801646, + "grad_norm": 0.3310892581939697, + "learning_rate": 4.667605023819835e-05, + "loss": 0.3574, + "step": 4912000 + }, + { + "epoch": 33.24288111736683, + "grad_norm": 0.32191982865333557, + "learning_rate": 4.667571188826332e-05, + "loss": 0.3585, + "step": 4912500 + }, + { + "epoch": 33.24626461671719, + "grad_norm": 0.37318795919418335, + "learning_rate": 4.667537353832828e-05, + "loss": 0.3596, + "step": 4913000 + }, + { + "epoch": 33.24964811606756, + "grad_norm": 0.41358447074890137, + "learning_rate": 4.6675035188393245e-05, + "loss": 0.3585, + "step": 4913500 + }, + { + "epoch": 33.25303161541793, + "grad_norm": 0.3911883533000946, + "learning_rate": 4.667469683845821e-05, + "loss": 0.3601, + "step": 4914000 + }, + { + "epoch": 33.256415114768295, + "grad_norm": 0.38080286979675293, + "learning_rate": 4.6674358488523176e-05, + "loss": 0.3594, + "step": 4914500 + }, + { + "epoch": 33.259798614118665, + "grad_norm": 0.3826783001422882, + "learning_rate": 4.667402013858814e-05, + "loss": 0.3603, + "step": 4915000 + }, + { + "epoch": 33.263182113469036, + "grad_norm": 0.36232760548591614, + "learning_rate": 4.66736817886531e-05, + "loss": 0.3589, + "step": 4915500 + }, + { + "epoch": 33.266565612819406, + "grad_norm": 0.42275017499923706, + "learning_rate": 4.667334343871806e-05, + "loss": 0.3592, + "step": 4916000 + }, + { + "epoch": 33.26994911216977, + "grad_norm": 0.3786555230617523, + "learning_rate": 4.6673005088783024e-05, + "loss": 0.3602, + "step": 4916500 + }, + { + "epoch": 33.27333261152014, + "grad_norm": 0.36606207489967346, + "learning_rate": 4.6672666738847987e-05, + "loss": 0.3587, + "step": 4917000 + }, + { + "epoch": 33.27671611087051, + "grad_norm": 0.4194417893886566, + "learning_rate": 4.667232838891295e-05, + "loss": 0.358, + "step": 4917500 + }, + { + "epoch": 33.28009961022087, + "grad_norm": 0.3580016493797302, + "learning_rate": 4.667199003897791e-05, + "loss": 0.3582, + "step": 4918000 + }, + { + "epoch": 33.28348310957124, + "grad_norm": 0.3689776659011841, + "learning_rate": 4.667165168904288e-05, + "loss": 0.3606, + "step": 4918500 + }, + { + "epoch": 33.28686660892161, + "grad_norm": 0.41763901710510254, + "learning_rate": 4.667131333910784e-05, + "loss": 0.3585, + "step": 4919000 + }, + { + "epoch": 33.29025010827198, + "grad_norm": 0.33973315358161926, + "learning_rate": 4.6670974989172804e-05, + "loss": 0.3587, + "step": 4919500 + }, + { + "epoch": 33.29363360762235, + "grad_norm": 0.3305444121360779, + "learning_rate": 4.6670636639237766e-05, + "loss": 0.3598, + "step": 4920000 + }, + { + "epoch": 33.29701710697272, + "grad_norm": 0.3761975169181824, + "learning_rate": 4.6670298289302735e-05, + "loss": 0.3592, + "step": 4920500 + }, + { + "epoch": 33.30040060632308, + "grad_norm": 0.3960320055484772, + "learning_rate": 4.666995993936769e-05, + "loss": 0.3583, + "step": 4921000 + }, + { + "epoch": 33.30378410567345, + "grad_norm": 0.3718136250972748, + "learning_rate": 4.666962158943265e-05, + "loss": 0.3572, + "step": 4921500 + }, + { + "epoch": 33.30716760502382, + "grad_norm": 0.384726345539093, + "learning_rate": 4.666928323949762e-05, + "loss": 0.3594, + "step": 4922000 + }, + { + "epoch": 33.31055110437419, + "grad_norm": 0.3793955147266388, + "learning_rate": 4.6668944889562583e-05, + "loss": 0.3598, + "step": 4922500 + }, + { + "epoch": 33.313934603724555, + "grad_norm": 0.3863762319087982, + "learning_rate": 4.6668606539627546e-05, + "loss": 0.3601, + "step": 4923000 + }, + { + "epoch": 33.317318103074925, + "grad_norm": 0.3898181617259979, + "learning_rate": 4.666826818969251e-05, + "loss": 0.3599, + "step": 4923500 + }, + { + "epoch": 33.320701602425295, + "grad_norm": 0.37115055322647095, + "learning_rate": 4.6667929839757477e-05, + "loss": 0.3596, + "step": 4924000 + }, + { + "epoch": 33.32408510177566, + "grad_norm": 0.3558864891529083, + "learning_rate": 4.666759148982244e-05, + "loss": 0.3576, + "step": 4924500 + }, + { + "epoch": 33.32746860112603, + "grad_norm": 0.3862520754337311, + "learning_rate": 4.66672531398874e-05, + "loss": 0.3589, + "step": 4925000 + }, + { + "epoch": 33.3308521004764, + "grad_norm": 0.3589293956756592, + "learning_rate": 4.666691478995236e-05, + "loss": 0.3601, + "step": 4925500 + }, + { + "epoch": 33.33423559982676, + "grad_norm": 0.35759809613227844, + "learning_rate": 4.6666576440017325e-05, + "loss": 0.3595, + "step": 4926000 + }, + { + "epoch": 33.33761909917713, + "grad_norm": 0.3903287947177887, + "learning_rate": 4.666623809008229e-05, + "loss": 0.3604, + "step": 4926500 + }, + { + "epoch": 33.3410025985275, + "grad_norm": 0.3714323341846466, + "learning_rate": 4.666589974014725e-05, + "loss": 0.3607, + "step": 4927000 + }, + { + "epoch": 33.344386097877866, + "grad_norm": 0.4133627712726593, + "learning_rate": 4.666556139021221e-05, + "loss": 0.3599, + "step": 4927500 + }, + { + "epoch": 33.347769597228236, + "grad_norm": 0.3589901626110077, + "learning_rate": 4.666522304027718e-05, + "loss": 0.3596, + "step": 4928000 + }, + { + "epoch": 33.35115309657861, + "grad_norm": 0.40413162112236023, + "learning_rate": 4.666488469034214e-05, + "loss": 0.3586, + "step": 4928500 + }, + { + "epoch": 33.35453659592898, + "grad_norm": 0.4117417633533478, + "learning_rate": 4.6664546340407105e-05, + "loss": 0.3605, + "step": 4929000 + }, + { + "epoch": 33.35792009527934, + "grad_norm": 0.4197719693183899, + "learning_rate": 4.666420799047207e-05, + "loss": 0.3595, + "step": 4929500 + }, + { + "epoch": 33.36130359462971, + "grad_norm": 0.4158882200717926, + "learning_rate": 4.6663869640537036e-05, + "loss": 0.3589, + "step": 4930000 + }, + { + "epoch": 33.36468709398008, + "grad_norm": 0.3937288820743561, + "learning_rate": 4.666353129060199e-05, + "loss": 0.3591, + "step": 4930500 + }, + { + "epoch": 33.368070593330444, + "grad_norm": 0.4039243459701538, + "learning_rate": 4.666319294066695e-05, + "loss": 0.3587, + "step": 4931000 + }, + { + "epoch": 33.371454092680814, + "grad_norm": 0.3809860348701477, + "learning_rate": 4.666285459073192e-05, + "loss": 0.3596, + "step": 4931500 + }, + { + "epoch": 33.374837592031184, + "grad_norm": 0.39591655135154724, + "learning_rate": 4.6662516240796884e-05, + "loss": 0.359, + "step": 4932000 + }, + { + "epoch": 33.37822109138155, + "grad_norm": 0.36925196647644043, + "learning_rate": 4.6662177890861846e-05, + "loss": 0.3579, + "step": 4932500 + }, + { + "epoch": 33.38160459073192, + "grad_norm": 0.3570270836353302, + "learning_rate": 4.666183954092681e-05, + "loss": 0.3594, + "step": 4933000 + }, + { + "epoch": 33.38498809008229, + "grad_norm": 0.4062257409095764, + "learning_rate": 4.666150119099178e-05, + "loss": 0.3596, + "step": 4933500 + }, + { + "epoch": 33.38837158943265, + "grad_norm": 0.3464130461215973, + "learning_rate": 4.666116284105674e-05, + "loss": 0.3612, + "step": 4934000 + }, + { + "epoch": 33.39175508878302, + "grad_norm": 0.3435489237308502, + "learning_rate": 4.66608244911217e-05, + "loss": 0.3601, + "step": 4934500 + }, + { + "epoch": 33.39513858813339, + "grad_norm": 0.37780943512916565, + "learning_rate": 4.6660486141186664e-05, + "loss": 0.3589, + "step": 4935000 + }, + { + "epoch": 33.39852208748376, + "grad_norm": 0.373794823884964, + "learning_rate": 4.6660147791251626e-05, + "loss": 0.3614, + "step": 4935500 + }, + { + "epoch": 33.401905586834125, + "grad_norm": 0.43545180559158325, + "learning_rate": 4.665980944131659e-05, + "loss": 0.3608, + "step": 4936000 + }, + { + "epoch": 33.405289086184496, + "grad_norm": 0.3868274986743927, + "learning_rate": 4.665947109138155e-05, + "loss": 0.3588, + "step": 4936500 + }, + { + "epoch": 33.408672585534866, + "grad_norm": 0.3833744525909424, + "learning_rate": 4.665913274144651e-05, + "loss": 0.3587, + "step": 4937000 + }, + { + "epoch": 33.41205608488523, + "grad_norm": 0.3601362109184265, + "learning_rate": 4.665879439151148e-05, + "loss": 0.3605, + "step": 4937500 + }, + { + "epoch": 33.4154395842356, + "grad_norm": 0.36071741580963135, + "learning_rate": 4.665845604157644e-05, + "loss": 0.3581, + "step": 4938000 + }, + { + "epoch": 33.41882308358597, + "grad_norm": 0.3734799027442932, + "learning_rate": 4.6658117691641405e-05, + "loss": 0.3596, + "step": 4938500 + }, + { + "epoch": 33.42220658293633, + "grad_norm": 0.35517418384552, + "learning_rate": 4.665777934170637e-05, + "loss": 0.3606, + "step": 4939000 + }, + { + "epoch": 33.4255900822867, + "grad_norm": 0.37884628772735596, + "learning_rate": 4.6657440991771336e-05, + "loss": 0.3585, + "step": 4939500 + }, + { + "epoch": 33.428973581637074, + "grad_norm": 0.38768133521080017, + "learning_rate": 4.66571026418363e-05, + "loss": 0.3589, + "step": 4940000 + }, + { + "epoch": 33.432357080987444, + "grad_norm": 0.3687038719654083, + "learning_rate": 4.6656764291901254e-05, + "loss": 0.3599, + "step": 4940500 + }, + { + "epoch": 33.43574058033781, + "grad_norm": 0.3995122015476227, + "learning_rate": 4.665642594196622e-05, + "loss": 0.3594, + "step": 4941000 + }, + { + "epoch": 33.43912407968818, + "grad_norm": 0.40393587946891785, + "learning_rate": 4.6656087592031185e-05, + "loss": 0.3598, + "step": 4941500 + }, + { + "epoch": 33.44250757903855, + "grad_norm": 0.4265615940093994, + "learning_rate": 4.665574924209615e-05, + "loss": 0.3585, + "step": 4942000 + }, + { + "epoch": 33.44589107838891, + "grad_norm": 0.3678419888019562, + "learning_rate": 4.665541089216111e-05, + "loss": 0.362, + "step": 4942500 + }, + { + "epoch": 33.44927457773928, + "grad_norm": 0.344463586807251, + "learning_rate": 4.665507254222608e-05, + "loss": 0.3591, + "step": 4943000 + }, + { + "epoch": 33.45265807708965, + "grad_norm": 0.3779900372028351, + "learning_rate": 4.665473419229104e-05, + "loss": 0.3601, + "step": 4943500 + }, + { + "epoch": 33.456041576440015, + "grad_norm": 0.384981632232666, + "learning_rate": 4.6654395842356e-05, + "loss": 0.3588, + "step": 4944000 + }, + { + "epoch": 33.459425075790385, + "grad_norm": 0.3845674693584442, + "learning_rate": 4.6654057492420964e-05, + "loss": 0.3592, + "step": 4944500 + }, + { + "epoch": 33.462808575140755, + "grad_norm": 0.39477214217185974, + "learning_rate": 4.6653719142485926e-05, + "loss": 0.3583, + "step": 4945000 + }, + { + "epoch": 33.46619207449112, + "grad_norm": 0.3745117485523224, + "learning_rate": 4.665338079255089e-05, + "loss": 0.3588, + "step": 4945500 + }, + { + "epoch": 33.46957557384149, + "grad_norm": 0.39083191752433777, + "learning_rate": 4.665304244261585e-05, + "loss": 0.3603, + "step": 4946000 + }, + { + "epoch": 33.47295907319186, + "grad_norm": 0.3955481946468353, + "learning_rate": 4.665270409268081e-05, + "loss": 0.3603, + "step": 4946500 + }, + { + "epoch": 33.47634257254223, + "grad_norm": 0.3859942853450775, + "learning_rate": 4.665236574274578e-05, + "loss": 0.3596, + "step": 4947000 + }, + { + "epoch": 33.47972607189259, + "grad_norm": 0.41049662232398987, + "learning_rate": 4.6652027392810744e-05, + "loss": 0.3597, + "step": 4947500 + }, + { + "epoch": 33.48310957124296, + "grad_norm": 0.38303178548812866, + "learning_rate": 4.6651689042875706e-05, + "loss": 0.3583, + "step": 4948000 + }, + { + "epoch": 33.48649307059333, + "grad_norm": 0.37434321641921997, + "learning_rate": 4.665135069294067e-05, + "loss": 0.3591, + "step": 4948500 + }, + { + "epoch": 33.489876569943696, + "grad_norm": 0.35679763555526733, + "learning_rate": 4.665101234300564e-05, + "loss": 0.3599, + "step": 4949000 + }, + { + "epoch": 33.49326006929407, + "grad_norm": 0.38214111328125, + "learning_rate": 4.66506739930706e-05, + "loss": 0.3611, + "step": 4949500 + }, + { + "epoch": 33.49664356864444, + "grad_norm": 0.3885679543018341, + "learning_rate": 4.6650335643135554e-05, + "loss": 0.3592, + "step": 4950000 + }, + { + "epoch": 33.5000270679948, + "grad_norm": 0.3565627336502075, + "learning_rate": 4.664999729320052e-05, + "loss": 0.3595, + "step": 4950500 + }, + { + "epoch": 33.50341056734517, + "grad_norm": 0.4018453359603882, + "learning_rate": 4.6649658943265485e-05, + "loss": 0.3582, + "step": 4951000 + }, + { + "epoch": 33.50679406669554, + "grad_norm": 0.3506716787815094, + "learning_rate": 4.664932059333045e-05, + "loss": 0.3605, + "step": 4951500 + }, + { + "epoch": 33.510177566045904, + "grad_norm": 0.34621524810791016, + "learning_rate": 4.664898224339541e-05, + "loss": 0.3589, + "step": 4952000 + }, + { + "epoch": 33.513561065396274, + "grad_norm": 0.3600829541683197, + "learning_rate": 4.664864389346038e-05, + "loss": 0.3601, + "step": 4952500 + }, + { + "epoch": 33.516944564746645, + "grad_norm": 0.4031408429145813, + "learning_rate": 4.664830554352534e-05, + "loss": 0.3608, + "step": 4953000 + }, + { + "epoch": 33.520328064097015, + "grad_norm": 0.38953065872192383, + "learning_rate": 4.66479671935903e-05, + "loss": 0.3597, + "step": 4953500 + }, + { + "epoch": 33.52371156344738, + "grad_norm": 0.41566869616508484, + "learning_rate": 4.6647628843655265e-05, + "loss": 0.3585, + "step": 4954000 + }, + { + "epoch": 33.52709506279775, + "grad_norm": 0.4138035178184509, + "learning_rate": 4.664729049372023e-05, + "loss": 0.3581, + "step": 4954500 + }, + { + "epoch": 33.53047856214812, + "grad_norm": 0.4077489674091339, + "learning_rate": 4.664695214378519e-05, + "loss": 0.3591, + "step": 4955000 + }, + { + "epoch": 33.53386206149848, + "grad_norm": 0.38625746965408325, + "learning_rate": 4.664661379385015e-05, + "loss": 0.3607, + "step": 4955500 + }, + { + "epoch": 33.53724556084885, + "grad_norm": 0.4039769172668457, + "learning_rate": 4.664627544391511e-05, + "loss": 0.3623, + "step": 4956000 + }, + { + "epoch": 33.54062906019922, + "grad_norm": 0.4044813811779022, + "learning_rate": 4.664593709398008e-05, + "loss": 0.358, + "step": 4956500 + }, + { + "epoch": 33.544012559549586, + "grad_norm": 0.3929738402366638, + "learning_rate": 4.6645598744045044e-05, + "loss": 0.3588, + "step": 4957000 + }, + { + "epoch": 33.547396058899956, + "grad_norm": 0.3623534142971039, + "learning_rate": 4.6645260394110006e-05, + "loss": 0.3588, + "step": 4957500 + }, + { + "epoch": 33.550779558250326, + "grad_norm": 0.3804514706134796, + "learning_rate": 4.664492204417497e-05, + "loss": 0.3585, + "step": 4958000 + }, + { + "epoch": 33.55416305760069, + "grad_norm": 0.39171263575553894, + "learning_rate": 4.664458369423994e-05, + "loss": 0.3584, + "step": 4958500 + }, + { + "epoch": 33.55754655695106, + "grad_norm": 0.3710254728794098, + "learning_rate": 4.66442453443049e-05, + "loss": 0.3606, + "step": 4959000 + }, + { + "epoch": 33.56093005630143, + "grad_norm": 0.37038999795913696, + "learning_rate": 4.6643906994369855e-05, + "loss": 0.3587, + "step": 4959500 + }, + { + "epoch": 33.5643135556518, + "grad_norm": 0.3657047748565674, + "learning_rate": 4.6643568644434824e-05, + "loss": 0.362, + "step": 4960000 + }, + { + "epoch": 33.567697055002164, + "grad_norm": 0.3724454939365387, + "learning_rate": 4.6643230294499786e-05, + "loss": 0.3581, + "step": 4960500 + }, + { + "epoch": 33.571080554352534, + "grad_norm": 0.415170282125473, + "learning_rate": 4.664289194456475e-05, + "loss": 0.3601, + "step": 4961000 + }, + { + "epoch": 33.574464053702904, + "grad_norm": 0.37282267212867737, + "learning_rate": 4.664255359462971e-05, + "loss": 0.3608, + "step": 4961500 + }, + { + "epoch": 33.57784755305327, + "grad_norm": 0.4100336730480194, + "learning_rate": 4.664221524469468e-05, + "loss": 0.3605, + "step": 4962000 + }, + { + "epoch": 33.58123105240364, + "grad_norm": 0.38508278131484985, + "learning_rate": 4.664187689475964e-05, + "loss": 0.3614, + "step": 4962500 + }, + { + "epoch": 33.58461455175401, + "grad_norm": 0.3460726737976074, + "learning_rate": 4.66415385448246e-05, + "loss": 0.3576, + "step": 4963000 + }, + { + "epoch": 33.58799805110437, + "grad_norm": 0.398885577917099, + "learning_rate": 4.6641200194889565e-05, + "loss": 0.3598, + "step": 4963500 + }, + { + "epoch": 33.59138155045474, + "grad_norm": 0.40901660919189453, + "learning_rate": 4.664086184495453e-05, + "loss": 0.3596, + "step": 4964000 + }, + { + "epoch": 33.59476504980511, + "grad_norm": 0.3406643867492676, + "learning_rate": 4.664052349501949e-05, + "loss": 0.3584, + "step": 4964500 + }, + { + "epoch": 33.59814854915548, + "grad_norm": 0.40784019231796265, + "learning_rate": 4.664018514508445e-05, + "loss": 0.3599, + "step": 4965000 + }, + { + "epoch": 33.601532048505845, + "grad_norm": 0.4412790536880493, + "learning_rate": 4.6639846795149414e-05, + "loss": 0.3595, + "step": 4965500 + }, + { + "epoch": 33.604915547856216, + "grad_norm": 0.3802646994590759, + "learning_rate": 4.663950844521438e-05, + "loss": 0.3606, + "step": 4966000 + }, + { + "epoch": 33.608299047206586, + "grad_norm": 0.37169966101646423, + "learning_rate": 4.6639170095279345e-05, + "loss": 0.3592, + "step": 4966500 + }, + { + "epoch": 33.61168254655695, + "grad_norm": 0.3817939758300781, + "learning_rate": 4.663883174534431e-05, + "loss": 0.3603, + "step": 4967000 + }, + { + "epoch": 33.61506604590732, + "grad_norm": 0.34958896040916443, + "learning_rate": 4.663849339540927e-05, + "loss": 0.3581, + "step": 4967500 + }, + { + "epoch": 33.61844954525769, + "grad_norm": 0.3952915072441101, + "learning_rate": 4.663815504547424e-05, + "loss": 0.3605, + "step": 4968000 + }, + { + "epoch": 33.62183304460805, + "grad_norm": 0.36356744170188904, + "learning_rate": 4.66378166955392e-05, + "loss": 0.3617, + "step": 4968500 + }, + { + "epoch": 33.62521654395842, + "grad_norm": 0.4046655297279358, + "learning_rate": 4.6637478345604156e-05, + "loss": 0.3585, + "step": 4969000 + }, + { + "epoch": 33.62860004330879, + "grad_norm": 0.3863518536090851, + "learning_rate": 4.6637139995669124e-05, + "loss": 0.3585, + "step": 4969500 + }, + { + "epoch": 33.63198354265916, + "grad_norm": 0.35415220260620117, + "learning_rate": 4.6636801645734087e-05, + "loss": 0.3596, + "step": 4970000 + }, + { + "epoch": 33.63536704200953, + "grad_norm": 0.3962274491786957, + "learning_rate": 4.663646329579905e-05, + "loss": 0.3593, + "step": 4970500 + }, + { + "epoch": 33.6387505413599, + "grad_norm": 0.3565498888492584, + "learning_rate": 4.663612494586401e-05, + "loss": 0.3596, + "step": 4971000 + }, + { + "epoch": 33.64213404071027, + "grad_norm": 0.3514151871204376, + "learning_rate": 4.663578659592898e-05, + "loss": 0.359, + "step": 4971500 + }, + { + "epoch": 33.64551754006063, + "grad_norm": 0.35873162746429443, + "learning_rate": 4.663544824599394e-05, + "loss": 0.3597, + "step": 4972000 + }, + { + "epoch": 33.648901039411, + "grad_norm": 0.38362917304039, + "learning_rate": 4.6635109896058904e-05, + "loss": 0.3594, + "step": 4972500 + }, + { + "epoch": 33.65228453876137, + "grad_norm": 0.3515843152999878, + "learning_rate": 4.6634771546123866e-05, + "loss": 0.361, + "step": 4973000 + }, + { + "epoch": 33.655668038111735, + "grad_norm": 0.3710431754589081, + "learning_rate": 4.663443319618883e-05, + "loss": 0.3594, + "step": 4973500 + }, + { + "epoch": 33.659051537462105, + "grad_norm": 0.4550579786300659, + "learning_rate": 4.663409484625379e-05, + "loss": 0.3588, + "step": 4974000 + }, + { + "epoch": 33.662435036812475, + "grad_norm": 0.36737725138664246, + "learning_rate": 4.663375649631875e-05, + "loss": 0.3596, + "step": 4974500 + }, + { + "epoch": 33.66581853616284, + "grad_norm": 0.40225350856781006, + "learning_rate": 4.6633418146383715e-05, + "loss": 0.3579, + "step": 4975000 + }, + { + "epoch": 33.66920203551321, + "grad_norm": 0.39821138978004456, + "learning_rate": 4.6633079796448683e-05, + "loss": 0.3605, + "step": 4975500 + }, + { + "epoch": 33.67258553486358, + "grad_norm": 0.36197197437286377, + "learning_rate": 4.6632741446513646e-05, + "loss": 0.3588, + "step": 4976000 + }, + { + "epoch": 33.67596903421394, + "grad_norm": 0.3673679828643799, + "learning_rate": 4.663240309657861e-05, + "loss": 0.3602, + "step": 4976500 + }, + { + "epoch": 33.67935253356431, + "grad_norm": 0.38268110156059265, + "learning_rate": 4.663206474664357e-05, + "loss": 0.3612, + "step": 4977000 + }, + { + "epoch": 33.68273603291468, + "grad_norm": 0.36730483174324036, + "learning_rate": 4.663172639670854e-05, + "loss": 0.3618, + "step": 4977500 + }, + { + "epoch": 33.68611953226505, + "grad_norm": 0.39497968554496765, + "learning_rate": 4.66313880467735e-05, + "loss": 0.3596, + "step": 4978000 + }, + { + "epoch": 33.689503031615416, + "grad_norm": 0.36848193407058716, + "learning_rate": 4.6631049696838456e-05, + "loss": 0.3604, + "step": 4978500 + }, + { + "epoch": 33.69288653096579, + "grad_norm": 0.3875355124473572, + "learning_rate": 4.6630711346903425e-05, + "loss": 0.3596, + "step": 4979000 + }, + { + "epoch": 33.69627003031616, + "grad_norm": 0.38768208026885986, + "learning_rate": 4.663037299696839e-05, + "loss": 0.3592, + "step": 4979500 + }, + { + "epoch": 33.69965352966652, + "grad_norm": 0.34618815779685974, + "learning_rate": 4.663003464703335e-05, + "loss": 0.3598, + "step": 4980000 + }, + { + "epoch": 33.70303702901689, + "grad_norm": 0.377130389213562, + "learning_rate": 4.662969629709831e-05, + "loss": 0.3604, + "step": 4980500 + }, + { + "epoch": 33.70642052836726, + "grad_norm": 0.35038310289382935, + "learning_rate": 4.6629357947163274e-05, + "loss": 0.3608, + "step": 4981000 + }, + { + "epoch": 33.709804027717624, + "grad_norm": 0.36757394671440125, + "learning_rate": 4.662901959722824e-05, + "loss": 0.3602, + "step": 4981500 + }, + { + "epoch": 33.713187527067994, + "grad_norm": 0.3615155816078186, + "learning_rate": 4.6628681247293205e-05, + "loss": 0.3607, + "step": 4982000 + }, + { + "epoch": 33.716571026418364, + "grad_norm": 0.37312600016593933, + "learning_rate": 4.662834289735817e-05, + "loss": 0.3587, + "step": 4982500 + }, + { + "epoch": 33.71995452576873, + "grad_norm": 0.3604985475540161, + "learning_rate": 4.662800454742313e-05, + "loss": 0.3593, + "step": 4983000 + }, + { + "epoch": 33.7233380251191, + "grad_norm": 0.38975080847740173, + "learning_rate": 4.662766619748809e-05, + "loss": 0.3603, + "step": 4983500 + }, + { + "epoch": 33.72672152446947, + "grad_norm": 0.36984366178512573, + "learning_rate": 4.662732784755305e-05, + "loss": 0.3598, + "step": 4984000 + }, + { + "epoch": 33.73010502381984, + "grad_norm": 0.3986571729183197, + "learning_rate": 4.6626989497618015e-05, + "loss": 0.3606, + "step": 4984500 + }, + { + "epoch": 33.7334885231702, + "grad_norm": 0.41753271222114563, + "learning_rate": 4.6626651147682984e-05, + "loss": 0.3604, + "step": 4985000 + }, + { + "epoch": 33.73687202252057, + "grad_norm": 0.35058414936065674, + "learning_rate": 4.6626312797747946e-05, + "loss": 0.3602, + "step": 4985500 + }, + { + "epoch": 33.74025552187094, + "grad_norm": 0.3672487139701843, + "learning_rate": 4.662597444781291e-05, + "loss": 0.3601, + "step": 4986000 + }, + { + "epoch": 33.743639021221306, + "grad_norm": 0.43293723464012146, + "learning_rate": 4.662563609787787e-05, + "loss": 0.3594, + "step": 4986500 + }, + { + "epoch": 33.747022520571676, + "grad_norm": 0.3658154308795929, + "learning_rate": 4.662529774794284e-05, + "loss": 0.3609, + "step": 4987000 + }, + { + "epoch": 33.750406019922046, + "grad_norm": 0.3655095100402832, + "learning_rate": 4.66249593980078e-05, + "loss": 0.361, + "step": 4987500 + }, + { + "epoch": 33.75378951927241, + "grad_norm": 0.40919873118400574, + "learning_rate": 4.662462104807276e-05, + "loss": 0.3618, + "step": 4988000 + }, + { + "epoch": 33.75717301862278, + "grad_norm": 0.3246718943119049, + "learning_rate": 4.662428269813772e-05, + "loss": 0.3581, + "step": 4988500 + }, + { + "epoch": 33.76055651797315, + "grad_norm": 0.37892502546310425, + "learning_rate": 4.662394434820269e-05, + "loss": 0.3603, + "step": 4989000 + }, + { + "epoch": 33.76394001732352, + "grad_norm": 0.3403722047805786, + "learning_rate": 4.662360599826765e-05, + "loss": 0.3582, + "step": 4989500 + }, + { + "epoch": 33.76732351667388, + "grad_norm": 0.352461040019989, + "learning_rate": 4.662326764833261e-05, + "loss": 0.3607, + "step": 4990000 + }, + { + "epoch": 33.770707016024254, + "grad_norm": 0.38694116473197937, + "learning_rate": 4.6622929298397574e-05, + "loss": 0.3601, + "step": 4990500 + }, + { + "epoch": 33.774090515374624, + "grad_norm": 0.3797401189804077, + "learning_rate": 4.662259094846254e-05, + "loss": 0.3607, + "step": 4991000 + }, + { + "epoch": 33.77747401472499, + "grad_norm": 0.3736964464187622, + "learning_rate": 4.6622252598527505e-05, + "loss": 0.3601, + "step": 4991500 + }, + { + "epoch": 33.78085751407536, + "grad_norm": 0.3719107508659363, + "learning_rate": 4.662191424859247e-05, + "loss": 0.3607, + "step": 4992000 + }, + { + "epoch": 33.78424101342573, + "grad_norm": 0.410574734210968, + "learning_rate": 4.662157589865743e-05, + "loss": 0.3598, + "step": 4992500 + }, + { + "epoch": 33.78762451277609, + "grad_norm": 0.3776076138019562, + "learning_rate": 4.662123754872239e-05, + "loss": 0.3598, + "step": 4993000 + }, + { + "epoch": 33.79100801212646, + "grad_norm": 0.3880300521850586, + "learning_rate": 4.6620899198787354e-05, + "loss": 0.359, + "step": 4993500 + }, + { + "epoch": 33.79439151147683, + "grad_norm": 0.36968597769737244, + "learning_rate": 4.6620560848852316e-05, + "loss": 0.3591, + "step": 4994000 + }, + { + "epoch": 33.797775010827195, + "grad_norm": 0.357287734746933, + "learning_rate": 4.6620222498917285e-05, + "loss": 0.359, + "step": 4994500 + }, + { + "epoch": 33.801158510177565, + "grad_norm": 0.36389797925949097, + "learning_rate": 4.661988414898225e-05, + "loss": 0.3608, + "step": 4995000 + }, + { + "epoch": 33.804542009527935, + "grad_norm": 0.3828994631767273, + "learning_rate": 4.661954579904721e-05, + "loss": 0.3591, + "step": 4995500 + }, + { + "epoch": 33.807925508878306, + "grad_norm": 0.3847697973251343, + "learning_rate": 4.661920744911217e-05, + "loss": 0.3595, + "step": 4996000 + }, + { + "epoch": 33.81130900822867, + "grad_norm": 0.3801291584968567, + "learning_rate": 4.661886909917714e-05, + "loss": 0.3595, + "step": 4996500 + }, + { + "epoch": 33.81469250757904, + "grad_norm": 0.3958672285079956, + "learning_rate": 4.66185307492421e-05, + "loss": 0.3608, + "step": 4997000 + }, + { + "epoch": 33.81807600692941, + "grad_norm": 0.36948850750923157, + "learning_rate": 4.661819239930706e-05, + "loss": 0.3622, + "step": 4997500 + }, + { + "epoch": 33.82145950627977, + "grad_norm": 0.3512585461139679, + "learning_rate": 4.661785404937202e-05, + "loss": 0.3595, + "step": 4998000 + }, + { + "epoch": 33.82484300563014, + "grad_norm": 0.38217779994010925, + "learning_rate": 4.661751569943699e-05, + "loss": 0.3593, + "step": 4998500 + }, + { + "epoch": 33.82822650498051, + "grad_norm": 0.35705459117889404, + "learning_rate": 4.661717734950195e-05, + "loss": 0.3605, + "step": 4999000 + }, + { + "epoch": 33.83161000433088, + "grad_norm": 0.4054109752178192, + "learning_rate": 4.661683899956691e-05, + "loss": 0.3606, + "step": 4999500 + }, + { + "epoch": 33.83499350368125, + "grad_norm": 0.37336549162864685, + "learning_rate": 4.6616500649631875e-05, + "loss": 0.3588, + "step": 5000000 + }, + { + "epoch": 33.83837700303162, + "grad_norm": 0.4157043993473053, + "learning_rate": 4.6616162299696844e-05, + "loss": 0.3609, + "step": 5000500 + }, + { + "epoch": 33.84176050238198, + "grad_norm": 0.3988616466522217, + "learning_rate": 4.6615823949761806e-05, + "loss": 0.3601, + "step": 5001000 + }, + { + "epoch": 33.84514400173235, + "grad_norm": 0.3774484395980835, + "learning_rate": 4.661548559982677e-05, + "loss": 0.3596, + "step": 5001500 + }, + { + "epoch": 33.84852750108272, + "grad_norm": 0.37425515055656433, + "learning_rate": 4.661514724989173e-05, + "loss": 0.3598, + "step": 5002000 + }, + { + "epoch": 33.85191100043309, + "grad_norm": 0.40442442893981934, + "learning_rate": 4.661480889995669e-05, + "loss": 0.3613, + "step": 5002500 + }, + { + "epoch": 33.855294499783454, + "grad_norm": 0.40121543407440186, + "learning_rate": 4.6614470550021654e-05, + "loss": 0.3605, + "step": 5003000 + }, + { + "epoch": 33.858677999133825, + "grad_norm": 0.3543761968612671, + "learning_rate": 4.6614132200086616e-05, + "loss": 0.3587, + "step": 5003500 + }, + { + "epoch": 33.862061498484195, + "grad_norm": 0.4035884141921997, + "learning_rate": 4.6613793850151585e-05, + "loss": 0.3598, + "step": 5004000 + }, + { + "epoch": 33.86544499783456, + "grad_norm": 0.40483883023262024, + "learning_rate": 4.661345550021655e-05, + "loss": 0.3595, + "step": 5004500 + }, + { + "epoch": 33.86882849718493, + "grad_norm": 0.36776548624038696, + "learning_rate": 4.661311715028151e-05, + "loss": 0.3595, + "step": 5005000 + }, + { + "epoch": 33.8722119965353, + "grad_norm": 0.35366418957710266, + "learning_rate": 4.661277880034647e-05, + "loss": 0.3597, + "step": 5005500 + }, + { + "epoch": 33.87559549588566, + "grad_norm": 0.36323100328445435, + "learning_rate": 4.661244045041144e-05, + "loss": 0.3592, + "step": 5006000 + }, + { + "epoch": 33.87897899523603, + "grad_norm": 0.3393199145793915, + "learning_rate": 4.66121021004764e-05, + "loss": 0.3605, + "step": 5006500 + }, + { + "epoch": 33.8823624945864, + "grad_norm": 0.3459985852241516, + "learning_rate": 4.661176375054136e-05, + "loss": 0.3597, + "step": 5007000 + }, + { + "epoch": 33.885745993936766, + "grad_norm": 0.37933433055877686, + "learning_rate": 4.661142540060632e-05, + "loss": 0.3595, + "step": 5007500 + }, + { + "epoch": 33.889129493287136, + "grad_norm": 0.39216548204421997, + "learning_rate": 4.661108705067129e-05, + "loss": 0.3596, + "step": 5008000 + }, + { + "epoch": 33.892512992637506, + "grad_norm": 0.363120436668396, + "learning_rate": 4.661074870073625e-05, + "loss": 0.3596, + "step": 5008500 + }, + { + "epoch": 33.89589649198788, + "grad_norm": 0.4088318347930908, + "learning_rate": 4.661041035080121e-05, + "loss": 0.3598, + "step": 5009000 + }, + { + "epoch": 33.89927999133824, + "grad_norm": 0.3933384418487549, + "learning_rate": 4.6610072000866175e-05, + "loss": 0.3586, + "step": 5009500 + }, + { + "epoch": 33.90266349068861, + "grad_norm": 0.3848974406719208, + "learning_rate": 4.6609733650931144e-05, + "loss": 0.3586, + "step": 5010000 + }, + { + "epoch": 33.90604699003898, + "grad_norm": 0.36713850498199463, + "learning_rate": 4.6609395300996106e-05, + "loss": 0.3624, + "step": 5010500 + }, + { + "epoch": 33.909430489389344, + "grad_norm": 0.36642223596572876, + "learning_rate": 4.660905695106107e-05, + "loss": 0.3597, + "step": 5011000 + }, + { + "epoch": 33.912813988739714, + "grad_norm": 0.40728092193603516, + "learning_rate": 4.660871860112603e-05, + "loss": 0.3602, + "step": 5011500 + }, + { + "epoch": 33.916197488090084, + "grad_norm": 0.3996118903160095, + "learning_rate": 4.660838025119099e-05, + "loss": 0.3616, + "step": 5012000 + }, + { + "epoch": 33.91958098744045, + "grad_norm": 0.41957518458366394, + "learning_rate": 4.6608041901255955e-05, + "loss": 0.3601, + "step": 5012500 + }, + { + "epoch": 33.92296448679082, + "grad_norm": 0.35961875319480896, + "learning_rate": 4.660770355132092e-05, + "loss": 0.36, + "step": 5013000 + }, + { + "epoch": 33.92634798614119, + "grad_norm": 0.42392638325691223, + "learning_rate": 4.6607365201385886e-05, + "loss": 0.3596, + "step": 5013500 + }, + { + "epoch": 33.92973148549156, + "grad_norm": 0.3994753658771515, + "learning_rate": 4.660702685145085e-05, + "loss": 0.36, + "step": 5014000 + }, + { + "epoch": 33.93311498484192, + "grad_norm": 0.376228392124176, + "learning_rate": 4.660668850151581e-05, + "loss": 0.3597, + "step": 5014500 + }, + { + "epoch": 33.93649848419229, + "grad_norm": 0.3853345513343811, + "learning_rate": 4.660635015158077e-05, + "loss": 0.3607, + "step": 5015000 + }, + { + "epoch": 33.93988198354266, + "grad_norm": 0.36617282032966614, + "learning_rate": 4.660601180164574e-05, + "loss": 0.3604, + "step": 5015500 + }, + { + "epoch": 33.943265482893025, + "grad_norm": 0.37868422269821167, + "learning_rate": 4.66056734517107e-05, + "loss": 0.3613, + "step": 5016000 + }, + { + "epoch": 33.946648982243396, + "grad_norm": 0.34545958042144775, + "learning_rate": 4.660533510177566e-05, + "loss": 0.3599, + "step": 5016500 + }, + { + "epoch": 33.950032481593766, + "grad_norm": 0.3555082380771637, + "learning_rate": 4.660499675184062e-05, + "loss": 0.3605, + "step": 5017000 + }, + { + "epoch": 33.95341598094413, + "grad_norm": 0.3674757182598114, + "learning_rate": 4.660465840190559e-05, + "loss": 0.3597, + "step": 5017500 + }, + { + "epoch": 33.9567994802945, + "grad_norm": 0.371110200881958, + "learning_rate": 4.660432005197055e-05, + "loss": 0.3596, + "step": 5018000 + }, + { + "epoch": 33.96018297964487, + "grad_norm": 0.3983413875102997, + "learning_rate": 4.6603981702035514e-05, + "loss": 0.3594, + "step": 5018500 + }, + { + "epoch": 33.96356647899523, + "grad_norm": 0.37326234579086304, + "learning_rate": 4.6603643352100476e-05, + "loss": 0.3581, + "step": 5019000 + }, + { + "epoch": 33.9669499783456, + "grad_norm": 0.3510463237762451, + "learning_rate": 4.6603305002165445e-05, + "loss": 0.3601, + "step": 5019500 + }, + { + "epoch": 33.970333477695974, + "grad_norm": 0.3734467923641205, + "learning_rate": 4.660296665223041e-05, + "loss": 0.3597, + "step": 5020000 + }, + { + "epoch": 33.973716977046344, + "grad_norm": 0.3781816363334656, + "learning_rate": 4.660262830229537e-05, + "loss": 0.3592, + "step": 5020500 + }, + { + "epoch": 33.97710047639671, + "grad_norm": 0.35887932777404785, + "learning_rate": 4.660228995236033e-05, + "loss": 0.3604, + "step": 5021000 + }, + { + "epoch": 33.98048397574708, + "grad_norm": 0.38643041253089905, + "learning_rate": 4.6601951602425293e-05, + "loss": 0.3591, + "step": 5021500 + }, + { + "epoch": 33.98386747509745, + "grad_norm": 0.3732677102088928, + "learning_rate": 4.6601613252490256e-05, + "loss": 0.3607, + "step": 5022000 + }, + { + "epoch": 33.98725097444781, + "grad_norm": 0.392314612865448, + "learning_rate": 4.660127490255522e-05, + "loss": 0.3609, + "step": 5022500 + }, + { + "epoch": 33.99063447379818, + "grad_norm": 0.3450537323951721, + "learning_rate": 4.660093655262019e-05, + "loss": 0.3605, + "step": 5023000 + }, + { + "epoch": 33.99401797314855, + "grad_norm": 0.3555243909358978, + "learning_rate": 4.660059820268515e-05, + "loss": 0.3595, + "step": 5023500 + }, + { + "epoch": 33.997401472498915, + "grad_norm": 0.3801630437374115, + "learning_rate": 4.660025985275011e-05, + "loss": 0.3599, + "step": 5024000 + }, + { + "epoch": 34.0, + "eval_accuracy": 0.862768993043337, + "eval_loss": 0.556894838809967, + "eval_runtime": 3349.7517, + "eval_samples_per_second": 86.796, + "eval_steps_per_second": 5.425, + "step": 5024384 + }, + { + "epoch": 34.000784971849285, + "grad_norm": 0.3564847707748413, + "learning_rate": 4.659992150281507e-05, + "loss": 0.3599, + "step": 5024500 + }, + { + "epoch": 34.004168471199655, + "grad_norm": 0.38413703441619873, + "learning_rate": 4.659958315288004e-05, + "loss": 0.3566, + "step": 5025000 + }, + { + "epoch": 34.00755197055002, + "grad_norm": 0.39659664034843445, + "learning_rate": 4.6599244802945004e-05, + "loss": 0.3583, + "step": 5025500 + }, + { + "epoch": 34.01093546990039, + "grad_norm": 0.3901703357696533, + "learning_rate": 4.659890645300996e-05, + "loss": 0.3584, + "step": 5026000 + }, + { + "epoch": 34.01431896925076, + "grad_norm": 0.3614691197872162, + "learning_rate": 4.659856810307492e-05, + "loss": 0.3565, + "step": 5026500 + }, + { + "epoch": 34.01770246860113, + "grad_norm": 0.35379400849342346, + "learning_rate": 4.659822975313989e-05, + "loss": 0.3554, + "step": 5027000 + }, + { + "epoch": 34.02108596795149, + "grad_norm": 0.37780022621154785, + "learning_rate": 4.659789140320485e-05, + "loss": 0.359, + "step": 5027500 + }, + { + "epoch": 34.02446946730186, + "grad_norm": 0.4037107229232788, + "learning_rate": 4.6597553053269815e-05, + "loss": 0.3585, + "step": 5028000 + }, + { + "epoch": 34.02785296665223, + "grad_norm": 0.3732971251010895, + "learning_rate": 4.659721470333478e-05, + "loss": 0.3581, + "step": 5028500 + }, + { + "epoch": 34.031236466002596, + "grad_norm": 0.4241940379142761, + "learning_rate": 4.6596876353399746e-05, + "loss": 0.3581, + "step": 5029000 + }, + { + "epoch": 34.03461996535297, + "grad_norm": 0.4099807143211365, + "learning_rate": 4.659653800346471e-05, + "loss": 0.3557, + "step": 5029500 + }, + { + "epoch": 34.03800346470334, + "grad_norm": 0.3990079462528229, + "learning_rate": 4.659619965352967e-05, + "loss": 0.3582, + "step": 5030000 + }, + { + "epoch": 34.0413869640537, + "grad_norm": 0.38755473494529724, + "learning_rate": 4.659586130359463e-05, + "loss": 0.3583, + "step": 5030500 + }, + { + "epoch": 34.04477046340407, + "grad_norm": 0.376600056886673, + "learning_rate": 4.6595522953659594e-05, + "loss": 0.3573, + "step": 5031000 + }, + { + "epoch": 34.04815396275444, + "grad_norm": 0.39983510971069336, + "learning_rate": 4.6595184603724556e-05, + "loss": 0.3579, + "step": 5031500 + }, + { + "epoch": 34.051537462104804, + "grad_norm": 0.4000255763530731, + "learning_rate": 4.659484625378952e-05, + "loss": 0.3584, + "step": 5032000 + }, + { + "epoch": 34.054920961455174, + "grad_norm": 0.39626163244247437, + "learning_rate": 4.659450790385449e-05, + "loss": 0.359, + "step": 5032500 + }, + { + "epoch": 34.058304460805545, + "grad_norm": 0.37869513034820557, + "learning_rate": 4.659416955391945e-05, + "loss": 0.358, + "step": 5033000 + }, + { + "epoch": 34.061687960155915, + "grad_norm": 0.33970946073532104, + "learning_rate": 4.659383120398441e-05, + "loss": 0.3576, + "step": 5033500 + }, + { + "epoch": 34.06507145950628, + "grad_norm": 0.40400931239128113, + "learning_rate": 4.6593492854049374e-05, + "loss": 0.3565, + "step": 5034000 + }, + { + "epoch": 34.06845495885665, + "grad_norm": 0.38513997197151184, + "learning_rate": 4.659315450411434e-05, + "loss": 0.3576, + "step": 5034500 + }, + { + "epoch": 34.07183845820702, + "grad_norm": 0.376578152179718, + "learning_rate": 4.6592816154179305e-05, + "loss": 0.3588, + "step": 5035000 + }, + { + "epoch": 34.07522195755738, + "grad_norm": 0.4014889895915985, + "learning_rate": 4.659247780424426e-05, + "loss": 0.3593, + "step": 5035500 + }, + { + "epoch": 34.07860545690775, + "grad_norm": 0.3878783583641052, + "learning_rate": 4.659213945430922e-05, + "loss": 0.359, + "step": 5036000 + }, + { + "epoch": 34.08198895625812, + "grad_norm": 0.3966333270072937, + "learning_rate": 4.659180110437419e-05, + "loss": 0.3586, + "step": 5036500 + }, + { + "epoch": 34.085372455608486, + "grad_norm": 0.356948584318161, + "learning_rate": 4.659146275443915e-05, + "loss": 0.3594, + "step": 5037000 + }, + { + "epoch": 34.088755954958856, + "grad_norm": 0.43688029050827026, + "learning_rate": 4.6591124404504115e-05, + "loss": 0.3598, + "step": 5037500 + }, + { + "epoch": 34.092139454309226, + "grad_norm": 0.34125515818595886, + "learning_rate": 4.659078605456908e-05, + "loss": 0.358, + "step": 5038000 + }, + { + "epoch": 34.09552295365959, + "grad_norm": 0.403870165348053, + "learning_rate": 4.6590447704634046e-05, + "loss": 0.3578, + "step": 5038500 + }, + { + "epoch": 34.09890645300996, + "grad_norm": 0.42687666416168213, + "learning_rate": 4.659010935469901e-05, + "loss": 0.3579, + "step": 5039000 + }, + { + "epoch": 34.10228995236033, + "grad_norm": 0.37997591495513916, + "learning_rate": 4.658977100476397e-05, + "loss": 0.3583, + "step": 5039500 + }, + { + "epoch": 34.1056734517107, + "grad_norm": 0.37498998641967773, + "learning_rate": 4.658943265482893e-05, + "loss": 0.3586, + "step": 5040000 + }, + { + "epoch": 34.10905695106106, + "grad_norm": 0.39867478609085083, + "learning_rate": 4.6589094304893895e-05, + "loss": 0.3583, + "step": 5040500 + }, + { + "epoch": 34.112440450411434, + "grad_norm": 0.38414129614830017, + "learning_rate": 4.658875595495886e-05, + "loss": 0.3578, + "step": 5041000 + }, + { + "epoch": 34.115823949761804, + "grad_norm": 0.3924691379070282, + "learning_rate": 4.658841760502382e-05, + "loss": 0.3592, + "step": 5041500 + }, + { + "epoch": 34.11920744911217, + "grad_norm": 0.3739098608493805, + "learning_rate": 4.658807925508879e-05, + "loss": 0.3579, + "step": 5042000 + }, + { + "epoch": 34.12259094846254, + "grad_norm": 0.3583349883556366, + "learning_rate": 4.658774090515375e-05, + "loss": 0.3591, + "step": 5042500 + }, + { + "epoch": 34.12597444781291, + "grad_norm": 0.40906092524528503, + "learning_rate": 4.658740255521871e-05, + "loss": 0.3594, + "step": 5043000 + }, + { + "epoch": 34.12935794716327, + "grad_norm": 0.33160528540611267, + "learning_rate": 4.6587064205283674e-05, + "loss": 0.3577, + "step": 5043500 + }, + { + "epoch": 34.13274144651364, + "grad_norm": 0.41338586807250977, + "learning_rate": 4.6586725855348636e-05, + "loss": 0.3586, + "step": 5044000 + }, + { + "epoch": 34.13612494586401, + "grad_norm": 0.3884548246860504, + "learning_rate": 4.6586387505413605e-05, + "loss": 0.3604, + "step": 5044500 + }, + { + "epoch": 34.13950844521438, + "grad_norm": 0.3763774037361145, + "learning_rate": 4.658604915547856e-05, + "loss": 0.357, + "step": 5045000 + }, + { + "epoch": 34.142891944564745, + "grad_norm": 0.381849080324173, + "learning_rate": 4.658571080554352e-05, + "loss": 0.3595, + "step": 5045500 + }, + { + "epoch": 34.146275443915115, + "grad_norm": 0.39528530836105347, + "learning_rate": 4.658537245560849e-05, + "loss": 0.3591, + "step": 5046000 + }, + { + "epoch": 34.149658943265486, + "grad_norm": 0.36974236369132996, + "learning_rate": 4.6585034105673454e-05, + "loss": 0.359, + "step": 5046500 + }, + { + "epoch": 34.15304244261585, + "grad_norm": 0.3644038736820221, + "learning_rate": 4.6584695755738416e-05, + "loss": 0.36, + "step": 5047000 + }, + { + "epoch": 34.15642594196622, + "grad_norm": 0.3983452618122101, + "learning_rate": 4.658435740580338e-05, + "loss": 0.3585, + "step": 5047500 + }, + { + "epoch": 34.15980944131659, + "grad_norm": 0.3650473952293396, + "learning_rate": 4.658401905586835e-05, + "loss": 0.3591, + "step": 5048000 + }, + { + "epoch": 34.16319294066695, + "grad_norm": 0.37225234508514404, + "learning_rate": 4.658368070593331e-05, + "loss": 0.3594, + "step": 5048500 + }, + { + "epoch": 34.16657644001732, + "grad_norm": 0.41506242752075195, + "learning_rate": 4.658334235599827e-05, + "loss": 0.3586, + "step": 5049000 + }, + { + "epoch": 34.16995993936769, + "grad_norm": 0.3849523067474365, + "learning_rate": 4.658300400606323e-05, + "loss": 0.3576, + "step": 5049500 + }, + { + "epoch": 34.17334343871806, + "grad_norm": 0.4018872380256653, + "learning_rate": 4.6582665656128195e-05, + "loss": 0.3579, + "step": 5050000 + }, + { + "epoch": 34.17672693806843, + "grad_norm": 0.3655732274055481, + "learning_rate": 4.658232730619316e-05, + "loss": 0.3578, + "step": 5050500 + }, + { + "epoch": 34.1801104374188, + "grad_norm": 0.3754114508628845, + "learning_rate": 4.658198895625812e-05, + "loss": 0.3585, + "step": 5051000 + }, + { + "epoch": 34.18349393676917, + "grad_norm": 0.37415942549705505, + "learning_rate": 4.658165060632308e-05, + "loss": 0.3587, + "step": 5051500 + }, + { + "epoch": 34.18687743611953, + "grad_norm": 0.35716935992240906, + "learning_rate": 4.658131225638805e-05, + "loss": 0.3593, + "step": 5052000 + }, + { + "epoch": 34.1902609354699, + "grad_norm": 0.3869902789592743, + "learning_rate": 4.658097390645301e-05, + "loss": 0.36, + "step": 5052500 + }, + { + "epoch": 34.19364443482027, + "grad_norm": 0.355259507894516, + "learning_rate": 4.6580635556517975e-05, + "loss": 0.3597, + "step": 5053000 + }, + { + "epoch": 34.197027934170634, + "grad_norm": 0.3885345458984375, + "learning_rate": 4.658029720658294e-05, + "loss": 0.3578, + "step": 5053500 + }, + { + "epoch": 34.200411433521005, + "grad_norm": 0.37546995282173157, + "learning_rate": 4.6579958856647906e-05, + "loss": 0.3601, + "step": 5054000 + }, + { + "epoch": 34.203794932871375, + "grad_norm": 0.39922034740448, + "learning_rate": 4.657962050671287e-05, + "loss": 0.3573, + "step": 5054500 + }, + { + "epoch": 34.20717843222174, + "grad_norm": 0.37996113300323486, + "learning_rate": 4.657928215677782e-05, + "loss": 0.3583, + "step": 5055000 + }, + { + "epoch": 34.21056193157211, + "grad_norm": 0.39103591442108154, + "learning_rate": 4.657894380684279e-05, + "loss": 0.3586, + "step": 5055500 + }, + { + "epoch": 34.21394543092248, + "grad_norm": 0.3796161115169525, + "learning_rate": 4.6578605456907754e-05, + "loss": 0.3595, + "step": 5056000 + }, + { + "epoch": 34.21732893027284, + "grad_norm": 0.3596300184726715, + "learning_rate": 4.6578267106972717e-05, + "loss": 0.3579, + "step": 5056500 + }, + { + "epoch": 34.22071242962321, + "grad_norm": 0.38669246435165405, + "learning_rate": 4.657792875703768e-05, + "loss": 0.3594, + "step": 5057000 + }, + { + "epoch": 34.22409592897358, + "grad_norm": 0.419877290725708, + "learning_rate": 4.657759040710265e-05, + "loss": 0.3579, + "step": 5057500 + }, + { + "epoch": 34.22747942832395, + "grad_norm": 0.40100720524787903, + "learning_rate": 4.657725205716761e-05, + "loss": 0.359, + "step": 5058000 + }, + { + "epoch": 34.230862927674316, + "grad_norm": 0.4499681293964386, + "learning_rate": 4.657691370723257e-05, + "loss": 0.359, + "step": 5058500 + }, + { + "epoch": 34.23424642702469, + "grad_norm": 0.3997453451156616, + "learning_rate": 4.6576575357297534e-05, + "loss": 0.3589, + "step": 5059000 + }, + { + "epoch": 34.23762992637506, + "grad_norm": 0.3672904968261719, + "learning_rate": 4.6576237007362496e-05, + "loss": 0.3582, + "step": 5059500 + }, + { + "epoch": 34.24101342572542, + "grad_norm": 0.39281371235847473, + "learning_rate": 4.657589865742746e-05, + "loss": 0.3597, + "step": 5060000 + }, + { + "epoch": 34.24439692507579, + "grad_norm": 0.3908872902393341, + "learning_rate": 4.657556030749242e-05, + "loss": 0.3591, + "step": 5060500 + }, + { + "epoch": 34.24778042442616, + "grad_norm": 0.3856948912143707, + "learning_rate": 4.657522195755738e-05, + "loss": 0.3574, + "step": 5061000 + }, + { + "epoch": 34.251163923776524, + "grad_norm": 0.389735609292984, + "learning_rate": 4.657488360762235e-05, + "loss": 0.3579, + "step": 5061500 + }, + { + "epoch": 34.254547423126894, + "grad_norm": 0.3987472653388977, + "learning_rate": 4.6574545257687313e-05, + "loss": 0.36, + "step": 5062000 + }, + { + "epoch": 34.257930922477264, + "grad_norm": 0.38841792941093445, + "learning_rate": 4.6574206907752276e-05, + "loss": 0.3576, + "step": 5062500 + }, + { + "epoch": 34.26131442182763, + "grad_norm": 0.3690048158168793, + "learning_rate": 4.657386855781724e-05, + "loss": 0.3586, + "step": 5063000 + }, + { + "epoch": 34.264697921178, + "grad_norm": 0.41382020711898804, + "learning_rate": 4.6573530207882207e-05, + "loss": 0.36, + "step": 5063500 + }, + { + "epoch": 34.26808142052837, + "grad_norm": 0.39931240677833557, + "learning_rate": 4.657319185794717e-05, + "loss": 0.3598, + "step": 5064000 + }, + { + "epoch": 34.27146491987874, + "grad_norm": 0.4021117687225342, + "learning_rate": 4.6572853508012124e-05, + "loss": 0.3575, + "step": 5064500 + }, + { + "epoch": 34.2748484192291, + "grad_norm": 0.3797292411327362, + "learning_rate": 4.657251515807709e-05, + "loss": 0.3583, + "step": 5065000 + }, + { + "epoch": 34.27823191857947, + "grad_norm": 0.39234450459480286, + "learning_rate": 4.6572176808142055e-05, + "loss": 0.3586, + "step": 5065500 + }, + { + "epoch": 34.28161541792984, + "grad_norm": 0.4292556643486023, + "learning_rate": 4.657183845820702e-05, + "loss": 0.3583, + "step": 5066000 + }, + { + "epoch": 34.284998917280205, + "grad_norm": 0.3685343861579895, + "learning_rate": 4.657150010827198e-05, + "loss": 0.3599, + "step": 5066500 + }, + { + "epoch": 34.288382416630576, + "grad_norm": 0.40842705965042114, + "learning_rate": 4.657116175833695e-05, + "loss": 0.3594, + "step": 5067000 + }, + { + "epoch": 34.291765915980946, + "grad_norm": 0.3621181547641754, + "learning_rate": 4.657082340840191e-05, + "loss": 0.359, + "step": 5067500 + }, + { + "epoch": 34.29514941533131, + "grad_norm": 0.3603003919124603, + "learning_rate": 4.657048505846687e-05, + "loss": 0.3597, + "step": 5068000 + }, + { + "epoch": 34.29853291468168, + "grad_norm": 0.3800334930419922, + "learning_rate": 4.6570146708531835e-05, + "loss": 0.3617, + "step": 5068500 + }, + { + "epoch": 34.30191641403205, + "grad_norm": 0.379797101020813, + "learning_rate": 4.65698083585968e-05, + "loss": 0.3588, + "step": 5069000 + }, + { + "epoch": 34.30529991338242, + "grad_norm": 0.3685661852359772, + "learning_rate": 4.656947000866176e-05, + "loss": 0.3595, + "step": 5069500 + }, + { + "epoch": 34.30868341273278, + "grad_norm": 0.3684749901294708, + "learning_rate": 4.656913165872672e-05, + "loss": 0.3574, + "step": 5070000 + }, + { + "epoch": 34.312066912083154, + "grad_norm": 0.3760967552661896, + "learning_rate": 4.656879330879168e-05, + "loss": 0.3594, + "step": 5070500 + }, + { + "epoch": 34.315450411433524, + "grad_norm": 0.3654816150665283, + "learning_rate": 4.656845495885665e-05, + "loss": 0.3593, + "step": 5071000 + }, + { + "epoch": 34.31883391078389, + "grad_norm": 0.3774130642414093, + "learning_rate": 4.6568116608921614e-05, + "loss": 0.3585, + "step": 5071500 + }, + { + "epoch": 34.32221741013426, + "grad_norm": 0.35825785994529724, + "learning_rate": 4.6567778258986576e-05, + "loss": 0.359, + "step": 5072000 + }, + { + "epoch": 34.32560090948463, + "grad_norm": 0.41758641600608826, + "learning_rate": 4.656743990905154e-05, + "loss": 0.3589, + "step": 5072500 + }, + { + "epoch": 34.32898440883499, + "grad_norm": 0.3906160295009613, + "learning_rate": 4.656710155911651e-05, + "loss": 0.3591, + "step": 5073000 + }, + { + "epoch": 34.33236790818536, + "grad_norm": 0.3692469894886017, + "learning_rate": 4.656676320918147e-05, + "loss": 0.3586, + "step": 5073500 + }, + { + "epoch": 34.33575140753573, + "grad_norm": 0.35179731249809265, + "learning_rate": 4.6566424859246425e-05, + "loss": 0.3596, + "step": 5074000 + }, + { + "epoch": 34.339134906886095, + "grad_norm": 0.3839186429977417, + "learning_rate": 4.6566086509311394e-05, + "loss": 0.3586, + "step": 5074500 + }, + { + "epoch": 34.342518406236465, + "grad_norm": 0.34909674525260925, + "learning_rate": 4.6565748159376356e-05, + "loss": 0.3595, + "step": 5075000 + }, + { + "epoch": 34.345901905586835, + "grad_norm": 0.3728690445423126, + "learning_rate": 4.656540980944132e-05, + "loss": 0.3568, + "step": 5075500 + }, + { + "epoch": 34.349285404937206, + "grad_norm": 0.3859647810459137, + "learning_rate": 4.656507145950628e-05, + "loss": 0.359, + "step": 5076000 + }, + { + "epoch": 34.35266890428757, + "grad_norm": 0.38348260521888733, + "learning_rate": 4.656473310957125e-05, + "loss": 0.3566, + "step": 5076500 + }, + { + "epoch": 34.35605240363794, + "grad_norm": 0.37911906838417053, + "learning_rate": 4.656439475963621e-05, + "loss": 0.3583, + "step": 5077000 + }, + { + "epoch": 34.35943590298831, + "grad_norm": 0.36353886127471924, + "learning_rate": 4.656405640970117e-05, + "loss": 0.3586, + "step": 5077500 + }, + { + "epoch": 34.36281940233867, + "grad_norm": 0.4255317449569702, + "learning_rate": 4.6563718059766135e-05, + "loss": 0.36, + "step": 5078000 + }, + { + "epoch": 34.36620290168904, + "grad_norm": 0.36470532417297363, + "learning_rate": 4.65633797098311e-05, + "loss": 0.3596, + "step": 5078500 + }, + { + "epoch": 34.36958640103941, + "grad_norm": 0.3753197193145752, + "learning_rate": 4.656304135989606e-05, + "loss": 0.3584, + "step": 5079000 + }, + { + "epoch": 34.372969900389776, + "grad_norm": 0.40687665343284607, + "learning_rate": 4.656270300996102e-05, + "loss": 0.3584, + "step": 5079500 + }, + { + "epoch": 34.37635339974015, + "grad_norm": 0.37256020307540894, + "learning_rate": 4.6562364660025984e-05, + "loss": 0.3579, + "step": 5080000 + }, + { + "epoch": 34.37973689909052, + "grad_norm": 0.41435477137565613, + "learning_rate": 4.656202631009095e-05, + "loss": 0.3594, + "step": 5080500 + }, + { + "epoch": 34.38312039844088, + "grad_norm": 0.36371898651123047, + "learning_rate": 4.6561687960155915e-05, + "loss": 0.3597, + "step": 5081000 + }, + { + "epoch": 34.38650389779125, + "grad_norm": 0.4099665582180023, + "learning_rate": 4.656134961022088e-05, + "loss": 0.3594, + "step": 5081500 + }, + { + "epoch": 34.38988739714162, + "grad_norm": 0.37060919404029846, + "learning_rate": 4.656101126028584e-05, + "loss": 0.3596, + "step": 5082000 + }, + { + "epoch": 34.39327089649199, + "grad_norm": 0.38851264119148254, + "learning_rate": 4.656067291035081e-05, + "loss": 0.358, + "step": 5082500 + }, + { + "epoch": 34.396654395842354, + "grad_norm": 0.3856896162033081, + "learning_rate": 4.656033456041577e-05, + "loss": 0.3598, + "step": 5083000 + }, + { + "epoch": 34.400037895192725, + "grad_norm": 0.3771379590034485, + "learning_rate": 4.6559996210480725e-05, + "loss": 0.3606, + "step": 5083500 + }, + { + "epoch": 34.403421394543095, + "grad_norm": 0.3658657670021057, + "learning_rate": 4.6559657860545694e-05, + "loss": 0.3586, + "step": 5084000 + }, + { + "epoch": 34.40680489389346, + "grad_norm": 0.3816579580307007, + "learning_rate": 4.6559319510610656e-05, + "loss": 0.3591, + "step": 5084500 + }, + { + "epoch": 34.41018839324383, + "grad_norm": 0.3742064833641052, + "learning_rate": 4.655898116067562e-05, + "loss": 0.3585, + "step": 5085000 + }, + { + "epoch": 34.4135718925942, + "grad_norm": 0.3874446153640747, + "learning_rate": 4.655864281074058e-05, + "loss": 0.3593, + "step": 5085500 + }, + { + "epoch": 34.41695539194456, + "grad_norm": 0.377986341714859, + "learning_rate": 4.655830446080555e-05, + "loss": 0.3608, + "step": 5086000 + }, + { + "epoch": 34.42033889129493, + "grad_norm": 0.39694201946258545, + "learning_rate": 4.655796611087051e-05, + "loss": 0.3593, + "step": 5086500 + }, + { + "epoch": 34.4237223906453, + "grad_norm": 0.3425546884536743, + "learning_rate": 4.6557627760935474e-05, + "loss": 0.3592, + "step": 5087000 + }, + { + "epoch": 34.427105889995666, + "grad_norm": 0.3719537854194641, + "learning_rate": 4.6557289411000436e-05, + "loss": 0.3599, + "step": 5087500 + }, + { + "epoch": 34.430489389346036, + "grad_norm": 0.37190452218055725, + "learning_rate": 4.65569510610654e-05, + "loss": 0.3595, + "step": 5088000 + }, + { + "epoch": 34.433872888696406, + "grad_norm": 0.4227867126464844, + "learning_rate": 4.655661271113036e-05, + "loss": 0.3588, + "step": 5088500 + }, + { + "epoch": 34.43725638804678, + "grad_norm": 0.3775515854358673, + "learning_rate": 4.655627436119532e-05, + "loss": 0.3596, + "step": 5089000 + }, + { + "epoch": 34.44063988739714, + "grad_norm": 0.3955373764038086, + "learning_rate": 4.6555936011260284e-05, + "loss": 0.3596, + "step": 5089500 + }, + { + "epoch": 34.44402338674751, + "grad_norm": 0.3818521201610565, + "learning_rate": 4.655559766132525e-05, + "loss": 0.3588, + "step": 5090000 + }, + { + "epoch": 34.44740688609788, + "grad_norm": 0.3896339535713196, + "learning_rate": 4.6555259311390215e-05, + "loss": 0.3575, + "step": 5090500 + }, + { + "epoch": 34.450790385448244, + "grad_norm": 0.34918251633644104, + "learning_rate": 4.655492096145518e-05, + "loss": 0.3599, + "step": 5091000 + }, + { + "epoch": 34.454173884798614, + "grad_norm": 0.3901742696762085, + "learning_rate": 4.655458261152014e-05, + "loss": 0.36, + "step": 5091500 + }, + { + "epoch": 34.457557384148984, + "grad_norm": 0.3786981701850891, + "learning_rate": 4.655424426158511e-05, + "loss": 0.3584, + "step": 5092000 + }, + { + "epoch": 34.46094088349935, + "grad_norm": 0.420971155166626, + "learning_rate": 4.655390591165007e-05, + "loss": 0.3582, + "step": 5092500 + }, + { + "epoch": 34.46432438284972, + "grad_norm": 0.3856641352176666, + "learning_rate": 4.6553567561715026e-05, + "loss": 0.3595, + "step": 5093000 + }, + { + "epoch": 34.46770788220009, + "grad_norm": 0.38781335949897766, + "learning_rate": 4.6553229211779995e-05, + "loss": 0.3597, + "step": 5093500 + }, + { + "epoch": 34.47109138155046, + "grad_norm": 0.3730041980743408, + "learning_rate": 4.655289086184496e-05, + "loss": 0.358, + "step": 5094000 + }, + { + "epoch": 34.47447488090082, + "grad_norm": 0.4017457067966461, + "learning_rate": 4.655255251190992e-05, + "loss": 0.3584, + "step": 5094500 + }, + { + "epoch": 34.47785838025119, + "grad_norm": 0.38262927532196045, + "learning_rate": 4.655221416197488e-05, + "loss": 0.3596, + "step": 5095000 + }, + { + "epoch": 34.48124187960156, + "grad_norm": 0.34982404112815857, + "learning_rate": 4.655187581203985e-05, + "loss": 0.36, + "step": 5095500 + }, + { + "epoch": 34.484625378951925, + "grad_norm": 0.39245760440826416, + "learning_rate": 4.655153746210481e-05, + "loss": 0.3588, + "step": 5096000 + }, + { + "epoch": 34.488008878302296, + "grad_norm": 0.4110981523990631, + "learning_rate": 4.6551199112169774e-05, + "loss": 0.3585, + "step": 5096500 + }, + { + "epoch": 34.491392377652666, + "grad_norm": 0.34749066829681396, + "learning_rate": 4.6550860762234736e-05, + "loss": 0.3589, + "step": 5097000 + }, + { + "epoch": 34.49477587700303, + "grad_norm": 0.3863259553909302, + "learning_rate": 4.65505224122997e-05, + "loss": 0.3609, + "step": 5097500 + }, + { + "epoch": 34.4981593763534, + "grad_norm": 0.35212135314941406, + "learning_rate": 4.655018406236466e-05, + "loss": 0.3597, + "step": 5098000 + }, + { + "epoch": 34.50154287570377, + "grad_norm": 0.38584738969802856, + "learning_rate": 4.654984571242962e-05, + "loss": 0.3578, + "step": 5098500 + }, + { + "epoch": 34.50492637505413, + "grad_norm": 0.35282817482948303, + "learning_rate": 4.6549507362494585e-05, + "loss": 0.3592, + "step": 5099000 + }, + { + "epoch": 34.5083098744045, + "grad_norm": 0.3928869068622589, + "learning_rate": 4.6549169012559554e-05, + "loss": 0.3589, + "step": 5099500 + }, + { + "epoch": 34.51169337375487, + "grad_norm": 0.394925594329834, + "learning_rate": 4.6548830662624516e-05, + "loss": 0.358, + "step": 5100000 + }, + { + "epoch": 34.515076873105244, + "grad_norm": 0.379443883895874, + "learning_rate": 4.654849231268948e-05, + "loss": 0.3578, + "step": 5100500 + }, + { + "epoch": 34.51846037245561, + "grad_norm": 0.37842902541160583, + "learning_rate": 4.654815396275444e-05, + "loss": 0.359, + "step": 5101000 + }, + { + "epoch": 34.52184387180598, + "grad_norm": 0.3890233635902405, + "learning_rate": 4.654781561281941e-05, + "loss": 0.3584, + "step": 5101500 + }, + { + "epoch": 34.52522737115635, + "grad_norm": 0.3421824872493744, + "learning_rate": 4.654747726288437e-05, + "loss": 0.3597, + "step": 5102000 + }, + { + "epoch": 34.52861087050671, + "grad_norm": 0.3568973243236542, + "learning_rate": 4.6547138912949327e-05, + "loss": 0.3594, + "step": 5102500 + }, + { + "epoch": 34.53199436985708, + "grad_norm": 0.3865303695201874, + "learning_rate": 4.6546800563014295e-05, + "loss": 0.3604, + "step": 5103000 + }, + { + "epoch": 34.53537786920745, + "grad_norm": 0.34883520007133484, + "learning_rate": 4.654646221307926e-05, + "loss": 0.3602, + "step": 5103500 + }, + { + "epoch": 34.538761368557815, + "grad_norm": 0.4210364818572998, + "learning_rate": 4.654612386314422e-05, + "loss": 0.3604, + "step": 5104000 + }, + { + "epoch": 34.542144867908185, + "grad_norm": 0.41963812708854675, + "learning_rate": 4.654578551320918e-05, + "loss": 0.3609, + "step": 5104500 + }, + { + "epoch": 34.545528367258555, + "grad_norm": 0.40737205743789673, + "learning_rate": 4.654544716327415e-05, + "loss": 0.3581, + "step": 5105000 + }, + { + "epoch": 34.54891186660892, + "grad_norm": 0.35566970705986023, + "learning_rate": 4.654510881333911e-05, + "loss": 0.3601, + "step": 5105500 + }, + { + "epoch": 34.55229536595929, + "grad_norm": 0.3322935402393341, + "learning_rate": 4.6544770463404075e-05, + "loss": 0.3584, + "step": 5106000 + }, + { + "epoch": 34.55567886530966, + "grad_norm": 0.3786003887653351, + "learning_rate": 4.654443211346904e-05, + "loss": 0.3599, + "step": 5106500 + }, + { + "epoch": 34.55906236466003, + "grad_norm": 0.36978060007095337, + "learning_rate": 4.6544093763534e-05, + "loss": 0.3592, + "step": 5107000 + }, + { + "epoch": 34.56244586401039, + "grad_norm": 0.3569822609424591, + "learning_rate": 4.654375541359896e-05, + "loss": 0.3575, + "step": 5107500 + }, + { + "epoch": 34.56582936336076, + "grad_norm": 0.3197338581085205, + "learning_rate": 4.6543417063663923e-05, + "loss": 0.3605, + "step": 5108000 + }, + { + "epoch": 34.56921286271113, + "grad_norm": 0.36754342913627625, + "learning_rate": 4.6543078713728886e-05, + "loss": 0.3599, + "step": 5108500 + }, + { + "epoch": 34.572596362061496, + "grad_norm": 0.42086338996887207, + "learning_rate": 4.6542740363793854e-05, + "loss": 0.3592, + "step": 5109000 + }, + { + "epoch": 34.57597986141187, + "grad_norm": 0.3892146050930023, + "learning_rate": 4.6542402013858817e-05, + "loss": 0.36, + "step": 5109500 + }, + { + "epoch": 34.57936336076224, + "grad_norm": 0.3757900297641754, + "learning_rate": 4.654206366392378e-05, + "loss": 0.3587, + "step": 5110000 + }, + { + "epoch": 34.5827468601126, + "grad_norm": 0.3483287990093231, + "learning_rate": 4.654172531398874e-05, + "loss": 0.359, + "step": 5110500 + }, + { + "epoch": 34.58613035946297, + "grad_norm": 0.3535486161708832, + "learning_rate": 4.654138696405371e-05, + "loss": 0.3602, + "step": 5111000 + }, + { + "epoch": 34.58951385881334, + "grad_norm": 0.387491911649704, + "learning_rate": 4.654104861411867e-05, + "loss": 0.3587, + "step": 5111500 + }, + { + "epoch": 34.592897358163704, + "grad_norm": 0.39275237917900085, + "learning_rate": 4.654071026418363e-05, + "loss": 0.3606, + "step": 5112000 + }, + { + "epoch": 34.596280857514074, + "grad_norm": 0.4202045202255249, + "learning_rate": 4.6540371914248596e-05, + "loss": 0.3603, + "step": 5112500 + }, + { + "epoch": 34.599664356864444, + "grad_norm": 0.3850978910923004, + "learning_rate": 4.654003356431356e-05, + "loss": 0.3591, + "step": 5113000 + }, + { + "epoch": 34.603047856214815, + "grad_norm": 0.4012092351913452, + "learning_rate": 4.653969521437852e-05, + "loss": 0.3584, + "step": 5113500 + }, + { + "epoch": 34.60643135556518, + "grad_norm": 0.37787535786628723, + "learning_rate": 4.653935686444348e-05, + "loss": 0.3593, + "step": 5114000 + }, + { + "epoch": 34.60981485491555, + "grad_norm": 0.3980405032634735, + "learning_rate": 4.6539018514508445e-05, + "loss": 0.3604, + "step": 5114500 + }, + { + "epoch": 34.61319835426592, + "grad_norm": 0.39192214608192444, + "learning_rate": 4.6538680164573413e-05, + "loss": 0.36, + "step": 5115000 + }, + { + "epoch": 34.61658185361628, + "grad_norm": 0.3803424537181854, + "learning_rate": 4.6538341814638376e-05, + "loss": 0.3605, + "step": 5115500 + }, + { + "epoch": 34.61996535296665, + "grad_norm": 0.32273218035697937, + "learning_rate": 4.653800346470334e-05, + "loss": 0.3601, + "step": 5116000 + }, + { + "epoch": 34.62334885231702, + "grad_norm": 0.36341169476509094, + "learning_rate": 4.65376651147683e-05, + "loss": 0.3602, + "step": 5116500 + }, + { + "epoch": 34.626732351667386, + "grad_norm": 0.3712911605834961, + "learning_rate": 4.653732676483326e-05, + "loss": 0.3602, + "step": 5117000 + }, + { + "epoch": 34.630115851017756, + "grad_norm": 0.35886746644973755, + "learning_rate": 4.6536988414898224e-05, + "loss": 0.3602, + "step": 5117500 + }, + { + "epoch": 34.633499350368126, + "grad_norm": 0.433685302734375, + "learning_rate": 4.6536650064963186e-05, + "loss": 0.3584, + "step": 5118000 + }, + { + "epoch": 34.63688284971849, + "grad_norm": 0.37122201919555664, + "learning_rate": 4.6536311715028155e-05, + "loss": 0.3593, + "step": 5118500 + }, + { + "epoch": 34.64026634906886, + "grad_norm": 0.35382482409477234, + "learning_rate": 4.653597336509312e-05, + "loss": 0.3606, + "step": 5119000 + }, + { + "epoch": 34.64364984841923, + "grad_norm": 0.3513053357601166, + "learning_rate": 4.653563501515808e-05, + "loss": 0.3602, + "step": 5119500 + }, + { + "epoch": 34.6470333477696, + "grad_norm": 0.3879770338535309, + "learning_rate": 4.653529666522304e-05, + "loss": 0.3596, + "step": 5120000 + }, + { + "epoch": 34.65041684711996, + "grad_norm": 0.3754001259803772, + "learning_rate": 4.653495831528801e-05, + "loss": 0.3591, + "step": 5120500 + }, + { + "epoch": 34.653800346470334, + "grad_norm": 0.3843109905719757, + "learning_rate": 4.653461996535297e-05, + "loss": 0.3601, + "step": 5121000 + }, + { + "epoch": 34.657183845820704, + "grad_norm": 0.38966867327690125, + "learning_rate": 4.653428161541793e-05, + "loss": 0.3589, + "step": 5121500 + }, + { + "epoch": 34.66056734517107, + "grad_norm": 0.3803916871547699, + "learning_rate": 4.653394326548289e-05, + "loss": 0.3589, + "step": 5122000 + }, + { + "epoch": 34.66395084452144, + "grad_norm": 0.3781992197036743, + "learning_rate": 4.653360491554786e-05, + "loss": 0.3585, + "step": 5122500 + }, + { + "epoch": 34.66733434387181, + "grad_norm": 0.3852325975894928, + "learning_rate": 4.653326656561282e-05, + "loss": 0.3599, + "step": 5123000 + }, + { + "epoch": 34.67071784322217, + "grad_norm": 0.37221723794937134, + "learning_rate": 4.653292821567778e-05, + "loss": 0.361, + "step": 5123500 + }, + { + "epoch": 34.67410134257254, + "grad_norm": 0.3999648094177246, + "learning_rate": 4.6532589865742745e-05, + "loss": 0.3607, + "step": 5124000 + }, + { + "epoch": 34.67748484192291, + "grad_norm": 0.3421521484851837, + "learning_rate": 4.6532251515807714e-05, + "loss": 0.3577, + "step": 5124500 + }, + { + "epoch": 34.68086834127328, + "grad_norm": 0.37870514392852783, + "learning_rate": 4.6531913165872676e-05, + "loss": 0.359, + "step": 5125000 + }, + { + "epoch": 34.684251840623645, + "grad_norm": 0.3588518500328064, + "learning_rate": 4.653157481593764e-05, + "loss": 0.3597, + "step": 5125500 + }, + { + "epoch": 34.687635339974015, + "grad_norm": 0.390232652425766, + "learning_rate": 4.65312364660026e-05, + "loss": 0.3595, + "step": 5126000 + }, + { + "epoch": 34.691018839324386, + "grad_norm": 0.38923120498657227, + "learning_rate": 4.653089811606756e-05, + "loss": 0.36, + "step": 5126500 + }, + { + "epoch": 34.69440233867475, + "grad_norm": 0.39458635449409485, + "learning_rate": 4.6530559766132525e-05, + "loss": 0.3603, + "step": 5127000 + }, + { + "epoch": 34.69778583802512, + "grad_norm": 0.4159116744995117, + "learning_rate": 4.653022141619749e-05, + "loss": 0.3592, + "step": 5127500 + }, + { + "epoch": 34.70116933737549, + "grad_norm": 0.3766118586063385, + "learning_rate": 4.6529883066262456e-05, + "loss": 0.3599, + "step": 5128000 + }, + { + "epoch": 34.70455283672585, + "grad_norm": 0.3492608964443207, + "learning_rate": 4.652954471632742e-05, + "loss": 0.359, + "step": 5128500 + }, + { + "epoch": 34.70793633607622, + "grad_norm": 0.4123823642730713, + "learning_rate": 4.652920636639238e-05, + "loss": 0.3585, + "step": 5129000 + }, + { + "epoch": 34.71131983542659, + "grad_norm": 0.3934713304042816, + "learning_rate": 4.652886801645734e-05, + "loss": 0.3584, + "step": 5129500 + }, + { + "epoch": 34.71470333477696, + "grad_norm": 0.34409138560295105, + "learning_rate": 4.652852966652231e-05, + "loss": 0.3589, + "step": 5130000 + }, + { + "epoch": 34.71808683412733, + "grad_norm": 0.3865695595741272, + "learning_rate": 4.652819131658727e-05, + "loss": 0.359, + "step": 5130500 + }, + { + "epoch": 34.7214703334777, + "grad_norm": 0.366634339094162, + "learning_rate": 4.652785296665223e-05, + "loss": 0.3604, + "step": 5131000 + }, + { + "epoch": 34.72485383282807, + "grad_norm": 0.4457561671733856, + "learning_rate": 4.652751461671719e-05, + "loss": 0.3595, + "step": 5131500 + }, + { + "epoch": 34.72823733217843, + "grad_norm": 0.36967164278030396, + "learning_rate": 4.652717626678216e-05, + "loss": 0.3612, + "step": 5132000 + }, + { + "epoch": 34.7316208315288, + "grad_norm": 0.37795010209083557, + "learning_rate": 4.652683791684712e-05, + "loss": 0.3599, + "step": 5132500 + }, + { + "epoch": 34.73500433087917, + "grad_norm": 0.38283535838127136, + "learning_rate": 4.6526499566912084e-05, + "loss": 0.361, + "step": 5133000 + }, + { + "epoch": 34.738387830229534, + "grad_norm": 0.40486639738082886, + "learning_rate": 4.6526161216977046e-05, + "loss": 0.3584, + "step": 5133500 + }, + { + "epoch": 34.741771329579905, + "grad_norm": 0.41568008065223694, + "learning_rate": 4.6525822867042015e-05, + "loss": 0.3585, + "step": 5134000 + }, + { + "epoch": 34.745154828930275, + "grad_norm": 0.3480258584022522, + "learning_rate": 4.652548451710698e-05, + "loss": 0.3591, + "step": 5134500 + }, + { + "epoch": 34.74853832828064, + "grad_norm": 0.3713112473487854, + "learning_rate": 4.652514616717194e-05, + "loss": 0.3602, + "step": 5135000 + }, + { + "epoch": 34.75192182763101, + "grad_norm": 0.36227697134017944, + "learning_rate": 4.65248078172369e-05, + "loss": 0.3595, + "step": 5135500 + }, + { + "epoch": 34.75530532698138, + "grad_norm": 0.3820270597934723, + "learning_rate": 4.652446946730186e-05, + "loss": 0.3595, + "step": 5136000 + }, + { + "epoch": 34.75868882633174, + "grad_norm": 0.3776843547821045, + "learning_rate": 4.6524131117366825e-05, + "loss": 0.3598, + "step": 5136500 + }, + { + "epoch": 34.76207232568211, + "grad_norm": 0.4049762189388275, + "learning_rate": 4.652379276743179e-05, + "loss": 0.3593, + "step": 5137000 + }, + { + "epoch": 34.76545582503248, + "grad_norm": 0.3922647535800934, + "learning_rate": 4.6523454417496756e-05, + "loss": 0.3593, + "step": 5137500 + }, + { + "epoch": 34.76883932438285, + "grad_norm": 0.3500220477581024, + "learning_rate": 4.652311606756172e-05, + "loss": 0.3594, + "step": 5138000 + }, + { + "epoch": 34.772222823733216, + "grad_norm": 0.36333972215652466, + "learning_rate": 4.652277771762668e-05, + "loss": 0.3601, + "step": 5138500 + }, + { + "epoch": 34.775606323083586, + "grad_norm": 0.3815446197986603, + "learning_rate": 4.652243936769164e-05, + "loss": 0.3593, + "step": 5139000 + }, + { + "epoch": 34.77898982243396, + "grad_norm": 0.387391060590744, + "learning_rate": 4.652210101775661e-05, + "loss": 0.3597, + "step": 5139500 + }, + { + "epoch": 34.78237332178432, + "grad_norm": 0.37375015020370483, + "learning_rate": 4.6521762667821574e-05, + "loss": 0.3584, + "step": 5140000 + }, + { + "epoch": 34.78575682113469, + "grad_norm": 0.335807204246521, + "learning_rate": 4.652142431788653e-05, + "loss": 0.3602, + "step": 5140500 + }, + { + "epoch": 34.78914032048506, + "grad_norm": 0.3519132137298584, + "learning_rate": 4.652108596795149e-05, + "loss": 0.3585, + "step": 5141000 + }, + { + "epoch": 34.792523819835424, + "grad_norm": 0.38648733496665955, + "learning_rate": 4.652074761801646e-05, + "loss": 0.3596, + "step": 5141500 + }, + { + "epoch": 34.795907319185794, + "grad_norm": 0.3896413743495941, + "learning_rate": 4.652040926808142e-05, + "loss": 0.3598, + "step": 5142000 + }, + { + "epoch": 34.799290818536164, + "grad_norm": 0.38711005449295044, + "learning_rate": 4.6520070918146384e-05, + "loss": 0.3593, + "step": 5142500 + }, + { + "epoch": 34.80267431788653, + "grad_norm": 0.41289591789245605, + "learning_rate": 4.6519732568211346e-05, + "loss": 0.3612, + "step": 5143000 + }, + { + "epoch": 34.8060578172369, + "grad_norm": 0.42670756578445435, + "learning_rate": 4.6519394218276315e-05, + "loss": 0.3585, + "step": 5143500 + }, + { + "epoch": 34.80944131658727, + "grad_norm": 0.36375993490219116, + "learning_rate": 4.651905586834128e-05, + "loss": 0.3629, + "step": 5144000 + }, + { + "epoch": 34.81282481593764, + "grad_norm": 0.348468154668808, + "learning_rate": 4.651871751840624e-05, + "loss": 0.3595, + "step": 5144500 + }, + { + "epoch": 34.816208315288, + "grad_norm": 0.37443798780441284, + "learning_rate": 4.65183791684712e-05, + "loss": 0.3594, + "step": 5145000 + }, + { + "epoch": 34.81959181463837, + "grad_norm": 0.3582093119621277, + "learning_rate": 4.6518040818536164e-05, + "loss": 0.3587, + "step": 5145500 + }, + { + "epoch": 34.82297531398874, + "grad_norm": 0.3682185411453247, + "learning_rate": 4.6517702468601126e-05, + "loss": 0.3602, + "step": 5146000 + }, + { + "epoch": 34.826358813339105, + "grad_norm": 0.33949360251426697, + "learning_rate": 4.651736411866609e-05, + "loss": 0.3597, + "step": 5146500 + }, + { + "epoch": 34.829742312689476, + "grad_norm": 0.3803558051586151, + "learning_rate": 4.651702576873106e-05, + "loss": 0.3611, + "step": 5147000 + }, + { + "epoch": 34.833125812039846, + "grad_norm": 0.4080631136894226, + "learning_rate": 4.651668741879602e-05, + "loss": 0.3607, + "step": 5147500 + }, + { + "epoch": 34.83650931139021, + "grad_norm": 0.3974284827709198, + "learning_rate": 4.651634906886098e-05, + "loss": 0.3599, + "step": 5148000 + }, + { + "epoch": 34.83989281074058, + "grad_norm": 0.38302257657051086, + "learning_rate": 4.651601071892594e-05, + "loss": 0.3583, + "step": 5148500 + }, + { + "epoch": 34.84327631009095, + "grad_norm": 0.3776800334453583, + "learning_rate": 4.651567236899091e-05, + "loss": 0.3596, + "step": 5149000 + }, + { + "epoch": 34.84665980944132, + "grad_norm": 0.44061967730522156, + "learning_rate": 4.6515334019055874e-05, + "loss": 0.3604, + "step": 5149500 + }, + { + "epoch": 34.85004330879168, + "grad_norm": 0.3747578561306, + "learning_rate": 4.651499566912083e-05, + "loss": 0.3595, + "step": 5150000 + }, + { + "epoch": 34.85342680814205, + "grad_norm": 0.36438217759132385, + "learning_rate": 4.651465731918579e-05, + "loss": 0.3591, + "step": 5150500 + }, + { + "epoch": 34.856810307492424, + "grad_norm": 0.41121163964271545, + "learning_rate": 4.651431896925076e-05, + "loss": 0.358, + "step": 5151000 + }, + { + "epoch": 34.86019380684279, + "grad_norm": 0.38191524147987366, + "learning_rate": 4.651398061931572e-05, + "loss": 0.3601, + "step": 5151500 + }, + { + "epoch": 34.86357730619316, + "grad_norm": 0.40387558937072754, + "learning_rate": 4.6513642269380685e-05, + "loss": 0.3595, + "step": 5152000 + }, + { + "epoch": 34.86696080554353, + "grad_norm": 0.35598960518836975, + "learning_rate": 4.651330391944565e-05, + "loss": 0.3591, + "step": 5152500 + }, + { + "epoch": 34.87034430489389, + "grad_norm": 0.33690690994262695, + "learning_rate": 4.6512965569510616e-05, + "loss": 0.36, + "step": 5153000 + }, + { + "epoch": 34.87372780424426, + "grad_norm": 0.3854426443576813, + "learning_rate": 4.651262721957558e-05, + "loss": 0.3597, + "step": 5153500 + }, + { + "epoch": 34.87711130359463, + "grad_norm": 0.3753717541694641, + "learning_rate": 4.651228886964054e-05, + "loss": 0.3597, + "step": 5154000 + }, + { + "epoch": 34.880494802944995, + "grad_norm": 0.35593491792678833, + "learning_rate": 4.65119505197055e-05, + "loss": 0.3592, + "step": 5154500 + }, + { + "epoch": 34.883878302295365, + "grad_norm": 0.37082576751708984, + "learning_rate": 4.6511612169770464e-05, + "loss": 0.3605, + "step": 5155000 + }, + { + "epoch": 34.887261801645735, + "grad_norm": 0.41517174243927, + "learning_rate": 4.6511273819835427e-05, + "loss": 0.3592, + "step": 5155500 + }, + { + "epoch": 34.890645300996106, + "grad_norm": 0.4043850600719452, + "learning_rate": 4.651093546990039e-05, + "loss": 0.3596, + "step": 5156000 + }, + { + "epoch": 34.89402880034647, + "grad_norm": 0.4053369462490082, + "learning_rate": 4.651059711996536e-05, + "loss": 0.359, + "step": 5156500 + }, + { + "epoch": 34.89741229969684, + "grad_norm": 0.40419548749923706, + "learning_rate": 4.651025877003032e-05, + "loss": 0.3585, + "step": 5157000 + }, + { + "epoch": 34.90079579904721, + "grad_norm": 0.3818973898887634, + "learning_rate": 4.650992042009528e-05, + "loss": 0.3586, + "step": 5157500 + }, + { + "epoch": 34.90417929839757, + "grad_norm": 0.41882139444351196, + "learning_rate": 4.6509582070160244e-05, + "loss": 0.3599, + "step": 5158000 + }, + { + "epoch": 34.90756279774794, + "grad_norm": 0.38736435770988464, + "learning_rate": 4.650924372022521e-05, + "loss": 0.3589, + "step": 5158500 + }, + { + "epoch": 34.91094629709831, + "grad_norm": 0.33506953716278076, + "learning_rate": 4.6508905370290175e-05, + "loss": 0.3598, + "step": 5159000 + }, + { + "epoch": 34.914329796448676, + "grad_norm": 0.3672519624233246, + "learning_rate": 4.650856702035513e-05, + "loss": 0.3604, + "step": 5159500 + }, + { + "epoch": 34.91771329579905, + "grad_norm": 0.3793613016605377, + "learning_rate": 4.650822867042009e-05, + "loss": 0.3593, + "step": 5160000 + }, + { + "epoch": 34.92109679514942, + "grad_norm": 0.3812962472438812, + "learning_rate": 4.650789032048506e-05, + "loss": 0.3604, + "step": 5160500 + }, + { + "epoch": 34.92448029449978, + "grad_norm": 0.4058850109577179, + "learning_rate": 4.6507551970550023e-05, + "loss": 0.3597, + "step": 5161000 + }, + { + "epoch": 34.92786379385015, + "grad_norm": 0.37779462337493896, + "learning_rate": 4.6507213620614986e-05, + "loss": 0.3606, + "step": 5161500 + }, + { + "epoch": 34.93124729320052, + "grad_norm": 0.4026508927345276, + "learning_rate": 4.650687527067995e-05, + "loss": 0.3602, + "step": 5162000 + }, + { + "epoch": 34.93463079255089, + "grad_norm": 0.40791353583335876, + "learning_rate": 4.650653692074492e-05, + "loss": 0.3592, + "step": 5162500 + }, + { + "epoch": 34.938014291901254, + "grad_norm": 0.3928754925727844, + "learning_rate": 4.650619857080988e-05, + "loss": 0.3604, + "step": 5163000 + }, + { + "epoch": 34.941397791251624, + "grad_norm": 0.37801092863082886, + "learning_rate": 4.650586022087484e-05, + "loss": 0.3593, + "step": 5163500 + }, + { + "epoch": 34.944781290601995, + "grad_norm": 0.34316352009773254, + "learning_rate": 4.65055218709398e-05, + "loss": 0.3584, + "step": 5164000 + }, + { + "epoch": 34.94816478995236, + "grad_norm": 0.39071527123451233, + "learning_rate": 4.6505183521004765e-05, + "loss": 0.36, + "step": 5164500 + }, + { + "epoch": 34.95154828930273, + "grad_norm": 0.34213265776634216, + "learning_rate": 4.650484517106973e-05, + "loss": 0.3603, + "step": 5165000 + }, + { + "epoch": 34.9549317886531, + "grad_norm": 0.3949839174747467, + "learning_rate": 4.650450682113469e-05, + "loss": 0.3585, + "step": 5165500 + }, + { + "epoch": 34.95831528800346, + "grad_norm": 0.384823203086853, + "learning_rate": 4.650416847119966e-05, + "loss": 0.3588, + "step": 5166000 + }, + { + "epoch": 34.96169878735383, + "grad_norm": 0.4190446734428406, + "learning_rate": 4.650383012126462e-05, + "loss": 0.3591, + "step": 5166500 + }, + { + "epoch": 34.9650822867042, + "grad_norm": 0.3400190472602844, + "learning_rate": 4.650349177132958e-05, + "loss": 0.3609, + "step": 5167000 + }, + { + "epoch": 34.968465786054566, + "grad_norm": 0.39202719926834106, + "learning_rate": 4.6503153421394545e-05, + "loss": 0.3596, + "step": 5167500 + }, + { + "epoch": 34.971849285404936, + "grad_norm": 0.3868553936481476, + "learning_rate": 4.650281507145951e-05, + "loss": 0.3588, + "step": 5168000 + }, + { + "epoch": 34.975232784755306, + "grad_norm": 0.3931337594985962, + "learning_rate": 4.6502476721524476e-05, + "loss": 0.3608, + "step": 5168500 + }, + { + "epoch": 34.97861628410568, + "grad_norm": 0.4102155268192291, + "learning_rate": 4.650213837158944e-05, + "loss": 0.3594, + "step": 5169000 + }, + { + "epoch": 34.98199978345604, + "grad_norm": 0.36841264367103577, + "learning_rate": 4.650180002165439e-05, + "loss": 0.3594, + "step": 5169500 + }, + { + "epoch": 34.98538328280641, + "grad_norm": 0.3891858458518982, + "learning_rate": 4.650146167171936e-05, + "loss": 0.3609, + "step": 5170000 + }, + { + "epoch": 34.98876678215678, + "grad_norm": 0.39834773540496826, + "learning_rate": 4.6501123321784324e-05, + "loss": 0.3601, + "step": 5170500 + }, + { + "epoch": 34.99215028150714, + "grad_norm": 0.36702215671539307, + "learning_rate": 4.6500784971849286e-05, + "loss": 0.3602, + "step": 5171000 + }, + { + "epoch": 34.995533780857514, + "grad_norm": 0.4127750098705292, + "learning_rate": 4.650044662191425e-05, + "loss": 0.3593, + "step": 5171500 + }, + { + "epoch": 34.998917280207884, + "grad_norm": 0.41186824440956116, + "learning_rate": 4.650010827197922e-05, + "loss": 0.3586, + "step": 5172000 + }, + { + "epoch": 35.0, + "eval_accuracy": 0.8629421222081165, + "eval_loss": 0.5557395219802856, + "eval_runtime": 3356.599, + "eval_samples_per_second": 86.619, + "eval_steps_per_second": 5.414, + "step": 5172160 + }, + { + "epoch": 35.00230077955825, + "grad_norm": 0.3806530833244324, + "learning_rate": 4.649976992204418e-05, + "loss": 0.3571, + "step": 5172500 + }, + { + "epoch": 35.00568427890862, + "grad_norm": 0.3965018093585968, + "learning_rate": 4.649943157210914e-05, + "loss": 0.3583, + "step": 5173000 + }, + { + "epoch": 35.00906777825899, + "grad_norm": 0.36404433846473694, + "learning_rate": 4.6499093222174104e-05, + "loss": 0.3569, + "step": 5173500 + }, + { + "epoch": 35.01245127760936, + "grad_norm": 0.38004666566848755, + "learning_rate": 4.6498754872239066e-05, + "loss": 0.3575, + "step": 5174000 + }, + { + "epoch": 35.01583477695972, + "grad_norm": 0.3895862400531769, + "learning_rate": 4.649841652230403e-05, + "loss": 0.3577, + "step": 5174500 + }, + { + "epoch": 35.01921827631009, + "grad_norm": 0.37043413519859314, + "learning_rate": 4.649807817236899e-05, + "loss": 0.3569, + "step": 5175000 + }, + { + "epoch": 35.02260177566046, + "grad_norm": 0.3911413550376892, + "learning_rate": 4.649773982243396e-05, + "loss": 0.3577, + "step": 5175500 + }, + { + "epoch": 35.025985275010825, + "grad_norm": 0.39181697368621826, + "learning_rate": 4.649740147249892e-05, + "loss": 0.3573, + "step": 5176000 + }, + { + "epoch": 35.029368774361195, + "grad_norm": 0.3618784546852112, + "learning_rate": 4.649706312256388e-05, + "loss": 0.3567, + "step": 5176500 + }, + { + "epoch": 35.032752273711566, + "grad_norm": 0.42894452810287476, + "learning_rate": 4.6496724772628845e-05, + "loss": 0.3589, + "step": 5177000 + }, + { + "epoch": 35.03613577306193, + "grad_norm": 0.41768378019332886, + "learning_rate": 4.649638642269381e-05, + "loss": 0.3573, + "step": 5177500 + }, + { + "epoch": 35.0395192724123, + "grad_norm": 0.4142279624938965, + "learning_rate": 4.6496048072758776e-05, + "loss": 0.3572, + "step": 5178000 + }, + { + "epoch": 35.04290277176267, + "grad_norm": 0.40129706263542175, + "learning_rate": 4.649570972282374e-05, + "loss": 0.3562, + "step": 5178500 + }, + { + "epoch": 35.04628627111303, + "grad_norm": 0.38917475938796997, + "learning_rate": 4.6495371372888694e-05, + "loss": 0.3571, + "step": 5179000 + }, + { + "epoch": 35.0496697704634, + "grad_norm": 0.3783824145793915, + "learning_rate": 4.649503302295366e-05, + "loss": 0.358, + "step": 5179500 + }, + { + "epoch": 35.05305326981377, + "grad_norm": 0.42363739013671875, + "learning_rate": 4.6494694673018625e-05, + "loss": 0.359, + "step": 5180000 + }, + { + "epoch": 35.056436769164144, + "grad_norm": 0.43193498253822327, + "learning_rate": 4.649435632308359e-05, + "loss": 0.357, + "step": 5180500 + }, + { + "epoch": 35.05982026851451, + "grad_norm": 0.366696834564209, + "learning_rate": 4.649401797314855e-05, + "loss": 0.3583, + "step": 5181000 + }, + { + "epoch": 35.06320376786488, + "grad_norm": 0.37471070885658264, + "learning_rate": 4.649367962321352e-05, + "loss": 0.3582, + "step": 5181500 + }, + { + "epoch": 35.06658726721525, + "grad_norm": 0.36143651604652405, + "learning_rate": 4.649334127327848e-05, + "loss": 0.3589, + "step": 5182000 + }, + { + "epoch": 35.06997076656561, + "grad_norm": 0.41785821318626404, + "learning_rate": 4.649300292334344e-05, + "loss": 0.3571, + "step": 5182500 + }, + { + "epoch": 35.07335426591598, + "grad_norm": 0.38460901379585266, + "learning_rate": 4.6492664573408404e-05, + "loss": 0.3568, + "step": 5183000 + }, + { + "epoch": 35.07673776526635, + "grad_norm": 0.3540903925895691, + "learning_rate": 4.6492326223473366e-05, + "loss": 0.3572, + "step": 5183500 + }, + { + "epoch": 35.080121264616714, + "grad_norm": 0.37059855461120605, + "learning_rate": 4.649198787353833e-05, + "loss": 0.3576, + "step": 5184000 + }, + { + "epoch": 35.083504763967085, + "grad_norm": 0.3962647616863251, + "learning_rate": 4.649164952360329e-05, + "loss": 0.3572, + "step": 5184500 + }, + { + "epoch": 35.086888263317455, + "grad_norm": 0.42337074875831604, + "learning_rate": 4.649131117366825e-05, + "loss": 0.359, + "step": 5185000 + }, + { + "epoch": 35.09027176266782, + "grad_norm": 0.36690419912338257, + "learning_rate": 4.649097282373322e-05, + "loss": 0.3571, + "step": 5185500 + }, + { + "epoch": 35.09365526201819, + "grad_norm": 0.3772687017917633, + "learning_rate": 4.6490634473798184e-05, + "loss": 0.3585, + "step": 5186000 + }, + { + "epoch": 35.09703876136856, + "grad_norm": 0.3888028860092163, + "learning_rate": 4.6490296123863146e-05, + "loss": 0.3575, + "step": 5186500 + }, + { + "epoch": 35.10042226071893, + "grad_norm": 0.38842588663101196, + "learning_rate": 4.648995777392811e-05, + "loss": 0.3587, + "step": 5187000 + }, + { + "epoch": 35.10380576006929, + "grad_norm": 0.40151742100715637, + "learning_rate": 4.648961942399308e-05, + "loss": 0.3597, + "step": 5187500 + }, + { + "epoch": 35.10718925941966, + "grad_norm": 0.3653225004673004, + "learning_rate": 4.648928107405804e-05, + "loss": 0.3588, + "step": 5188000 + }, + { + "epoch": 35.11057275877003, + "grad_norm": 0.42274415493011475, + "learning_rate": 4.6488942724122994e-05, + "loss": 0.3582, + "step": 5188500 + }, + { + "epoch": 35.113956258120396, + "grad_norm": 0.43063274025917053, + "learning_rate": 4.648860437418796e-05, + "loss": 0.3589, + "step": 5189000 + }, + { + "epoch": 35.117339757470766, + "grad_norm": 0.38887819647789, + "learning_rate": 4.6488266024252925e-05, + "loss": 0.3581, + "step": 5189500 + }, + { + "epoch": 35.12072325682114, + "grad_norm": 0.3567529022693634, + "learning_rate": 4.648792767431789e-05, + "loss": 0.3585, + "step": 5190000 + }, + { + "epoch": 35.1241067561715, + "grad_norm": 0.33756041526794434, + "learning_rate": 4.648758932438285e-05, + "loss": 0.3591, + "step": 5190500 + }, + { + "epoch": 35.12749025552187, + "grad_norm": 0.39104586839675903, + "learning_rate": 4.648725097444782e-05, + "loss": 0.3587, + "step": 5191000 + }, + { + "epoch": 35.13087375487224, + "grad_norm": 0.3683955669403076, + "learning_rate": 4.648691262451278e-05, + "loss": 0.3575, + "step": 5191500 + }, + { + "epoch": 35.134257254222604, + "grad_norm": 0.39823782444000244, + "learning_rate": 4.648657427457774e-05, + "loss": 0.3589, + "step": 5192000 + }, + { + "epoch": 35.137640753572974, + "grad_norm": 0.41600486636161804, + "learning_rate": 4.6486235924642705e-05, + "loss": 0.3577, + "step": 5192500 + }, + { + "epoch": 35.141024252923344, + "grad_norm": 0.37839174270629883, + "learning_rate": 4.648589757470767e-05, + "loss": 0.3596, + "step": 5193000 + }, + { + "epoch": 35.144407752273715, + "grad_norm": 0.3842908442020416, + "learning_rate": 4.648555922477263e-05, + "loss": 0.3592, + "step": 5193500 + }, + { + "epoch": 35.14779125162408, + "grad_norm": 0.4035647213459015, + "learning_rate": 4.648522087483759e-05, + "loss": 0.3581, + "step": 5194000 + }, + { + "epoch": 35.15117475097445, + "grad_norm": 0.43002358078956604, + "learning_rate": 4.648488252490255e-05, + "loss": 0.3577, + "step": 5194500 + }, + { + "epoch": 35.15455825032482, + "grad_norm": 0.4105346202850342, + "learning_rate": 4.648454417496752e-05, + "loss": 0.3588, + "step": 5195000 + }, + { + "epoch": 35.15794174967518, + "grad_norm": 0.36859622597694397, + "learning_rate": 4.6484205825032484e-05, + "loss": 0.3588, + "step": 5195500 + }, + { + "epoch": 35.16132524902555, + "grad_norm": 0.4235802888870239, + "learning_rate": 4.6483867475097447e-05, + "loss": 0.3587, + "step": 5196000 + }, + { + "epoch": 35.16470874837592, + "grad_norm": 0.3733595609664917, + "learning_rate": 4.648352912516241e-05, + "loss": 0.3573, + "step": 5196500 + }, + { + "epoch": 35.168092247726285, + "grad_norm": 0.38341930508613586, + "learning_rate": 4.648319077522738e-05, + "loss": 0.3582, + "step": 5197000 + }, + { + "epoch": 35.171475747076656, + "grad_norm": 0.4134353995323181, + "learning_rate": 4.648285242529234e-05, + "loss": 0.359, + "step": 5197500 + }, + { + "epoch": 35.174859246427026, + "grad_norm": 0.42130154371261597, + "learning_rate": 4.6482514075357295e-05, + "loss": 0.3572, + "step": 5198000 + }, + { + "epoch": 35.178242745777396, + "grad_norm": 0.4000984728336334, + "learning_rate": 4.6482175725422264e-05, + "loss": 0.3576, + "step": 5198500 + }, + { + "epoch": 35.18162624512776, + "grad_norm": 0.3766460418701172, + "learning_rate": 4.6481837375487226e-05, + "loss": 0.3585, + "step": 5199000 + }, + { + "epoch": 35.18500974447813, + "grad_norm": 0.39292386174201965, + "learning_rate": 4.648149902555219e-05, + "loss": 0.3575, + "step": 5199500 + }, + { + "epoch": 35.1883932438285, + "grad_norm": 0.39176398515701294, + "learning_rate": 4.648116067561715e-05, + "loss": 0.3589, + "step": 5200000 + }, + { + "epoch": 35.19177674317886, + "grad_norm": 0.3988284468650818, + "learning_rate": 4.648082232568212e-05, + "loss": 0.3579, + "step": 5200500 + }, + { + "epoch": 35.195160242529234, + "grad_norm": 0.3713776469230652, + "learning_rate": 4.648048397574708e-05, + "loss": 0.3584, + "step": 5201000 + }, + { + "epoch": 35.198543741879604, + "grad_norm": 0.3763924837112427, + "learning_rate": 4.6480145625812043e-05, + "loss": 0.3579, + "step": 5201500 + }, + { + "epoch": 35.20192724122997, + "grad_norm": 0.3911839425563812, + "learning_rate": 4.6479807275877006e-05, + "loss": 0.3583, + "step": 5202000 + }, + { + "epoch": 35.20531074058034, + "grad_norm": 0.4037778973579407, + "learning_rate": 4.647946892594197e-05, + "loss": 0.3601, + "step": 5202500 + }, + { + "epoch": 35.20869423993071, + "grad_norm": 0.37728357315063477, + "learning_rate": 4.647913057600693e-05, + "loss": 0.358, + "step": 5203000 + }, + { + "epoch": 35.21207773928107, + "grad_norm": 0.36691904067993164, + "learning_rate": 4.647879222607189e-05, + "loss": 0.3566, + "step": 5203500 + }, + { + "epoch": 35.21546123863144, + "grad_norm": 0.38442808389663696, + "learning_rate": 4.6478453876136854e-05, + "loss": 0.3593, + "step": 5204000 + }, + { + "epoch": 35.21884473798181, + "grad_norm": 0.41736212372779846, + "learning_rate": 4.647811552620182e-05, + "loss": 0.3584, + "step": 5204500 + }, + { + "epoch": 35.22222823733218, + "grad_norm": 0.3651248812675476, + "learning_rate": 4.6477777176266785e-05, + "loss": 0.3598, + "step": 5205000 + }, + { + "epoch": 35.225611736682545, + "grad_norm": 0.365179181098938, + "learning_rate": 4.647743882633175e-05, + "loss": 0.3592, + "step": 5205500 + }, + { + "epoch": 35.228995236032915, + "grad_norm": 0.3637058138847351, + "learning_rate": 4.647710047639671e-05, + "loss": 0.3596, + "step": 5206000 + }, + { + "epoch": 35.232378735383286, + "grad_norm": 0.3909095525741577, + "learning_rate": 4.647676212646168e-05, + "loss": 0.3596, + "step": 5206500 + }, + { + "epoch": 35.23576223473365, + "grad_norm": 0.390669584274292, + "learning_rate": 4.647642377652664e-05, + "loss": 0.3592, + "step": 5207000 + }, + { + "epoch": 35.23914573408402, + "grad_norm": 0.40053826570510864, + "learning_rate": 4.6476085426591596e-05, + "loss": 0.3594, + "step": 5207500 + }, + { + "epoch": 35.24252923343439, + "grad_norm": 0.4064285457134247, + "learning_rate": 4.6475747076656565e-05, + "loss": 0.3589, + "step": 5208000 + }, + { + "epoch": 35.24591273278475, + "grad_norm": 0.3962462246417999, + "learning_rate": 4.647540872672153e-05, + "loss": 0.3599, + "step": 5208500 + }, + { + "epoch": 35.24929623213512, + "grad_norm": 0.3796030282974243, + "learning_rate": 4.647507037678649e-05, + "loss": 0.3597, + "step": 5209000 + }, + { + "epoch": 35.25267973148549, + "grad_norm": 0.3501536250114441, + "learning_rate": 4.647473202685145e-05, + "loss": 0.3602, + "step": 5209500 + }, + { + "epoch": 35.256063230835856, + "grad_norm": 0.35535991191864014, + "learning_rate": 4.647439367691642e-05, + "loss": 0.3577, + "step": 5210000 + }, + { + "epoch": 35.25944673018623, + "grad_norm": 0.3918737769126892, + "learning_rate": 4.647405532698138e-05, + "loss": 0.3584, + "step": 5210500 + }, + { + "epoch": 35.2628302295366, + "grad_norm": 0.3632281422615051, + "learning_rate": 4.6473716977046344e-05, + "loss": 0.3585, + "step": 5211000 + }, + { + "epoch": 35.26621372888697, + "grad_norm": 0.3752191364765167, + "learning_rate": 4.6473378627111306e-05, + "loss": 0.3585, + "step": 5211500 + }, + { + "epoch": 35.26959722823733, + "grad_norm": 0.4171627461910248, + "learning_rate": 4.647304027717627e-05, + "loss": 0.358, + "step": 5212000 + }, + { + "epoch": 35.2729807275877, + "grad_norm": 0.3874201774597168, + "learning_rate": 4.647270192724123e-05, + "loss": 0.3589, + "step": 5212500 + }, + { + "epoch": 35.27636422693807, + "grad_norm": 0.36945509910583496, + "learning_rate": 4.647236357730619e-05, + "loss": 0.3597, + "step": 5213000 + }, + { + "epoch": 35.279747726288434, + "grad_norm": 0.38861581683158875, + "learning_rate": 4.6472025227371155e-05, + "loss": 0.3591, + "step": 5213500 + }, + { + "epoch": 35.283131225638805, + "grad_norm": 0.3578372895717621, + "learning_rate": 4.6471686877436124e-05, + "loss": 0.3587, + "step": 5214000 + }, + { + "epoch": 35.286514724989175, + "grad_norm": 0.3561571538448334, + "learning_rate": 4.6471348527501086e-05, + "loss": 0.358, + "step": 5214500 + }, + { + "epoch": 35.28989822433954, + "grad_norm": 0.39968201518058777, + "learning_rate": 4.647101017756605e-05, + "loss": 0.3588, + "step": 5215000 + }, + { + "epoch": 35.29328172368991, + "grad_norm": 0.42220667004585266, + "learning_rate": 4.647067182763101e-05, + "loss": 0.3607, + "step": 5215500 + }, + { + "epoch": 35.29666522304028, + "grad_norm": 0.41770124435424805, + "learning_rate": 4.647033347769598e-05, + "loss": 0.359, + "step": 5216000 + }, + { + "epoch": 35.30004872239064, + "grad_norm": 0.3415060043334961, + "learning_rate": 4.646999512776094e-05, + "loss": 0.3597, + "step": 5216500 + }, + { + "epoch": 35.30343222174101, + "grad_norm": 0.3768995404243469, + "learning_rate": 4.6469656777825896e-05, + "loss": 0.3587, + "step": 5217000 + }, + { + "epoch": 35.30681572109138, + "grad_norm": 0.368702232837677, + "learning_rate": 4.6469318427890865e-05, + "loss": 0.3599, + "step": 5217500 + }, + { + "epoch": 35.31019922044175, + "grad_norm": 0.3856721520423889, + "learning_rate": 4.646898007795583e-05, + "loss": 0.3584, + "step": 5218000 + }, + { + "epoch": 35.313582719792116, + "grad_norm": 0.3629095256328583, + "learning_rate": 4.646864172802079e-05, + "loss": 0.3582, + "step": 5218500 + }, + { + "epoch": 35.316966219142486, + "grad_norm": 0.3303627669811249, + "learning_rate": 4.646830337808575e-05, + "loss": 0.3582, + "step": 5219000 + }, + { + "epoch": 35.32034971849286, + "grad_norm": 0.4104846715927124, + "learning_rate": 4.646796502815072e-05, + "loss": 0.3597, + "step": 5219500 + }, + { + "epoch": 35.32373321784322, + "grad_norm": 0.3870714604854584, + "learning_rate": 4.646762667821568e-05, + "loss": 0.3591, + "step": 5220000 + }, + { + "epoch": 35.32711671719359, + "grad_norm": 0.38889339566230774, + "learning_rate": 4.6467288328280645e-05, + "loss": 0.3587, + "step": 5220500 + }, + { + "epoch": 35.33050021654396, + "grad_norm": 0.38608211278915405, + "learning_rate": 4.646694997834561e-05, + "loss": 0.36, + "step": 5221000 + }, + { + "epoch": 35.33388371589432, + "grad_norm": 0.38808754086494446, + "learning_rate": 4.646661162841057e-05, + "loss": 0.3586, + "step": 5221500 + }, + { + "epoch": 35.337267215244694, + "grad_norm": 0.39795616269111633, + "learning_rate": 4.646627327847553e-05, + "loss": 0.3598, + "step": 5222000 + }, + { + "epoch": 35.340650714595064, + "grad_norm": 0.4353151023387909, + "learning_rate": 4.646593492854049e-05, + "loss": 0.3586, + "step": 5222500 + }, + { + "epoch": 35.344034213945434, + "grad_norm": 0.36362794041633606, + "learning_rate": 4.6465596578605455e-05, + "loss": 0.3607, + "step": 5223000 + }, + { + "epoch": 35.3474177132958, + "grad_norm": 0.4323992431163788, + "learning_rate": 4.6465258228670424e-05, + "loss": 0.3567, + "step": 5223500 + }, + { + "epoch": 35.35080121264617, + "grad_norm": 0.3503703773021698, + "learning_rate": 4.6464919878735386e-05, + "loss": 0.3584, + "step": 5224000 + }, + { + "epoch": 35.35418471199654, + "grad_norm": 0.4126063585281372, + "learning_rate": 4.646458152880035e-05, + "loss": 0.3594, + "step": 5224500 + }, + { + "epoch": 35.3575682113469, + "grad_norm": 0.3662908375263214, + "learning_rate": 4.646424317886531e-05, + "loss": 0.3593, + "step": 5225000 + }, + { + "epoch": 35.36095171069727, + "grad_norm": 0.37647882103919983, + "learning_rate": 4.646390482893028e-05, + "loss": 0.3601, + "step": 5225500 + }, + { + "epoch": 35.36433521004764, + "grad_norm": 0.3930198550224304, + "learning_rate": 4.646356647899524e-05, + "loss": 0.3596, + "step": 5226000 + }, + { + "epoch": 35.367718709398005, + "grad_norm": 0.36636117100715637, + "learning_rate": 4.64632281290602e-05, + "loss": 0.359, + "step": 5226500 + }, + { + "epoch": 35.371102208748376, + "grad_norm": 0.37644514441490173, + "learning_rate": 4.6462889779125166e-05, + "loss": 0.3588, + "step": 5227000 + }, + { + "epoch": 35.374485708098746, + "grad_norm": 0.41436415910720825, + "learning_rate": 4.646255142919013e-05, + "loss": 0.3592, + "step": 5227500 + }, + { + "epoch": 35.37786920744911, + "grad_norm": 0.3898829221725464, + "learning_rate": 4.646221307925509e-05, + "loss": 0.3584, + "step": 5228000 + }, + { + "epoch": 35.38125270679948, + "grad_norm": 0.3516451120376587, + "learning_rate": 4.646187472932005e-05, + "loss": 0.36, + "step": 5228500 + }, + { + "epoch": 35.38463620614985, + "grad_norm": 0.35585659742355347, + "learning_rate": 4.646153637938502e-05, + "loss": 0.359, + "step": 5229000 + }, + { + "epoch": 35.38801970550022, + "grad_norm": 0.3721868097782135, + "learning_rate": 4.646119802944998e-05, + "loss": 0.3583, + "step": 5229500 + }, + { + "epoch": 35.39140320485058, + "grad_norm": 0.3831140100955963, + "learning_rate": 4.6460859679514945e-05, + "loss": 0.3576, + "step": 5230000 + }, + { + "epoch": 35.39478670420095, + "grad_norm": 0.36397331953048706, + "learning_rate": 4.646052132957991e-05, + "loss": 0.3586, + "step": 5230500 + }, + { + "epoch": 35.398170203551324, + "grad_norm": 0.40902072191238403, + "learning_rate": 4.646018297964487e-05, + "loss": 0.3595, + "step": 5231000 + }, + { + "epoch": 35.40155370290169, + "grad_norm": 0.38793542981147766, + "learning_rate": 4.645984462970983e-05, + "loss": 0.3594, + "step": 5231500 + }, + { + "epoch": 35.40493720225206, + "grad_norm": 0.37250202894210815, + "learning_rate": 4.6459506279774794e-05, + "loss": 0.3598, + "step": 5232000 + }, + { + "epoch": 35.40832070160243, + "grad_norm": 0.3919159471988678, + "learning_rate": 4.6459167929839756e-05, + "loss": 0.3595, + "step": 5232500 + }, + { + "epoch": 35.41170420095279, + "grad_norm": 0.4006628394126892, + "learning_rate": 4.6458829579904725e-05, + "loss": 0.3593, + "step": 5233000 + }, + { + "epoch": 35.41508770030316, + "grad_norm": 0.381758451461792, + "learning_rate": 4.645849122996969e-05, + "loss": 0.3578, + "step": 5233500 + }, + { + "epoch": 35.41847119965353, + "grad_norm": 0.3751373291015625, + "learning_rate": 4.645815288003465e-05, + "loss": 0.3591, + "step": 5234000 + }, + { + "epoch": 35.421854699003894, + "grad_norm": 0.3407689929008484, + "learning_rate": 4.645781453009961e-05, + "loss": 0.3588, + "step": 5234500 + }, + { + "epoch": 35.425238198354265, + "grad_norm": 0.4218127727508545, + "learning_rate": 4.645747618016458e-05, + "loss": 0.3574, + "step": 5235000 + }, + { + "epoch": 35.428621697704635, + "grad_norm": 0.4281761348247528, + "learning_rate": 4.645713783022954e-05, + "loss": 0.3586, + "step": 5235500 + }, + { + "epoch": 35.432005197055005, + "grad_norm": 0.41469109058380127, + "learning_rate": 4.64567994802945e-05, + "loss": 0.3564, + "step": 5236000 + }, + { + "epoch": 35.43538869640537, + "grad_norm": 0.38898664712905884, + "learning_rate": 4.6456461130359466e-05, + "loss": 0.3603, + "step": 5236500 + }, + { + "epoch": 35.43877219575574, + "grad_norm": 0.3658848702907562, + "learning_rate": 4.645612278042443e-05, + "loss": 0.3579, + "step": 5237000 + }, + { + "epoch": 35.44215569510611, + "grad_norm": 0.41348376870155334, + "learning_rate": 4.645578443048939e-05, + "loss": 0.3591, + "step": 5237500 + }, + { + "epoch": 35.44553919445647, + "grad_norm": 0.40406206250190735, + "learning_rate": 4.645544608055435e-05, + "loss": 0.3574, + "step": 5238000 + }, + { + "epoch": 35.44892269380684, + "grad_norm": 0.39499127864837646, + "learning_rate": 4.6455107730619315e-05, + "loss": 0.3583, + "step": 5238500 + }, + { + "epoch": 35.45230619315721, + "grad_norm": 0.36872249841690063, + "learning_rate": 4.6454769380684284e-05, + "loss": 0.3603, + "step": 5239000 + }, + { + "epoch": 35.455689692507576, + "grad_norm": 0.37547048926353455, + "learning_rate": 4.6454431030749246e-05, + "loss": 0.3595, + "step": 5239500 + }, + { + "epoch": 35.45907319185795, + "grad_norm": 0.4177234172821045, + "learning_rate": 4.645409268081421e-05, + "loss": 0.3585, + "step": 5240000 + }, + { + "epoch": 35.46245669120832, + "grad_norm": 0.3927750587463379, + "learning_rate": 4.645375433087917e-05, + "loss": 0.3606, + "step": 5240500 + }, + { + "epoch": 35.46584019055868, + "grad_norm": 0.3455972969532013, + "learning_rate": 4.645341598094413e-05, + "loss": 0.3582, + "step": 5241000 + }, + { + "epoch": 35.46922368990905, + "grad_norm": 0.3680725395679474, + "learning_rate": 4.6453077631009094e-05, + "loss": 0.3592, + "step": 5241500 + }, + { + "epoch": 35.47260718925942, + "grad_norm": 0.41897982358932495, + "learning_rate": 4.6452739281074057e-05, + "loss": 0.3595, + "step": 5242000 + }, + { + "epoch": 35.47599068860979, + "grad_norm": 0.3519628345966339, + "learning_rate": 4.6452400931139025e-05, + "loss": 0.3597, + "step": 5242500 + }, + { + "epoch": 35.479374187960154, + "grad_norm": 0.39874663949012756, + "learning_rate": 4.645206258120399e-05, + "loss": 0.3591, + "step": 5243000 + }, + { + "epoch": 35.482757687310524, + "grad_norm": 0.41478675603866577, + "learning_rate": 4.645172423126895e-05, + "loss": 0.3575, + "step": 5243500 + }, + { + "epoch": 35.486141186660895, + "grad_norm": 0.36475494503974915, + "learning_rate": 4.645138588133391e-05, + "loss": 0.3595, + "step": 5244000 + }, + { + "epoch": 35.48952468601126, + "grad_norm": 0.41342654824256897, + "learning_rate": 4.645104753139888e-05, + "loss": 0.3568, + "step": 5244500 + }, + { + "epoch": 35.49290818536163, + "grad_norm": 0.3389993906021118, + "learning_rate": 4.645070918146384e-05, + "loss": 0.3574, + "step": 5245000 + }, + { + "epoch": 35.496291684712, + "grad_norm": 0.3895016312599182, + "learning_rate": 4.64503708315288e-05, + "loss": 0.3582, + "step": 5245500 + }, + { + "epoch": 35.49967518406236, + "grad_norm": 0.4148198962211609, + "learning_rate": 4.645003248159377e-05, + "loss": 0.3615, + "step": 5246000 + }, + { + "epoch": 35.50305868341273, + "grad_norm": 0.36932939291000366, + "learning_rate": 4.644969413165873e-05, + "loss": 0.3585, + "step": 5246500 + }, + { + "epoch": 35.5064421827631, + "grad_norm": 0.38193395733833313, + "learning_rate": 4.644935578172369e-05, + "loss": 0.3591, + "step": 5247000 + }, + { + "epoch": 35.50982568211347, + "grad_norm": 0.3812614977359772, + "learning_rate": 4.6449017431788653e-05, + "loss": 0.358, + "step": 5247500 + }, + { + "epoch": 35.513209181463836, + "grad_norm": 0.38990533351898193, + "learning_rate": 4.6448679081853616e-05, + "loss": 0.3592, + "step": 5248000 + }, + { + "epoch": 35.516592680814206, + "grad_norm": 0.3911738693714142, + "learning_rate": 4.6448340731918584e-05, + "loss": 0.3587, + "step": 5248500 + }, + { + "epoch": 35.519976180164576, + "grad_norm": 0.3662065863609314, + "learning_rate": 4.6448002381983547e-05, + "loss": 0.3584, + "step": 5249000 + }, + { + "epoch": 35.52335967951494, + "grad_norm": 0.4016791880130768, + "learning_rate": 4.644766403204851e-05, + "loss": 0.3604, + "step": 5249500 + }, + { + "epoch": 35.52674317886531, + "grad_norm": 0.39107978343963623, + "learning_rate": 4.644732568211347e-05, + "loss": 0.3589, + "step": 5250000 + }, + { + "epoch": 35.53012667821568, + "grad_norm": 0.3511251211166382, + "learning_rate": 4.644698733217843e-05, + "loss": 0.359, + "step": 5250500 + }, + { + "epoch": 35.53351017756604, + "grad_norm": 0.34765365719795227, + "learning_rate": 4.6446648982243395e-05, + "loss": 0.3596, + "step": 5251000 + }, + { + "epoch": 35.536893676916414, + "grad_norm": 0.409365177154541, + "learning_rate": 4.644631063230836e-05, + "loss": 0.36, + "step": 5251500 + }, + { + "epoch": 35.540277176266784, + "grad_norm": 0.3806619644165039, + "learning_rate": 4.6445972282373326e-05, + "loss": 0.3586, + "step": 5252000 + }, + { + "epoch": 35.54366067561715, + "grad_norm": 0.3907391428947449, + "learning_rate": 4.644563393243829e-05, + "loss": 0.3592, + "step": 5252500 + }, + { + "epoch": 35.54704417496752, + "grad_norm": 0.3842097222805023, + "learning_rate": 4.644529558250325e-05, + "loss": 0.3582, + "step": 5253000 + }, + { + "epoch": 35.55042767431789, + "grad_norm": 0.35678067803382874, + "learning_rate": 4.644495723256821e-05, + "loss": 0.3587, + "step": 5253500 + }, + { + "epoch": 35.55381117366826, + "grad_norm": 0.4046052098274231, + "learning_rate": 4.644461888263318e-05, + "loss": 0.3587, + "step": 5254000 + }, + { + "epoch": 35.55719467301862, + "grad_norm": 0.369022011756897, + "learning_rate": 4.6444280532698143e-05, + "loss": 0.3589, + "step": 5254500 + }, + { + "epoch": 35.56057817236899, + "grad_norm": 0.44715407490730286, + "learning_rate": 4.64439421827631e-05, + "loss": 0.3593, + "step": 5255000 + }, + { + "epoch": 35.56396167171936, + "grad_norm": 0.4040326476097107, + "learning_rate": 4.644360383282806e-05, + "loss": 0.3587, + "step": 5255500 + }, + { + "epoch": 35.567345171069725, + "grad_norm": 0.41204652190208435, + "learning_rate": 4.644326548289303e-05, + "loss": 0.358, + "step": 5256000 + }, + { + "epoch": 35.570728670420095, + "grad_norm": 0.37181586027145386, + "learning_rate": 4.644292713295799e-05, + "loss": 0.3605, + "step": 5256500 + }, + { + "epoch": 35.574112169770466, + "grad_norm": 0.3742663562297821, + "learning_rate": 4.6442588783022954e-05, + "loss": 0.3584, + "step": 5257000 + }, + { + "epoch": 35.57749566912083, + "grad_norm": 0.3441641926765442, + "learning_rate": 4.6442250433087916e-05, + "loss": 0.3592, + "step": 5257500 + }, + { + "epoch": 35.5808791684712, + "grad_norm": 0.3913019895553589, + "learning_rate": 4.6441912083152885e-05, + "loss": 0.359, + "step": 5258000 + }, + { + "epoch": 35.58426266782157, + "grad_norm": 0.36842718720436096, + "learning_rate": 4.644157373321785e-05, + "loss": 0.3594, + "step": 5258500 + }, + { + "epoch": 35.58764616717193, + "grad_norm": 0.3861198425292969, + "learning_rate": 4.644123538328281e-05, + "loss": 0.3593, + "step": 5259000 + }, + { + "epoch": 35.5910296665223, + "grad_norm": 0.3628122806549072, + "learning_rate": 4.644089703334777e-05, + "loss": 0.359, + "step": 5259500 + }, + { + "epoch": 35.59441316587267, + "grad_norm": 0.3801637589931488, + "learning_rate": 4.6440558683412734e-05, + "loss": 0.3593, + "step": 5260000 + }, + { + "epoch": 35.59779666522304, + "grad_norm": 0.397336483001709, + "learning_rate": 4.6440220333477696e-05, + "loss": 0.3588, + "step": 5260500 + }, + { + "epoch": 35.60118016457341, + "grad_norm": 0.3856032192707062, + "learning_rate": 4.643988198354266e-05, + "loss": 0.3579, + "step": 5261000 + }, + { + "epoch": 35.60456366392378, + "grad_norm": 0.39578643441200256, + "learning_rate": 4.643954363360763e-05, + "loss": 0.3597, + "step": 5261500 + }, + { + "epoch": 35.60794716327415, + "grad_norm": 0.3347266614437103, + "learning_rate": 4.643920528367259e-05, + "loss": 0.3588, + "step": 5262000 + }, + { + "epoch": 35.61133066262451, + "grad_norm": 0.40181776881217957, + "learning_rate": 4.643886693373755e-05, + "loss": 0.3587, + "step": 5262500 + }, + { + "epoch": 35.61471416197488, + "grad_norm": 0.3796705901622772, + "learning_rate": 4.643852858380251e-05, + "loss": 0.3592, + "step": 5263000 + }, + { + "epoch": 35.61809766132525, + "grad_norm": 0.44177448749542236, + "learning_rate": 4.643819023386748e-05, + "loss": 0.3579, + "step": 5263500 + }, + { + "epoch": 35.621481160675614, + "grad_norm": 0.3485350012779236, + "learning_rate": 4.6437851883932444e-05, + "loss": 0.3593, + "step": 5264000 + }, + { + "epoch": 35.624864660025985, + "grad_norm": 0.37154629826545715, + "learning_rate": 4.64375135339974e-05, + "loss": 0.3594, + "step": 5264500 + }, + { + "epoch": 35.628248159376355, + "grad_norm": 0.3512772023677826, + "learning_rate": 4.643717518406236e-05, + "loss": 0.3592, + "step": 5265000 + }, + { + "epoch": 35.63163165872672, + "grad_norm": 0.38935092091560364, + "learning_rate": 4.643683683412733e-05, + "loss": 0.3589, + "step": 5265500 + }, + { + "epoch": 35.63501515807709, + "grad_norm": 0.37523671984672546, + "learning_rate": 4.643649848419229e-05, + "loss": 0.3604, + "step": 5266000 + }, + { + "epoch": 35.63839865742746, + "grad_norm": 0.380526065826416, + "learning_rate": 4.6436160134257255e-05, + "loss": 0.3591, + "step": 5266500 + }, + { + "epoch": 35.64178215677783, + "grad_norm": 0.39485201239585876, + "learning_rate": 4.643582178432222e-05, + "loss": 0.3587, + "step": 5267000 + }, + { + "epoch": 35.64516565612819, + "grad_norm": 0.3372666537761688, + "learning_rate": 4.6435483434387186e-05, + "loss": 0.359, + "step": 5267500 + }, + { + "epoch": 35.64854915547856, + "grad_norm": 0.4184510409832001, + "learning_rate": 4.643514508445215e-05, + "loss": 0.3593, + "step": 5268000 + }, + { + "epoch": 35.65193265482893, + "grad_norm": 0.4210392236709595, + "learning_rate": 4.643480673451711e-05, + "loss": 0.3602, + "step": 5268500 + }, + { + "epoch": 35.655316154179296, + "grad_norm": 0.361260324716568, + "learning_rate": 4.643446838458207e-05, + "loss": 0.3608, + "step": 5269000 + }, + { + "epoch": 35.658699653529666, + "grad_norm": 0.39221853017807007, + "learning_rate": 4.6434130034647034e-05, + "loss": 0.3585, + "step": 5269500 + }, + { + "epoch": 35.66208315288004, + "grad_norm": 0.39438000321388245, + "learning_rate": 4.6433791684711996e-05, + "loss": 0.3592, + "step": 5270000 + }, + { + "epoch": 35.6654666522304, + "grad_norm": 0.38537243008613586, + "learning_rate": 4.643345333477696e-05, + "loss": 0.3579, + "step": 5270500 + }, + { + "epoch": 35.66885015158077, + "grad_norm": 0.36047154664993286, + "learning_rate": 4.643311498484193e-05, + "loss": 0.359, + "step": 5271000 + }, + { + "epoch": 35.67223365093114, + "grad_norm": 0.37812140583992004, + "learning_rate": 4.643277663490689e-05, + "loss": 0.36, + "step": 5271500 + }, + { + "epoch": 35.67561715028151, + "grad_norm": 0.3501070439815521, + "learning_rate": 4.643243828497185e-05, + "loss": 0.3592, + "step": 5272000 + }, + { + "epoch": 35.679000649631874, + "grad_norm": 0.4167449474334717, + "learning_rate": 4.6432099935036814e-05, + "loss": 0.3594, + "step": 5272500 + }, + { + "epoch": 35.682384148982244, + "grad_norm": 0.3701885938644409, + "learning_rate": 4.643176158510178e-05, + "loss": 0.3586, + "step": 5273000 + }, + { + "epoch": 35.685767648332614, + "grad_norm": 0.3991261124610901, + "learning_rate": 4.6431423235166745e-05, + "loss": 0.3577, + "step": 5273500 + }, + { + "epoch": 35.68915114768298, + "grad_norm": 0.37794020771980286, + "learning_rate": 4.64310848852317e-05, + "loss": 0.3587, + "step": 5274000 + }, + { + "epoch": 35.69253464703335, + "grad_norm": 0.3513682782649994, + "learning_rate": 4.643074653529666e-05, + "loss": 0.3592, + "step": 5274500 + }, + { + "epoch": 35.69591814638372, + "grad_norm": 0.3790803551673889, + "learning_rate": 4.643040818536163e-05, + "loss": 0.359, + "step": 5275000 + }, + { + "epoch": 35.69930164573408, + "grad_norm": 0.40094253420829773, + "learning_rate": 4.643006983542659e-05, + "loss": 0.3589, + "step": 5275500 + }, + { + "epoch": 35.70268514508445, + "grad_norm": 0.3767148554325104, + "learning_rate": 4.6429731485491555e-05, + "loss": 0.3577, + "step": 5276000 + }, + { + "epoch": 35.70606864443482, + "grad_norm": 0.42464911937713623, + "learning_rate": 4.642939313555652e-05, + "loss": 0.3596, + "step": 5276500 + }, + { + "epoch": 35.709452143785185, + "grad_norm": 0.36401018500328064, + "learning_rate": 4.6429054785621486e-05, + "loss": 0.3598, + "step": 5277000 + }, + { + "epoch": 35.712835643135556, + "grad_norm": 0.3809833526611328, + "learning_rate": 4.642871643568645e-05, + "loss": 0.359, + "step": 5277500 + }, + { + "epoch": 35.716219142485926, + "grad_norm": 0.41787412762641907, + "learning_rate": 4.642837808575141e-05, + "loss": 0.3604, + "step": 5278000 + }, + { + "epoch": 35.719602641836296, + "grad_norm": 0.4240560233592987, + "learning_rate": 4.642803973581637e-05, + "loss": 0.3591, + "step": 5278500 + }, + { + "epoch": 35.72298614118666, + "grad_norm": 0.3682841658592224, + "learning_rate": 4.6427701385881335e-05, + "loss": 0.3585, + "step": 5279000 + }, + { + "epoch": 35.72636964053703, + "grad_norm": 0.34984299540519714, + "learning_rate": 4.64273630359463e-05, + "loss": 0.3585, + "step": 5279500 + }, + { + "epoch": 35.7297531398874, + "grad_norm": 0.3614426255226135, + "learning_rate": 4.642702468601126e-05, + "loss": 0.3577, + "step": 5280000 + }, + { + "epoch": 35.73313663923776, + "grad_norm": 0.3550345301628113, + "learning_rate": 4.642668633607623e-05, + "loss": 0.3587, + "step": 5280500 + }, + { + "epoch": 35.73652013858813, + "grad_norm": 0.43409860134124756, + "learning_rate": 4.642634798614119e-05, + "loss": 0.359, + "step": 5281000 + }, + { + "epoch": 35.739903637938504, + "grad_norm": 0.42521485686302185, + "learning_rate": 4.642600963620615e-05, + "loss": 0.3603, + "step": 5281500 + }, + { + "epoch": 35.74328713728887, + "grad_norm": 0.4124901592731476, + "learning_rate": 4.6425671286271114e-05, + "loss": 0.3591, + "step": 5282000 + }, + { + "epoch": 35.74667063663924, + "grad_norm": 0.38093918561935425, + "learning_rate": 4.642533293633608e-05, + "loss": 0.36, + "step": 5282500 + }, + { + "epoch": 35.75005413598961, + "grad_norm": 0.3848240077495575, + "learning_rate": 4.6424994586401045e-05, + "loss": 0.3587, + "step": 5283000 + }, + { + "epoch": 35.75343763533997, + "grad_norm": 0.3985755145549774, + "learning_rate": 4.642465623646601e-05, + "loss": 0.3608, + "step": 5283500 + }, + { + "epoch": 35.75682113469034, + "grad_norm": 0.37025782465934753, + "learning_rate": 4.642431788653096e-05, + "loss": 0.3596, + "step": 5284000 + }, + { + "epoch": 35.76020463404071, + "grad_norm": 0.33689063787460327, + "learning_rate": 4.642397953659593e-05, + "loss": 0.3589, + "step": 5284500 + }, + { + "epoch": 35.76358813339108, + "grad_norm": 0.3519691228866577, + "learning_rate": 4.6423641186660894e-05, + "loss": 0.3598, + "step": 5285000 + }, + { + "epoch": 35.766971632741445, + "grad_norm": 0.37017616629600525, + "learning_rate": 4.6423302836725856e-05, + "loss": 0.3591, + "step": 5285500 + }, + { + "epoch": 35.770355132091815, + "grad_norm": 0.3611086308956146, + "learning_rate": 4.642296448679082e-05, + "loss": 0.3571, + "step": 5286000 + }, + { + "epoch": 35.773738631442185, + "grad_norm": 0.3775934875011444, + "learning_rate": 4.642262613685579e-05, + "loss": 0.3588, + "step": 5286500 + }, + { + "epoch": 35.77712213079255, + "grad_norm": 0.4065438508987427, + "learning_rate": 4.642228778692075e-05, + "loss": 0.36, + "step": 5287000 + }, + { + "epoch": 35.78050563014292, + "grad_norm": 0.4096687436103821, + "learning_rate": 4.642194943698571e-05, + "loss": 0.3588, + "step": 5287500 + }, + { + "epoch": 35.78388912949329, + "grad_norm": 0.4001534879207611, + "learning_rate": 4.642161108705067e-05, + "loss": 0.3582, + "step": 5288000 + }, + { + "epoch": 35.78727262884365, + "grad_norm": 0.35695651173591614, + "learning_rate": 4.6421272737115635e-05, + "loss": 0.3597, + "step": 5288500 + }, + { + "epoch": 35.79065612819402, + "grad_norm": 0.3913804590702057, + "learning_rate": 4.64209343871806e-05, + "loss": 0.3607, + "step": 5289000 + }, + { + "epoch": 35.79403962754439, + "grad_norm": 0.36949577927589417, + "learning_rate": 4.642059603724556e-05, + "loss": 0.3586, + "step": 5289500 + }, + { + "epoch": 35.797423126894756, + "grad_norm": 0.3722244203090668, + "learning_rate": 4.642025768731053e-05, + "loss": 0.3581, + "step": 5290000 + }, + { + "epoch": 35.80080662624513, + "grad_norm": 0.37984737753868103, + "learning_rate": 4.641991933737549e-05, + "loss": 0.3597, + "step": 5290500 + }, + { + "epoch": 35.8041901255955, + "grad_norm": 0.4076242744922638, + "learning_rate": 4.641958098744045e-05, + "loss": 0.3593, + "step": 5291000 + }, + { + "epoch": 35.80757362494587, + "grad_norm": 0.4038795530796051, + "learning_rate": 4.6419242637505415e-05, + "loss": 0.3595, + "step": 5291500 + }, + { + "epoch": 35.81095712429623, + "grad_norm": 0.3721678853034973, + "learning_rate": 4.6418904287570384e-05, + "loss": 0.3597, + "step": 5292000 + }, + { + "epoch": 35.8143406236466, + "grad_norm": 0.4363827407360077, + "learning_rate": 4.6418565937635346e-05, + "loss": 0.3588, + "step": 5292500 + }, + { + "epoch": 35.81772412299697, + "grad_norm": 0.3978166878223419, + "learning_rate": 4.641822758770031e-05, + "loss": 0.358, + "step": 5293000 + }, + { + "epoch": 35.821107622347334, + "grad_norm": 0.39794260263442993, + "learning_rate": 4.6417889237765263e-05, + "loss": 0.3594, + "step": 5293500 + }, + { + "epoch": 35.824491121697704, + "grad_norm": 0.35563987493515015, + "learning_rate": 4.641755088783023e-05, + "loss": 0.3593, + "step": 5294000 + }, + { + "epoch": 35.827874621048075, + "grad_norm": 0.3575843572616577, + "learning_rate": 4.6417212537895194e-05, + "loss": 0.3598, + "step": 5294500 + }, + { + "epoch": 35.83125812039844, + "grad_norm": 0.42469942569732666, + "learning_rate": 4.6416874187960157e-05, + "loss": 0.3583, + "step": 5295000 + }, + { + "epoch": 35.83464161974881, + "grad_norm": 0.34596970677375793, + "learning_rate": 4.641653583802512e-05, + "loss": 0.3593, + "step": 5295500 + }, + { + "epoch": 35.83802511909918, + "grad_norm": 0.40347158908843994, + "learning_rate": 4.641619748809009e-05, + "loss": 0.3588, + "step": 5296000 + }, + { + "epoch": 35.84140861844955, + "grad_norm": 0.3695710599422455, + "learning_rate": 4.641585913815505e-05, + "loss": 0.3583, + "step": 5296500 + }, + { + "epoch": 35.84479211779991, + "grad_norm": 0.3977468013763428, + "learning_rate": 4.641552078822001e-05, + "loss": 0.3582, + "step": 5297000 + }, + { + "epoch": 35.84817561715028, + "grad_norm": 0.39114922285079956, + "learning_rate": 4.6415182438284974e-05, + "loss": 0.3597, + "step": 5297500 + }, + { + "epoch": 35.85155911650065, + "grad_norm": 0.38786453008651733, + "learning_rate": 4.6414844088349936e-05, + "loss": 0.3584, + "step": 5298000 + }, + { + "epoch": 35.854942615851016, + "grad_norm": 0.35438981652259827, + "learning_rate": 4.64145057384149e-05, + "loss": 0.3589, + "step": 5298500 + }, + { + "epoch": 35.858326115201386, + "grad_norm": 0.36184030771255493, + "learning_rate": 4.641416738847986e-05, + "loss": 0.3601, + "step": 5299000 + }, + { + "epoch": 35.861709614551756, + "grad_norm": 0.4410672187805176, + "learning_rate": 4.641382903854483e-05, + "loss": 0.3593, + "step": 5299500 + }, + { + "epoch": 35.86509311390212, + "grad_norm": 0.40411484241485596, + "learning_rate": 4.641349068860979e-05, + "loss": 0.3589, + "step": 5300000 + }, + { + "epoch": 35.86847661325249, + "grad_norm": 0.3632428050041199, + "learning_rate": 4.6413152338674753e-05, + "loss": 0.3584, + "step": 5300500 + }, + { + "epoch": 35.87186011260286, + "grad_norm": 0.39821600914001465, + "learning_rate": 4.6412813988739716e-05, + "loss": 0.3572, + "step": 5301000 + }, + { + "epoch": 35.87524361195322, + "grad_norm": 0.40248432755470276, + "learning_rate": 4.641247563880468e-05, + "loss": 0.3601, + "step": 5301500 + }, + { + "epoch": 35.878627111303594, + "grad_norm": 0.3401845097541809, + "learning_rate": 4.641213728886965e-05, + "loss": 0.3577, + "step": 5302000 + }, + { + "epoch": 35.882010610653964, + "grad_norm": 0.36157751083374023, + "learning_rate": 4.641179893893461e-05, + "loss": 0.3606, + "step": 5302500 + }, + { + "epoch": 35.885394110004334, + "grad_norm": 0.40573593974113464, + "learning_rate": 4.6411460588999564e-05, + "loss": 0.3584, + "step": 5303000 + }, + { + "epoch": 35.8887776093547, + "grad_norm": 0.38465699553489685, + "learning_rate": 4.641112223906453e-05, + "loss": 0.3598, + "step": 5303500 + }, + { + "epoch": 35.89216110870507, + "grad_norm": 0.38376665115356445, + "learning_rate": 4.6410783889129495e-05, + "loss": 0.3604, + "step": 5304000 + }, + { + "epoch": 35.89554460805544, + "grad_norm": 0.3362913727760315, + "learning_rate": 4.641044553919446e-05, + "loss": 0.3582, + "step": 5304500 + }, + { + "epoch": 35.8989281074058, + "grad_norm": 0.3604910373687744, + "learning_rate": 4.641010718925942e-05, + "loss": 0.3593, + "step": 5305000 + }, + { + "epoch": 35.90231160675617, + "grad_norm": 0.35981470346450806, + "learning_rate": 4.640976883932439e-05, + "loss": 0.3596, + "step": 5305500 + }, + { + "epoch": 35.90569510610654, + "grad_norm": 0.35935699939727783, + "learning_rate": 4.640943048938935e-05, + "loss": 0.3608, + "step": 5306000 + }, + { + "epoch": 35.909078605456905, + "grad_norm": 0.3535047173500061, + "learning_rate": 4.640909213945431e-05, + "loss": 0.3597, + "step": 5306500 + }, + { + "epoch": 35.912462104807275, + "grad_norm": 0.42171168327331543, + "learning_rate": 4.6408753789519275e-05, + "loss": 0.3607, + "step": 5307000 + }, + { + "epoch": 35.915845604157646, + "grad_norm": 0.4060705006122589, + "learning_rate": 4.640841543958424e-05, + "loss": 0.3605, + "step": 5307500 + }, + { + "epoch": 35.91922910350801, + "grad_norm": 0.3892277777194977, + "learning_rate": 4.64080770896492e-05, + "loss": 0.3601, + "step": 5308000 + }, + { + "epoch": 35.92261260285838, + "grad_norm": 0.43026089668273926, + "learning_rate": 4.640773873971416e-05, + "loss": 0.3593, + "step": 5308500 + }, + { + "epoch": 35.92599610220875, + "grad_norm": 0.3809777796268463, + "learning_rate": 4.640740038977913e-05, + "loss": 0.3587, + "step": 5309000 + }, + { + "epoch": 35.92937960155912, + "grad_norm": 0.36120370030403137, + "learning_rate": 4.640706203984409e-05, + "loss": 0.3588, + "step": 5309500 + }, + { + "epoch": 35.93276310090948, + "grad_norm": 0.376757949590683, + "learning_rate": 4.6406723689909054e-05, + "loss": 0.3593, + "step": 5310000 + }, + { + "epoch": 35.93614660025985, + "grad_norm": 0.329667866230011, + "learning_rate": 4.6406385339974016e-05, + "loss": 0.3583, + "step": 5310500 + }, + { + "epoch": 35.939530099610224, + "grad_norm": 0.3501999080181122, + "learning_rate": 4.640604699003898e-05, + "loss": 0.3589, + "step": 5311000 + }, + { + "epoch": 35.94291359896059, + "grad_norm": 0.3717506527900696, + "learning_rate": 4.640570864010395e-05, + "loss": 0.3592, + "step": 5311500 + }, + { + "epoch": 35.94629709831096, + "grad_norm": 0.39163535833358765, + "learning_rate": 4.640537029016891e-05, + "loss": 0.3596, + "step": 5312000 + }, + { + "epoch": 35.94968059766133, + "grad_norm": 0.3506573736667633, + "learning_rate": 4.6405031940233865e-05, + "loss": 0.3594, + "step": 5312500 + }, + { + "epoch": 35.95306409701169, + "grad_norm": 0.3836919367313385, + "learning_rate": 4.6404693590298834e-05, + "loss": 0.3584, + "step": 5313000 + }, + { + "epoch": 35.95644759636206, + "grad_norm": 0.41102489829063416, + "learning_rate": 4.6404355240363796e-05, + "loss": 0.3593, + "step": 5313500 + }, + { + "epoch": 35.95983109571243, + "grad_norm": 0.3786611557006836, + "learning_rate": 4.640401689042876e-05, + "loss": 0.3595, + "step": 5314000 + }, + { + "epoch": 35.963214595062794, + "grad_norm": 0.38707447052001953, + "learning_rate": 4.640367854049372e-05, + "loss": 0.3592, + "step": 5314500 + }, + { + "epoch": 35.966598094413165, + "grad_norm": 0.3671312928199768, + "learning_rate": 4.640334019055869e-05, + "loss": 0.3592, + "step": 5315000 + }, + { + "epoch": 35.969981593763535, + "grad_norm": 0.3961714208126068, + "learning_rate": 4.640300184062365e-05, + "loss": 0.361, + "step": 5315500 + }, + { + "epoch": 35.973365093113905, + "grad_norm": 0.38784390687942505, + "learning_rate": 4.640266349068861e-05, + "loss": 0.3603, + "step": 5316000 + }, + { + "epoch": 35.97674859246427, + "grad_norm": 0.3855289816856384, + "learning_rate": 4.6402325140753575e-05, + "loss": 0.3598, + "step": 5316500 + }, + { + "epoch": 35.98013209181464, + "grad_norm": 0.3643551170825958, + "learning_rate": 4.640198679081854e-05, + "loss": 0.3586, + "step": 5317000 + }, + { + "epoch": 35.98351559116501, + "grad_norm": 0.3573073744773865, + "learning_rate": 4.64016484408835e-05, + "loss": 0.3586, + "step": 5317500 + }, + { + "epoch": 35.98689909051537, + "grad_norm": 0.4077359139919281, + "learning_rate": 4.640131009094846e-05, + "loss": 0.3594, + "step": 5318000 + }, + { + "epoch": 35.99028258986574, + "grad_norm": 0.35591912269592285, + "learning_rate": 4.6400971741013424e-05, + "loss": 0.3591, + "step": 5318500 + }, + { + "epoch": 35.99366608921611, + "grad_norm": 0.3844730854034424, + "learning_rate": 4.640063339107839e-05, + "loss": 0.36, + "step": 5319000 + }, + { + "epoch": 35.997049588566476, + "grad_norm": 0.407355934381485, + "learning_rate": 4.6400295041143355e-05, + "loss": 0.3602, + "step": 5319500 + }, + { + "epoch": 36.0, + "eval_accuracy": 0.8629955463435177, + "eval_loss": 0.5554865002632141, + "eval_runtime": 3359.5803, + "eval_samples_per_second": 86.542, + "eval_steps_per_second": 5.409, + "step": 5319936 + }, + { + "epoch": 36.000433087916846, + "grad_norm": 0.3714601695537567, + "learning_rate": 4.639995669120832e-05, + "loss": 0.359, + "step": 5320000 + }, + { + "epoch": 36.00381658726722, + "grad_norm": 0.3737226128578186, + "learning_rate": 4.639961834127328e-05, + "loss": 0.3562, + "step": 5320500 + }, + { + "epoch": 36.00720008661758, + "grad_norm": 0.37579116225242615, + "learning_rate": 4.639927999133825e-05, + "loss": 0.3573, + "step": 5321000 + }, + { + "epoch": 36.01058358596795, + "grad_norm": 0.3803044855594635, + "learning_rate": 4.639894164140321e-05, + "loss": 0.3565, + "step": 5321500 + }, + { + "epoch": 36.01396708531832, + "grad_norm": 0.34731969237327576, + "learning_rate": 4.6398603291468165e-05, + "loss": 0.358, + "step": 5322000 + }, + { + "epoch": 36.01735058466869, + "grad_norm": 0.35496339201927185, + "learning_rate": 4.6398264941533134e-05, + "loss": 0.3559, + "step": 5322500 + }, + { + "epoch": 36.020734084019054, + "grad_norm": 0.3641916513442993, + "learning_rate": 4.6397926591598096e-05, + "loss": 0.3561, + "step": 5323000 + }, + { + "epoch": 36.024117583369424, + "grad_norm": 0.40890011191368103, + "learning_rate": 4.639758824166306e-05, + "loss": 0.3579, + "step": 5323500 + }, + { + "epoch": 36.027501082719795, + "grad_norm": 0.36843806505203247, + "learning_rate": 4.639724989172802e-05, + "loss": 0.3576, + "step": 5324000 + }, + { + "epoch": 36.03088458207016, + "grad_norm": 0.35895708203315735, + "learning_rate": 4.639691154179299e-05, + "loss": 0.3556, + "step": 5324500 + }, + { + "epoch": 36.03426808142053, + "grad_norm": 0.42412081360816956, + "learning_rate": 4.639657319185795e-05, + "loss": 0.3579, + "step": 5325000 + }, + { + "epoch": 36.0376515807709, + "grad_norm": 0.3796720802783966, + "learning_rate": 4.6396234841922914e-05, + "loss": 0.3576, + "step": 5325500 + }, + { + "epoch": 36.04103508012126, + "grad_norm": 0.3830703794956207, + "learning_rate": 4.6395896491987876e-05, + "loss": 0.3578, + "step": 5326000 + }, + { + "epoch": 36.04441857947163, + "grad_norm": 0.3955300450325012, + "learning_rate": 4.639555814205284e-05, + "loss": 0.3571, + "step": 5326500 + }, + { + "epoch": 36.047802078822, + "grad_norm": 0.3940010964870453, + "learning_rate": 4.63952197921178e-05, + "loss": 0.3575, + "step": 5327000 + }, + { + "epoch": 36.05118557817237, + "grad_norm": 0.37831932306289673, + "learning_rate": 4.639488144218276e-05, + "loss": 0.3572, + "step": 5327500 + }, + { + "epoch": 36.054569077522736, + "grad_norm": 0.3522729277610779, + "learning_rate": 4.6394543092247724e-05, + "loss": 0.3579, + "step": 5328000 + }, + { + "epoch": 36.057952576873106, + "grad_norm": 0.37316861748695374, + "learning_rate": 4.639420474231269e-05, + "loss": 0.3567, + "step": 5328500 + }, + { + "epoch": 36.061336076223476, + "grad_norm": 0.3755626380443573, + "learning_rate": 4.6393866392377655e-05, + "loss": 0.3572, + "step": 5329000 + }, + { + "epoch": 36.06471957557384, + "grad_norm": 0.37687817215919495, + "learning_rate": 4.639352804244262e-05, + "loss": 0.3582, + "step": 5329500 + }, + { + "epoch": 36.06810307492421, + "grad_norm": 0.36007440090179443, + "learning_rate": 4.639318969250758e-05, + "loss": 0.3593, + "step": 5330000 + }, + { + "epoch": 36.07148657427458, + "grad_norm": 0.36718013882637024, + "learning_rate": 4.639285134257255e-05, + "loss": 0.3583, + "step": 5330500 + }, + { + "epoch": 36.07487007362494, + "grad_norm": 0.3536832928657532, + "learning_rate": 4.639251299263751e-05, + "loss": 0.3578, + "step": 5331000 + }, + { + "epoch": 36.07825357297531, + "grad_norm": 0.3785749673843384, + "learning_rate": 4.6392174642702466e-05, + "loss": 0.3584, + "step": 5331500 + }, + { + "epoch": 36.081637072325684, + "grad_norm": 0.366966187953949, + "learning_rate": 4.6391836292767435e-05, + "loss": 0.359, + "step": 5332000 + }, + { + "epoch": 36.08502057167605, + "grad_norm": 0.4064937233924866, + "learning_rate": 4.63914979428324e-05, + "loss": 0.3577, + "step": 5332500 + }, + { + "epoch": 36.08840407102642, + "grad_norm": 0.381213515996933, + "learning_rate": 4.639115959289736e-05, + "loss": 0.3583, + "step": 5333000 + }, + { + "epoch": 36.09178757037679, + "grad_norm": 0.39870741963386536, + "learning_rate": 4.639082124296232e-05, + "loss": 0.3579, + "step": 5333500 + }, + { + "epoch": 36.09517106972716, + "grad_norm": 0.37169355154037476, + "learning_rate": 4.639048289302729e-05, + "loss": 0.3597, + "step": 5334000 + }, + { + "epoch": 36.09855456907752, + "grad_norm": 0.4086398482322693, + "learning_rate": 4.639014454309225e-05, + "loss": 0.3578, + "step": 5334500 + }, + { + "epoch": 36.10193806842789, + "grad_norm": 0.4269033670425415, + "learning_rate": 4.6389806193157214e-05, + "loss": 0.3579, + "step": 5335000 + }, + { + "epoch": 36.10532156777826, + "grad_norm": 0.38613566756248474, + "learning_rate": 4.6389467843222177e-05, + "loss": 0.3586, + "step": 5335500 + }, + { + "epoch": 36.108705067128625, + "grad_norm": 0.3657166361808777, + "learning_rate": 4.638912949328714e-05, + "loss": 0.3575, + "step": 5336000 + }, + { + "epoch": 36.112088566478995, + "grad_norm": 0.4097485840320587, + "learning_rate": 4.63887911433521e-05, + "loss": 0.3568, + "step": 5336500 + }, + { + "epoch": 36.115472065829366, + "grad_norm": 0.3976365327835083, + "learning_rate": 4.638845279341706e-05, + "loss": 0.3592, + "step": 5337000 + }, + { + "epoch": 36.11885556517973, + "grad_norm": 0.3897380530834198, + "learning_rate": 4.6388114443482025e-05, + "loss": 0.3595, + "step": 5337500 + }, + { + "epoch": 36.1222390645301, + "grad_norm": 0.35844093561172485, + "learning_rate": 4.6387776093546994e-05, + "loss": 0.3566, + "step": 5338000 + }, + { + "epoch": 36.12562256388047, + "grad_norm": 0.4127728044986725, + "learning_rate": 4.6387437743611956e-05, + "loss": 0.3583, + "step": 5338500 + }, + { + "epoch": 36.12900606323083, + "grad_norm": 0.37532439827919006, + "learning_rate": 4.638709939367692e-05, + "loss": 0.3593, + "step": 5339000 + }, + { + "epoch": 36.1323895625812, + "grad_norm": 0.3638269901275635, + "learning_rate": 4.638676104374188e-05, + "loss": 0.3581, + "step": 5339500 + }, + { + "epoch": 36.13577306193157, + "grad_norm": 0.40364179015159607, + "learning_rate": 4.638642269380685e-05, + "loss": 0.3586, + "step": 5340000 + }, + { + "epoch": 36.13915656128194, + "grad_norm": 0.38234543800354004, + "learning_rate": 4.638608434387181e-05, + "loss": 0.3577, + "step": 5340500 + }, + { + "epoch": 36.14254006063231, + "grad_norm": 0.4303121566772461, + "learning_rate": 4.638574599393677e-05, + "loss": 0.3589, + "step": 5341000 + }, + { + "epoch": 36.14592355998268, + "grad_norm": 0.39878717064857483, + "learning_rate": 4.6385407644001736e-05, + "loss": 0.358, + "step": 5341500 + }, + { + "epoch": 36.14930705933305, + "grad_norm": 0.3977298438549042, + "learning_rate": 4.63850692940667e-05, + "loss": 0.358, + "step": 5342000 + }, + { + "epoch": 36.15269055868341, + "grad_norm": 0.3616574704647064, + "learning_rate": 4.638473094413166e-05, + "loss": 0.3591, + "step": 5342500 + }, + { + "epoch": 36.15607405803378, + "grad_norm": 0.34833091497421265, + "learning_rate": 4.638439259419662e-05, + "loss": 0.3583, + "step": 5343000 + }, + { + "epoch": 36.15945755738415, + "grad_norm": 0.415879487991333, + "learning_rate": 4.638405424426159e-05, + "loss": 0.3586, + "step": 5343500 + }, + { + "epoch": 36.162841056734514, + "grad_norm": 0.3834993541240692, + "learning_rate": 4.638371589432655e-05, + "loss": 0.3585, + "step": 5344000 + }, + { + "epoch": 36.166224556084885, + "grad_norm": 0.40970712900161743, + "learning_rate": 4.6383377544391515e-05, + "loss": 0.3573, + "step": 5344500 + }, + { + "epoch": 36.169608055435255, + "grad_norm": 0.3846546411514282, + "learning_rate": 4.638303919445648e-05, + "loss": 0.3588, + "step": 5345000 + }, + { + "epoch": 36.17299155478562, + "grad_norm": 0.39750853180885315, + "learning_rate": 4.6382700844521446e-05, + "loss": 0.3573, + "step": 5345500 + }, + { + "epoch": 36.17637505413599, + "grad_norm": 0.34784793853759766, + "learning_rate": 4.63823624945864e-05, + "loss": 0.3575, + "step": 5346000 + }, + { + "epoch": 36.17975855348636, + "grad_norm": 0.3635327219963074, + "learning_rate": 4.6382024144651364e-05, + "loss": 0.3559, + "step": 5346500 + }, + { + "epoch": 36.18314205283673, + "grad_norm": 0.3809336721897125, + "learning_rate": 4.6381685794716326e-05, + "loss": 0.3583, + "step": 5347000 + }, + { + "epoch": 36.18652555218709, + "grad_norm": 0.350963294506073, + "learning_rate": 4.6381347444781295e-05, + "loss": 0.3562, + "step": 5347500 + }, + { + "epoch": 36.18990905153746, + "grad_norm": 0.40162286162376404, + "learning_rate": 4.638100909484626e-05, + "loss": 0.3587, + "step": 5348000 + }, + { + "epoch": 36.19329255088783, + "grad_norm": 0.4086779057979584, + "learning_rate": 4.638067074491122e-05, + "loss": 0.3576, + "step": 5348500 + }, + { + "epoch": 36.196676050238196, + "grad_norm": 0.395887166261673, + "learning_rate": 4.638033239497618e-05, + "loss": 0.3594, + "step": 5349000 + }, + { + "epoch": 36.200059549588566, + "grad_norm": 0.36396047472953796, + "learning_rate": 4.637999404504115e-05, + "loss": 0.357, + "step": 5349500 + }, + { + "epoch": 36.20344304893894, + "grad_norm": 0.34778252243995667, + "learning_rate": 4.637965569510611e-05, + "loss": 0.3579, + "step": 5350000 + }, + { + "epoch": 36.2068265482893, + "grad_norm": 0.3774062395095825, + "learning_rate": 4.637931734517107e-05, + "loss": 0.3584, + "step": 5350500 + }, + { + "epoch": 36.21021004763967, + "grad_norm": 0.4170990586280823, + "learning_rate": 4.6378978995236036e-05, + "loss": 0.3574, + "step": 5351000 + }, + { + "epoch": 36.21359354699004, + "grad_norm": 0.3895938992500305, + "learning_rate": 4.6378640645301e-05, + "loss": 0.3597, + "step": 5351500 + }, + { + "epoch": 36.21697704634041, + "grad_norm": 0.38899892568588257, + "learning_rate": 4.637830229536596e-05, + "loss": 0.3598, + "step": 5352000 + }, + { + "epoch": 36.220360545690774, + "grad_norm": 0.40833351016044617, + "learning_rate": 4.637796394543092e-05, + "loss": 0.3577, + "step": 5352500 + }, + { + "epoch": 36.223744045041144, + "grad_norm": 0.4181547462940216, + "learning_rate": 4.637762559549589e-05, + "loss": 0.3593, + "step": 5353000 + }, + { + "epoch": 36.227127544391514, + "grad_norm": 0.3875177502632141, + "learning_rate": 4.6377287245560854e-05, + "loss": 0.3569, + "step": 5353500 + }, + { + "epoch": 36.23051104374188, + "grad_norm": 0.36812880635261536, + "learning_rate": 4.6376948895625816e-05, + "loss": 0.3586, + "step": 5354000 + }, + { + "epoch": 36.23389454309225, + "grad_norm": 0.42248624563217163, + "learning_rate": 4.637661054569078e-05, + "loss": 0.3568, + "step": 5354500 + }, + { + "epoch": 36.23727804244262, + "grad_norm": 0.4050443470478058, + "learning_rate": 4.637627219575575e-05, + "loss": 0.3587, + "step": 5355000 + }, + { + "epoch": 36.24066154179298, + "grad_norm": 0.41615429520606995, + "learning_rate": 4.63759338458207e-05, + "loss": 0.3582, + "step": 5355500 + }, + { + "epoch": 36.24404504114335, + "grad_norm": 0.36678463220596313, + "learning_rate": 4.6375595495885664e-05, + "loss": 0.3587, + "step": 5356000 + }, + { + "epoch": 36.24742854049372, + "grad_norm": 0.3542846739292145, + "learning_rate": 4.6375257145950626e-05, + "loss": 0.3577, + "step": 5356500 + }, + { + "epoch": 36.250812039844085, + "grad_norm": 0.42447608709335327, + "learning_rate": 4.6374918796015595e-05, + "loss": 0.3597, + "step": 5357000 + }, + { + "epoch": 36.254195539194455, + "grad_norm": 0.41700679063796997, + "learning_rate": 4.637458044608056e-05, + "loss": 0.3592, + "step": 5357500 + }, + { + "epoch": 36.257579038544826, + "grad_norm": 0.371595174074173, + "learning_rate": 4.637424209614552e-05, + "loss": 0.36, + "step": 5358000 + }, + { + "epoch": 36.260962537895196, + "grad_norm": 0.38792935013771057, + "learning_rate": 4.637390374621048e-05, + "loss": 0.3598, + "step": 5358500 + }, + { + "epoch": 36.26434603724556, + "grad_norm": 0.4075170159339905, + "learning_rate": 4.637356539627545e-05, + "loss": 0.3567, + "step": 5359000 + }, + { + "epoch": 36.26772953659593, + "grad_norm": 0.39475739002227783, + "learning_rate": 4.637322704634041e-05, + "loss": 0.3586, + "step": 5359500 + }, + { + "epoch": 36.2711130359463, + "grad_norm": 0.36673352122306824, + "learning_rate": 4.637288869640537e-05, + "loss": 0.3578, + "step": 5360000 + }, + { + "epoch": 36.27449653529666, + "grad_norm": 0.331930935382843, + "learning_rate": 4.637255034647034e-05, + "loss": 0.3584, + "step": 5360500 + }, + { + "epoch": 36.27788003464703, + "grad_norm": 0.3997202515602112, + "learning_rate": 4.63722119965353e-05, + "loss": 0.359, + "step": 5361000 + }, + { + "epoch": 36.281263533997404, + "grad_norm": 0.41310784220695496, + "learning_rate": 4.637187364660026e-05, + "loss": 0.3585, + "step": 5361500 + }, + { + "epoch": 36.28464703334777, + "grad_norm": 0.39765745401382446, + "learning_rate": 4.637153529666522e-05, + "loss": 0.3573, + "step": 5362000 + }, + { + "epoch": 36.28803053269814, + "grad_norm": 0.39464566111564636, + "learning_rate": 4.637119694673019e-05, + "loss": 0.3583, + "step": 5362500 + }, + { + "epoch": 36.29141403204851, + "grad_norm": 0.418169230222702, + "learning_rate": 4.6370858596795154e-05, + "loss": 0.3587, + "step": 5363000 + }, + { + "epoch": 36.29479753139887, + "grad_norm": 0.3595762848854065, + "learning_rate": 4.6370520246860116e-05, + "loss": 0.3591, + "step": 5363500 + }, + { + "epoch": 36.29818103074924, + "grad_norm": 0.3601153790950775, + "learning_rate": 4.637018189692508e-05, + "loss": 0.3592, + "step": 5364000 + }, + { + "epoch": 36.30156453009961, + "grad_norm": 0.3526724576950073, + "learning_rate": 4.636984354699004e-05, + "loss": 0.3587, + "step": 5364500 + }, + { + "epoch": 36.30494802944998, + "grad_norm": 0.38247859477996826, + "learning_rate": 4.6369505197055e-05, + "loss": 0.3593, + "step": 5365000 + }, + { + "epoch": 36.308331528800345, + "grad_norm": 0.3702358901500702, + "learning_rate": 4.6369166847119965e-05, + "loss": 0.3589, + "step": 5365500 + }, + { + "epoch": 36.311715028150715, + "grad_norm": 0.40158194303512573, + "learning_rate": 4.636882849718493e-05, + "loss": 0.3609, + "step": 5366000 + }, + { + "epoch": 36.315098527501085, + "grad_norm": 0.37560826539993286, + "learning_rate": 4.6368490147249896e-05, + "loss": 0.3591, + "step": 5366500 + }, + { + "epoch": 36.31848202685145, + "grad_norm": 0.3786207139492035, + "learning_rate": 4.636815179731486e-05, + "loss": 0.3595, + "step": 5367000 + }, + { + "epoch": 36.32186552620182, + "grad_norm": 0.4172016978263855, + "learning_rate": 4.636781344737982e-05, + "loss": 0.3574, + "step": 5367500 + }, + { + "epoch": 36.32524902555219, + "grad_norm": 0.40235501527786255, + "learning_rate": 4.636747509744478e-05, + "loss": 0.3575, + "step": 5368000 + }, + { + "epoch": 36.32863252490255, + "grad_norm": 0.3735620975494385, + "learning_rate": 4.636713674750975e-05, + "loss": 0.3586, + "step": 5368500 + }, + { + "epoch": 36.33201602425292, + "grad_norm": 0.38861343264579773, + "learning_rate": 4.636679839757471e-05, + "loss": 0.3595, + "step": 5369000 + }, + { + "epoch": 36.33539952360329, + "grad_norm": 0.3941441774368286, + "learning_rate": 4.636646004763967e-05, + "loss": 0.3569, + "step": 5369500 + }, + { + "epoch": 36.338783022953656, + "grad_norm": 0.36925792694091797, + "learning_rate": 4.636612169770464e-05, + "loss": 0.358, + "step": 5370000 + }, + { + "epoch": 36.342166522304026, + "grad_norm": 0.35760292410850525, + "learning_rate": 4.63657833477696e-05, + "loss": 0.3567, + "step": 5370500 + }, + { + "epoch": 36.3455500216544, + "grad_norm": 0.41832560300827026, + "learning_rate": 4.636544499783456e-05, + "loss": 0.3587, + "step": 5371000 + }, + { + "epoch": 36.34893352100477, + "grad_norm": 0.4099883735179901, + "learning_rate": 4.6365106647899524e-05, + "loss": 0.3595, + "step": 5371500 + }, + { + "epoch": 36.35231702035513, + "grad_norm": 0.37353208661079407, + "learning_rate": 4.6364768297964486e-05, + "loss": 0.3594, + "step": 5372000 + }, + { + "epoch": 36.3557005197055, + "grad_norm": 0.40817371010780334, + "learning_rate": 4.6364429948029455e-05, + "loss": 0.3593, + "step": 5372500 + }, + { + "epoch": 36.35908401905587, + "grad_norm": 0.39321252703666687, + "learning_rate": 4.636409159809442e-05, + "loss": 0.3587, + "step": 5373000 + }, + { + "epoch": 36.362467518406234, + "grad_norm": 0.3826882243156433, + "learning_rate": 4.636375324815938e-05, + "loss": 0.3596, + "step": 5373500 + }, + { + "epoch": 36.365851017756604, + "grad_norm": 0.3495653569698334, + "learning_rate": 4.636341489822434e-05, + "loss": 0.359, + "step": 5374000 + }, + { + "epoch": 36.369234517106975, + "grad_norm": 0.4428384304046631, + "learning_rate": 4.63630765482893e-05, + "loss": 0.359, + "step": 5374500 + }, + { + "epoch": 36.37261801645734, + "grad_norm": 0.39965730905532837, + "learning_rate": 4.6362738198354265e-05, + "loss": 0.3575, + "step": 5375000 + }, + { + "epoch": 36.37600151580771, + "grad_norm": 0.3903505802154541, + "learning_rate": 4.636239984841923e-05, + "loss": 0.3573, + "step": 5375500 + }, + { + "epoch": 36.37938501515808, + "grad_norm": 0.359244704246521, + "learning_rate": 4.6362061498484196e-05, + "loss": 0.3582, + "step": 5376000 + }, + { + "epoch": 36.38276851450845, + "grad_norm": 0.39628976583480835, + "learning_rate": 4.636172314854916e-05, + "loss": 0.3588, + "step": 5376500 + }, + { + "epoch": 36.38615201385881, + "grad_norm": 0.38931041955947876, + "learning_rate": 4.636138479861412e-05, + "loss": 0.357, + "step": 5377000 + }, + { + "epoch": 36.38953551320918, + "grad_norm": 0.3588677644729614, + "learning_rate": 4.636104644867908e-05, + "loss": 0.3578, + "step": 5377500 + }, + { + "epoch": 36.39291901255955, + "grad_norm": 0.369165301322937, + "learning_rate": 4.636070809874405e-05, + "loss": 0.3599, + "step": 5378000 + }, + { + "epoch": 36.396302511909916, + "grad_norm": 0.36009520292282104, + "learning_rate": 4.6360369748809014e-05, + "loss": 0.3589, + "step": 5378500 + }, + { + "epoch": 36.399686011260286, + "grad_norm": 0.37979286909103394, + "learning_rate": 4.636003139887397e-05, + "loss": 0.3572, + "step": 5379000 + }, + { + "epoch": 36.403069510610656, + "grad_norm": 0.3631051182746887, + "learning_rate": 4.635969304893894e-05, + "loss": 0.359, + "step": 5379500 + }, + { + "epoch": 36.40645300996102, + "grad_norm": 0.4344273507595062, + "learning_rate": 4.63593546990039e-05, + "loss": 0.3582, + "step": 5380000 + }, + { + "epoch": 36.40983650931139, + "grad_norm": 0.3770763874053955, + "learning_rate": 4.635901634906886e-05, + "loss": 0.3581, + "step": 5380500 + }, + { + "epoch": 36.41322000866176, + "grad_norm": 0.3626430332660675, + "learning_rate": 4.6358677999133824e-05, + "loss": 0.3589, + "step": 5381000 + }, + { + "epoch": 36.41660350801212, + "grad_norm": 0.3731286823749542, + "learning_rate": 4.6358339649198787e-05, + "loss": 0.3587, + "step": 5381500 + }, + { + "epoch": 36.419987007362494, + "grad_norm": 0.3749610185623169, + "learning_rate": 4.6358001299263755e-05, + "loss": 0.3599, + "step": 5382000 + }, + { + "epoch": 36.423370506712864, + "grad_norm": 0.37450677156448364, + "learning_rate": 4.635766294932872e-05, + "loss": 0.357, + "step": 5382500 + }, + { + "epoch": 36.426754006063234, + "grad_norm": 0.412520170211792, + "learning_rate": 4.635732459939368e-05, + "loss": 0.3576, + "step": 5383000 + }, + { + "epoch": 36.4301375054136, + "grad_norm": 0.3579409122467041, + "learning_rate": 4.635698624945864e-05, + "loss": 0.3577, + "step": 5383500 + }, + { + "epoch": 36.43352100476397, + "grad_norm": 0.3862817883491516, + "learning_rate": 4.6356647899523604e-05, + "loss": 0.3593, + "step": 5384000 + }, + { + "epoch": 36.43690450411434, + "grad_norm": 0.3743864893913269, + "learning_rate": 4.6356309549588566e-05, + "loss": 0.3587, + "step": 5384500 + }, + { + "epoch": 36.4402880034647, + "grad_norm": 0.40367433428764343, + "learning_rate": 4.635597119965353e-05, + "loss": 0.3584, + "step": 5385000 + }, + { + "epoch": 36.44367150281507, + "grad_norm": 0.3912498354911804, + "learning_rate": 4.63556328497185e-05, + "loss": 0.3595, + "step": 5385500 + }, + { + "epoch": 36.44705500216544, + "grad_norm": 0.394877552986145, + "learning_rate": 4.635529449978346e-05, + "loss": 0.359, + "step": 5386000 + }, + { + "epoch": 36.450438501515805, + "grad_norm": 0.4186530113220215, + "learning_rate": 4.635495614984842e-05, + "loss": 0.3586, + "step": 5386500 + }, + { + "epoch": 36.453822000866175, + "grad_norm": 0.42265915870666504, + "learning_rate": 4.6354617799913383e-05, + "loss": 0.3582, + "step": 5387000 + }, + { + "epoch": 36.457205500216546, + "grad_norm": 0.3434792757034302, + "learning_rate": 4.635427944997835e-05, + "loss": 0.3571, + "step": 5387500 + }, + { + "epoch": 36.46058899956691, + "grad_norm": 0.36982664465904236, + "learning_rate": 4.6353941100043314e-05, + "loss": 0.3619, + "step": 5388000 + }, + { + "epoch": 36.46397249891728, + "grad_norm": 0.3465140461921692, + "learning_rate": 4.635360275010827e-05, + "loss": 0.3572, + "step": 5388500 + }, + { + "epoch": 36.46735599826765, + "grad_norm": 0.3905229866504669, + "learning_rate": 4.635326440017323e-05, + "loss": 0.3587, + "step": 5389000 + }, + { + "epoch": 36.47073949761802, + "grad_norm": 0.4007827639579773, + "learning_rate": 4.63529260502382e-05, + "loss": 0.3587, + "step": 5389500 + }, + { + "epoch": 36.47412299696838, + "grad_norm": 0.3601161241531372, + "learning_rate": 4.635258770030316e-05, + "loss": 0.3579, + "step": 5390000 + }, + { + "epoch": 36.47750649631875, + "grad_norm": 0.3981015086174011, + "learning_rate": 4.6352249350368125e-05, + "loss": 0.3593, + "step": 5390500 + }, + { + "epoch": 36.48088999566912, + "grad_norm": 0.3994007706642151, + "learning_rate": 4.635191100043309e-05, + "loss": 0.358, + "step": 5391000 + }, + { + "epoch": 36.48427349501949, + "grad_norm": 0.3872138559818268, + "learning_rate": 4.6351572650498056e-05, + "loss": 0.3585, + "step": 5391500 + }, + { + "epoch": 36.48765699436986, + "grad_norm": 0.38818129897117615, + "learning_rate": 4.635123430056302e-05, + "loss": 0.3605, + "step": 5392000 + }, + { + "epoch": 36.49104049372023, + "grad_norm": 0.3747335970401764, + "learning_rate": 4.635089595062798e-05, + "loss": 0.3589, + "step": 5392500 + }, + { + "epoch": 36.49442399307059, + "grad_norm": 0.36242780089378357, + "learning_rate": 4.635055760069294e-05, + "loss": 0.3595, + "step": 5393000 + }, + { + "epoch": 36.49780749242096, + "grad_norm": 0.41383475065231323, + "learning_rate": 4.6350219250757905e-05, + "loss": 0.3586, + "step": 5393500 + }, + { + "epoch": 36.50119099177133, + "grad_norm": 0.3369709253311157, + "learning_rate": 4.634988090082287e-05, + "loss": 0.3588, + "step": 5394000 + }, + { + "epoch": 36.504574491121694, + "grad_norm": 0.3563244044780731, + "learning_rate": 4.634954255088783e-05, + "loss": 0.3571, + "step": 5394500 + }, + { + "epoch": 36.507957990472065, + "grad_norm": 0.3913503885269165, + "learning_rate": 4.63492042009528e-05, + "loss": 0.3586, + "step": 5395000 + }, + { + "epoch": 36.511341489822435, + "grad_norm": 0.368977814912796, + "learning_rate": 4.634886585101776e-05, + "loss": 0.3586, + "step": 5395500 + }, + { + "epoch": 36.514724989172805, + "grad_norm": 0.3941846489906311, + "learning_rate": 4.634852750108272e-05, + "loss": 0.3584, + "step": 5396000 + }, + { + "epoch": 36.51810848852317, + "grad_norm": 0.4536517858505249, + "learning_rate": 4.6348189151147684e-05, + "loss": 0.3577, + "step": 5396500 + }, + { + "epoch": 36.52149198787354, + "grad_norm": 0.3874954879283905, + "learning_rate": 4.634785080121265e-05, + "loss": 0.3591, + "step": 5397000 + }, + { + "epoch": 36.52487548722391, + "grad_norm": 0.4522276818752289, + "learning_rate": 4.6347512451277615e-05, + "loss": 0.3594, + "step": 5397500 + }, + { + "epoch": 36.52825898657427, + "grad_norm": 0.39889538288116455, + "learning_rate": 4.634717410134258e-05, + "loss": 0.3586, + "step": 5398000 + }, + { + "epoch": 36.53164248592464, + "grad_norm": 0.36961063742637634, + "learning_rate": 4.634683575140753e-05, + "loss": 0.3576, + "step": 5398500 + }, + { + "epoch": 36.53502598527501, + "grad_norm": 0.3706519901752472, + "learning_rate": 4.63464974014725e-05, + "loss": 0.3564, + "step": 5399000 + }, + { + "epoch": 36.538409484625376, + "grad_norm": 0.3952397108078003, + "learning_rate": 4.6346159051537464e-05, + "loss": 0.3595, + "step": 5399500 + }, + { + "epoch": 36.541792983975746, + "grad_norm": 0.37313124537467957, + "learning_rate": 4.6345820701602426e-05, + "loss": 0.3592, + "step": 5400000 + }, + { + "epoch": 36.54517648332612, + "grad_norm": 0.37953048944473267, + "learning_rate": 4.634548235166739e-05, + "loss": 0.3592, + "step": 5400500 + }, + { + "epoch": 36.54855998267648, + "grad_norm": 0.3729240596294403, + "learning_rate": 4.634514400173236e-05, + "loss": 0.3586, + "step": 5401000 + }, + { + "epoch": 36.55194348202685, + "grad_norm": 0.3639374077320099, + "learning_rate": 4.634480565179732e-05, + "loss": 0.3595, + "step": 5401500 + }, + { + "epoch": 36.55532698137722, + "grad_norm": 0.37958860397338867, + "learning_rate": 4.634446730186228e-05, + "loss": 0.3595, + "step": 5402000 + }, + { + "epoch": 36.55871048072759, + "grad_norm": 0.3735257685184479, + "learning_rate": 4.634412895192724e-05, + "loss": 0.3598, + "step": 5402500 + }, + { + "epoch": 36.562093980077954, + "grad_norm": 0.38983970880508423, + "learning_rate": 4.6343790601992205e-05, + "loss": 0.3574, + "step": 5403000 + }, + { + "epoch": 36.565477479428324, + "grad_norm": 0.3864821791648865, + "learning_rate": 4.634345225205717e-05, + "loss": 0.3597, + "step": 5403500 + }, + { + "epoch": 36.568860978778694, + "grad_norm": 0.37265393137931824, + "learning_rate": 4.634311390212213e-05, + "loss": 0.3581, + "step": 5404000 + }, + { + "epoch": 36.57224447812906, + "grad_norm": 0.3908233344554901, + "learning_rate": 4.63427755521871e-05, + "loss": 0.3593, + "step": 5404500 + }, + { + "epoch": 36.57562797747943, + "grad_norm": 0.3419293165206909, + "learning_rate": 4.634243720225206e-05, + "loss": 0.3585, + "step": 5405000 + }, + { + "epoch": 36.5790114768298, + "grad_norm": 0.36108455061912537, + "learning_rate": 4.634209885231702e-05, + "loss": 0.358, + "step": 5405500 + }, + { + "epoch": 36.58239497618016, + "grad_norm": 0.37847715616226196, + "learning_rate": 4.6341760502381985e-05, + "loss": 0.3591, + "step": 5406000 + }, + { + "epoch": 36.58577847553053, + "grad_norm": 0.3705810308456421, + "learning_rate": 4.6341422152446954e-05, + "loss": 0.3582, + "step": 5406500 + }, + { + "epoch": 36.5891619748809, + "grad_norm": 0.38005268573760986, + "learning_rate": 4.6341083802511916e-05, + "loss": 0.3562, + "step": 5407000 + }, + { + "epoch": 36.59254547423127, + "grad_norm": 0.3952135443687439, + "learning_rate": 4.634074545257688e-05, + "loss": 0.3586, + "step": 5407500 + }, + { + "epoch": 36.595928973581636, + "grad_norm": 0.40303125977516174, + "learning_rate": 4.634040710264183e-05, + "loss": 0.3577, + "step": 5408000 + }, + { + "epoch": 36.599312472932006, + "grad_norm": 0.38825419545173645, + "learning_rate": 4.63400687527068e-05, + "loss": 0.3588, + "step": 5408500 + }, + { + "epoch": 36.602695972282376, + "grad_norm": 0.37603962421417236, + "learning_rate": 4.6339730402771764e-05, + "loss": 0.3586, + "step": 5409000 + }, + { + "epoch": 36.60607947163274, + "grad_norm": 0.39547544717788696, + "learning_rate": 4.6339392052836726e-05, + "loss": 0.3579, + "step": 5409500 + }, + { + "epoch": 36.60946297098311, + "grad_norm": 0.38806581497192383, + "learning_rate": 4.633905370290169e-05, + "loss": 0.3591, + "step": 5410000 + }, + { + "epoch": 36.61284647033348, + "grad_norm": 0.39700913429260254, + "learning_rate": 4.633871535296666e-05, + "loss": 0.3583, + "step": 5410500 + }, + { + "epoch": 36.61622996968384, + "grad_norm": 0.39626410603523254, + "learning_rate": 4.633837700303162e-05, + "loss": 0.3579, + "step": 5411000 + }, + { + "epoch": 36.61961346903421, + "grad_norm": 0.3509100675582886, + "learning_rate": 4.633803865309658e-05, + "loss": 0.3596, + "step": 5411500 + }, + { + "epoch": 36.622996968384584, + "grad_norm": 0.3858174979686737, + "learning_rate": 4.6337700303161544e-05, + "loss": 0.3593, + "step": 5412000 + }, + { + "epoch": 36.62638046773495, + "grad_norm": 0.39701759815216064, + "learning_rate": 4.6337361953226506e-05, + "loss": 0.3611, + "step": 5412500 + }, + { + "epoch": 36.62976396708532, + "grad_norm": 0.362775593996048, + "learning_rate": 4.633702360329147e-05, + "loss": 0.3596, + "step": 5413000 + }, + { + "epoch": 36.63314746643569, + "grad_norm": 0.39234185218811035, + "learning_rate": 4.633668525335643e-05, + "loss": 0.3569, + "step": 5413500 + }, + { + "epoch": 36.63653096578606, + "grad_norm": 0.4118044078350067, + "learning_rate": 4.63363469034214e-05, + "loss": 0.3584, + "step": 5414000 + }, + { + "epoch": 36.63991446513642, + "grad_norm": 0.38820743560791016, + "learning_rate": 4.633600855348636e-05, + "loss": 0.3596, + "step": 5414500 + }, + { + "epoch": 36.64329796448679, + "grad_norm": 0.37806764245033264, + "learning_rate": 4.633567020355132e-05, + "loss": 0.3599, + "step": 5415000 + }, + { + "epoch": 36.64668146383716, + "grad_norm": 0.3902973532676697, + "learning_rate": 4.6335331853616285e-05, + "loss": 0.3592, + "step": 5415500 + }, + { + "epoch": 36.650064963187525, + "grad_norm": 0.3700363039970398, + "learning_rate": 4.6334993503681254e-05, + "loss": 0.3604, + "step": 5416000 + }, + { + "epoch": 36.653448462537895, + "grad_norm": 0.38539978861808777, + "learning_rate": 4.6334655153746216e-05, + "loss": 0.3608, + "step": 5416500 + }, + { + "epoch": 36.656831961888265, + "grad_norm": 0.37120288610458374, + "learning_rate": 4.633431680381118e-05, + "loss": 0.3595, + "step": 5417000 + }, + { + "epoch": 36.66021546123863, + "grad_norm": 0.37381964921951294, + "learning_rate": 4.6333978453876134e-05, + "loss": 0.3591, + "step": 5417500 + }, + { + "epoch": 36.663598960589, + "grad_norm": 0.36754944920539856, + "learning_rate": 4.63336401039411e-05, + "loss": 0.3588, + "step": 5418000 + }, + { + "epoch": 36.66698245993937, + "grad_norm": 0.4037177860736847, + "learning_rate": 4.6333301754006065e-05, + "loss": 0.3587, + "step": 5418500 + }, + { + "epoch": 36.67036595928973, + "grad_norm": 0.3826404809951782, + "learning_rate": 4.633296340407103e-05, + "loss": 0.3592, + "step": 5419000 + }, + { + "epoch": 36.6737494586401, + "grad_norm": 0.4238000214099884, + "learning_rate": 4.633262505413599e-05, + "loss": 0.3593, + "step": 5419500 + }, + { + "epoch": 36.67713295799047, + "grad_norm": 0.40022116899490356, + "learning_rate": 4.633228670420096e-05, + "loss": 0.3598, + "step": 5420000 + }, + { + "epoch": 36.68051645734084, + "grad_norm": 0.3914811909198761, + "learning_rate": 4.633194835426592e-05, + "loss": 0.3589, + "step": 5420500 + }, + { + "epoch": 36.68389995669121, + "grad_norm": 0.37830302119255066, + "learning_rate": 4.633161000433088e-05, + "loss": 0.3586, + "step": 5421000 + }, + { + "epoch": 36.68728345604158, + "grad_norm": 0.3562361001968384, + "learning_rate": 4.6331271654395844e-05, + "loss": 0.3587, + "step": 5421500 + }, + { + "epoch": 36.69066695539195, + "grad_norm": 0.3969062566757202, + "learning_rate": 4.6330933304460806e-05, + "loss": 0.3602, + "step": 5422000 + }, + { + "epoch": 36.69405045474231, + "grad_norm": 0.37047722935676575, + "learning_rate": 4.633059495452577e-05, + "loss": 0.3597, + "step": 5422500 + }, + { + "epoch": 36.69743395409268, + "grad_norm": 0.4213216006755829, + "learning_rate": 4.633025660459073e-05, + "loss": 0.3569, + "step": 5423000 + }, + { + "epoch": 36.70081745344305, + "grad_norm": 0.3517361879348755, + "learning_rate": 4.63299182546557e-05, + "loss": 0.3575, + "step": 5423500 + }, + { + "epoch": 36.704200952793414, + "grad_norm": 0.3819379210472107, + "learning_rate": 4.632957990472066e-05, + "loss": 0.3592, + "step": 5424000 + }, + { + "epoch": 36.707584452143784, + "grad_norm": 0.40440550446510315, + "learning_rate": 4.6329241554785624e-05, + "loss": 0.3581, + "step": 5424500 + }, + { + "epoch": 36.710967951494155, + "grad_norm": 0.38853567838668823, + "learning_rate": 4.6328903204850586e-05, + "loss": 0.3603, + "step": 5425000 + }, + { + "epoch": 36.71435145084452, + "grad_norm": 0.38550257682800293, + "learning_rate": 4.6328564854915555e-05, + "loss": 0.3599, + "step": 5425500 + }, + { + "epoch": 36.71773495019489, + "grad_norm": 0.379503071308136, + "learning_rate": 4.632822650498052e-05, + "loss": 0.3592, + "step": 5426000 + }, + { + "epoch": 36.72111844954526, + "grad_norm": 0.3605881929397583, + "learning_rate": 4.632788815504548e-05, + "loss": 0.3586, + "step": 5426500 + }, + { + "epoch": 36.72450194889563, + "grad_norm": 0.3627432584762573, + "learning_rate": 4.6327549805110434e-05, + "loss": 0.3591, + "step": 5427000 + }, + { + "epoch": 36.72788544824599, + "grad_norm": 0.4143041968345642, + "learning_rate": 4.63272114551754e-05, + "loss": 0.3593, + "step": 5427500 + }, + { + "epoch": 36.73126894759636, + "grad_norm": 0.4168412685394287, + "learning_rate": 4.6326873105240365e-05, + "loss": 0.3584, + "step": 5428000 + }, + { + "epoch": 36.73465244694673, + "grad_norm": 0.3902731239795685, + "learning_rate": 4.632653475530533e-05, + "loss": 0.3595, + "step": 5428500 + }, + { + "epoch": 36.738035946297096, + "grad_norm": 0.40431979298591614, + "learning_rate": 4.632619640537029e-05, + "loss": 0.3598, + "step": 5429000 + }, + { + "epoch": 36.741419445647466, + "grad_norm": 0.3976406753063202, + "learning_rate": 4.632585805543526e-05, + "loss": 0.3582, + "step": 5429500 + }, + { + "epoch": 36.744802944997836, + "grad_norm": 0.3488816022872925, + "learning_rate": 4.632551970550022e-05, + "loss": 0.3588, + "step": 5430000 + }, + { + "epoch": 36.7481864443482, + "grad_norm": 0.36659806966781616, + "learning_rate": 4.632518135556518e-05, + "loss": 0.357, + "step": 5430500 + }, + { + "epoch": 36.75156994369857, + "grad_norm": 0.39805367588996887, + "learning_rate": 4.6324843005630145e-05, + "loss": 0.3597, + "step": 5431000 + }, + { + "epoch": 36.75495344304894, + "grad_norm": 0.365629106760025, + "learning_rate": 4.632450465569511e-05, + "loss": 0.3589, + "step": 5431500 + }, + { + "epoch": 36.75833694239931, + "grad_norm": 0.3875758647918701, + "learning_rate": 4.632416630576007e-05, + "loss": 0.3597, + "step": 5432000 + }, + { + "epoch": 36.761720441749674, + "grad_norm": 0.4101535379886627, + "learning_rate": 4.632382795582503e-05, + "loss": 0.3589, + "step": 5432500 + }, + { + "epoch": 36.765103941100044, + "grad_norm": 0.37793266773223877, + "learning_rate": 4.632348960589e-05, + "loss": 0.3584, + "step": 5433000 + }, + { + "epoch": 36.768487440450414, + "grad_norm": 0.35704052448272705, + "learning_rate": 4.632315125595496e-05, + "loss": 0.3581, + "step": 5433500 + }, + { + "epoch": 36.77187093980078, + "grad_norm": 0.4402250647544861, + "learning_rate": 4.6322812906019924e-05, + "loss": 0.3594, + "step": 5434000 + }, + { + "epoch": 36.77525443915115, + "grad_norm": 0.4011518359184265, + "learning_rate": 4.632247455608489e-05, + "loss": 0.3583, + "step": 5434500 + }, + { + "epoch": 36.77863793850152, + "grad_norm": 0.3994705379009247, + "learning_rate": 4.632213620614985e-05, + "loss": 0.3592, + "step": 5435000 + }, + { + "epoch": 36.78202143785188, + "grad_norm": 0.3601782023906708, + "learning_rate": 4.632179785621482e-05, + "loss": 0.3581, + "step": 5435500 + }, + { + "epoch": 36.78540493720225, + "grad_norm": 0.3508903682231903, + "learning_rate": 4.632145950627978e-05, + "loss": 0.3599, + "step": 5436000 + }, + { + "epoch": 36.78878843655262, + "grad_norm": 0.3926507234573364, + "learning_rate": 4.6321121156344735e-05, + "loss": 0.3596, + "step": 5436500 + }, + { + "epoch": 36.792171935902985, + "grad_norm": 0.39061203598976135, + "learning_rate": 4.6320782806409704e-05, + "loss": 0.3604, + "step": 5437000 + }, + { + "epoch": 36.795555435253355, + "grad_norm": 0.37601664662361145, + "learning_rate": 4.6320444456474666e-05, + "loss": 0.3589, + "step": 5437500 + }, + { + "epoch": 36.798938934603726, + "grad_norm": 0.3568149209022522, + "learning_rate": 4.632010610653963e-05, + "loss": 0.3593, + "step": 5438000 + }, + { + "epoch": 36.802322433954096, + "grad_norm": 0.406227707862854, + "learning_rate": 4.631976775660459e-05, + "loss": 0.3595, + "step": 5438500 + }, + { + "epoch": 36.80570593330446, + "grad_norm": 0.3645710051059723, + "learning_rate": 4.631942940666956e-05, + "loss": 0.358, + "step": 5439000 + }, + { + "epoch": 36.80908943265483, + "grad_norm": 0.36030206084251404, + "learning_rate": 4.631909105673452e-05, + "loss": 0.3586, + "step": 5439500 + }, + { + "epoch": 36.8124729320052, + "grad_norm": 0.38765445351600647, + "learning_rate": 4.6318752706799484e-05, + "loss": 0.3591, + "step": 5440000 + }, + { + "epoch": 36.81585643135556, + "grad_norm": 0.35829150676727295, + "learning_rate": 4.6318414356864446e-05, + "loss": 0.359, + "step": 5440500 + }, + { + "epoch": 36.81923993070593, + "grad_norm": 0.3841919004917145, + "learning_rate": 4.631807600692941e-05, + "loss": 0.3587, + "step": 5441000 + }, + { + "epoch": 36.822623430056304, + "grad_norm": 0.39539119601249695, + "learning_rate": 4.631773765699437e-05, + "loss": 0.3586, + "step": 5441500 + }, + { + "epoch": 36.82600692940667, + "grad_norm": 0.34584274888038635, + "learning_rate": 4.631739930705933e-05, + "loss": 0.3596, + "step": 5442000 + }, + { + "epoch": 36.82939042875704, + "grad_norm": 0.3979288935661316, + "learning_rate": 4.6317060957124294e-05, + "loss": 0.3601, + "step": 5442500 + }, + { + "epoch": 36.83277392810741, + "grad_norm": 0.40058383345603943, + "learning_rate": 4.631672260718926e-05, + "loss": 0.3591, + "step": 5443000 + }, + { + "epoch": 36.83615742745777, + "grad_norm": 0.35918498039245605, + "learning_rate": 4.6316384257254225e-05, + "loss": 0.3582, + "step": 5443500 + }, + { + "epoch": 36.83954092680814, + "grad_norm": 0.3729562759399414, + "learning_rate": 4.631604590731919e-05, + "loss": 0.3588, + "step": 5444000 + }, + { + "epoch": 36.84292442615851, + "grad_norm": 0.3985956907272339, + "learning_rate": 4.631570755738415e-05, + "loss": 0.3601, + "step": 5444500 + }, + { + "epoch": 36.84630792550888, + "grad_norm": 0.3987262547016144, + "learning_rate": 4.631536920744912e-05, + "loss": 0.3597, + "step": 5445000 + }, + { + "epoch": 36.849691424859245, + "grad_norm": 0.36801108717918396, + "learning_rate": 4.631503085751408e-05, + "loss": 0.3582, + "step": 5445500 + }, + { + "epoch": 36.853074924209615, + "grad_norm": 0.38933831453323364, + "learning_rate": 4.6314692507579036e-05, + "loss": 0.3597, + "step": 5446000 + }, + { + "epoch": 36.856458423559985, + "grad_norm": 0.38778480887413025, + "learning_rate": 4.6314354157644005e-05, + "loss": 0.3585, + "step": 5446500 + }, + { + "epoch": 36.85984192291035, + "grad_norm": 0.3846574127674103, + "learning_rate": 4.631401580770897e-05, + "loss": 0.357, + "step": 5447000 + }, + { + "epoch": 36.86322542226072, + "grad_norm": 0.41694244742393494, + "learning_rate": 4.631367745777393e-05, + "loss": 0.3589, + "step": 5447500 + }, + { + "epoch": 36.86660892161109, + "grad_norm": 0.37676724791526794, + "learning_rate": 4.631333910783889e-05, + "loss": 0.3574, + "step": 5448000 + }, + { + "epoch": 36.86999242096145, + "grad_norm": 0.38469696044921875, + "learning_rate": 4.631300075790386e-05, + "loss": 0.3591, + "step": 5448500 + }, + { + "epoch": 36.87337592031182, + "grad_norm": 0.35767194628715515, + "learning_rate": 4.631266240796882e-05, + "loss": 0.3599, + "step": 5449000 + }, + { + "epoch": 36.87675941966219, + "grad_norm": 0.35038360953330994, + "learning_rate": 4.6312324058033784e-05, + "loss": 0.3594, + "step": 5449500 + }, + { + "epoch": 36.880142919012556, + "grad_norm": 0.3759296238422394, + "learning_rate": 4.6311985708098746e-05, + "loss": 0.3597, + "step": 5450000 + }, + { + "epoch": 36.883526418362926, + "grad_norm": 0.3757367432117462, + "learning_rate": 4.631164735816371e-05, + "loss": 0.3586, + "step": 5450500 + }, + { + "epoch": 36.8869099177133, + "grad_norm": 0.3534103333950043, + "learning_rate": 4.631130900822867e-05, + "loss": 0.3597, + "step": 5451000 + }, + { + "epoch": 36.89029341706367, + "grad_norm": 0.4127538204193115, + "learning_rate": 4.631097065829363e-05, + "loss": 0.3597, + "step": 5451500 + }, + { + "epoch": 36.89367691641403, + "grad_norm": 0.4183761775493622, + "learning_rate": 4.6310632308358595e-05, + "loss": 0.3601, + "step": 5452000 + }, + { + "epoch": 36.8970604157644, + "grad_norm": 0.3787117004394531, + "learning_rate": 4.6310293958423564e-05, + "loss": 0.3577, + "step": 5452500 + }, + { + "epoch": 36.90044391511477, + "grad_norm": 0.38860374689102173, + "learning_rate": 4.6309955608488526e-05, + "loss": 0.3583, + "step": 5453000 + }, + { + "epoch": 36.903827414465134, + "grad_norm": 0.37418508529663086, + "learning_rate": 4.630961725855349e-05, + "loss": 0.3603, + "step": 5453500 + }, + { + "epoch": 36.907210913815504, + "grad_norm": 0.37635117769241333, + "learning_rate": 4.630927890861845e-05, + "loss": 0.3601, + "step": 5454000 + }, + { + "epoch": 36.910594413165875, + "grad_norm": 0.3899887204170227, + "learning_rate": 4.630894055868342e-05, + "loss": 0.3597, + "step": 5454500 + }, + { + "epoch": 36.91397791251624, + "grad_norm": 0.3659631013870239, + "learning_rate": 4.630860220874838e-05, + "loss": 0.3581, + "step": 5455000 + }, + { + "epoch": 36.91736141186661, + "grad_norm": 0.36870715022087097, + "learning_rate": 4.6308263858813336e-05, + "loss": 0.3607, + "step": 5455500 + }, + { + "epoch": 36.92074491121698, + "grad_norm": 0.37624937295913696, + "learning_rate": 4.6307925508878305e-05, + "loss": 0.3574, + "step": 5456000 + }, + { + "epoch": 36.92412841056735, + "grad_norm": 0.40849369764328003, + "learning_rate": 4.630758715894327e-05, + "loss": 0.3578, + "step": 5456500 + }, + { + "epoch": 36.92751190991771, + "grad_norm": 0.37514176964759827, + "learning_rate": 4.630724880900823e-05, + "loss": 0.359, + "step": 5457000 + }, + { + "epoch": 36.93089540926808, + "grad_norm": 0.358591228723526, + "learning_rate": 4.630691045907319e-05, + "loss": 0.359, + "step": 5457500 + }, + { + "epoch": 36.93427890861845, + "grad_norm": 0.37737929821014404, + "learning_rate": 4.630657210913816e-05, + "loss": 0.3594, + "step": 5458000 + }, + { + "epoch": 36.937662407968816, + "grad_norm": 0.3692334294319153, + "learning_rate": 4.630623375920312e-05, + "loss": 0.3587, + "step": 5458500 + }, + { + "epoch": 36.941045907319186, + "grad_norm": 0.3860073685646057, + "learning_rate": 4.6305895409268085e-05, + "loss": 0.3592, + "step": 5459000 + }, + { + "epoch": 36.944429406669556, + "grad_norm": 0.3795725405216217, + "learning_rate": 4.630555705933305e-05, + "loss": 0.3586, + "step": 5459500 + }, + { + "epoch": 36.94781290601992, + "grad_norm": 0.3706900477409363, + "learning_rate": 4.6305218709398016e-05, + "loss": 0.3596, + "step": 5460000 + }, + { + "epoch": 36.95119640537029, + "grad_norm": 0.40297240018844604, + "learning_rate": 4.630488035946297e-05, + "loss": 0.3581, + "step": 5460500 + }, + { + "epoch": 36.95457990472066, + "grad_norm": 0.3621428608894348, + "learning_rate": 4.630454200952793e-05, + "loss": 0.3606, + "step": 5461000 + }, + { + "epoch": 36.95796340407102, + "grad_norm": 0.42571696639060974, + "learning_rate": 4.6304203659592895e-05, + "loss": 0.3589, + "step": 5461500 + }, + { + "epoch": 36.96134690342139, + "grad_norm": 0.3712451457977295, + "learning_rate": 4.6303865309657864e-05, + "loss": 0.3579, + "step": 5462000 + }, + { + "epoch": 36.964730402771764, + "grad_norm": 0.3637605309486389, + "learning_rate": 4.6303526959722826e-05, + "loss": 0.358, + "step": 5462500 + }, + { + "epoch": 36.968113902122134, + "grad_norm": 0.40731701254844666, + "learning_rate": 4.630318860978779e-05, + "loss": 0.359, + "step": 5463000 + }, + { + "epoch": 36.9714974014725, + "grad_norm": 0.3971612751483917, + "learning_rate": 4.630285025985275e-05, + "loss": 0.3582, + "step": 5463500 + }, + { + "epoch": 36.97488090082287, + "grad_norm": 0.3789381682872772, + "learning_rate": 4.630251190991772e-05, + "loss": 0.3588, + "step": 5464000 + }, + { + "epoch": 36.97826440017324, + "grad_norm": 0.39527857303619385, + "learning_rate": 4.630217355998268e-05, + "loss": 0.3584, + "step": 5464500 + }, + { + "epoch": 36.9816478995236, + "grad_norm": 0.386687695980072, + "learning_rate": 4.630183521004764e-05, + "loss": 0.3584, + "step": 5465000 + }, + { + "epoch": 36.98503139887397, + "grad_norm": 0.35649698972702026, + "learning_rate": 4.6301496860112606e-05, + "loss": 0.3588, + "step": 5465500 + }, + { + "epoch": 36.98841489822434, + "grad_norm": 0.3558090329170227, + "learning_rate": 4.630115851017757e-05, + "loss": 0.3588, + "step": 5466000 + }, + { + "epoch": 36.991798397574705, + "grad_norm": 0.3773846924304962, + "learning_rate": 4.630082016024253e-05, + "loss": 0.3578, + "step": 5466500 + }, + { + "epoch": 36.995181896925075, + "grad_norm": 0.4010309875011444, + "learning_rate": 4.630048181030749e-05, + "loss": 0.3609, + "step": 5467000 + }, + { + "epoch": 36.998565396275445, + "grad_norm": 0.39403846859931946, + "learning_rate": 4.630014346037246e-05, + "loss": 0.3594, + "step": 5467500 + }, + { + "epoch": 37.0, + "eval_accuracy": 0.8629837217567206, + "eval_loss": 0.5562915802001953, + "eval_runtime": 3362.5317, + "eval_samples_per_second": 86.466, + "eval_steps_per_second": 5.404, + "step": 5467712 + }, + { + "epoch": 37.00194889562581, + "grad_norm": 0.3730051815509796, + "learning_rate": 4.629980511043742e-05, + "loss": 0.3589, + "step": 5468000 + }, + { + "epoch": 37.00533239497618, + "grad_norm": 0.38916271924972534, + "learning_rate": 4.6299466760502385e-05, + "loss": 0.3568, + "step": 5468500 + }, + { + "epoch": 37.00871589432655, + "grad_norm": 0.40832391381263733, + "learning_rate": 4.629912841056735e-05, + "loss": 0.3564, + "step": 5469000 + }, + { + "epoch": 37.01209939367692, + "grad_norm": 0.37710896134376526, + "learning_rate": 4.6298790060632316e-05, + "loss": 0.3557, + "step": 5469500 + }, + { + "epoch": 37.01548289302728, + "grad_norm": 0.3445221185684204, + "learning_rate": 4.629845171069727e-05, + "loss": 0.3573, + "step": 5470000 + }, + { + "epoch": 37.01886639237765, + "grad_norm": 0.37882137298583984, + "learning_rate": 4.6298113360762234e-05, + "loss": 0.3558, + "step": 5470500 + }, + { + "epoch": 37.02224989172802, + "grad_norm": 0.3661237061023712, + "learning_rate": 4.6297775010827196e-05, + "loss": 0.357, + "step": 5471000 + }, + { + "epoch": 37.02563339107839, + "grad_norm": 0.36156991124153137, + "learning_rate": 4.6297436660892165e-05, + "loss": 0.3575, + "step": 5471500 + }, + { + "epoch": 37.02901689042876, + "grad_norm": 0.4158290922641754, + "learning_rate": 4.629709831095713e-05, + "loss": 0.3583, + "step": 5472000 + }, + { + "epoch": 37.03240038977913, + "grad_norm": 0.35936787724494934, + "learning_rate": 4.629675996102209e-05, + "loss": 0.3563, + "step": 5472500 + }, + { + "epoch": 37.03578388912949, + "grad_norm": 0.3601103723049164, + "learning_rate": 4.629642161108705e-05, + "loss": 0.3584, + "step": 5473000 + }, + { + "epoch": 37.03916738847986, + "grad_norm": 0.39076584577560425, + "learning_rate": 4.629608326115202e-05, + "loss": 0.3581, + "step": 5473500 + }, + { + "epoch": 37.04255088783023, + "grad_norm": 0.38857847452163696, + "learning_rate": 4.629574491121698e-05, + "loss": 0.357, + "step": 5474000 + }, + { + "epoch": 37.045934387180594, + "grad_norm": 0.42331647872924805, + "learning_rate": 4.629540656128194e-05, + "loss": 0.3558, + "step": 5474500 + }, + { + "epoch": 37.049317886530964, + "grad_norm": 0.3687676191329956, + "learning_rate": 4.6295068211346907e-05, + "loss": 0.3574, + "step": 5475000 + }, + { + "epoch": 37.052701385881335, + "grad_norm": 0.41535496711730957, + "learning_rate": 4.629472986141187e-05, + "loss": 0.3568, + "step": 5475500 + }, + { + "epoch": 37.056084885231705, + "grad_norm": 0.43465059995651245, + "learning_rate": 4.629439151147683e-05, + "loss": 0.357, + "step": 5476000 + }, + { + "epoch": 37.05946838458207, + "grad_norm": 0.4017110764980316, + "learning_rate": 4.629405316154179e-05, + "loss": 0.3567, + "step": 5476500 + }, + { + "epoch": 37.06285188393244, + "grad_norm": 0.3725665211677551, + "learning_rate": 4.629371481160676e-05, + "loss": 0.357, + "step": 5477000 + }, + { + "epoch": 37.06623538328281, + "grad_norm": 0.408597469329834, + "learning_rate": 4.6293376461671724e-05, + "loss": 0.3583, + "step": 5477500 + }, + { + "epoch": 37.06961888263317, + "grad_norm": 0.3889630138874054, + "learning_rate": 4.6293038111736686e-05, + "loss": 0.3574, + "step": 5478000 + }, + { + "epoch": 37.07300238198354, + "grad_norm": 0.4078400135040283, + "learning_rate": 4.629269976180165e-05, + "loss": 0.3574, + "step": 5478500 + }, + { + "epoch": 37.07638588133391, + "grad_norm": 0.38494017720222473, + "learning_rate": 4.629236141186662e-05, + "loss": 0.3595, + "step": 5479000 + }, + { + "epoch": 37.079769380684276, + "grad_norm": 0.38983720541000366, + "learning_rate": 4.629202306193157e-05, + "loss": 0.357, + "step": 5479500 + }, + { + "epoch": 37.083152880034646, + "grad_norm": 0.3703463077545166, + "learning_rate": 4.6291684711996535e-05, + "loss": 0.3566, + "step": 5480000 + }, + { + "epoch": 37.08653637938502, + "grad_norm": 0.42000865936279297, + "learning_rate": 4.62913463620615e-05, + "loss": 0.3565, + "step": 5480500 + }, + { + "epoch": 37.08991987873539, + "grad_norm": 0.3849279284477234, + "learning_rate": 4.6291008012126466e-05, + "loss": 0.357, + "step": 5481000 + }, + { + "epoch": 37.09330337808575, + "grad_norm": 0.37425053119659424, + "learning_rate": 4.629066966219143e-05, + "loss": 0.3573, + "step": 5481500 + }, + { + "epoch": 37.09668687743612, + "grad_norm": 0.4025816023349762, + "learning_rate": 4.629033131225639e-05, + "loss": 0.3576, + "step": 5482000 + }, + { + "epoch": 37.10007037678649, + "grad_norm": 0.3557578921318054, + "learning_rate": 4.628999296232135e-05, + "loss": 0.3579, + "step": 5482500 + }, + { + "epoch": 37.103453876136854, + "grad_norm": 0.3866497278213501, + "learning_rate": 4.628965461238632e-05, + "loss": 0.3591, + "step": 5483000 + }, + { + "epoch": 37.106837375487224, + "grad_norm": 0.3914271295070648, + "learning_rate": 4.628931626245128e-05, + "loss": 0.3577, + "step": 5483500 + }, + { + "epoch": 37.110220874837594, + "grad_norm": 0.3865836560726166, + "learning_rate": 4.628897791251624e-05, + "loss": 0.3564, + "step": 5484000 + }, + { + "epoch": 37.11360437418796, + "grad_norm": 0.4411778450012207, + "learning_rate": 4.628863956258121e-05, + "loss": 0.3573, + "step": 5484500 + }, + { + "epoch": 37.11698787353833, + "grad_norm": 0.40927067399024963, + "learning_rate": 4.628830121264617e-05, + "loss": 0.3588, + "step": 5485000 + }, + { + "epoch": 37.1203713728887, + "grad_norm": 0.39079582691192627, + "learning_rate": 4.628796286271113e-05, + "loss": 0.3572, + "step": 5485500 + }, + { + "epoch": 37.12375487223906, + "grad_norm": 0.4265403747558594, + "learning_rate": 4.6287624512776094e-05, + "loss": 0.3567, + "step": 5486000 + }, + { + "epoch": 37.12713837158943, + "grad_norm": 0.34536078572273254, + "learning_rate": 4.628728616284106e-05, + "loss": 0.3578, + "step": 5486500 + }, + { + "epoch": 37.1305218709398, + "grad_norm": 0.4132966697216034, + "learning_rate": 4.6286947812906025e-05, + "loss": 0.3572, + "step": 5487000 + }, + { + "epoch": 37.13390537029017, + "grad_norm": 0.3569653630256653, + "learning_rate": 4.628660946297099e-05, + "loss": 0.3575, + "step": 5487500 + }, + { + "epoch": 37.137288869640535, + "grad_norm": 0.3779239058494568, + "learning_rate": 4.628627111303595e-05, + "loss": 0.3574, + "step": 5488000 + }, + { + "epoch": 37.140672368990906, + "grad_norm": 0.3855503797531128, + "learning_rate": 4.628593276310092e-05, + "loss": 0.3577, + "step": 5488500 + }, + { + "epoch": 37.144055868341276, + "grad_norm": 0.3920680284500122, + "learning_rate": 4.628559441316587e-05, + "loss": 0.3575, + "step": 5489000 + }, + { + "epoch": 37.14743936769164, + "grad_norm": 0.40385711193084717, + "learning_rate": 4.6285256063230835e-05, + "loss": 0.358, + "step": 5489500 + }, + { + "epoch": 37.15082286704201, + "grad_norm": 0.3753313720226288, + "learning_rate": 4.62849177132958e-05, + "loss": 0.3591, + "step": 5490000 + }, + { + "epoch": 37.15420636639238, + "grad_norm": 0.38031160831451416, + "learning_rate": 4.6284579363360766e-05, + "loss": 0.3584, + "step": 5490500 + }, + { + "epoch": 37.15758986574274, + "grad_norm": 0.35980871319770813, + "learning_rate": 4.628424101342573e-05, + "loss": 0.3582, + "step": 5491000 + }, + { + "epoch": 37.16097336509311, + "grad_norm": 0.3997298777103424, + "learning_rate": 4.628390266349069e-05, + "loss": 0.3572, + "step": 5491500 + }, + { + "epoch": 37.164356864443484, + "grad_norm": 0.4069244861602783, + "learning_rate": 4.628356431355565e-05, + "loss": 0.3574, + "step": 5492000 + }, + { + "epoch": 37.16774036379385, + "grad_norm": 0.37065088748931885, + "learning_rate": 4.628322596362062e-05, + "loss": 0.3573, + "step": 5492500 + }, + { + "epoch": 37.17112386314422, + "grad_norm": 0.40082502365112305, + "learning_rate": 4.6282887613685584e-05, + "loss": 0.3579, + "step": 5493000 + }, + { + "epoch": 37.17450736249459, + "grad_norm": 0.3844318687915802, + "learning_rate": 4.628254926375054e-05, + "loss": 0.3588, + "step": 5493500 + }, + { + "epoch": 37.17789086184496, + "grad_norm": 0.3889318108558655, + "learning_rate": 4.628221091381551e-05, + "loss": 0.36, + "step": 5494000 + }, + { + "epoch": 37.18127436119532, + "grad_norm": 0.34067457914352417, + "learning_rate": 4.628187256388047e-05, + "loss": 0.3578, + "step": 5494500 + }, + { + "epoch": 37.18465786054569, + "grad_norm": 0.3923149108886719, + "learning_rate": 4.628153421394543e-05, + "loss": 0.3589, + "step": 5495000 + }, + { + "epoch": 37.18804135989606, + "grad_norm": 0.420271098613739, + "learning_rate": 4.6281195864010394e-05, + "loss": 0.3592, + "step": 5495500 + }, + { + "epoch": 37.191424859246425, + "grad_norm": 0.42455965280532837, + "learning_rate": 4.628085751407536e-05, + "loss": 0.3601, + "step": 5496000 + }, + { + "epoch": 37.194808358596795, + "grad_norm": 0.4452683925628662, + "learning_rate": 4.6280519164140325e-05, + "loss": 0.3582, + "step": 5496500 + }, + { + "epoch": 37.198191857947165, + "grad_norm": 0.41060131788253784, + "learning_rate": 4.628018081420529e-05, + "loss": 0.3581, + "step": 5497000 + }, + { + "epoch": 37.20157535729753, + "grad_norm": 0.396625280380249, + "learning_rate": 4.627984246427025e-05, + "loss": 0.3594, + "step": 5497500 + }, + { + "epoch": 37.2049588566479, + "grad_norm": 0.4013853073120117, + "learning_rate": 4.627950411433521e-05, + "loss": 0.3585, + "step": 5498000 + }, + { + "epoch": 37.20834235599827, + "grad_norm": 0.3770159184932709, + "learning_rate": 4.6279165764400174e-05, + "loss": 0.3554, + "step": 5498500 + }, + { + "epoch": 37.21172585534863, + "grad_norm": 0.38657766580581665, + "learning_rate": 4.6278827414465136e-05, + "loss": 0.3588, + "step": 5499000 + }, + { + "epoch": 37.215109354699, + "grad_norm": 0.3623330593109131, + "learning_rate": 4.62784890645301e-05, + "loss": 0.3573, + "step": 5499500 + }, + { + "epoch": 37.21849285404937, + "grad_norm": 0.35294994711875916, + "learning_rate": 4.627815071459507e-05, + "loss": 0.3589, + "step": 5500000 + }, + { + "epoch": 37.22187635339974, + "grad_norm": 0.408532977104187, + "learning_rate": 4.627781236466003e-05, + "loss": 0.3588, + "step": 5500500 + }, + { + "epoch": 37.225259852750106, + "grad_norm": 0.38643956184387207, + "learning_rate": 4.627747401472499e-05, + "loss": 0.3575, + "step": 5501000 + }, + { + "epoch": 37.22864335210048, + "grad_norm": 0.4474964737892151, + "learning_rate": 4.627713566478995e-05, + "loss": 0.3568, + "step": 5501500 + }, + { + "epoch": 37.23202685145085, + "grad_norm": 0.3891274631023407, + "learning_rate": 4.627679731485492e-05, + "loss": 0.3588, + "step": 5502000 + }, + { + "epoch": 37.23541035080121, + "grad_norm": 0.3855583369731903, + "learning_rate": 4.6276458964919884e-05, + "loss": 0.3585, + "step": 5502500 + }, + { + "epoch": 37.23879385015158, + "grad_norm": 0.3802156448364258, + "learning_rate": 4.627612061498484e-05, + "loss": 0.3568, + "step": 5503000 + }, + { + "epoch": 37.24217734950195, + "grad_norm": 0.3620854616165161, + "learning_rate": 4.627578226504981e-05, + "loss": 0.3593, + "step": 5503500 + }, + { + "epoch": 37.245560848852314, + "grad_norm": 0.36845290660858154, + "learning_rate": 4.627544391511477e-05, + "loss": 0.3579, + "step": 5504000 + }, + { + "epoch": 37.248944348202684, + "grad_norm": 0.417395681142807, + "learning_rate": 4.627510556517973e-05, + "loss": 0.3588, + "step": 5504500 + }, + { + "epoch": 37.252327847553055, + "grad_norm": 0.35247254371643066, + "learning_rate": 4.6274767215244695e-05, + "loss": 0.3569, + "step": 5505000 + }, + { + "epoch": 37.255711346903425, + "grad_norm": 0.3863687813282013, + "learning_rate": 4.627442886530966e-05, + "loss": 0.3572, + "step": 5505500 + }, + { + "epoch": 37.25909484625379, + "grad_norm": 0.4075227379798889, + "learning_rate": 4.6274090515374626e-05, + "loss": 0.3567, + "step": 5506000 + }, + { + "epoch": 37.26247834560416, + "grad_norm": 0.3664039075374603, + "learning_rate": 4.627375216543959e-05, + "loss": 0.3578, + "step": 5506500 + }, + { + "epoch": 37.26586184495453, + "grad_norm": 0.4030807316303253, + "learning_rate": 4.627341381550455e-05, + "loss": 0.3579, + "step": 5507000 + }, + { + "epoch": 37.26924534430489, + "grad_norm": 0.37312668561935425, + "learning_rate": 4.627307546556951e-05, + "loss": 0.3569, + "step": 5507500 + }, + { + "epoch": 37.27262884365526, + "grad_norm": 0.4228224456310272, + "learning_rate": 4.6272737115634474e-05, + "loss": 0.3574, + "step": 5508000 + }, + { + "epoch": 37.27601234300563, + "grad_norm": 0.36767372488975525, + "learning_rate": 4.6272398765699436e-05, + "loss": 0.3588, + "step": 5508500 + }, + { + "epoch": 37.279395842355996, + "grad_norm": 0.399533212184906, + "learning_rate": 4.62720604157644e-05, + "loss": 0.3591, + "step": 5509000 + }, + { + "epoch": 37.282779341706366, + "grad_norm": 0.37349948287010193, + "learning_rate": 4.627172206582937e-05, + "loss": 0.3564, + "step": 5509500 + }, + { + "epoch": 37.286162841056736, + "grad_norm": 0.3685149550437927, + "learning_rate": 4.627138371589433e-05, + "loss": 0.3582, + "step": 5510000 + }, + { + "epoch": 37.2895463404071, + "grad_norm": 0.3562529683113098, + "learning_rate": 4.627104536595929e-05, + "loss": 0.3584, + "step": 5510500 + }, + { + "epoch": 37.29292983975747, + "grad_norm": 0.39345937967300415, + "learning_rate": 4.6270707016024254e-05, + "loss": 0.3565, + "step": 5511000 + }, + { + "epoch": 37.29631333910784, + "grad_norm": 0.3297092020511627, + "learning_rate": 4.627036866608922e-05, + "loss": 0.3571, + "step": 5511500 + }, + { + "epoch": 37.29969683845821, + "grad_norm": 0.3960650563240051, + "learning_rate": 4.6270030316154185e-05, + "loss": 0.3594, + "step": 5512000 + }, + { + "epoch": 37.303080337808574, + "grad_norm": 0.3693292438983917, + "learning_rate": 4.626969196621914e-05, + "loss": 0.359, + "step": 5512500 + }, + { + "epoch": 37.306463837158944, + "grad_norm": 0.38844481110572815, + "learning_rate": 4.626935361628411e-05, + "loss": 0.3588, + "step": 5513000 + }, + { + "epoch": 37.309847336509314, + "grad_norm": 0.3629488945007324, + "learning_rate": 4.626901526634907e-05, + "loss": 0.3596, + "step": 5513500 + }, + { + "epoch": 37.31323083585968, + "grad_norm": 0.3812672197818756, + "learning_rate": 4.626867691641403e-05, + "loss": 0.3587, + "step": 5514000 + }, + { + "epoch": 37.31661433521005, + "grad_norm": 0.393673837184906, + "learning_rate": 4.6268338566478995e-05, + "loss": 0.3588, + "step": 5514500 + }, + { + "epoch": 37.31999783456042, + "grad_norm": 0.39718562364578247, + "learning_rate": 4.626800021654396e-05, + "loss": 0.3595, + "step": 5515000 + }, + { + "epoch": 37.32338133391078, + "grad_norm": 0.3568868637084961, + "learning_rate": 4.6267661866608926e-05, + "loss": 0.3595, + "step": 5515500 + }, + { + "epoch": 37.32676483326115, + "grad_norm": 0.3998781740665436, + "learning_rate": 4.626732351667389e-05, + "loss": 0.3572, + "step": 5516000 + }, + { + "epoch": 37.33014833261152, + "grad_norm": 0.3915063142776489, + "learning_rate": 4.626698516673885e-05, + "loss": 0.3596, + "step": 5516500 + }, + { + "epoch": 37.333531831961885, + "grad_norm": 0.39350441098213196, + "learning_rate": 4.626664681680381e-05, + "loss": 0.3588, + "step": 5517000 + }, + { + "epoch": 37.336915331312255, + "grad_norm": 0.3588891625404358, + "learning_rate": 4.6266308466868775e-05, + "loss": 0.3593, + "step": 5517500 + }, + { + "epoch": 37.340298830662626, + "grad_norm": 0.44149383902549744, + "learning_rate": 4.626597011693374e-05, + "loss": 0.3585, + "step": 5518000 + }, + { + "epoch": 37.343682330012996, + "grad_norm": 0.3888978958129883, + "learning_rate": 4.62656317669987e-05, + "loss": 0.3582, + "step": 5518500 + }, + { + "epoch": 37.34706582936336, + "grad_norm": 0.35463154315948486, + "learning_rate": 4.626529341706367e-05, + "loss": 0.3573, + "step": 5519000 + }, + { + "epoch": 37.35044932871373, + "grad_norm": 0.4043683409690857, + "learning_rate": 4.626495506712863e-05, + "loss": 0.3585, + "step": 5519500 + }, + { + "epoch": 37.3538328280641, + "grad_norm": 0.3934733271598816, + "learning_rate": 4.626461671719359e-05, + "loss": 0.359, + "step": 5520000 + }, + { + "epoch": 37.35721632741446, + "grad_norm": 0.41046079993247986, + "learning_rate": 4.6264278367258554e-05, + "loss": 0.3547, + "step": 5520500 + }, + { + "epoch": 37.36059982676483, + "grad_norm": 0.3889351487159729, + "learning_rate": 4.626394001732352e-05, + "loss": 0.3581, + "step": 5521000 + }, + { + "epoch": 37.3639833261152, + "grad_norm": 0.38513049483299255, + "learning_rate": 4.6263601667388485e-05, + "loss": 0.3601, + "step": 5521500 + }, + { + "epoch": 37.36736682546557, + "grad_norm": 0.37006130814552307, + "learning_rate": 4.626326331745345e-05, + "loss": 0.3587, + "step": 5522000 + }, + { + "epoch": 37.37075032481594, + "grad_norm": 0.4113992154598236, + "learning_rate": 4.62629249675184e-05, + "loss": 0.3586, + "step": 5522500 + }, + { + "epoch": 37.37413382416631, + "grad_norm": 0.39319366216659546, + "learning_rate": 4.626258661758337e-05, + "loss": 0.3593, + "step": 5523000 + }, + { + "epoch": 37.37751732351667, + "grad_norm": 0.38613831996917725, + "learning_rate": 4.6262248267648334e-05, + "loss": 0.358, + "step": 5523500 + }, + { + "epoch": 37.38090082286704, + "grad_norm": 0.38404855132102966, + "learning_rate": 4.6261909917713296e-05, + "loss": 0.3591, + "step": 5524000 + }, + { + "epoch": 37.38428432221741, + "grad_norm": 0.34263017773628235, + "learning_rate": 4.626157156777826e-05, + "loss": 0.3576, + "step": 5524500 + }, + { + "epoch": 37.38766782156778, + "grad_norm": 0.3522478938102722, + "learning_rate": 4.626123321784323e-05, + "loss": 0.3589, + "step": 5525000 + }, + { + "epoch": 37.391051320918145, + "grad_norm": 0.3875865936279297, + "learning_rate": 4.626089486790819e-05, + "loss": 0.3578, + "step": 5525500 + }, + { + "epoch": 37.394434820268515, + "grad_norm": 0.38345375657081604, + "learning_rate": 4.626055651797315e-05, + "loss": 0.3596, + "step": 5526000 + }, + { + "epoch": 37.397818319618885, + "grad_norm": 0.3821273744106293, + "learning_rate": 4.6260218168038113e-05, + "loss": 0.3584, + "step": 5526500 + }, + { + "epoch": 37.40120181896925, + "grad_norm": 0.35929813981056213, + "learning_rate": 4.6259879818103076e-05, + "loss": 0.359, + "step": 5527000 + }, + { + "epoch": 37.40458531831962, + "grad_norm": 0.4246380627155304, + "learning_rate": 4.625954146816804e-05, + "loss": 0.3601, + "step": 5527500 + }, + { + "epoch": 37.40796881766999, + "grad_norm": 0.36928990483283997, + "learning_rate": 4.6259203118233e-05, + "loss": 0.3584, + "step": 5528000 + }, + { + "epoch": 37.41135231702035, + "grad_norm": 0.3711598515510559, + "learning_rate": 4.625886476829797e-05, + "loss": 0.3566, + "step": 5528500 + }, + { + "epoch": 37.41473581637072, + "grad_norm": 0.36917001008987427, + "learning_rate": 4.625852641836293e-05, + "loss": 0.3585, + "step": 5529000 + }, + { + "epoch": 37.41811931572109, + "grad_norm": 0.4055008590221405, + "learning_rate": 4.625818806842789e-05, + "loss": 0.3576, + "step": 5529500 + }, + { + "epoch": 37.42150281507146, + "grad_norm": 0.3711182773113251, + "learning_rate": 4.6257849718492855e-05, + "loss": 0.3583, + "step": 5530000 + }, + { + "epoch": 37.424886314421826, + "grad_norm": 0.354253888130188, + "learning_rate": 4.6257511368557824e-05, + "loss": 0.3593, + "step": 5530500 + }, + { + "epoch": 37.4282698137722, + "grad_norm": 0.42840898036956787, + "learning_rate": 4.6257173018622786e-05, + "loss": 0.3596, + "step": 5531000 + }, + { + "epoch": 37.43165331312257, + "grad_norm": 0.3781728744506836, + "learning_rate": 4.625683466868775e-05, + "loss": 0.3582, + "step": 5531500 + }, + { + "epoch": 37.43503681247293, + "grad_norm": 0.3387506604194641, + "learning_rate": 4.6256496318752704e-05, + "loss": 0.3588, + "step": 5532000 + }, + { + "epoch": 37.4384203118233, + "grad_norm": 0.41057971119880676, + "learning_rate": 4.625615796881767e-05, + "loss": 0.3578, + "step": 5532500 + }, + { + "epoch": 37.44180381117367, + "grad_norm": 0.42533671855926514, + "learning_rate": 4.6255819618882635e-05, + "loss": 0.3574, + "step": 5533000 + }, + { + "epoch": 37.445187310524034, + "grad_norm": 0.40534380078315735, + "learning_rate": 4.62554812689476e-05, + "loss": 0.3592, + "step": 5533500 + }, + { + "epoch": 37.448570809874404, + "grad_norm": 0.383487343788147, + "learning_rate": 4.625514291901256e-05, + "loss": 0.3575, + "step": 5534000 + }, + { + "epoch": 37.451954309224774, + "grad_norm": 0.3784342110157013, + "learning_rate": 4.625480456907753e-05, + "loss": 0.3605, + "step": 5534500 + }, + { + "epoch": 37.45533780857514, + "grad_norm": 0.43512246012687683, + "learning_rate": 4.625446621914249e-05, + "loss": 0.3586, + "step": 5535000 + }, + { + "epoch": 37.45872130792551, + "grad_norm": 0.3840678036212921, + "learning_rate": 4.625412786920745e-05, + "loss": 0.3586, + "step": 5535500 + }, + { + "epoch": 37.46210480727588, + "grad_norm": 0.3802643120288849, + "learning_rate": 4.6253789519272414e-05, + "loss": 0.3581, + "step": 5536000 + }, + { + "epoch": 37.46548830662625, + "grad_norm": 0.3913117051124573, + "learning_rate": 4.6253451169337376e-05, + "loss": 0.3592, + "step": 5536500 + }, + { + "epoch": 37.46887180597661, + "grad_norm": 0.3788554072380066, + "learning_rate": 4.625311281940234e-05, + "loss": 0.3595, + "step": 5537000 + }, + { + "epoch": 37.47225530532698, + "grad_norm": 0.36486807465553284, + "learning_rate": 4.62527744694673e-05, + "loss": 0.3591, + "step": 5537500 + }, + { + "epoch": 37.47563880467735, + "grad_norm": 0.40812861919403076, + "learning_rate": 4.625243611953227e-05, + "loss": 0.3581, + "step": 5538000 + }, + { + "epoch": 37.479022304027716, + "grad_norm": 0.4144463837146759, + "learning_rate": 4.625209776959723e-05, + "loss": 0.3577, + "step": 5538500 + }, + { + "epoch": 37.482405803378086, + "grad_norm": 0.3697092831134796, + "learning_rate": 4.6251759419662194e-05, + "loss": 0.3579, + "step": 5539000 + }, + { + "epoch": 37.485789302728456, + "grad_norm": 0.35993775725364685, + "learning_rate": 4.6251421069727156e-05, + "loss": 0.3603, + "step": 5539500 + }, + { + "epoch": 37.48917280207882, + "grad_norm": 0.4362202286720276, + "learning_rate": 4.6251082719792125e-05, + "loss": 0.3577, + "step": 5540000 + }, + { + "epoch": 37.49255630142919, + "grad_norm": 0.3813544511795044, + "learning_rate": 4.625074436985709e-05, + "loss": 0.3581, + "step": 5540500 + }, + { + "epoch": 37.49593980077956, + "grad_norm": 0.42652133107185364, + "learning_rate": 4.625040601992205e-05, + "loss": 0.3588, + "step": 5541000 + }, + { + "epoch": 37.49932330012992, + "grad_norm": 0.38198423385620117, + "learning_rate": 4.6250067669987004e-05, + "loss": 0.3574, + "step": 5541500 + }, + { + "epoch": 37.50270679948029, + "grad_norm": 0.36867615580558777, + "learning_rate": 4.624972932005197e-05, + "loss": 0.3584, + "step": 5542000 + }, + { + "epoch": 37.506090298830664, + "grad_norm": 0.39017850160598755, + "learning_rate": 4.6249390970116935e-05, + "loss": 0.3577, + "step": 5542500 + }, + { + "epoch": 37.509473798181034, + "grad_norm": 0.39716485142707825, + "learning_rate": 4.62490526201819e-05, + "loss": 0.3582, + "step": 5543000 + }, + { + "epoch": 37.5128572975314, + "grad_norm": 0.3819413185119629, + "learning_rate": 4.624871427024686e-05, + "loss": 0.3577, + "step": 5543500 + }, + { + "epoch": 37.51624079688177, + "grad_norm": 0.38019052147865295, + "learning_rate": 4.624837592031183e-05, + "loss": 0.3591, + "step": 5544000 + }, + { + "epoch": 37.51962429623214, + "grad_norm": 0.4178309440612793, + "learning_rate": 4.624803757037679e-05, + "loss": 0.3591, + "step": 5544500 + }, + { + "epoch": 37.5230077955825, + "grad_norm": 0.40726998448371887, + "learning_rate": 4.624769922044175e-05, + "loss": 0.3598, + "step": 5545000 + }, + { + "epoch": 37.52639129493287, + "grad_norm": 0.3306938707828522, + "learning_rate": 4.6247360870506715e-05, + "loss": 0.3591, + "step": 5545500 + }, + { + "epoch": 37.52977479428324, + "grad_norm": 0.412689208984375, + "learning_rate": 4.624702252057168e-05, + "loss": 0.3584, + "step": 5546000 + }, + { + "epoch": 37.533158293633605, + "grad_norm": 0.38152435421943665, + "learning_rate": 4.624668417063664e-05, + "loss": 0.3586, + "step": 5546500 + }, + { + "epoch": 37.536541792983975, + "grad_norm": 0.3947616219520569, + "learning_rate": 4.62463458207016e-05, + "loss": 0.3595, + "step": 5547000 + }, + { + "epoch": 37.539925292334345, + "grad_norm": 0.3962426483631134, + "learning_rate": 4.624600747076657e-05, + "loss": 0.3577, + "step": 5547500 + }, + { + "epoch": 37.54330879168471, + "grad_norm": 0.3332233130931854, + "learning_rate": 4.624566912083153e-05, + "loss": 0.3577, + "step": 5548000 + }, + { + "epoch": 37.54669229103508, + "grad_norm": 0.37939968705177307, + "learning_rate": 4.6245330770896494e-05, + "loss": 0.3598, + "step": 5548500 + }, + { + "epoch": 37.55007579038545, + "grad_norm": 0.3684322237968445, + "learning_rate": 4.6244992420961456e-05, + "loss": 0.3572, + "step": 5549000 + }, + { + "epoch": 37.55345928973582, + "grad_norm": 0.38885024189949036, + "learning_rate": 4.6244654071026425e-05, + "loss": 0.3599, + "step": 5549500 + }, + { + "epoch": 37.55684278908618, + "grad_norm": 0.38561055064201355, + "learning_rate": 4.624431572109139e-05, + "loss": 0.3592, + "step": 5550000 + }, + { + "epoch": 37.56022628843655, + "grad_norm": 0.3819288909435272, + "learning_rate": 4.624397737115635e-05, + "loss": 0.356, + "step": 5550500 + }, + { + "epoch": 37.56360978778692, + "grad_norm": 0.4337511658668518, + "learning_rate": 4.6243639021221305e-05, + "loss": 0.3597, + "step": 5551000 + }, + { + "epoch": 37.56699328713729, + "grad_norm": 0.33733001351356506, + "learning_rate": 4.6243300671286274e-05, + "loss": 0.3588, + "step": 5551500 + }, + { + "epoch": 37.57037678648766, + "grad_norm": 0.42798560857772827, + "learning_rate": 4.6242962321351236e-05, + "loss": 0.3576, + "step": 5552000 + }, + { + "epoch": 37.57376028583803, + "grad_norm": 0.3651011288166046, + "learning_rate": 4.62426239714162e-05, + "loss": 0.3592, + "step": 5552500 + }, + { + "epoch": 37.57714378518839, + "grad_norm": 0.36789029836654663, + "learning_rate": 4.624228562148116e-05, + "loss": 0.3594, + "step": 5553000 + }, + { + "epoch": 37.58052728453876, + "grad_norm": 0.36014118790626526, + "learning_rate": 4.624194727154613e-05, + "loss": 0.3586, + "step": 5553500 + }, + { + "epoch": 37.58391078388913, + "grad_norm": 0.4079189598560333, + "learning_rate": 4.624160892161109e-05, + "loss": 0.3582, + "step": 5554000 + }, + { + "epoch": 37.5872942832395, + "grad_norm": 0.3701742887496948, + "learning_rate": 4.624127057167605e-05, + "loss": 0.3583, + "step": 5554500 + }, + { + "epoch": 37.590677782589864, + "grad_norm": 0.39185646176338196, + "learning_rate": 4.6240932221741015e-05, + "loss": 0.3589, + "step": 5555000 + }, + { + "epoch": 37.594061281940235, + "grad_norm": 0.3799331486225128, + "learning_rate": 4.624059387180598e-05, + "loss": 0.3592, + "step": 5555500 + }, + { + "epoch": 37.597444781290605, + "grad_norm": 0.4082535207271576, + "learning_rate": 4.624025552187094e-05, + "loss": 0.3575, + "step": 5556000 + }, + { + "epoch": 37.60082828064097, + "grad_norm": 0.40764573216438293, + "learning_rate": 4.62399171719359e-05, + "loss": 0.3592, + "step": 5556500 + }, + { + "epoch": 37.60421177999134, + "grad_norm": 0.41705572605133057, + "learning_rate": 4.623957882200087e-05, + "loss": 0.3583, + "step": 5557000 + }, + { + "epoch": 37.60759527934171, + "grad_norm": 0.35448992252349854, + "learning_rate": 4.623924047206583e-05, + "loss": 0.358, + "step": 5557500 + }, + { + "epoch": 37.61097877869207, + "grad_norm": 0.4184199869632721, + "learning_rate": 4.6238902122130795e-05, + "loss": 0.3587, + "step": 5558000 + }, + { + "epoch": 37.61436227804244, + "grad_norm": 0.3703767657279968, + "learning_rate": 4.623856377219576e-05, + "loss": 0.3581, + "step": 5558500 + }, + { + "epoch": 37.61774577739281, + "grad_norm": 0.3824697732925415, + "learning_rate": 4.6238225422260726e-05, + "loss": 0.3578, + "step": 5559000 + }, + { + "epoch": 37.621129276743176, + "grad_norm": 0.41411226987838745, + "learning_rate": 4.623788707232569e-05, + "loss": 0.3595, + "step": 5559500 + }, + { + "epoch": 37.624512776093546, + "grad_norm": 0.3943541944026947, + "learning_rate": 4.623754872239065e-05, + "loss": 0.3596, + "step": 5560000 + }, + { + "epoch": 37.627896275443916, + "grad_norm": 0.37810245156288147, + "learning_rate": 4.6237210372455605e-05, + "loss": 0.3583, + "step": 5560500 + }, + { + "epoch": 37.63127977479429, + "grad_norm": 0.40703824162483215, + "learning_rate": 4.6236872022520574e-05, + "loss": 0.359, + "step": 5561000 + }, + { + "epoch": 37.63466327414465, + "grad_norm": 0.3871955871582031, + "learning_rate": 4.6236533672585536e-05, + "loss": 0.3562, + "step": 5561500 + }, + { + "epoch": 37.63804677349502, + "grad_norm": 0.3976595103740692, + "learning_rate": 4.62361953226505e-05, + "loss": 0.3599, + "step": 5562000 + }, + { + "epoch": 37.64143027284539, + "grad_norm": 0.33061501383781433, + "learning_rate": 4.623585697271546e-05, + "loss": 0.3575, + "step": 5562500 + }, + { + "epoch": 37.644813772195754, + "grad_norm": 0.36976704001426697, + "learning_rate": 4.623551862278043e-05, + "loss": 0.3592, + "step": 5563000 + }, + { + "epoch": 37.648197271546124, + "grad_norm": 0.3802747428417206, + "learning_rate": 4.623518027284539e-05, + "loss": 0.3567, + "step": 5563500 + }, + { + "epoch": 37.651580770896494, + "grad_norm": 0.4013938009738922, + "learning_rate": 4.6234841922910354e-05, + "loss": 0.3573, + "step": 5564000 + }, + { + "epoch": 37.65496427024686, + "grad_norm": 0.4191991686820984, + "learning_rate": 4.6234503572975316e-05, + "loss": 0.3586, + "step": 5564500 + }, + { + "epoch": 37.65834776959723, + "grad_norm": 0.3883405327796936, + "learning_rate": 4.623416522304028e-05, + "loss": 0.3596, + "step": 5565000 + }, + { + "epoch": 37.6617312689476, + "grad_norm": 0.41135701537132263, + "learning_rate": 4.623382687310524e-05, + "loss": 0.3597, + "step": 5565500 + }, + { + "epoch": 37.66511476829796, + "grad_norm": 0.41621407866477966, + "learning_rate": 4.62334885231702e-05, + "loss": 0.3583, + "step": 5566000 + }, + { + "epoch": 37.66849826764833, + "grad_norm": 0.3869550824165344, + "learning_rate": 4.623315017323517e-05, + "loss": 0.3587, + "step": 5566500 + }, + { + "epoch": 37.6718817669987, + "grad_norm": 0.39791762828826904, + "learning_rate": 4.623281182330013e-05, + "loss": 0.3586, + "step": 5567000 + }, + { + "epoch": 37.67526526634907, + "grad_norm": 0.3453007638454437, + "learning_rate": 4.6232473473365095e-05, + "loss": 0.3584, + "step": 5567500 + }, + { + "epoch": 37.678648765699435, + "grad_norm": 0.4014468193054199, + "learning_rate": 4.623213512343006e-05, + "loss": 0.3577, + "step": 5568000 + }, + { + "epoch": 37.682032265049806, + "grad_norm": 0.32720428705215454, + "learning_rate": 4.623179677349502e-05, + "loss": 0.3581, + "step": 5568500 + }, + { + "epoch": 37.685415764400176, + "grad_norm": 0.4049336612224579, + "learning_rate": 4.623145842355999e-05, + "loss": 0.3575, + "step": 5569000 + }, + { + "epoch": 37.68879926375054, + "grad_norm": 0.3947456479072571, + "learning_rate": 4.623112007362495e-05, + "loss": 0.3583, + "step": 5569500 + }, + { + "epoch": 37.69218276310091, + "grad_norm": 0.403266966342926, + "learning_rate": 4.6230781723689906e-05, + "loss": 0.3585, + "step": 5570000 + }, + { + "epoch": 37.69556626245128, + "grad_norm": 0.36558789014816284, + "learning_rate": 4.6230443373754875e-05, + "loss": 0.3574, + "step": 5570500 + }, + { + "epoch": 37.69894976180164, + "grad_norm": 0.3818999230861664, + "learning_rate": 4.623010502381984e-05, + "loss": 0.3587, + "step": 5571000 + }, + { + "epoch": 37.70233326115201, + "grad_norm": 0.4195767641067505, + "learning_rate": 4.62297666738848e-05, + "loss": 0.3588, + "step": 5571500 + }, + { + "epoch": 37.70571676050238, + "grad_norm": 0.37867316603660583, + "learning_rate": 4.622942832394976e-05, + "loss": 0.3558, + "step": 5572000 + }, + { + "epoch": 37.70910025985275, + "grad_norm": 0.38056012988090515, + "learning_rate": 4.622908997401473e-05, + "loss": 0.3581, + "step": 5572500 + }, + { + "epoch": 37.71248375920312, + "grad_norm": 0.3594655990600586, + "learning_rate": 4.622875162407969e-05, + "loss": 0.358, + "step": 5573000 + }, + { + "epoch": 37.71586725855349, + "grad_norm": 0.3999258279800415, + "learning_rate": 4.6228413274144655e-05, + "loss": 0.3604, + "step": 5573500 + }, + { + "epoch": 37.71925075790386, + "grad_norm": 0.4410228729248047, + "learning_rate": 4.622807492420962e-05, + "loss": 0.359, + "step": 5574000 + }, + { + "epoch": 37.72263425725422, + "grad_norm": 0.3825768232345581, + "learning_rate": 4.6227736574274586e-05, + "loss": 0.3585, + "step": 5574500 + }, + { + "epoch": 37.72601775660459, + "grad_norm": 0.3880165219306946, + "learning_rate": 4.622739822433954e-05, + "loss": 0.3581, + "step": 5575000 + }, + { + "epoch": 37.72940125595496, + "grad_norm": 0.4235299229621887, + "learning_rate": 4.62270598744045e-05, + "loss": 0.3585, + "step": 5575500 + }, + { + "epoch": 37.732784755305325, + "grad_norm": 0.376895934343338, + "learning_rate": 4.6226721524469465e-05, + "loss": 0.3585, + "step": 5576000 + }, + { + "epoch": 37.736168254655695, + "grad_norm": 0.4097041189670563, + "learning_rate": 4.6226383174534434e-05, + "loss": 0.3581, + "step": 5576500 + }, + { + "epoch": 37.739551754006065, + "grad_norm": 0.4131355285644531, + "learning_rate": 4.6226044824599396e-05, + "loss": 0.3591, + "step": 5577000 + }, + { + "epoch": 37.74293525335643, + "grad_norm": 0.42813947796821594, + "learning_rate": 4.622570647466436e-05, + "loss": 0.3601, + "step": 5577500 + }, + { + "epoch": 37.7463187527068, + "grad_norm": 0.38428157567977905, + "learning_rate": 4.622536812472932e-05, + "loss": 0.3588, + "step": 5578000 + }, + { + "epoch": 37.74970225205717, + "grad_norm": 0.3647453486919403, + "learning_rate": 4.622502977479429e-05, + "loss": 0.3563, + "step": 5578500 + }, + { + "epoch": 37.75308575140754, + "grad_norm": 0.3345412611961365, + "learning_rate": 4.622469142485925e-05, + "loss": 0.3594, + "step": 5579000 + }, + { + "epoch": 37.7564692507579, + "grad_norm": 0.4206676483154297, + "learning_rate": 4.622435307492421e-05, + "loss": 0.3593, + "step": 5579500 + }, + { + "epoch": 37.75985275010827, + "grad_norm": 0.38542622327804565, + "learning_rate": 4.6224014724989176e-05, + "loss": 0.3584, + "step": 5580000 + }, + { + "epoch": 37.76323624945864, + "grad_norm": 0.34646499156951904, + "learning_rate": 4.622367637505414e-05, + "loss": 0.3587, + "step": 5580500 + }, + { + "epoch": 37.766619748809006, + "grad_norm": 0.40047022700309753, + "learning_rate": 4.62233380251191e-05, + "loss": 0.3575, + "step": 5581000 + }, + { + "epoch": 37.77000324815938, + "grad_norm": 0.4088267385959625, + "learning_rate": 4.622299967518406e-05, + "loss": 0.3586, + "step": 5581500 + }, + { + "epoch": 37.77338674750975, + "grad_norm": 0.36530613899230957, + "learning_rate": 4.622266132524903e-05, + "loss": 0.3598, + "step": 5582000 + }, + { + "epoch": 37.77677024686011, + "grad_norm": 0.36611422896385193, + "learning_rate": 4.622232297531399e-05, + "loss": 0.3572, + "step": 5582500 + }, + { + "epoch": 37.78015374621048, + "grad_norm": 0.4519994854927063, + "learning_rate": 4.6221984625378955e-05, + "loss": 0.3588, + "step": 5583000 + }, + { + "epoch": 37.78353724556085, + "grad_norm": 0.37126612663269043, + "learning_rate": 4.622164627544392e-05, + "loss": 0.3577, + "step": 5583500 + }, + { + "epoch": 37.786920744911214, + "grad_norm": 0.44382140040397644, + "learning_rate": 4.6221307925508886e-05, + "loss": 0.3591, + "step": 5584000 + }, + { + "epoch": 37.790304244261584, + "grad_norm": 0.3470163345336914, + "learning_rate": 4.622096957557384e-05, + "loss": 0.3584, + "step": 5584500 + }, + { + "epoch": 37.793687743611954, + "grad_norm": 0.4182018041610718, + "learning_rate": 4.6220631225638804e-05, + "loss": 0.3598, + "step": 5585000 + }, + { + "epoch": 37.797071242962325, + "grad_norm": 0.3888076841831207, + "learning_rate": 4.6220292875703766e-05, + "loss": 0.3588, + "step": 5585500 + }, + { + "epoch": 37.80045474231269, + "grad_norm": 0.36507245898246765, + "learning_rate": 4.6219954525768735e-05, + "loss": 0.3601, + "step": 5586000 + }, + { + "epoch": 37.80383824166306, + "grad_norm": 0.3805876672267914, + "learning_rate": 4.62196161758337e-05, + "loss": 0.3605, + "step": 5586500 + }, + { + "epoch": 37.80722174101343, + "grad_norm": 0.42710334062576294, + "learning_rate": 4.621927782589866e-05, + "loss": 0.3604, + "step": 5587000 + }, + { + "epoch": 37.81060524036379, + "grad_norm": 0.38477030396461487, + "learning_rate": 4.621893947596362e-05, + "loss": 0.3581, + "step": 5587500 + }, + { + "epoch": 37.81398873971416, + "grad_norm": 0.3720863461494446, + "learning_rate": 4.621860112602859e-05, + "loss": 0.3572, + "step": 5588000 + }, + { + "epoch": 37.81737223906453, + "grad_norm": 0.3941510319709778, + "learning_rate": 4.621826277609355e-05, + "loss": 0.3594, + "step": 5588500 + }, + { + "epoch": 37.820755738414896, + "grad_norm": 0.4163818955421448, + "learning_rate": 4.621792442615851e-05, + "loss": 0.3605, + "step": 5589000 + }, + { + "epoch": 37.824139237765266, + "grad_norm": 0.39274489879608154, + "learning_rate": 4.6217586076223476e-05, + "loss": 0.358, + "step": 5589500 + }, + { + "epoch": 37.827522737115636, + "grad_norm": 0.3928956687450409, + "learning_rate": 4.621724772628844e-05, + "loss": 0.3596, + "step": 5590000 + }, + { + "epoch": 37.830906236466, + "grad_norm": 0.3751726448535919, + "learning_rate": 4.62169093763534e-05, + "loss": 0.3608, + "step": 5590500 + }, + { + "epoch": 37.83428973581637, + "grad_norm": 0.35046321153640747, + "learning_rate": 4.621657102641836e-05, + "loss": 0.3591, + "step": 5591000 + }, + { + "epoch": 37.83767323516674, + "grad_norm": 0.401225745677948, + "learning_rate": 4.621623267648333e-05, + "loss": 0.3592, + "step": 5591500 + }, + { + "epoch": 37.84105673451711, + "grad_norm": 0.42670589685440063, + "learning_rate": 4.6215894326548294e-05, + "loss": 0.3587, + "step": 5592000 + }, + { + "epoch": 37.84444023386747, + "grad_norm": 0.3757880628108978, + "learning_rate": 4.6215555976613256e-05, + "loss": 0.3582, + "step": 5592500 + }, + { + "epoch": 37.847823733217844, + "grad_norm": 0.3731699287891388, + "learning_rate": 4.621521762667822e-05, + "loss": 0.3593, + "step": 5593000 + }, + { + "epoch": 37.851207232568214, + "grad_norm": 0.38897013664245605, + "learning_rate": 4.621487927674319e-05, + "loss": 0.3591, + "step": 5593500 + }, + { + "epoch": 37.85459073191858, + "grad_norm": 0.38821354508399963, + "learning_rate": 4.621454092680814e-05, + "loss": 0.3593, + "step": 5594000 + }, + { + "epoch": 37.85797423126895, + "grad_norm": 0.3750055134296417, + "learning_rate": 4.6214202576873104e-05, + "loss": 0.3611, + "step": 5594500 + }, + { + "epoch": 37.86135773061932, + "grad_norm": 0.4040793180465698, + "learning_rate": 4.6213864226938066e-05, + "loss": 0.3577, + "step": 5595000 + }, + { + "epoch": 37.86474122996968, + "grad_norm": 0.3993184566497803, + "learning_rate": 4.6213525877003035e-05, + "loss": 0.3609, + "step": 5595500 + }, + { + "epoch": 37.86812472932005, + "grad_norm": 0.39517033100128174, + "learning_rate": 4.6213187527068e-05, + "loss": 0.359, + "step": 5596000 + }, + { + "epoch": 37.87150822867042, + "grad_norm": 0.38171622157096863, + "learning_rate": 4.621284917713296e-05, + "loss": 0.3583, + "step": 5596500 + }, + { + "epoch": 37.874891728020785, + "grad_norm": 0.39077261090278625, + "learning_rate": 4.621251082719792e-05, + "loss": 0.3592, + "step": 5597000 + }, + { + "epoch": 37.878275227371155, + "grad_norm": 0.3833240270614624, + "learning_rate": 4.621217247726289e-05, + "loss": 0.3582, + "step": 5597500 + }, + { + "epoch": 37.881658726721525, + "grad_norm": 0.37280595302581787, + "learning_rate": 4.621183412732785e-05, + "loss": 0.3586, + "step": 5598000 + }, + { + "epoch": 37.885042226071896, + "grad_norm": 0.3744910657405853, + "learning_rate": 4.621149577739281e-05, + "loss": 0.3582, + "step": 5598500 + }, + { + "epoch": 37.88842572542226, + "grad_norm": 0.38365522027015686, + "learning_rate": 4.621115742745778e-05, + "loss": 0.3587, + "step": 5599000 + }, + { + "epoch": 37.89180922477263, + "grad_norm": 0.3723243772983551, + "learning_rate": 4.621081907752274e-05, + "loss": 0.3587, + "step": 5599500 + }, + { + "epoch": 37.895192724123, + "grad_norm": 0.39694342017173767, + "learning_rate": 4.62104807275877e-05, + "loss": 0.3589, + "step": 5600000 + }, + { + "epoch": 37.89857622347336, + "grad_norm": 0.4003523290157318, + "learning_rate": 4.621014237765266e-05, + "loss": 0.3595, + "step": 5600500 + }, + { + "epoch": 37.90195972282373, + "grad_norm": 0.3916511535644531, + "learning_rate": 4.620980402771763e-05, + "loss": 0.3588, + "step": 5601000 + }, + { + "epoch": 37.9053432221741, + "grad_norm": 0.39069026708602905, + "learning_rate": 4.6209465677782594e-05, + "loss": 0.3605, + "step": 5601500 + }, + { + "epoch": 37.90872672152447, + "grad_norm": 0.3300125300884247, + "learning_rate": 4.6209127327847556e-05, + "loss": 0.3592, + "step": 5602000 + }, + { + "epoch": 37.91211022087484, + "grad_norm": 0.3769150972366333, + "learning_rate": 4.620878897791252e-05, + "loss": 0.358, + "step": 5602500 + }, + { + "epoch": 37.91549372022521, + "grad_norm": 0.406015008687973, + "learning_rate": 4.620845062797749e-05, + "loss": 0.3595, + "step": 5603000 + }, + { + "epoch": 37.91887721957558, + "grad_norm": 0.35252100229263306, + "learning_rate": 4.620811227804244e-05, + "loss": 0.358, + "step": 5603500 + }, + { + "epoch": 37.92226071892594, + "grad_norm": 0.4023178219795227, + "learning_rate": 4.6207773928107405e-05, + "loss": 0.3584, + "step": 5604000 + }, + { + "epoch": 37.92564421827631, + "grad_norm": 0.3610759675502777, + "learning_rate": 4.620743557817237e-05, + "loss": 0.3579, + "step": 5604500 + }, + { + "epoch": 37.92902771762668, + "grad_norm": 0.3983677923679352, + "learning_rate": 4.6207097228237336e-05, + "loss": 0.36, + "step": 5605000 + }, + { + "epoch": 37.932411216977044, + "grad_norm": 0.40776327252388, + "learning_rate": 4.62067588783023e-05, + "loss": 0.3603, + "step": 5605500 + }, + { + "epoch": 37.935794716327415, + "grad_norm": 0.3881959021091461, + "learning_rate": 4.620642052836726e-05, + "loss": 0.359, + "step": 5606000 + }, + { + "epoch": 37.939178215677785, + "grad_norm": 0.5547685623168945, + "learning_rate": 4.620608217843222e-05, + "loss": 0.3591, + "step": 5606500 + }, + { + "epoch": 37.94256171502815, + "grad_norm": 0.36712491512298584, + "learning_rate": 4.620574382849719e-05, + "loss": 0.3586, + "step": 5607000 + }, + { + "epoch": 37.94594521437852, + "grad_norm": 0.403956800699234, + "learning_rate": 4.620540547856215e-05, + "loss": 0.359, + "step": 5607500 + }, + { + "epoch": 37.94932871372889, + "grad_norm": 0.38546839356422424, + "learning_rate": 4.620506712862711e-05, + "loss": 0.3583, + "step": 5608000 + }, + { + "epoch": 37.95271221307925, + "grad_norm": 0.40115487575531006, + "learning_rate": 4.620472877869208e-05, + "loss": 0.3576, + "step": 5608500 + }, + { + "epoch": 37.95609571242962, + "grad_norm": 0.3921334445476532, + "learning_rate": 4.620439042875704e-05, + "loss": 0.3575, + "step": 5609000 + }, + { + "epoch": 37.95947921177999, + "grad_norm": 0.3957189917564392, + "learning_rate": 4.6204052078822e-05, + "loss": 0.3595, + "step": 5609500 + }, + { + "epoch": 37.96286271113036, + "grad_norm": 0.40347981452941895, + "learning_rate": 4.6203713728886964e-05, + "loss": 0.3575, + "step": 5610000 + }, + { + "epoch": 37.966246210480726, + "grad_norm": 0.3771745562553406, + "learning_rate": 4.620337537895193e-05, + "loss": 0.3584, + "step": 5610500 + }, + { + "epoch": 37.969629709831096, + "grad_norm": 0.36563006043434143, + "learning_rate": 4.6203037029016895e-05, + "loss": 0.3587, + "step": 5611000 + }, + { + "epoch": 37.97301320918147, + "grad_norm": 0.38988545536994934, + "learning_rate": 4.620269867908186e-05, + "loss": 0.359, + "step": 5611500 + }, + { + "epoch": 37.97639670853183, + "grad_norm": 0.39158713817596436, + "learning_rate": 4.620236032914682e-05, + "loss": 0.3576, + "step": 5612000 + }, + { + "epoch": 37.9797802078822, + "grad_norm": 0.4202806353569031, + "learning_rate": 4.620202197921179e-05, + "loss": 0.3583, + "step": 5612500 + }, + { + "epoch": 37.98316370723257, + "grad_norm": 0.3770996332168579, + "learning_rate": 4.6201683629276743e-05, + "loss": 0.3593, + "step": 5613000 + }, + { + "epoch": 37.986547206582934, + "grad_norm": 0.3654348850250244, + "learning_rate": 4.6201345279341706e-05, + "loss": 0.3579, + "step": 5613500 + }, + { + "epoch": 37.989930705933304, + "grad_norm": 0.3695790469646454, + "learning_rate": 4.620100692940667e-05, + "loss": 0.3588, + "step": 5614000 + }, + { + "epoch": 37.993314205283674, + "grad_norm": 0.4236592650413513, + "learning_rate": 4.6200668579471637e-05, + "loss": 0.3594, + "step": 5614500 + }, + { + "epoch": 37.99669770463404, + "grad_norm": 0.3716669976711273, + "learning_rate": 4.62003302295366e-05, + "loss": 0.3599, + "step": 5615000 + }, + { + "epoch": 38.0, + "eval_accuracy": 0.8631016885684777, + "eval_loss": 0.555016815662384, + "eval_runtime": 3366.5592, + "eval_samples_per_second": 86.362, + "eval_steps_per_second": 5.398, + "step": 5615488 + }, + { + "epoch": 38.00008120398441, + "grad_norm": 0.3898286521434784, + "learning_rate": 4.619999187960156e-05, + "loss": 0.3583, + "step": 5615500 + }, + { + "epoch": 38.00346470333478, + "grad_norm": 0.40544092655181885, + "learning_rate": 4.619965352966652e-05, + "loss": 0.3573, + "step": 5616000 + }, + { + "epoch": 38.00684820268515, + "grad_norm": 0.4436746835708618, + "learning_rate": 4.619931517973149e-05, + "loss": 0.3564, + "step": 5616500 + }, + { + "epoch": 38.01023170203551, + "grad_norm": 0.36663416028022766, + "learning_rate": 4.6198976829796454e-05, + "loss": 0.3571, + "step": 5617000 + }, + { + "epoch": 38.01361520138588, + "grad_norm": 0.36764028668403625, + "learning_rate": 4.619863847986141e-05, + "loss": 0.3577, + "step": 5617500 + }, + { + "epoch": 38.01699870073625, + "grad_norm": 0.40741071105003357, + "learning_rate": 4.619830012992638e-05, + "loss": 0.3573, + "step": 5618000 + }, + { + "epoch": 38.020382200086615, + "grad_norm": 0.40826502442359924, + "learning_rate": 4.619796177999134e-05, + "loss": 0.3565, + "step": 5618500 + }, + { + "epoch": 38.023765699436986, + "grad_norm": 0.4195602536201477, + "learning_rate": 4.61976234300563e-05, + "loss": 0.3555, + "step": 5619000 + }, + { + "epoch": 38.027149198787356, + "grad_norm": 0.37044769525527954, + "learning_rate": 4.6197285080121265e-05, + "loss": 0.3564, + "step": 5619500 + }, + { + "epoch": 38.03053269813772, + "grad_norm": 0.41229209303855896, + "learning_rate": 4.6196946730186233e-05, + "loss": 0.3561, + "step": 5620000 + }, + { + "epoch": 38.03391619748809, + "grad_norm": 0.38138049840927124, + "learning_rate": 4.6196608380251196e-05, + "loss": 0.3558, + "step": 5620500 + }, + { + "epoch": 38.03729969683846, + "grad_norm": 0.3792511522769928, + "learning_rate": 4.619627003031616e-05, + "loss": 0.3556, + "step": 5621000 + }, + { + "epoch": 38.04068319618882, + "grad_norm": 0.3787059783935547, + "learning_rate": 4.619593168038112e-05, + "loss": 0.3558, + "step": 5621500 + }, + { + "epoch": 38.04406669553919, + "grad_norm": 0.4135395288467407, + "learning_rate": 4.619559333044608e-05, + "loss": 0.356, + "step": 5622000 + }, + { + "epoch": 38.047450194889564, + "grad_norm": 0.36780989170074463, + "learning_rate": 4.6195254980511044e-05, + "loss": 0.3572, + "step": 5622500 + }, + { + "epoch": 38.050833694239934, + "grad_norm": 0.41230207681655884, + "learning_rate": 4.6194916630576006e-05, + "loss": 0.3574, + "step": 5623000 + }, + { + "epoch": 38.0542171935903, + "grad_norm": 0.3845711350440979, + "learning_rate": 4.619457828064097e-05, + "loss": 0.3574, + "step": 5623500 + }, + { + "epoch": 38.05760069294067, + "grad_norm": 0.3832910358905792, + "learning_rate": 4.619423993070594e-05, + "loss": 0.3569, + "step": 5624000 + }, + { + "epoch": 38.06098419229104, + "grad_norm": 0.4200763702392578, + "learning_rate": 4.61939015807709e-05, + "loss": 0.3566, + "step": 5624500 + }, + { + "epoch": 38.0643676916414, + "grad_norm": 0.38283076882362366, + "learning_rate": 4.619356323083586e-05, + "loss": 0.3561, + "step": 5625000 + }, + { + "epoch": 38.06775119099177, + "grad_norm": 0.4039447605609894, + "learning_rate": 4.6193224880900824e-05, + "loss": 0.3575, + "step": 5625500 + }, + { + "epoch": 38.07113469034214, + "grad_norm": 0.38887515664100647, + "learning_rate": 4.619288653096579e-05, + "loss": 0.356, + "step": 5626000 + }, + { + "epoch": 38.074518189692505, + "grad_norm": 0.3814372420310974, + "learning_rate": 4.6192548181030755e-05, + "loss": 0.3568, + "step": 5626500 + }, + { + "epoch": 38.077901689042875, + "grad_norm": 0.38250312209129333, + "learning_rate": 4.619220983109571e-05, + "loss": 0.3583, + "step": 5627000 + }, + { + "epoch": 38.081285188393245, + "grad_norm": 0.38772666454315186, + "learning_rate": 4.619187148116068e-05, + "loss": 0.3569, + "step": 5627500 + }, + { + "epoch": 38.08466868774361, + "grad_norm": 0.3952089250087738, + "learning_rate": 4.619153313122564e-05, + "loss": 0.3574, + "step": 5628000 + }, + { + "epoch": 38.08805218709398, + "grad_norm": 0.3905099630355835, + "learning_rate": 4.61911947812906e-05, + "loss": 0.357, + "step": 5628500 + }, + { + "epoch": 38.09143568644435, + "grad_norm": 0.3963543176651001, + "learning_rate": 4.6190856431355565e-05, + "loss": 0.3582, + "step": 5629000 + }, + { + "epoch": 38.09481918579472, + "grad_norm": 0.35863327980041504, + "learning_rate": 4.6190518081420534e-05, + "loss": 0.357, + "step": 5629500 + }, + { + "epoch": 38.09820268514508, + "grad_norm": 0.40665847063064575, + "learning_rate": 4.6190179731485496e-05, + "loss": 0.3588, + "step": 5630000 + }, + { + "epoch": 38.10158618449545, + "grad_norm": 0.4140339195728302, + "learning_rate": 4.618984138155046e-05, + "loss": 0.3569, + "step": 5630500 + }, + { + "epoch": 38.10496968384582, + "grad_norm": 0.3830562233924866, + "learning_rate": 4.618950303161542e-05, + "loss": 0.3578, + "step": 5631000 + }, + { + "epoch": 38.108353183196186, + "grad_norm": 0.40696558356285095, + "learning_rate": 4.618916468168038e-05, + "loss": 0.357, + "step": 5631500 + }, + { + "epoch": 38.11173668254656, + "grad_norm": 0.36331936717033386, + "learning_rate": 4.6188826331745345e-05, + "loss": 0.3578, + "step": 5632000 + }, + { + "epoch": 38.11512018189693, + "grad_norm": 0.3589300215244293, + "learning_rate": 4.618848798181031e-05, + "loss": 0.3576, + "step": 5632500 + }, + { + "epoch": 38.11850368124729, + "grad_norm": 0.4246826469898224, + "learning_rate": 4.618814963187527e-05, + "loss": 0.3567, + "step": 5633000 + }, + { + "epoch": 38.12188718059766, + "grad_norm": 0.3818008601665497, + "learning_rate": 4.618781128194024e-05, + "loss": 0.3595, + "step": 5633500 + }, + { + "epoch": 38.12527067994803, + "grad_norm": 0.35223817825317383, + "learning_rate": 4.61874729320052e-05, + "loss": 0.3587, + "step": 5634000 + }, + { + "epoch": 38.1286541792984, + "grad_norm": 0.38550788164138794, + "learning_rate": 4.618713458207016e-05, + "loss": 0.3569, + "step": 5634500 + }, + { + "epoch": 38.132037678648764, + "grad_norm": 0.41353002190589905, + "learning_rate": 4.6186796232135124e-05, + "loss": 0.3584, + "step": 5635000 + }, + { + "epoch": 38.135421177999135, + "grad_norm": 0.38820987939834595, + "learning_rate": 4.618645788220009e-05, + "loss": 0.3587, + "step": 5635500 + }, + { + "epoch": 38.138804677349505, + "grad_norm": 0.37804484367370605, + "learning_rate": 4.6186119532265055e-05, + "loss": 0.3543, + "step": 5636000 + }, + { + "epoch": 38.14218817669987, + "grad_norm": 0.417277067899704, + "learning_rate": 4.618578118233002e-05, + "loss": 0.3578, + "step": 5636500 + }, + { + "epoch": 38.14557167605024, + "grad_norm": 0.3772396147251129, + "learning_rate": 4.618544283239498e-05, + "loss": 0.358, + "step": 5637000 + }, + { + "epoch": 38.14895517540061, + "grad_norm": 0.37377408146858215, + "learning_rate": 4.618510448245994e-05, + "loss": 0.3583, + "step": 5637500 + }, + { + "epoch": 38.15233867475097, + "grad_norm": 0.3786788284778595, + "learning_rate": 4.6184766132524904e-05, + "loss": 0.3574, + "step": 5638000 + }, + { + "epoch": 38.15572217410134, + "grad_norm": 0.37169507145881653, + "learning_rate": 4.6184427782589866e-05, + "loss": 0.3566, + "step": 5638500 + }, + { + "epoch": 38.15910567345171, + "grad_norm": 0.4147394597530365, + "learning_rate": 4.618408943265483e-05, + "loss": 0.3591, + "step": 5639000 + }, + { + "epoch": 38.162489172802076, + "grad_norm": 0.41069290041923523, + "learning_rate": 4.61837510827198e-05, + "loss": 0.3589, + "step": 5639500 + }, + { + "epoch": 38.165872672152446, + "grad_norm": 0.41666343808174133, + "learning_rate": 4.618341273278476e-05, + "loss": 0.357, + "step": 5640000 + }, + { + "epoch": 38.169256171502816, + "grad_norm": 0.40683019161224365, + "learning_rate": 4.618307438284972e-05, + "loss": 0.3577, + "step": 5640500 + }, + { + "epoch": 38.17263967085319, + "grad_norm": 0.4103277921676636, + "learning_rate": 4.618273603291468e-05, + "loss": 0.3571, + "step": 5641000 + }, + { + "epoch": 38.17602317020355, + "grad_norm": 0.43209654092788696, + "learning_rate": 4.6182397682979645e-05, + "loss": 0.3574, + "step": 5641500 + }, + { + "epoch": 38.17940666955392, + "grad_norm": 0.3668835461139679, + "learning_rate": 4.618205933304461e-05, + "loss": 0.3572, + "step": 5642000 + }, + { + "epoch": 38.18279016890429, + "grad_norm": 0.39525651931762695, + "learning_rate": 4.618172098310957e-05, + "loss": 0.3568, + "step": 5642500 + }, + { + "epoch": 38.18617366825465, + "grad_norm": 0.42401039600372314, + "learning_rate": 4.618138263317454e-05, + "loss": 0.3569, + "step": 5643000 + }, + { + "epoch": 38.189557167605024, + "grad_norm": 0.38974061608314514, + "learning_rate": 4.61810442832395e-05, + "loss": 0.3587, + "step": 5643500 + }, + { + "epoch": 38.192940666955394, + "grad_norm": 0.39442285895347595, + "learning_rate": 4.618070593330446e-05, + "loss": 0.3578, + "step": 5644000 + }, + { + "epoch": 38.19632416630576, + "grad_norm": 0.38561609387397766, + "learning_rate": 4.6180367583369425e-05, + "loss": 0.3584, + "step": 5644500 + }, + { + "epoch": 38.19970766565613, + "grad_norm": 0.34974971413612366, + "learning_rate": 4.6180029233434394e-05, + "loss": 0.3586, + "step": 5645000 + }, + { + "epoch": 38.2030911650065, + "grad_norm": 0.418800950050354, + "learning_rate": 4.6179690883499356e-05, + "loss": 0.3595, + "step": 5645500 + }, + { + "epoch": 38.20647466435686, + "grad_norm": 0.3942648470401764, + "learning_rate": 4.617935253356432e-05, + "loss": 0.3571, + "step": 5646000 + }, + { + "epoch": 38.20985816370723, + "grad_norm": 0.4242751896381378, + "learning_rate": 4.617901418362927e-05, + "loss": 0.357, + "step": 5646500 + }, + { + "epoch": 38.2132416630576, + "grad_norm": 0.3806268572807312, + "learning_rate": 4.617867583369424e-05, + "loss": 0.3572, + "step": 5647000 + }, + { + "epoch": 38.21662516240797, + "grad_norm": 0.3610764443874359, + "learning_rate": 4.6178337483759204e-05, + "loss": 0.3581, + "step": 5647500 + }, + { + "epoch": 38.220008661758335, + "grad_norm": 0.41190919280052185, + "learning_rate": 4.6177999133824166e-05, + "loss": 0.3581, + "step": 5648000 + }, + { + "epoch": 38.223392161108706, + "grad_norm": 0.3944419026374817, + "learning_rate": 4.617766078388913e-05, + "loss": 0.3578, + "step": 5648500 + }, + { + "epoch": 38.226775660459076, + "grad_norm": 0.3749035596847534, + "learning_rate": 4.61773224339541e-05, + "loss": 0.358, + "step": 5649000 + }, + { + "epoch": 38.23015915980944, + "grad_norm": 0.3959389925003052, + "learning_rate": 4.617698408401906e-05, + "loss": 0.3575, + "step": 5649500 + }, + { + "epoch": 38.23354265915981, + "grad_norm": 0.3524419367313385, + "learning_rate": 4.617664573408402e-05, + "loss": 0.358, + "step": 5650000 + }, + { + "epoch": 38.23692615851018, + "grad_norm": 0.3562764525413513, + "learning_rate": 4.6176307384148984e-05, + "loss": 0.3579, + "step": 5650500 + }, + { + "epoch": 38.24030965786054, + "grad_norm": 0.3809186816215515, + "learning_rate": 4.6175969034213946e-05, + "loss": 0.3574, + "step": 5651000 + }, + { + "epoch": 38.24369315721091, + "grad_norm": 0.35034775733947754, + "learning_rate": 4.617563068427891e-05, + "loss": 0.3579, + "step": 5651500 + }, + { + "epoch": 38.24707665656128, + "grad_norm": 0.4001205563545227, + "learning_rate": 4.617529233434387e-05, + "loss": 0.3593, + "step": 5652000 + }, + { + "epoch": 38.25046015591165, + "grad_norm": 0.39731815457344055, + "learning_rate": 4.617495398440884e-05, + "loss": 0.3583, + "step": 5652500 + }, + { + "epoch": 38.25384365526202, + "grad_norm": 0.4180459976196289, + "learning_rate": 4.61746156344738e-05, + "loss": 0.356, + "step": 5653000 + }, + { + "epoch": 38.25722715461239, + "grad_norm": 0.37674039602279663, + "learning_rate": 4.617427728453876e-05, + "loss": 0.3577, + "step": 5653500 + }, + { + "epoch": 38.26061065396276, + "grad_norm": 0.39728453755378723, + "learning_rate": 4.6173938934603725e-05, + "loss": 0.357, + "step": 5654000 + }, + { + "epoch": 38.26399415331312, + "grad_norm": 0.3983825147151947, + "learning_rate": 4.6173600584668694e-05, + "loss": 0.3581, + "step": 5654500 + }, + { + "epoch": 38.26737765266349, + "grad_norm": 0.4299536943435669, + "learning_rate": 4.6173262234733656e-05, + "loss": 0.3582, + "step": 5655000 + }, + { + "epoch": 38.27076115201386, + "grad_norm": 0.3940950632095337, + "learning_rate": 4.617292388479862e-05, + "loss": 0.3586, + "step": 5655500 + }, + { + "epoch": 38.274144651364225, + "grad_norm": 0.3991442620754242, + "learning_rate": 4.6172585534863574e-05, + "loss": 0.3569, + "step": 5656000 + }, + { + "epoch": 38.277528150714595, + "grad_norm": 0.38127321004867554, + "learning_rate": 4.617224718492854e-05, + "loss": 0.3594, + "step": 5656500 + }, + { + "epoch": 38.280911650064965, + "grad_norm": 0.3947940766811371, + "learning_rate": 4.6171908834993505e-05, + "loss": 0.3593, + "step": 5657000 + }, + { + "epoch": 38.28429514941533, + "grad_norm": 0.3717426359653473, + "learning_rate": 4.617157048505847e-05, + "loss": 0.3558, + "step": 5657500 + }, + { + "epoch": 38.2876786487657, + "grad_norm": 0.3773467540740967, + "learning_rate": 4.617123213512343e-05, + "loss": 0.3566, + "step": 5658000 + }, + { + "epoch": 38.29106214811607, + "grad_norm": 0.4220793843269348, + "learning_rate": 4.61708937851884e-05, + "loss": 0.3576, + "step": 5658500 + }, + { + "epoch": 38.29444564746644, + "grad_norm": 0.35687562823295593, + "learning_rate": 4.617055543525336e-05, + "loss": 0.3586, + "step": 5659000 + }, + { + "epoch": 38.2978291468168, + "grad_norm": 0.3728513717651367, + "learning_rate": 4.617021708531832e-05, + "loss": 0.3564, + "step": 5659500 + }, + { + "epoch": 38.30121264616717, + "grad_norm": 0.38042473793029785, + "learning_rate": 4.6169878735383284e-05, + "loss": 0.3594, + "step": 5660000 + }, + { + "epoch": 38.30459614551754, + "grad_norm": 0.37338787317276, + "learning_rate": 4.6169540385448247e-05, + "loss": 0.3576, + "step": 5660500 + }, + { + "epoch": 38.307979644867906, + "grad_norm": 0.415351539850235, + "learning_rate": 4.616920203551321e-05, + "loss": 0.3584, + "step": 5661000 + }, + { + "epoch": 38.31136314421828, + "grad_norm": 0.37263545393943787, + "learning_rate": 4.616886368557817e-05, + "loss": 0.3596, + "step": 5661500 + }, + { + "epoch": 38.31474664356865, + "grad_norm": 0.38477614521980286, + "learning_rate": 4.616852533564314e-05, + "loss": 0.3592, + "step": 5662000 + }, + { + "epoch": 38.31813014291901, + "grad_norm": 0.4276152551174164, + "learning_rate": 4.61681869857081e-05, + "loss": 0.3585, + "step": 5662500 + }, + { + "epoch": 38.32151364226938, + "grad_norm": 0.39438101649284363, + "learning_rate": 4.6167848635773064e-05, + "loss": 0.3593, + "step": 5663000 + }, + { + "epoch": 38.32489714161975, + "grad_norm": 0.37863606214523315, + "learning_rate": 4.6167510285838026e-05, + "loss": 0.3587, + "step": 5663500 + }, + { + "epoch": 38.328280640970114, + "grad_norm": 0.3537881672382355, + "learning_rate": 4.6167171935902995e-05, + "loss": 0.3571, + "step": 5664000 + }, + { + "epoch": 38.331664140320484, + "grad_norm": 0.3838760256767273, + "learning_rate": 4.616683358596796e-05, + "loss": 0.3577, + "step": 5664500 + }, + { + "epoch": 38.335047639670854, + "grad_norm": 0.37645915150642395, + "learning_rate": 4.616649523603292e-05, + "loss": 0.3585, + "step": 5665000 + }, + { + "epoch": 38.338431139021225, + "grad_norm": 0.4026859700679779, + "learning_rate": 4.6166156886097875e-05, + "loss": 0.3574, + "step": 5665500 + }, + { + "epoch": 38.34181463837159, + "grad_norm": 0.3878239393234253, + "learning_rate": 4.6165818536162843e-05, + "loss": 0.3569, + "step": 5666000 + }, + { + "epoch": 38.34519813772196, + "grad_norm": 0.39249539375305176, + "learning_rate": 4.6165480186227806e-05, + "loss": 0.3577, + "step": 5666500 + }, + { + "epoch": 38.34858163707233, + "grad_norm": 0.3605285584926605, + "learning_rate": 4.616514183629277e-05, + "loss": 0.3588, + "step": 5667000 + }, + { + "epoch": 38.35196513642269, + "grad_norm": 0.3994273841381073, + "learning_rate": 4.616480348635773e-05, + "loss": 0.357, + "step": 5667500 + }, + { + "epoch": 38.35534863577306, + "grad_norm": 0.37711766362190247, + "learning_rate": 4.61644651364227e-05, + "loss": 0.3578, + "step": 5668000 + }, + { + "epoch": 38.35873213512343, + "grad_norm": 0.40630966424942017, + "learning_rate": 4.616412678648766e-05, + "loss": 0.3572, + "step": 5668500 + }, + { + "epoch": 38.362115634473795, + "grad_norm": 0.3620378375053406, + "learning_rate": 4.616378843655262e-05, + "loss": 0.3581, + "step": 5669000 + }, + { + "epoch": 38.365499133824166, + "grad_norm": 0.393038809299469, + "learning_rate": 4.6163450086617585e-05, + "loss": 0.3582, + "step": 5669500 + }, + { + "epoch": 38.368882633174536, + "grad_norm": 0.39178842306137085, + "learning_rate": 4.616311173668255e-05, + "loss": 0.3585, + "step": 5670000 + }, + { + "epoch": 38.3722661325249, + "grad_norm": 0.34970539808273315, + "learning_rate": 4.616277338674751e-05, + "loss": 0.3586, + "step": 5670500 + }, + { + "epoch": 38.37564963187527, + "grad_norm": 0.36897364258766174, + "learning_rate": 4.616243503681247e-05, + "loss": 0.3582, + "step": 5671000 + }, + { + "epoch": 38.37903313122564, + "grad_norm": 0.41955244541168213, + "learning_rate": 4.616209668687744e-05, + "loss": 0.3583, + "step": 5671500 + }, + { + "epoch": 38.38241663057601, + "grad_norm": 0.42534422874450684, + "learning_rate": 4.61617583369424e-05, + "loss": 0.358, + "step": 5672000 + }, + { + "epoch": 38.38580012992637, + "grad_norm": 0.3807324469089508, + "learning_rate": 4.6161419987007365e-05, + "loss": 0.3576, + "step": 5672500 + }, + { + "epoch": 38.389183629276744, + "grad_norm": 0.3953002691268921, + "learning_rate": 4.616108163707233e-05, + "loss": 0.3572, + "step": 5673000 + }, + { + "epoch": 38.392567128627114, + "grad_norm": 0.40438657999038696, + "learning_rate": 4.6160743287137296e-05, + "loss": 0.3572, + "step": 5673500 + }, + { + "epoch": 38.39595062797748, + "grad_norm": 0.3939213454723358, + "learning_rate": 4.616040493720226e-05, + "loss": 0.3587, + "step": 5674000 + }, + { + "epoch": 38.39933412732785, + "grad_norm": 0.4456295669078827, + "learning_rate": 4.616006658726722e-05, + "loss": 0.3577, + "step": 5674500 + }, + { + "epoch": 38.40271762667822, + "grad_norm": 0.40989065170288086, + "learning_rate": 4.6159728237332175e-05, + "loss": 0.3587, + "step": 5675000 + }, + { + "epoch": 38.40610112602858, + "grad_norm": 0.38084176182746887, + "learning_rate": 4.6159389887397144e-05, + "loss": 0.3581, + "step": 5675500 + }, + { + "epoch": 38.40948462537895, + "grad_norm": 0.4279657006263733, + "learning_rate": 4.6159051537462106e-05, + "loss": 0.3579, + "step": 5676000 + }, + { + "epoch": 38.41286812472932, + "grad_norm": 0.3909102976322174, + "learning_rate": 4.615871318752707e-05, + "loss": 0.3588, + "step": 5676500 + }, + { + "epoch": 38.416251624079685, + "grad_norm": 0.3725705146789551, + "learning_rate": 4.615837483759203e-05, + "loss": 0.3583, + "step": 5677000 + }, + { + "epoch": 38.419635123430055, + "grad_norm": 0.35240438580513, + "learning_rate": 4.6158036487657e-05, + "loss": 0.3591, + "step": 5677500 + }, + { + "epoch": 38.423018622780425, + "grad_norm": 0.36960190534591675, + "learning_rate": 4.615769813772196e-05, + "loss": 0.3589, + "step": 5678000 + }, + { + "epoch": 38.426402122130796, + "grad_norm": 0.3883328437805176, + "learning_rate": 4.6157359787786924e-05, + "loss": 0.356, + "step": 5678500 + }, + { + "epoch": 38.42978562148116, + "grad_norm": 0.3890812397003174, + "learning_rate": 4.6157021437851886e-05, + "loss": 0.3582, + "step": 5679000 + }, + { + "epoch": 38.43316912083153, + "grad_norm": 0.39880093932151794, + "learning_rate": 4.615668308791685e-05, + "loss": 0.3573, + "step": 5679500 + }, + { + "epoch": 38.4365526201819, + "grad_norm": 0.41489291191101074, + "learning_rate": 4.615634473798181e-05, + "loss": 0.3578, + "step": 5680000 + }, + { + "epoch": 38.43993611953226, + "grad_norm": 0.3967892527580261, + "learning_rate": 4.615600638804677e-05, + "loss": 0.3572, + "step": 5680500 + }, + { + "epoch": 38.44331961888263, + "grad_norm": 0.40296727418899536, + "learning_rate": 4.615566803811174e-05, + "loss": 0.3582, + "step": 5681000 + }, + { + "epoch": 38.446703118233, + "grad_norm": 0.41405507922172546, + "learning_rate": 4.61553296881767e-05, + "loss": 0.3592, + "step": 5681500 + }, + { + "epoch": 38.450086617583366, + "grad_norm": 0.4172917604446411, + "learning_rate": 4.6154991338241665e-05, + "loss": 0.3579, + "step": 5682000 + }, + { + "epoch": 38.45347011693374, + "grad_norm": 0.4117705821990967, + "learning_rate": 4.615465298830663e-05, + "loss": 0.3582, + "step": 5682500 + }, + { + "epoch": 38.45685361628411, + "grad_norm": 0.33144837617874146, + "learning_rate": 4.6154314638371596e-05, + "loss": 0.3583, + "step": 5683000 + }, + { + "epoch": 38.46023711563447, + "grad_norm": 0.3817470371723175, + "learning_rate": 4.615397628843656e-05, + "loss": 0.3581, + "step": 5683500 + }, + { + "epoch": 38.46362061498484, + "grad_norm": 0.3916853964328766, + "learning_rate": 4.615363793850152e-05, + "loss": 0.3586, + "step": 5684000 + }, + { + "epoch": 38.46700411433521, + "grad_norm": 0.3973696827888489, + "learning_rate": 4.6153299588566476e-05, + "loss": 0.3584, + "step": 5684500 + }, + { + "epoch": 38.47038761368558, + "grad_norm": 0.34616851806640625, + "learning_rate": 4.6152961238631445e-05, + "loss": 0.3589, + "step": 5685000 + }, + { + "epoch": 38.473771113035944, + "grad_norm": 0.41244304180145264, + "learning_rate": 4.615262288869641e-05, + "loss": 0.3575, + "step": 5685500 + }, + { + "epoch": 38.477154612386315, + "grad_norm": 0.37640896439552307, + "learning_rate": 4.615228453876137e-05, + "loss": 0.357, + "step": 5686000 + }, + { + "epoch": 38.480538111736685, + "grad_norm": 0.3661516606807709, + "learning_rate": 4.615194618882633e-05, + "loss": 0.3585, + "step": 5686500 + }, + { + "epoch": 38.48392161108705, + "grad_norm": 0.42485180497169495, + "learning_rate": 4.61516078388913e-05, + "loss": 0.3578, + "step": 5687000 + }, + { + "epoch": 38.48730511043742, + "grad_norm": 0.42335501313209534, + "learning_rate": 4.615126948895626e-05, + "loss": 0.3579, + "step": 5687500 + }, + { + "epoch": 38.49068860978779, + "grad_norm": 0.4134356677532196, + "learning_rate": 4.6150931139021224e-05, + "loss": 0.3581, + "step": 5688000 + }, + { + "epoch": 38.49407210913815, + "grad_norm": 0.40912774205207825, + "learning_rate": 4.6150592789086186e-05, + "loss": 0.3569, + "step": 5688500 + }, + { + "epoch": 38.49745560848852, + "grad_norm": 0.36274468898773193, + "learning_rate": 4.6150254439151155e-05, + "loss": 0.358, + "step": 5689000 + }, + { + "epoch": 38.50083910783889, + "grad_norm": 0.3722778558731079, + "learning_rate": 4.614991608921611e-05, + "loss": 0.3583, + "step": 5689500 + }, + { + "epoch": 38.50422260718926, + "grad_norm": 0.3648729622364044, + "learning_rate": 4.614957773928107e-05, + "loss": 0.3582, + "step": 5690000 + }, + { + "epoch": 38.507606106539626, + "grad_norm": 0.3988613486289978, + "learning_rate": 4.614923938934604e-05, + "loss": 0.3578, + "step": 5690500 + }, + { + "epoch": 38.510989605889996, + "grad_norm": 0.381133109331131, + "learning_rate": 4.6148901039411004e-05, + "loss": 0.3601, + "step": 5691000 + }, + { + "epoch": 38.51437310524037, + "grad_norm": 0.3979582190513611, + "learning_rate": 4.6148562689475966e-05, + "loss": 0.3596, + "step": 5691500 + }, + { + "epoch": 38.51775660459073, + "grad_norm": 0.388432115316391, + "learning_rate": 4.614822433954093e-05, + "loss": 0.3594, + "step": 5692000 + }, + { + "epoch": 38.5211401039411, + "grad_norm": 0.33916813135147095, + "learning_rate": 4.614788598960589e-05, + "loss": 0.3587, + "step": 5692500 + }, + { + "epoch": 38.52452360329147, + "grad_norm": 0.3934556841850281, + "learning_rate": 4.614754763967086e-05, + "loss": 0.3562, + "step": 5693000 + }, + { + "epoch": 38.527907102641834, + "grad_norm": 0.35447025299072266, + "learning_rate": 4.614720928973582e-05, + "loss": 0.3577, + "step": 5693500 + }, + { + "epoch": 38.531290601992204, + "grad_norm": 0.3884115517139435, + "learning_rate": 4.6146870939800776e-05, + "loss": 0.3573, + "step": 5694000 + }, + { + "epoch": 38.534674101342574, + "grad_norm": 0.4255782961845398, + "learning_rate": 4.6146532589865745e-05, + "loss": 0.3586, + "step": 5694500 + }, + { + "epoch": 38.53805760069294, + "grad_norm": 0.3719644546508789, + "learning_rate": 4.614619423993071e-05, + "loss": 0.359, + "step": 5695000 + }, + { + "epoch": 38.54144110004331, + "grad_norm": 0.3874768316745758, + "learning_rate": 4.614585588999567e-05, + "loss": 0.3575, + "step": 5695500 + }, + { + "epoch": 38.54482459939368, + "grad_norm": 0.368914395570755, + "learning_rate": 4.614551754006063e-05, + "loss": 0.3582, + "step": 5696000 + }, + { + "epoch": 38.54820809874405, + "grad_norm": 0.37411215901374817, + "learning_rate": 4.61451791901256e-05, + "loss": 0.3602, + "step": 5696500 + }, + { + "epoch": 38.55159159809441, + "grad_norm": 0.36497390270233154, + "learning_rate": 4.614484084019056e-05, + "loss": 0.3589, + "step": 5697000 + }, + { + "epoch": 38.55497509744478, + "grad_norm": 0.37230151891708374, + "learning_rate": 4.6144502490255525e-05, + "loss": 0.3587, + "step": 5697500 + }, + { + "epoch": 38.55835859679515, + "grad_norm": 0.4010688066482544, + "learning_rate": 4.614416414032049e-05, + "loss": 0.357, + "step": 5698000 + }, + { + "epoch": 38.561742096145515, + "grad_norm": 0.4175252914428711, + "learning_rate": 4.6143825790385456e-05, + "loss": 0.3591, + "step": 5698500 + }, + { + "epoch": 38.565125595495886, + "grad_norm": 0.3658173680305481, + "learning_rate": 4.614348744045041e-05, + "loss": 0.3575, + "step": 5699000 + }, + { + "epoch": 38.568509094846256, + "grad_norm": 0.4055701792240143, + "learning_rate": 4.614314909051537e-05, + "loss": 0.3583, + "step": 5699500 + }, + { + "epoch": 38.57189259419662, + "grad_norm": 0.414473295211792, + "learning_rate": 4.614281074058034e-05, + "loss": 0.3573, + "step": 5700000 + }, + { + "epoch": 38.57527609354699, + "grad_norm": 0.373854398727417, + "learning_rate": 4.6142472390645304e-05, + "loss": 0.3588, + "step": 5700500 + }, + { + "epoch": 38.57865959289736, + "grad_norm": 0.3989484906196594, + "learning_rate": 4.6142134040710266e-05, + "loss": 0.3584, + "step": 5701000 + }, + { + "epoch": 38.58204309224772, + "grad_norm": 0.41586586833000183, + "learning_rate": 4.614179569077523e-05, + "loss": 0.3563, + "step": 5701500 + }, + { + "epoch": 38.58542659159809, + "grad_norm": 0.3961929678916931, + "learning_rate": 4.614145734084019e-05, + "loss": 0.3586, + "step": 5702000 + }, + { + "epoch": 38.58881009094846, + "grad_norm": 0.39267802238464355, + "learning_rate": 4.614111899090516e-05, + "loss": 0.3565, + "step": 5702500 + }, + { + "epoch": 38.592193590298834, + "grad_norm": 0.3943745791912079, + "learning_rate": 4.614078064097012e-05, + "loss": 0.3583, + "step": 5703000 + }, + { + "epoch": 38.5955770896492, + "grad_norm": 0.4008505344390869, + "learning_rate": 4.614044229103508e-05, + "loss": 0.3589, + "step": 5703500 + }, + { + "epoch": 38.59896058899957, + "grad_norm": 0.382214218378067, + "learning_rate": 4.6140103941100046e-05, + "loss": 0.359, + "step": 5704000 + }, + { + "epoch": 38.60234408834994, + "grad_norm": 0.38153693079948425, + "learning_rate": 4.613976559116501e-05, + "loss": 0.3587, + "step": 5704500 + }, + { + "epoch": 38.6057275877003, + "grad_norm": 0.3921079635620117, + "learning_rate": 4.613942724122997e-05, + "loss": 0.3576, + "step": 5705000 + }, + { + "epoch": 38.60911108705067, + "grad_norm": 0.36310869455337524, + "learning_rate": 4.613908889129493e-05, + "loss": 0.3576, + "step": 5705500 + }, + { + "epoch": 38.61249458640104, + "grad_norm": 0.4206840395927429, + "learning_rate": 4.61387505413599e-05, + "loss": 0.3601, + "step": 5706000 + }, + { + "epoch": 38.615878085751405, + "grad_norm": 0.3657309412956238, + "learning_rate": 4.613841219142486e-05, + "loss": 0.3588, + "step": 5706500 + }, + { + "epoch": 38.619261585101775, + "grad_norm": 0.3499012291431427, + "learning_rate": 4.6138073841489826e-05, + "loss": 0.359, + "step": 5707000 + }, + { + "epoch": 38.622645084452145, + "grad_norm": 0.38575655221939087, + "learning_rate": 4.613773549155479e-05, + "loss": 0.3586, + "step": 5707500 + }, + { + "epoch": 38.62602858380251, + "grad_norm": 0.42723017930984497, + "learning_rate": 4.6137397141619757e-05, + "loss": 0.3573, + "step": 5708000 + }, + { + "epoch": 38.62941208315288, + "grad_norm": 0.4009384214878082, + "learning_rate": 4.613705879168471e-05, + "loss": 0.3576, + "step": 5708500 + }, + { + "epoch": 38.63279558250325, + "grad_norm": 0.38194090127944946, + "learning_rate": 4.6136720441749674e-05, + "loss": 0.36, + "step": 5709000 + }, + { + "epoch": 38.63617908185362, + "grad_norm": 0.4206542670726776, + "learning_rate": 4.6136382091814636e-05, + "loss": 0.3581, + "step": 5709500 + }, + { + "epoch": 38.63956258120398, + "grad_norm": 0.38274261355400085, + "learning_rate": 4.6136043741879605e-05, + "loss": 0.3574, + "step": 5710000 + }, + { + "epoch": 38.64294608055435, + "grad_norm": 0.36390283703804016, + "learning_rate": 4.613570539194457e-05, + "loss": 0.3581, + "step": 5710500 + }, + { + "epoch": 38.64632957990472, + "grad_norm": 0.35540375113487244, + "learning_rate": 4.613536704200953e-05, + "loss": 0.3573, + "step": 5711000 + }, + { + "epoch": 38.649713079255086, + "grad_norm": 0.39186784625053406, + "learning_rate": 4.613502869207449e-05, + "loss": 0.3583, + "step": 5711500 + }, + { + "epoch": 38.65309657860546, + "grad_norm": 0.40506142377853394, + "learning_rate": 4.613469034213946e-05, + "loss": 0.3592, + "step": 5712000 + }, + { + "epoch": 38.65648007795583, + "grad_norm": 0.4294401705265045, + "learning_rate": 4.613435199220442e-05, + "loss": 0.3582, + "step": 5712500 + }, + { + "epoch": 38.65986357730619, + "grad_norm": 0.35989508032798767, + "learning_rate": 4.613401364226938e-05, + "loss": 0.3594, + "step": 5713000 + }, + { + "epoch": 38.66324707665656, + "grad_norm": 0.38508445024490356, + "learning_rate": 4.613367529233435e-05, + "loss": 0.3596, + "step": 5713500 + }, + { + "epoch": 38.66663057600693, + "grad_norm": 0.3540355861186981, + "learning_rate": 4.613333694239931e-05, + "loss": 0.3583, + "step": 5714000 + }, + { + "epoch": 38.6700140753573, + "grad_norm": 0.39032605290412903, + "learning_rate": 4.613299859246427e-05, + "loss": 0.3589, + "step": 5714500 + }, + { + "epoch": 38.673397574707664, + "grad_norm": 0.38501599431037903, + "learning_rate": 4.613266024252923e-05, + "loss": 0.359, + "step": 5715000 + }, + { + "epoch": 38.676781074058034, + "grad_norm": 0.3840806484222412, + "learning_rate": 4.61323218925942e-05, + "loss": 0.3594, + "step": 5715500 + }, + { + "epoch": 38.680164573408405, + "grad_norm": 0.39163729548454285, + "learning_rate": 4.6131983542659164e-05, + "loss": 0.3584, + "step": 5716000 + }, + { + "epoch": 38.68354807275877, + "grad_norm": 0.36240720748901367, + "learning_rate": 4.6131645192724126e-05, + "loss": 0.3586, + "step": 5716500 + }, + { + "epoch": 38.68693157210914, + "grad_norm": 0.38391172885894775, + "learning_rate": 4.613130684278909e-05, + "loss": 0.3573, + "step": 5717000 + }, + { + "epoch": 38.69031507145951, + "grad_norm": 0.3868270218372345, + "learning_rate": 4.613096849285406e-05, + "loss": 0.3579, + "step": 5717500 + }, + { + "epoch": 38.69369857080987, + "grad_norm": 0.38391754031181335, + "learning_rate": 4.613063014291901e-05, + "loss": 0.3573, + "step": 5718000 + }, + { + "epoch": 38.69708207016024, + "grad_norm": 0.4172001779079437, + "learning_rate": 4.6130291792983975e-05, + "loss": 0.3598, + "step": 5718500 + }, + { + "epoch": 38.70046556951061, + "grad_norm": 0.3733766973018646, + "learning_rate": 4.612995344304894e-05, + "loss": 0.3575, + "step": 5719000 + }, + { + "epoch": 38.703849068860976, + "grad_norm": 0.39495787024497986, + "learning_rate": 4.6129615093113906e-05, + "loss": 0.36, + "step": 5719500 + }, + { + "epoch": 38.707232568211346, + "grad_norm": 0.42287978529930115, + "learning_rate": 4.612927674317887e-05, + "loss": 0.359, + "step": 5720000 + }, + { + "epoch": 38.710616067561716, + "grad_norm": 0.40965384244918823, + "learning_rate": 4.612893839324383e-05, + "loss": 0.3586, + "step": 5720500 + }, + { + "epoch": 38.713999566912086, + "grad_norm": 0.370764821767807, + "learning_rate": 4.612860004330879e-05, + "loss": 0.3592, + "step": 5721000 + }, + { + "epoch": 38.71738306626245, + "grad_norm": 0.3937366008758545, + "learning_rate": 4.612826169337376e-05, + "loss": 0.359, + "step": 5721500 + }, + { + "epoch": 38.72076656561282, + "grad_norm": 0.3713039457798004, + "learning_rate": 4.612792334343872e-05, + "loss": 0.3586, + "step": 5722000 + }, + { + "epoch": 38.72415006496319, + "grad_norm": 0.4095219075679779, + "learning_rate": 4.612758499350368e-05, + "loss": 0.3594, + "step": 5722500 + }, + { + "epoch": 38.72753356431355, + "grad_norm": 0.3711714744567871, + "learning_rate": 4.612724664356865e-05, + "loss": 0.359, + "step": 5723000 + }, + { + "epoch": 38.730917063663924, + "grad_norm": 0.3650723695755005, + "learning_rate": 4.612690829363361e-05, + "loss": 0.3575, + "step": 5723500 + }, + { + "epoch": 38.734300563014294, + "grad_norm": 0.3415977954864502, + "learning_rate": 4.612656994369857e-05, + "loss": 0.3586, + "step": 5724000 + }, + { + "epoch": 38.73768406236466, + "grad_norm": 0.4030567407608032, + "learning_rate": 4.6126231593763534e-05, + "loss": 0.3585, + "step": 5724500 + }, + { + "epoch": 38.74106756171503, + "grad_norm": 0.41347163915634155, + "learning_rate": 4.61258932438285e-05, + "loss": 0.3577, + "step": 5725000 + }, + { + "epoch": 38.7444510610654, + "grad_norm": 0.37313735485076904, + "learning_rate": 4.6125554893893465e-05, + "loss": 0.3579, + "step": 5725500 + }, + { + "epoch": 38.74783456041576, + "grad_norm": 0.37139520049095154, + "learning_rate": 4.612521654395843e-05, + "loss": 0.3573, + "step": 5726000 + }, + { + "epoch": 38.75121805976613, + "grad_norm": 0.41949698328971863, + "learning_rate": 4.612487819402339e-05, + "loss": 0.3581, + "step": 5726500 + }, + { + "epoch": 38.7546015591165, + "grad_norm": 0.3581236004829407, + "learning_rate": 4.612453984408836e-05, + "loss": 0.3581, + "step": 5727000 + }, + { + "epoch": 38.75798505846687, + "grad_norm": 0.37782368063926697, + "learning_rate": 4.612420149415331e-05, + "loss": 0.3572, + "step": 5727500 + }, + { + "epoch": 38.761368557817235, + "grad_norm": 0.36826446652412415, + "learning_rate": 4.6123863144218275e-05, + "loss": 0.3588, + "step": 5728000 + }, + { + "epoch": 38.764752057167605, + "grad_norm": 0.3888064920902252, + "learning_rate": 4.612352479428324e-05, + "loss": 0.3573, + "step": 5728500 + }, + { + "epoch": 38.768135556517976, + "grad_norm": 0.3726211488246918, + "learning_rate": 4.6123186444348206e-05, + "loss": 0.359, + "step": 5729000 + }, + { + "epoch": 38.77151905586834, + "grad_norm": 0.3911852240562439, + "learning_rate": 4.612284809441317e-05, + "loss": 0.3588, + "step": 5729500 + }, + { + "epoch": 38.77490255521871, + "grad_norm": 0.386538028717041, + "learning_rate": 4.612250974447813e-05, + "loss": 0.3571, + "step": 5730000 + }, + { + "epoch": 38.77828605456908, + "grad_norm": 0.35132524371147156, + "learning_rate": 4.612217139454309e-05, + "loss": 0.36, + "step": 5730500 + }, + { + "epoch": 38.78166955391944, + "grad_norm": 0.33423352241516113, + "learning_rate": 4.612183304460806e-05, + "loss": 0.3577, + "step": 5731000 + }, + { + "epoch": 38.78505305326981, + "grad_norm": 0.40488508343696594, + "learning_rate": 4.6121494694673024e-05, + "loss": 0.3598, + "step": 5731500 + }, + { + "epoch": 38.78843655262018, + "grad_norm": 0.35855478048324585, + "learning_rate": 4.612115634473798e-05, + "loss": 0.3574, + "step": 5732000 + }, + { + "epoch": 38.79182005197055, + "grad_norm": 0.4208734333515167, + "learning_rate": 4.612081799480295e-05, + "loss": 0.3575, + "step": 5732500 + }, + { + "epoch": 38.79520355132092, + "grad_norm": 0.3655511140823364, + "learning_rate": 4.612047964486791e-05, + "loss": 0.3593, + "step": 5733000 + }, + { + "epoch": 38.79858705067129, + "grad_norm": 0.43602752685546875, + "learning_rate": 4.612014129493287e-05, + "loss": 0.3574, + "step": 5733500 + }, + { + "epoch": 38.80197055002166, + "grad_norm": 0.4224923849105835, + "learning_rate": 4.6119802944997834e-05, + "loss": 0.359, + "step": 5734000 + }, + { + "epoch": 38.80535404937202, + "grad_norm": 0.3702527582645416, + "learning_rate": 4.61194645950628e-05, + "loss": 0.3573, + "step": 5734500 + }, + { + "epoch": 38.80873754872239, + "grad_norm": 0.4083997905254364, + "learning_rate": 4.6119126245127765e-05, + "loss": 0.3607, + "step": 5735000 + }, + { + "epoch": 38.81212104807276, + "grad_norm": 0.37937113642692566, + "learning_rate": 4.611878789519273e-05, + "loss": 0.3583, + "step": 5735500 + }, + { + "epoch": 38.815504547423124, + "grad_norm": 0.39841771125793457, + "learning_rate": 4.611844954525769e-05, + "loss": 0.3573, + "step": 5736000 + }, + { + "epoch": 38.818888046773495, + "grad_norm": 0.38441547751426697, + "learning_rate": 4.611811119532266e-05, + "loss": 0.3579, + "step": 5736500 + }, + { + "epoch": 38.822271546123865, + "grad_norm": 0.4236910939216614, + "learning_rate": 4.6117772845387614e-05, + "loss": 0.3577, + "step": 5737000 + }, + { + "epoch": 38.82565504547423, + "grad_norm": 0.38515132665634155, + "learning_rate": 4.6117434495452576e-05, + "loss": 0.3575, + "step": 5737500 + }, + { + "epoch": 38.8290385448246, + "grad_norm": 0.39090225100517273, + "learning_rate": 4.611709614551754e-05, + "loss": 0.3588, + "step": 5738000 + }, + { + "epoch": 38.83242204417497, + "grad_norm": 0.3756837844848633, + "learning_rate": 4.611675779558251e-05, + "loss": 0.3586, + "step": 5738500 + }, + { + "epoch": 38.83580554352534, + "grad_norm": 0.37347716093063354, + "learning_rate": 4.611641944564747e-05, + "loss": 0.3578, + "step": 5739000 + }, + { + "epoch": 38.8391890428757, + "grad_norm": 0.3697563707828522, + "learning_rate": 4.611608109571243e-05, + "loss": 0.3582, + "step": 5739500 + }, + { + "epoch": 38.84257254222607, + "grad_norm": 0.41285571455955505, + "learning_rate": 4.611574274577739e-05, + "loss": 0.3592, + "step": 5740000 + }, + { + "epoch": 38.84595604157644, + "grad_norm": 0.39796262979507446, + "learning_rate": 4.611540439584236e-05, + "loss": 0.3592, + "step": 5740500 + }, + { + "epoch": 38.849339540926806, + "grad_norm": 0.3566901981830597, + "learning_rate": 4.6115066045907324e-05, + "loss": 0.3574, + "step": 5741000 + }, + { + "epoch": 38.852723040277176, + "grad_norm": 0.3584422469139099, + "learning_rate": 4.611472769597228e-05, + "loss": 0.3589, + "step": 5741500 + }, + { + "epoch": 38.85610653962755, + "grad_norm": 0.3689061999320984, + "learning_rate": 4.611438934603725e-05, + "loss": 0.3588, + "step": 5742000 + }, + { + "epoch": 38.85949003897791, + "grad_norm": 0.3805975914001465, + "learning_rate": 4.611405099610221e-05, + "loss": 0.3581, + "step": 5742500 + }, + { + "epoch": 38.86287353832828, + "grad_norm": 0.3651244342327118, + "learning_rate": 4.611371264616717e-05, + "loss": 0.3589, + "step": 5743000 + }, + { + "epoch": 38.86625703767865, + "grad_norm": 0.3986314833164215, + "learning_rate": 4.6113374296232135e-05, + "loss": 0.3586, + "step": 5743500 + }, + { + "epoch": 38.869640537029014, + "grad_norm": 0.352322518825531, + "learning_rate": 4.6113035946297104e-05, + "loss": 0.3601, + "step": 5744000 + }, + { + "epoch": 38.873024036379384, + "grad_norm": 0.4206475615501404, + "learning_rate": 4.6112697596362066e-05, + "loss": 0.3575, + "step": 5744500 + }, + { + "epoch": 38.876407535729754, + "grad_norm": 0.4022883474826813, + "learning_rate": 4.611235924642703e-05, + "loss": 0.3598, + "step": 5745000 + }, + { + "epoch": 38.879791035080125, + "grad_norm": 0.40589722990989685, + "learning_rate": 4.611202089649199e-05, + "loss": 0.3579, + "step": 5745500 + }, + { + "epoch": 38.88317453443049, + "grad_norm": 0.35741114616394043, + "learning_rate": 4.611168254655696e-05, + "loss": 0.3579, + "step": 5746000 + }, + { + "epoch": 38.88655803378086, + "grad_norm": 0.41959348320961, + "learning_rate": 4.6111344196621914e-05, + "loss": 0.3571, + "step": 5746500 + }, + { + "epoch": 38.88994153313123, + "grad_norm": 0.37600529193878174, + "learning_rate": 4.6111005846686877e-05, + "loss": 0.3576, + "step": 5747000 + }, + { + "epoch": 38.89332503248159, + "grad_norm": 0.40771156549453735, + "learning_rate": 4.611066749675184e-05, + "loss": 0.3597, + "step": 5747500 + }, + { + "epoch": 38.89670853183196, + "grad_norm": 0.3621875047683716, + "learning_rate": 4.611032914681681e-05, + "loss": 0.3575, + "step": 5748000 + }, + { + "epoch": 38.90009203118233, + "grad_norm": 0.38153213262557983, + "learning_rate": 4.610999079688177e-05, + "loss": 0.3581, + "step": 5748500 + }, + { + "epoch": 38.903475530532695, + "grad_norm": 0.36397770047187805, + "learning_rate": 4.610965244694673e-05, + "loss": 0.3581, + "step": 5749000 + }, + { + "epoch": 38.906859029883066, + "grad_norm": 0.3855028748512268, + "learning_rate": 4.6109314097011694e-05, + "loss": 0.3587, + "step": 5749500 + }, + { + "epoch": 38.910242529233436, + "grad_norm": 0.41015130281448364, + "learning_rate": 4.610897574707666e-05, + "loss": 0.3593, + "step": 5750000 + }, + { + "epoch": 38.9136260285838, + "grad_norm": 0.3674156069755554, + "learning_rate": 4.6108637397141625e-05, + "loss": 0.3582, + "step": 5750500 + }, + { + "epoch": 38.91700952793417, + "grad_norm": 0.35390201210975647, + "learning_rate": 4.610829904720659e-05, + "loss": 0.3593, + "step": 5751000 + }, + { + "epoch": 38.92039302728454, + "grad_norm": 0.3748086094856262, + "learning_rate": 4.610796069727155e-05, + "loss": 0.3585, + "step": 5751500 + }, + { + "epoch": 38.92377652663491, + "grad_norm": 0.4166073501110077, + "learning_rate": 4.610762234733651e-05, + "loss": 0.3599, + "step": 5752000 + }, + { + "epoch": 38.92716002598527, + "grad_norm": 0.3876727223396301, + "learning_rate": 4.6107283997401473e-05, + "loss": 0.3587, + "step": 5752500 + }, + { + "epoch": 38.93054352533564, + "grad_norm": 0.34974271059036255, + "learning_rate": 4.6106945647466436e-05, + "loss": 0.3578, + "step": 5753000 + }, + { + "epoch": 38.933927024686014, + "grad_norm": 0.3858926594257355, + "learning_rate": 4.6106607297531404e-05, + "loss": 0.3587, + "step": 5753500 + }, + { + "epoch": 38.93731052403638, + "grad_norm": 0.38629648089408875, + "learning_rate": 4.6106268947596367e-05, + "loss": 0.3597, + "step": 5754000 + }, + { + "epoch": 38.94069402338675, + "grad_norm": 0.3409755229949951, + "learning_rate": 4.610593059766133e-05, + "loss": 0.3584, + "step": 5754500 + }, + { + "epoch": 38.94407752273712, + "grad_norm": 0.3903469443321228, + "learning_rate": 4.610559224772629e-05, + "loss": 0.3588, + "step": 5755000 + }, + { + "epoch": 38.94746102208748, + "grad_norm": 0.36905232071876526, + "learning_rate": 4.610525389779125e-05, + "loss": 0.3577, + "step": 5755500 + }, + { + "epoch": 38.95084452143785, + "grad_norm": 0.3966832458972931, + "learning_rate": 4.6104915547856215e-05, + "loss": 0.359, + "step": 5756000 + }, + { + "epoch": 38.95422802078822, + "grad_norm": 0.3584676682949066, + "learning_rate": 4.610457719792118e-05, + "loss": 0.3601, + "step": 5756500 + }, + { + "epoch": 38.957611520138585, + "grad_norm": 0.4345291554927826, + "learning_rate": 4.610423884798614e-05, + "loss": 0.3582, + "step": 5757000 + }, + { + "epoch": 38.960995019488955, + "grad_norm": 0.4365847706794739, + "learning_rate": 4.610390049805111e-05, + "loss": 0.3587, + "step": 5757500 + }, + { + "epoch": 38.964378518839325, + "grad_norm": 0.38714271783828735, + "learning_rate": 4.610356214811607e-05, + "loss": 0.3589, + "step": 5758000 + }, + { + "epoch": 38.967762018189696, + "grad_norm": 0.39306461811065674, + "learning_rate": 4.610322379818103e-05, + "loss": 0.3588, + "step": 5758500 + }, + { + "epoch": 38.97114551754006, + "grad_norm": 0.37263697385787964, + "learning_rate": 4.6102885448245995e-05, + "loss": 0.3599, + "step": 5759000 + }, + { + "epoch": 38.97452901689043, + "grad_norm": 0.40572571754455566, + "learning_rate": 4.6102547098310963e-05, + "loss": 0.3576, + "step": 5759500 + }, + { + "epoch": 38.9779125162408, + "grad_norm": 0.33693259954452515, + "learning_rate": 4.6102208748375926e-05, + "loss": 0.3587, + "step": 5760000 + }, + { + "epoch": 38.98129601559116, + "grad_norm": 0.39590874314308167, + "learning_rate": 4.610187039844089e-05, + "loss": 0.3579, + "step": 5760500 + }, + { + "epoch": 38.98467951494153, + "grad_norm": 0.3590654730796814, + "learning_rate": 4.610153204850585e-05, + "loss": 0.3581, + "step": 5761000 + }, + { + "epoch": 38.9880630142919, + "grad_norm": 0.41421273350715637, + "learning_rate": 4.610119369857081e-05, + "loss": 0.3576, + "step": 5761500 + }, + { + "epoch": 38.991446513642266, + "grad_norm": 0.3868899345397949, + "learning_rate": 4.6100855348635774e-05, + "loss": 0.3597, + "step": 5762000 + }, + { + "epoch": 38.99483001299264, + "grad_norm": 0.38397252559661865, + "learning_rate": 4.6100516998700736e-05, + "loss": 0.359, + "step": 5762500 + }, + { + "epoch": 38.99821351234301, + "grad_norm": 0.36752620339393616, + "learning_rate": 4.6100178648765705e-05, + "loss": 0.3576, + "step": 5763000 + }, + { + "epoch": 39.0, + "eval_accuracy": 0.8632947080532718, + "eval_loss": 0.5549707412719727, + "eval_runtime": 3389.5211, + "eval_samples_per_second": 85.777, + "eval_steps_per_second": 5.361, + "step": 5763264 + }, + { + "epoch": 39.00159701169338, + "grad_norm": 0.41049081087112427, + "learning_rate": 4.609984029883067e-05, + "loss": 0.3554, + "step": 5763500 + }, + { + "epoch": 39.00498051104374, + "grad_norm": 0.38443803787231445, + "learning_rate": 4.609950194889563e-05, + "loss": 0.3558, + "step": 5764000 + }, + { + "epoch": 39.00836401039411, + "grad_norm": 0.40071505308151245, + "learning_rate": 4.609916359896059e-05, + "loss": 0.354, + "step": 5764500 + }, + { + "epoch": 39.01174750974448, + "grad_norm": 0.3943445682525635, + "learning_rate": 4.6098825249025554e-05, + "loss": 0.3558, + "step": 5765000 + }, + { + "epoch": 39.015131009094844, + "grad_norm": 0.3997240662574768, + "learning_rate": 4.6098486899090516e-05, + "loss": 0.3554, + "step": 5765500 + }, + { + "epoch": 39.018514508445215, + "grad_norm": 0.40480175614356995, + "learning_rate": 4.609814854915548e-05, + "loss": 0.3559, + "step": 5766000 + }, + { + "epoch": 39.021898007795585, + "grad_norm": 0.3974194824695587, + "learning_rate": 4.609781019922044e-05, + "loss": 0.3558, + "step": 5766500 + }, + { + "epoch": 39.02528150714595, + "grad_norm": 0.3885852098464966, + "learning_rate": 4.609747184928541e-05, + "loss": 0.3565, + "step": 5767000 + }, + { + "epoch": 39.02866500649632, + "grad_norm": 0.3391273617744446, + "learning_rate": 4.609713349935037e-05, + "loss": 0.3549, + "step": 5767500 + }, + { + "epoch": 39.03204850584669, + "grad_norm": 0.3764244616031647, + "learning_rate": 4.609679514941533e-05, + "loss": 0.3564, + "step": 5768000 + }, + { + "epoch": 39.03543200519705, + "grad_norm": 0.3896447420120239, + "learning_rate": 4.6096456799480295e-05, + "loss": 0.356, + "step": 5768500 + }, + { + "epoch": 39.03881550454742, + "grad_norm": 0.3836119472980499, + "learning_rate": 4.6096118449545264e-05, + "loss": 0.3571, + "step": 5769000 + }, + { + "epoch": 39.04219900389779, + "grad_norm": 0.37785929441452026, + "learning_rate": 4.6095780099610226e-05, + "loss": 0.3555, + "step": 5769500 + }, + { + "epoch": 39.04558250324816, + "grad_norm": 0.3991580605506897, + "learning_rate": 4.609544174967519e-05, + "loss": 0.3567, + "step": 5770000 + }, + { + "epoch": 39.048966002598526, + "grad_norm": 0.3892557919025421, + "learning_rate": 4.609510339974015e-05, + "loss": 0.3566, + "step": 5770500 + }, + { + "epoch": 39.052349501948896, + "grad_norm": 0.3755117356777191, + "learning_rate": 4.609476504980511e-05, + "loss": 0.3554, + "step": 5771000 + }, + { + "epoch": 39.05573300129927, + "grad_norm": 0.3512061834335327, + "learning_rate": 4.6094426699870075e-05, + "loss": 0.3573, + "step": 5771500 + }, + { + "epoch": 39.05911650064963, + "grad_norm": 0.3926057517528534, + "learning_rate": 4.609408834993504e-05, + "loss": 0.3567, + "step": 5772000 + }, + { + "epoch": 39.0625, + "grad_norm": 0.37486153841018677, + "learning_rate": 4.609375e-05, + "loss": 0.356, + "step": 5772500 + }, + { + "epoch": 39.06588349935037, + "grad_norm": 0.37479740381240845, + "learning_rate": 4.609341165006497e-05, + "loss": 0.3582, + "step": 5773000 + }, + { + "epoch": 39.06926699870073, + "grad_norm": 0.3782919645309448, + "learning_rate": 4.609307330012993e-05, + "loss": 0.3559, + "step": 5773500 + }, + { + "epoch": 39.072650498051104, + "grad_norm": 0.3841499090194702, + "learning_rate": 4.609273495019489e-05, + "loss": 0.358, + "step": 5774000 + }, + { + "epoch": 39.076033997401474, + "grad_norm": 0.40607038140296936, + "learning_rate": 4.6092396600259854e-05, + "loss": 0.3562, + "step": 5774500 + }, + { + "epoch": 39.07941749675184, + "grad_norm": 0.4377508759498596, + "learning_rate": 4.6092058250324816e-05, + "loss": 0.3588, + "step": 5775000 + }, + { + "epoch": 39.08280099610221, + "grad_norm": 0.3485512435436249, + "learning_rate": 4.609171990038978e-05, + "loss": 0.3562, + "step": 5775500 + }, + { + "epoch": 39.08618449545258, + "grad_norm": 0.39141741394996643, + "learning_rate": 4.609138155045474e-05, + "loss": 0.3563, + "step": 5776000 + }, + { + "epoch": 39.08956799480295, + "grad_norm": 0.3922073245048523, + "learning_rate": 4.609104320051971e-05, + "loss": 0.3566, + "step": 5776500 + }, + { + "epoch": 39.09295149415331, + "grad_norm": 0.3608970642089844, + "learning_rate": 4.609070485058467e-05, + "loss": 0.3575, + "step": 5777000 + }, + { + "epoch": 39.09633499350368, + "grad_norm": 0.4330408275127411, + "learning_rate": 4.6090366500649634e-05, + "loss": 0.3566, + "step": 5777500 + }, + { + "epoch": 39.09971849285405, + "grad_norm": 0.3976117670536041, + "learning_rate": 4.6090028150714596e-05, + "loss": 0.3569, + "step": 5778000 + }, + { + "epoch": 39.103101992204415, + "grad_norm": 0.4137636721134186, + "learning_rate": 4.6089689800779565e-05, + "loss": 0.3559, + "step": 5778500 + }, + { + "epoch": 39.106485491554785, + "grad_norm": 0.39320191740989685, + "learning_rate": 4.608935145084453e-05, + "loss": 0.3584, + "step": 5779000 + }, + { + "epoch": 39.109868990905156, + "grad_norm": 0.3872518539428711, + "learning_rate": 4.608901310090949e-05, + "loss": 0.3577, + "step": 5779500 + }, + { + "epoch": 39.11325249025552, + "grad_norm": 0.38965508341789246, + "learning_rate": 4.6088674750974444e-05, + "loss": 0.3557, + "step": 5780000 + }, + { + "epoch": 39.11663598960589, + "grad_norm": 0.4170781672000885, + "learning_rate": 4.608833640103941e-05, + "loss": 0.3561, + "step": 5780500 + }, + { + "epoch": 39.12001948895626, + "grad_norm": 0.40871477127075195, + "learning_rate": 4.6087998051104375e-05, + "loss": 0.356, + "step": 5781000 + }, + { + "epoch": 39.12340298830662, + "grad_norm": 0.36769962310791016, + "learning_rate": 4.608765970116934e-05, + "loss": 0.3587, + "step": 5781500 + }, + { + "epoch": 39.12678648765699, + "grad_norm": 0.3955308794975281, + "learning_rate": 4.60873213512343e-05, + "loss": 0.3587, + "step": 5782000 + }, + { + "epoch": 39.13016998700736, + "grad_norm": 0.4196022152900696, + "learning_rate": 4.608698300129927e-05, + "loss": 0.3581, + "step": 5782500 + }, + { + "epoch": 39.133553486357734, + "grad_norm": 0.37169861793518066, + "learning_rate": 4.608664465136423e-05, + "loss": 0.3588, + "step": 5783000 + }, + { + "epoch": 39.1369369857081, + "grad_norm": 0.3570674657821655, + "learning_rate": 4.608630630142919e-05, + "loss": 0.3591, + "step": 5783500 + }, + { + "epoch": 39.14032048505847, + "grad_norm": 0.37060561776161194, + "learning_rate": 4.6085967951494155e-05, + "loss": 0.3563, + "step": 5784000 + }, + { + "epoch": 39.14370398440884, + "grad_norm": 0.36501386761665344, + "learning_rate": 4.608562960155912e-05, + "loss": 0.356, + "step": 5784500 + }, + { + "epoch": 39.1470874837592, + "grad_norm": 0.4009703993797302, + "learning_rate": 4.608529125162408e-05, + "loss": 0.3573, + "step": 5785000 + }, + { + "epoch": 39.15047098310957, + "grad_norm": 0.38712480664253235, + "learning_rate": 4.608495290168904e-05, + "loss": 0.3549, + "step": 5785500 + }, + { + "epoch": 39.15385448245994, + "grad_norm": 0.3608624041080475, + "learning_rate": 4.608461455175401e-05, + "loss": 0.3567, + "step": 5786000 + }, + { + "epoch": 39.157237981810304, + "grad_norm": 0.41465529799461365, + "learning_rate": 4.608427620181897e-05, + "loss": 0.3581, + "step": 5786500 + }, + { + "epoch": 39.160621481160675, + "grad_norm": 0.38023123145103455, + "learning_rate": 4.6083937851883934e-05, + "loss": 0.3575, + "step": 5787000 + }, + { + "epoch": 39.164004980511045, + "grad_norm": 0.3878611922264099, + "learning_rate": 4.6083599501948896e-05, + "loss": 0.3588, + "step": 5787500 + }, + { + "epoch": 39.167388479861415, + "grad_norm": 0.3601565361022949, + "learning_rate": 4.6083261152013865e-05, + "loss": 0.3565, + "step": 5788000 + }, + { + "epoch": 39.17077197921178, + "grad_norm": 0.4064596891403198, + "learning_rate": 4.608292280207883e-05, + "loss": 0.3578, + "step": 5788500 + }, + { + "epoch": 39.17415547856215, + "grad_norm": 0.35656580328941345, + "learning_rate": 4.608258445214379e-05, + "loss": 0.3582, + "step": 5789000 + }, + { + "epoch": 39.17753897791252, + "grad_norm": 0.42319318652153015, + "learning_rate": 4.6082246102208745e-05, + "loss": 0.3579, + "step": 5789500 + }, + { + "epoch": 39.18092247726288, + "grad_norm": 0.3610454499721527, + "learning_rate": 4.6081907752273714e-05, + "loss": 0.3564, + "step": 5790000 + }, + { + "epoch": 39.18430597661325, + "grad_norm": 0.38486212491989136, + "learning_rate": 4.6081569402338676e-05, + "loss": 0.3584, + "step": 5790500 + }, + { + "epoch": 39.18768947596362, + "grad_norm": 0.4174489378929138, + "learning_rate": 4.608123105240364e-05, + "loss": 0.3564, + "step": 5791000 + }, + { + "epoch": 39.191072975313986, + "grad_norm": 0.42792096734046936, + "learning_rate": 4.60808927024686e-05, + "loss": 0.3574, + "step": 5791500 + }, + { + "epoch": 39.19445647466436, + "grad_norm": 0.3785710632801056, + "learning_rate": 4.608055435253357e-05, + "loss": 0.3582, + "step": 5792000 + }, + { + "epoch": 39.19783997401473, + "grad_norm": 0.41874465346336365, + "learning_rate": 4.608021600259853e-05, + "loss": 0.3571, + "step": 5792500 + }, + { + "epoch": 39.20122347336509, + "grad_norm": 0.4343903660774231, + "learning_rate": 4.607987765266349e-05, + "loss": 0.3566, + "step": 5793000 + }, + { + "epoch": 39.20460697271546, + "grad_norm": 0.37548828125, + "learning_rate": 4.6079539302728455e-05, + "loss": 0.3576, + "step": 5793500 + }, + { + "epoch": 39.20799047206583, + "grad_norm": 0.3644872009754181, + "learning_rate": 4.607920095279342e-05, + "loss": 0.3578, + "step": 5794000 + }, + { + "epoch": 39.2113739714162, + "grad_norm": 0.4181846082210541, + "learning_rate": 4.607886260285838e-05, + "loss": 0.3579, + "step": 5794500 + }, + { + "epoch": 39.214757470766564, + "grad_norm": 0.43744346499443054, + "learning_rate": 4.607852425292334e-05, + "loss": 0.3556, + "step": 5795000 + }, + { + "epoch": 39.218140970116934, + "grad_norm": 0.38154008984565735, + "learning_rate": 4.607818590298831e-05, + "loss": 0.3584, + "step": 5795500 + }, + { + "epoch": 39.221524469467305, + "grad_norm": 0.42544493079185486, + "learning_rate": 4.607784755305327e-05, + "loss": 0.3571, + "step": 5796000 + }, + { + "epoch": 39.22490796881767, + "grad_norm": 0.3827608525753021, + "learning_rate": 4.6077509203118235e-05, + "loss": 0.3581, + "step": 5796500 + }, + { + "epoch": 39.22829146816804, + "grad_norm": 0.3791741728782654, + "learning_rate": 4.60771708531832e-05, + "loss": 0.3574, + "step": 5797000 + }, + { + "epoch": 39.23167496751841, + "grad_norm": 0.37400010228157043, + "learning_rate": 4.6076832503248166e-05, + "loss": 0.3568, + "step": 5797500 + }, + { + "epoch": 39.23505846686877, + "grad_norm": 0.399890273809433, + "learning_rate": 4.607649415331313e-05, + "loss": 0.3587, + "step": 5798000 + }, + { + "epoch": 39.23844196621914, + "grad_norm": 0.3848799467086792, + "learning_rate": 4.607615580337809e-05, + "loss": 0.3586, + "step": 5798500 + }, + { + "epoch": 39.24182546556951, + "grad_norm": 0.4184007942676544, + "learning_rate": 4.6075817453443046e-05, + "loss": 0.3565, + "step": 5799000 + }, + { + "epoch": 39.245208964919875, + "grad_norm": 0.350588858127594, + "learning_rate": 4.6075479103508014e-05, + "loss": 0.3583, + "step": 5799500 + }, + { + "epoch": 39.248592464270246, + "grad_norm": 0.4084034860134125, + "learning_rate": 4.6075140753572977e-05, + "loss": 0.3575, + "step": 5800000 + }, + { + "epoch": 39.251975963620616, + "grad_norm": 0.4222847819328308, + "learning_rate": 4.607480240363794e-05, + "loss": 0.3574, + "step": 5800500 + }, + { + "epoch": 39.255359462970986, + "grad_norm": 0.3579876124858856, + "learning_rate": 4.60744640537029e-05, + "loss": 0.3587, + "step": 5801000 + }, + { + "epoch": 39.25874296232135, + "grad_norm": 0.3947008550167084, + "learning_rate": 4.607412570376787e-05, + "loss": 0.3569, + "step": 5801500 + }, + { + "epoch": 39.26212646167172, + "grad_norm": 0.3536688983440399, + "learning_rate": 4.607378735383283e-05, + "loss": 0.3588, + "step": 5802000 + }, + { + "epoch": 39.26550996102209, + "grad_norm": 0.39786744117736816, + "learning_rate": 4.6073449003897794e-05, + "loss": 0.359, + "step": 5802500 + }, + { + "epoch": 39.26889346037245, + "grad_norm": 0.3595603406429291, + "learning_rate": 4.6073110653962756e-05, + "loss": 0.3583, + "step": 5803000 + }, + { + "epoch": 39.272276959722824, + "grad_norm": 0.4054912030696869, + "learning_rate": 4.6072772304027725e-05, + "loss": 0.3577, + "step": 5803500 + }, + { + "epoch": 39.275660459073194, + "grad_norm": 0.4205889105796814, + "learning_rate": 4.607243395409268e-05, + "loss": 0.3578, + "step": 5804000 + }, + { + "epoch": 39.27904395842356, + "grad_norm": 0.40288498997688293, + "learning_rate": 4.607209560415764e-05, + "loss": 0.357, + "step": 5804500 + }, + { + "epoch": 39.28242745777393, + "grad_norm": 0.38287627696990967, + "learning_rate": 4.607175725422261e-05, + "loss": 0.3571, + "step": 5805000 + }, + { + "epoch": 39.2858109571243, + "grad_norm": 0.3729548752307892, + "learning_rate": 4.6071418904287573e-05, + "loss": 0.3582, + "step": 5805500 + }, + { + "epoch": 39.28919445647466, + "grad_norm": 0.3424685299396515, + "learning_rate": 4.6071080554352536e-05, + "loss": 0.3589, + "step": 5806000 + }, + { + "epoch": 39.29257795582503, + "grad_norm": 0.37398502230644226, + "learning_rate": 4.60707422044175e-05, + "loss": 0.3584, + "step": 5806500 + }, + { + "epoch": 39.2959614551754, + "grad_norm": 0.3575172424316406, + "learning_rate": 4.607040385448247e-05, + "loss": 0.3567, + "step": 5807000 + }, + { + "epoch": 39.29934495452577, + "grad_norm": 0.38726699352264404, + "learning_rate": 4.607006550454743e-05, + "loss": 0.3593, + "step": 5807500 + }, + { + "epoch": 39.302728453876135, + "grad_norm": 0.401279091835022, + "learning_rate": 4.606972715461239e-05, + "loss": 0.3595, + "step": 5808000 + }, + { + "epoch": 39.306111953226505, + "grad_norm": 0.39637458324432373, + "learning_rate": 4.6069388804677346e-05, + "loss": 0.3583, + "step": 5808500 + }, + { + "epoch": 39.309495452576876, + "grad_norm": 0.3764081597328186, + "learning_rate": 4.6069050454742315e-05, + "loss": 0.3587, + "step": 5809000 + }, + { + "epoch": 39.31287895192724, + "grad_norm": 0.3873443007469177, + "learning_rate": 4.606871210480728e-05, + "loss": 0.3574, + "step": 5809500 + }, + { + "epoch": 39.31626245127761, + "grad_norm": 0.40507134795188904, + "learning_rate": 4.606837375487224e-05, + "loss": 0.3576, + "step": 5810000 + }, + { + "epoch": 39.31964595062798, + "grad_norm": 0.43546003103256226, + "learning_rate": 4.60680354049372e-05, + "loss": 0.359, + "step": 5810500 + }, + { + "epoch": 39.32302944997834, + "grad_norm": 0.41429951786994934, + "learning_rate": 4.606769705500217e-05, + "loss": 0.3571, + "step": 5811000 + }, + { + "epoch": 39.32641294932871, + "grad_norm": 0.4261474013328552, + "learning_rate": 4.606735870506713e-05, + "loss": 0.3575, + "step": 5811500 + }, + { + "epoch": 39.32979644867908, + "grad_norm": 0.3997849225997925, + "learning_rate": 4.6067020355132095e-05, + "loss": 0.3575, + "step": 5812000 + }, + { + "epoch": 39.33317994802945, + "grad_norm": 0.35773953795433044, + "learning_rate": 4.606668200519706e-05, + "loss": 0.3589, + "step": 5812500 + }, + { + "epoch": 39.33656344737982, + "grad_norm": 0.38859155774116516, + "learning_rate": 4.6066343655262026e-05, + "loss": 0.3576, + "step": 5813000 + }, + { + "epoch": 39.33994694673019, + "grad_norm": 0.37268543243408203, + "learning_rate": 4.606600530532698e-05, + "loss": 0.3561, + "step": 5813500 + }, + { + "epoch": 39.34333044608056, + "grad_norm": 0.40817004442214966, + "learning_rate": 4.606566695539194e-05, + "loss": 0.3598, + "step": 5814000 + }, + { + "epoch": 39.34671394543092, + "grad_norm": 0.34740012884140015, + "learning_rate": 4.606532860545691e-05, + "loss": 0.3596, + "step": 5814500 + }, + { + "epoch": 39.35009744478129, + "grad_norm": 0.42565348744392395, + "learning_rate": 4.6064990255521874e-05, + "loss": 0.3594, + "step": 5815000 + }, + { + "epoch": 39.35348094413166, + "grad_norm": 0.40188512206077576, + "learning_rate": 4.6064651905586836e-05, + "loss": 0.3578, + "step": 5815500 + }, + { + "epoch": 39.356864443482024, + "grad_norm": 0.4051068127155304, + "learning_rate": 4.60643135556518e-05, + "loss": 0.3587, + "step": 5816000 + }, + { + "epoch": 39.360247942832395, + "grad_norm": 0.3960541784763336, + "learning_rate": 4.606397520571677e-05, + "loss": 0.3595, + "step": 5816500 + }, + { + "epoch": 39.363631442182765, + "grad_norm": 0.41114065051078796, + "learning_rate": 4.606363685578173e-05, + "loss": 0.3588, + "step": 5817000 + }, + { + "epoch": 39.36701494153313, + "grad_norm": 0.3999212384223938, + "learning_rate": 4.606329850584669e-05, + "loss": 0.3571, + "step": 5817500 + }, + { + "epoch": 39.3703984408835, + "grad_norm": 0.34364742040634155, + "learning_rate": 4.606296015591165e-05, + "loss": 0.3587, + "step": 5818000 + }, + { + "epoch": 39.37378194023387, + "grad_norm": 0.3538402020931244, + "learning_rate": 4.6062621805976616e-05, + "loss": 0.3583, + "step": 5818500 + }, + { + "epoch": 39.37716543958424, + "grad_norm": 0.3612450957298279, + "learning_rate": 4.606228345604158e-05, + "loss": 0.3591, + "step": 5819000 + }, + { + "epoch": 39.3805489389346, + "grad_norm": 0.366875022649765, + "learning_rate": 4.606194510610654e-05, + "loss": 0.3573, + "step": 5819500 + }, + { + "epoch": 39.38393243828497, + "grad_norm": 0.39192113280296326, + "learning_rate": 4.60616067561715e-05, + "loss": 0.3569, + "step": 5820000 + }, + { + "epoch": 39.38731593763534, + "grad_norm": 0.37817490100860596, + "learning_rate": 4.606126840623647e-05, + "loss": 0.3604, + "step": 5820500 + }, + { + "epoch": 39.390699436985706, + "grad_norm": 0.35173866152763367, + "learning_rate": 4.606093005630143e-05, + "loss": 0.3584, + "step": 5821000 + }, + { + "epoch": 39.394082936336076, + "grad_norm": 0.37890276312828064, + "learning_rate": 4.6060591706366395e-05, + "loss": 0.3584, + "step": 5821500 + }, + { + "epoch": 39.39746643568645, + "grad_norm": 0.3983881175518036, + "learning_rate": 4.606025335643136e-05, + "loss": 0.3563, + "step": 5822000 + }, + { + "epoch": 39.40084993503681, + "grad_norm": 0.3656408190727234, + "learning_rate": 4.6059915006496326e-05, + "loss": 0.3579, + "step": 5822500 + }, + { + "epoch": 39.40423343438718, + "grad_norm": 0.40919533371925354, + "learning_rate": 4.605957665656128e-05, + "loss": 0.3581, + "step": 5823000 + }, + { + "epoch": 39.40761693373755, + "grad_norm": 0.3984890580177307, + "learning_rate": 4.6059238306626244e-05, + "loss": 0.3574, + "step": 5823500 + }, + { + "epoch": 39.411000433087914, + "grad_norm": 0.3947007954120636, + "learning_rate": 4.605889995669121e-05, + "loss": 0.3586, + "step": 5824000 + }, + { + "epoch": 39.414383932438284, + "grad_norm": 0.4393731653690338, + "learning_rate": 4.6058561606756175e-05, + "loss": 0.3568, + "step": 5824500 + }, + { + "epoch": 39.417767431788654, + "grad_norm": 0.36653101444244385, + "learning_rate": 4.605822325682114e-05, + "loss": 0.3573, + "step": 5825000 + }, + { + "epoch": 39.421150931139024, + "grad_norm": 0.37711697816848755, + "learning_rate": 4.60578849068861e-05, + "loss": 0.3583, + "step": 5825500 + }, + { + "epoch": 39.42453443048939, + "grad_norm": 0.40406227111816406, + "learning_rate": 4.605754655695106e-05, + "loss": 0.3575, + "step": 5826000 + }, + { + "epoch": 39.42791792983976, + "grad_norm": 0.43455639481544495, + "learning_rate": 4.605720820701603e-05, + "loss": 0.3598, + "step": 5826500 + }, + { + "epoch": 39.43130142919013, + "grad_norm": 0.3846614360809326, + "learning_rate": 4.605686985708099e-05, + "loss": 0.3564, + "step": 5827000 + }, + { + "epoch": 39.43468492854049, + "grad_norm": 0.3586784601211548, + "learning_rate": 4.605653150714595e-05, + "loss": 0.3582, + "step": 5827500 + }, + { + "epoch": 39.43806842789086, + "grad_norm": 0.4095541536808014, + "learning_rate": 4.6056193157210916e-05, + "loss": 0.3582, + "step": 5828000 + }, + { + "epoch": 39.44145192724123, + "grad_norm": 0.390663206577301, + "learning_rate": 4.605585480727588e-05, + "loss": 0.3564, + "step": 5828500 + }, + { + "epoch": 39.444835426591595, + "grad_norm": 0.3315221667289734, + "learning_rate": 4.605551645734084e-05, + "loss": 0.3577, + "step": 5829000 + }, + { + "epoch": 39.448218925941966, + "grad_norm": 0.4073394238948822, + "learning_rate": 4.60551781074058e-05, + "loss": 0.3575, + "step": 5829500 + }, + { + "epoch": 39.451602425292336, + "grad_norm": 0.41086679697036743, + "learning_rate": 4.605483975747077e-05, + "loss": 0.3581, + "step": 5830000 + }, + { + "epoch": 39.4549859246427, + "grad_norm": 0.37248337268829346, + "learning_rate": 4.6054501407535734e-05, + "loss": 0.357, + "step": 5830500 + }, + { + "epoch": 39.45836942399307, + "grad_norm": 0.42306119203567505, + "learning_rate": 4.6054163057600696e-05, + "loss": 0.3563, + "step": 5831000 + }, + { + "epoch": 39.46175292334344, + "grad_norm": 0.3646513521671295, + "learning_rate": 4.605382470766566e-05, + "loss": 0.3583, + "step": 5831500 + }, + { + "epoch": 39.46513642269381, + "grad_norm": 0.33797353506088257, + "learning_rate": 4.605348635773063e-05, + "loss": 0.3594, + "step": 5832000 + }, + { + "epoch": 39.46851992204417, + "grad_norm": 0.3744948208332062, + "learning_rate": 4.605314800779558e-05, + "loss": 0.3579, + "step": 5832500 + }, + { + "epoch": 39.47190342139454, + "grad_norm": 0.35115155577659607, + "learning_rate": 4.6052809657860544e-05, + "loss": 0.3574, + "step": 5833000 + }, + { + "epoch": 39.475286920744914, + "grad_norm": 0.39874720573425293, + "learning_rate": 4.605247130792551e-05, + "loss": 0.3583, + "step": 5833500 + }, + { + "epoch": 39.47867042009528, + "grad_norm": 0.3970976769924164, + "learning_rate": 4.6052132957990475e-05, + "loss": 0.3576, + "step": 5834000 + }, + { + "epoch": 39.48205391944565, + "grad_norm": 0.4019801914691925, + "learning_rate": 4.605179460805544e-05, + "loss": 0.3562, + "step": 5834500 + }, + { + "epoch": 39.48543741879602, + "grad_norm": 0.39601776003837585, + "learning_rate": 4.60514562581204e-05, + "loss": 0.3571, + "step": 5835000 + }, + { + "epoch": 39.48882091814638, + "grad_norm": 0.36481037735939026, + "learning_rate": 4.605111790818536e-05, + "loss": 0.3581, + "step": 5835500 + }, + { + "epoch": 39.49220441749675, + "grad_norm": 0.39808598160743713, + "learning_rate": 4.605077955825033e-05, + "loss": 0.3582, + "step": 5836000 + }, + { + "epoch": 39.49558791684712, + "grad_norm": 0.3782365918159485, + "learning_rate": 4.605044120831529e-05, + "loss": 0.3591, + "step": 5836500 + }, + { + "epoch": 39.49897141619749, + "grad_norm": 0.4156523644924164, + "learning_rate": 4.605010285838025e-05, + "loss": 0.3588, + "step": 5837000 + }, + { + "epoch": 39.502354915547855, + "grad_norm": 0.33446115255355835, + "learning_rate": 4.604976450844522e-05, + "loss": 0.3572, + "step": 5837500 + }, + { + "epoch": 39.505738414898225, + "grad_norm": 0.38550758361816406, + "learning_rate": 4.604942615851018e-05, + "loss": 0.3572, + "step": 5838000 + }, + { + "epoch": 39.509121914248595, + "grad_norm": 0.37014317512512207, + "learning_rate": 4.604908780857514e-05, + "loss": 0.3572, + "step": 5838500 + }, + { + "epoch": 39.51250541359896, + "grad_norm": 0.39192500710487366, + "learning_rate": 4.60487494586401e-05, + "loss": 0.3587, + "step": 5839000 + }, + { + "epoch": 39.51588891294933, + "grad_norm": 0.36324289441108704, + "learning_rate": 4.604841110870507e-05, + "loss": 0.3582, + "step": 5839500 + }, + { + "epoch": 39.5192724122997, + "grad_norm": 0.3271288275718689, + "learning_rate": 4.6048072758770034e-05, + "loss": 0.3581, + "step": 5840000 + }, + { + "epoch": 39.52265591165006, + "grad_norm": 0.41794437170028687, + "learning_rate": 4.6047734408834997e-05, + "loss": 0.358, + "step": 5840500 + }, + { + "epoch": 39.52603941100043, + "grad_norm": 0.391206294298172, + "learning_rate": 4.604739605889996e-05, + "loss": 0.3582, + "step": 5841000 + }, + { + "epoch": 39.5294229103508, + "grad_norm": 0.40756654739379883, + "learning_rate": 4.604705770896493e-05, + "loss": 0.356, + "step": 5841500 + }, + { + "epoch": 39.532806409701166, + "grad_norm": 0.4058922231197357, + "learning_rate": 4.604671935902988e-05, + "loss": 0.3588, + "step": 5842000 + }, + { + "epoch": 39.53618990905154, + "grad_norm": 0.39431220293045044, + "learning_rate": 4.6046381009094845e-05, + "loss": 0.3574, + "step": 5842500 + }, + { + "epoch": 39.53957340840191, + "grad_norm": 0.413131445646286, + "learning_rate": 4.604604265915981e-05, + "loss": 0.3579, + "step": 5843000 + }, + { + "epoch": 39.54295690775228, + "grad_norm": 0.40306076407432556, + "learning_rate": 4.6045704309224776e-05, + "loss": 0.3579, + "step": 5843500 + }, + { + "epoch": 39.54634040710264, + "grad_norm": 0.3776503801345825, + "learning_rate": 4.604536595928974e-05, + "loss": 0.3572, + "step": 5844000 + }, + { + "epoch": 39.54972390645301, + "grad_norm": 0.39396798610687256, + "learning_rate": 4.60450276093547e-05, + "loss": 0.3574, + "step": 5844500 + }, + { + "epoch": 39.55310740580338, + "grad_norm": 0.38373348116874695, + "learning_rate": 4.604468925941966e-05, + "loss": 0.359, + "step": 5845000 + }, + { + "epoch": 39.556490905153744, + "grad_norm": 0.38188958168029785, + "learning_rate": 4.604435090948463e-05, + "loss": 0.3573, + "step": 5845500 + }, + { + "epoch": 39.559874404504114, + "grad_norm": 0.36791303753852844, + "learning_rate": 4.6044012559549593e-05, + "loss": 0.3589, + "step": 5846000 + }, + { + "epoch": 39.563257903854485, + "grad_norm": 0.39912042021751404, + "learning_rate": 4.604367420961455e-05, + "loss": 0.3586, + "step": 5846500 + }, + { + "epoch": 39.56664140320485, + "grad_norm": 0.4015141427516937, + "learning_rate": 4.604333585967952e-05, + "loss": 0.3575, + "step": 5847000 + }, + { + "epoch": 39.57002490255522, + "grad_norm": 0.3737492561340332, + "learning_rate": 4.604299750974448e-05, + "loss": 0.3585, + "step": 5847500 + }, + { + "epoch": 39.57340840190559, + "grad_norm": 0.3960467278957367, + "learning_rate": 4.604265915980944e-05, + "loss": 0.3572, + "step": 5848000 + }, + { + "epoch": 39.57679190125595, + "grad_norm": 0.3899024426937103, + "learning_rate": 4.6042320809874404e-05, + "loss": 0.359, + "step": 5848500 + }, + { + "epoch": 39.58017540060632, + "grad_norm": 0.3939670920372009, + "learning_rate": 4.604198245993937e-05, + "loss": 0.3588, + "step": 5849000 + }, + { + "epoch": 39.58355889995669, + "grad_norm": 0.3528555631637573, + "learning_rate": 4.6041644110004335e-05, + "loss": 0.3569, + "step": 5849500 + }, + { + "epoch": 39.58694239930706, + "grad_norm": 0.3714154362678528, + "learning_rate": 4.60413057600693e-05, + "loss": 0.3577, + "step": 5850000 + }, + { + "epoch": 39.590325898657426, + "grad_norm": 0.3541982173919678, + "learning_rate": 4.604096741013426e-05, + "loss": 0.3575, + "step": 5850500 + }, + { + "epoch": 39.593709398007796, + "grad_norm": 0.37014955282211304, + "learning_rate": 4.604062906019923e-05, + "loss": 0.3589, + "step": 5851000 + }, + { + "epoch": 39.597092897358166, + "grad_norm": 0.3928954303264618, + "learning_rate": 4.6040290710264184e-05, + "loss": 0.3574, + "step": 5851500 + }, + { + "epoch": 39.60047639670853, + "grad_norm": 0.42399078607559204, + "learning_rate": 4.6039952360329146e-05, + "loss": 0.3581, + "step": 5852000 + }, + { + "epoch": 39.6038598960589, + "grad_norm": 0.42243319749832153, + "learning_rate": 4.603961401039411e-05, + "loss": 0.3593, + "step": 5852500 + }, + { + "epoch": 39.60724339540927, + "grad_norm": 0.33981195092201233, + "learning_rate": 4.603927566045908e-05, + "loss": 0.3581, + "step": 5853000 + }, + { + "epoch": 39.61062689475963, + "grad_norm": 0.3757496178150177, + "learning_rate": 4.603893731052404e-05, + "loss": 0.359, + "step": 5853500 + }, + { + "epoch": 39.614010394110004, + "grad_norm": 0.3754291534423828, + "learning_rate": 4.6038598960589e-05, + "loss": 0.3585, + "step": 5854000 + }, + { + "epoch": 39.617393893460374, + "grad_norm": 0.3909532427787781, + "learning_rate": 4.603826061065396e-05, + "loss": 0.3572, + "step": 5854500 + }, + { + "epoch": 39.62077739281074, + "grad_norm": 0.39325129985809326, + "learning_rate": 4.603792226071893e-05, + "loss": 0.3589, + "step": 5855000 + }, + { + "epoch": 39.62416089216111, + "grad_norm": 0.36456966400146484, + "learning_rate": 4.6037583910783894e-05, + "loss": 0.3593, + "step": 5855500 + }, + { + "epoch": 39.62754439151148, + "grad_norm": 0.3960898518562317, + "learning_rate": 4.603724556084885e-05, + "loss": 0.3565, + "step": 5856000 + }, + { + "epoch": 39.63092789086185, + "grad_norm": 0.38639023900032043, + "learning_rate": 4.603690721091382e-05, + "loss": 0.3591, + "step": 5856500 + }, + { + "epoch": 39.63431139021221, + "grad_norm": 0.35006192326545715, + "learning_rate": 4.603656886097878e-05, + "loss": 0.3584, + "step": 5857000 + }, + { + "epoch": 39.63769488956258, + "grad_norm": 0.3980305790901184, + "learning_rate": 4.603623051104374e-05, + "loss": 0.3571, + "step": 5857500 + }, + { + "epoch": 39.64107838891295, + "grad_norm": 0.34487035870552063, + "learning_rate": 4.6035892161108705e-05, + "loss": 0.3575, + "step": 5858000 + }, + { + "epoch": 39.644461888263315, + "grad_norm": 0.3834744691848755, + "learning_rate": 4.6035553811173674e-05, + "loss": 0.3568, + "step": 5858500 + }, + { + "epoch": 39.647845387613685, + "grad_norm": 0.38060876727104187, + "learning_rate": 4.6035215461238636e-05, + "loss": 0.3582, + "step": 5859000 + }, + { + "epoch": 39.651228886964056, + "grad_norm": 0.41033247113227844, + "learning_rate": 4.60348771113036e-05, + "loss": 0.3558, + "step": 5859500 + }, + { + "epoch": 39.65461238631442, + "grad_norm": 0.37758803367614746, + "learning_rate": 4.603453876136856e-05, + "loss": 0.3577, + "step": 5860000 + }, + { + "epoch": 39.65799588566479, + "grad_norm": 0.40897002816200256, + "learning_rate": 4.603420041143353e-05, + "loss": 0.3582, + "step": 5860500 + }, + { + "epoch": 39.66137938501516, + "grad_norm": 0.4154495298862457, + "learning_rate": 4.6033862061498484e-05, + "loss": 0.3566, + "step": 5861000 + }, + { + "epoch": 39.66476288436553, + "grad_norm": 0.3547229766845703, + "learning_rate": 4.6033523711563446e-05, + "loss": 0.3573, + "step": 5861500 + }, + { + "epoch": 39.66814638371589, + "grad_norm": 0.3655273914337158, + "learning_rate": 4.603318536162841e-05, + "loss": 0.3564, + "step": 5862000 + }, + { + "epoch": 39.67152988306626, + "grad_norm": 0.36885225772857666, + "learning_rate": 4.603284701169338e-05, + "loss": 0.3578, + "step": 5862500 + }, + { + "epoch": 39.674913382416634, + "grad_norm": 0.40368983149528503, + "learning_rate": 4.603250866175834e-05, + "loss": 0.3582, + "step": 5863000 + }, + { + "epoch": 39.678296881767, + "grad_norm": 0.4202343225479126, + "learning_rate": 4.60321703118233e-05, + "loss": 0.3591, + "step": 5863500 + }, + { + "epoch": 39.68168038111737, + "grad_norm": 0.43563011288642883, + "learning_rate": 4.6031831961888264e-05, + "loss": 0.3589, + "step": 5864000 + }, + { + "epoch": 39.68506388046774, + "grad_norm": 0.3813161551952362, + "learning_rate": 4.603149361195323e-05, + "loss": 0.3573, + "step": 5864500 + }, + { + "epoch": 39.6884473798181, + "grad_norm": 0.38616743683815, + "learning_rate": 4.6031155262018195e-05, + "loss": 0.3568, + "step": 5865000 + }, + { + "epoch": 39.69183087916847, + "grad_norm": 0.4203009605407715, + "learning_rate": 4.603081691208316e-05, + "loss": 0.3577, + "step": 5865500 + }, + { + "epoch": 39.69521437851884, + "grad_norm": 0.3902522027492523, + "learning_rate": 4.603047856214812e-05, + "loss": 0.3584, + "step": 5866000 + }, + { + "epoch": 39.698597877869204, + "grad_norm": 0.3417580723762512, + "learning_rate": 4.603014021221308e-05, + "loss": 0.357, + "step": 5866500 + }, + { + "epoch": 39.701981377219575, + "grad_norm": 0.3956816792488098, + "learning_rate": 4.602980186227804e-05, + "loss": 0.3575, + "step": 5867000 + }, + { + "epoch": 39.705364876569945, + "grad_norm": 0.35489580035209656, + "learning_rate": 4.6029463512343005e-05, + "loss": 0.3579, + "step": 5867500 + }, + { + "epoch": 39.708748375920315, + "grad_norm": 0.4248609244823456, + "learning_rate": 4.6029125162407974e-05, + "loss": 0.3583, + "step": 5868000 + }, + { + "epoch": 39.71213187527068, + "grad_norm": 0.4011983871459961, + "learning_rate": 4.6028786812472936e-05, + "loss": 0.3573, + "step": 5868500 + }, + { + "epoch": 39.71551537462105, + "grad_norm": 0.35149234533309937, + "learning_rate": 4.60284484625379e-05, + "loss": 0.3587, + "step": 5869000 + }, + { + "epoch": 39.71889887397142, + "grad_norm": 0.4021625220775604, + "learning_rate": 4.602811011260286e-05, + "loss": 0.3574, + "step": 5869500 + }, + { + "epoch": 39.72228237332178, + "grad_norm": 0.36163243651390076, + "learning_rate": 4.602777176266783e-05, + "loss": 0.3574, + "step": 5870000 + }, + { + "epoch": 39.72566587267215, + "grad_norm": 0.35306787490844727, + "learning_rate": 4.6027433412732785e-05, + "loss": 0.3554, + "step": 5870500 + }, + { + "epoch": 39.72904937202252, + "grad_norm": 0.39128831028938293, + "learning_rate": 4.602709506279775e-05, + "loss": 0.3591, + "step": 5871000 + }, + { + "epoch": 39.732432871372886, + "grad_norm": 0.4118329882621765, + "learning_rate": 4.602675671286271e-05, + "loss": 0.3592, + "step": 5871500 + }, + { + "epoch": 39.735816370723256, + "grad_norm": 0.3582545220851898, + "learning_rate": 4.602641836292768e-05, + "loss": 0.357, + "step": 5872000 + }, + { + "epoch": 39.73919987007363, + "grad_norm": 0.38006383180618286, + "learning_rate": 4.602608001299264e-05, + "loss": 0.3593, + "step": 5872500 + }, + { + "epoch": 39.74258336942399, + "grad_norm": 0.3745099604129791, + "learning_rate": 4.60257416630576e-05, + "loss": 0.3572, + "step": 5873000 + }, + { + "epoch": 39.74596686877436, + "grad_norm": 0.4437626898288727, + "learning_rate": 4.6025403313122564e-05, + "loss": 0.3579, + "step": 5873500 + }, + { + "epoch": 39.74935036812473, + "grad_norm": 0.4171718657016754, + "learning_rate": 4.602506496318753e-05, + "loss": 0.3578, + "step": 5874000 + }, + { + "epoch": 39.7527338674751, + "grad_norm": 0.3632790446281433, + "learning_rate": 4.6024726613252495e-05, + "loss": 0.359, + "step": 5874500 + }, + { + "epoch": 39.756117366825464, + "grad_norm": 0.3919455409049988, + "learning_rate": 4.602438826331746e-05, + "loss": 0.3597, + "step": 5875000 + }, + { + "epoch": 39.759500866175834, + "grad_norm": 0.3734685778617859, + "learning_rate": 4.602404991338242e-05, + "loss": 0.3582, + "step": 5875500 + }, + { + "epoch": 39.762884365526205, + "grad_norm": 0.39074575901031494, + "learning_rate": 4.602371156344738e-05, + "loss": 0.3575, + "step": 5876000 + }, + { + "epoch": 39.76626786487657, + "grad_norm": 0.40093353390693665, + "learning_rate": 4.6023373213512344e-05, + "loss": 0.3578, + "step": 5876500 + }, + { + "epoch": 39.76965136422694, + "grad_norm": 0.36939117312431335, + "learning_rate": 4.6023034863577306e-05, + "loss": 0.3587, + "step": 5877000 + }, + { + "epoch": 39.77303486357731, + "grad_norm": 0.38103702664375305, + "learning_rate": 4.6022696513642275e-05, + "loss": 0.3577, + "step": 5877500 + }, + { + "epoch": 39.77641836292767, + "grad_norm": 0.41622504591941833, + "learning_rate": 4.602235816370724e-05, + "loss": 0.3577, + "step": 5878000 + }, + { + "epoch": 39.77980186227804, + "grad_norm": 0.3693433403968811, + "learning_rate": 4.60220198137722e-05, + "loss": 0.3567, + "step": 5878500 + }, + { + "epoch": 39.78318536162841, + "grad_norm": 0.38066574931144714, + "learning_rate": 4.602168146383716e-05, + "loss": 0.3599, + "step": 5879000 + }, + { + "epoch": 39.786568860978775, + "grad_norm": 0.3881836533546448, + "learning_rate": 4.602134311390213e-05, + "loss": 0.3581, + "step": 5879500 + }, + { + "epoch": 39.789952360329146, + "grad_norm": 0.4086085855960846, + "learning_rate": 4.6021004763967085e-05, + "loss": 0.3588, + "step": 5880000 + }, + { + "epoch": 39.793335859679516, + "grad_norm": 0.3807792663574219, + "learning_rate": 4.602066641403205e-05, + "loss": 0.357, + "step": 5880500 + }, + { + "epoch": 39.796719359029886, + "grad_norm": 0.4019613265991211, + "learning_rate": 4.602032806409701e-05, + "loss": 0.3576, + "step": 5881000 + }, + { + "epoch": 39.80010285838025, + "grad_norm": 0.3932480812072754, + "learning_rate": 4.601998971416198e-05, + "loss": 0.3586, + "step": 5881500 + }, + { + "epoch": 39.80348635773062, + "grad_norm": 0.37599310278892517, + "learning_rate": 4.601965136422694e-05, + "loss": 0.3572, + "step": 5882000 + }, + { + "epoch": 39.80686985708099, + "grad_norm": 0.39159277081489563, + "learning_rate": 4.60193130142919e-05, + "loss": 0.3583, + "step": 5882500 + }, + { + "epoch": 39.81025335643135, + "grad_norm": 0.4587567448616028, + "learning_rate": 4.6018974664356865e-05, + "loss": 0.3584, + "step": 5883000 + }, + { + "epoch": 39.81363685578172, + "grad_norm": 0.39030855894088745, + "learning_rate": 4.6018636314421834e-05, + "loss": 0.3587, + "step": 5883500 + }, + { + "epoch": 39.817020355132094, + "grad_norm": 0.4061824381351471, + "learning_rate": 4.6018297964486796e-05, + "loss": 0.3577, + "step": 5884000 + }, + { + "epoch": 39.82040385448246, + "grad_norm": 0.42569318413734436, + "learning_rate": 4.601795961455176e-05, + "loss": 0.3584, + "step": 5884500 + }, + { + "epoch": 39.82378735383283, + "grad_norm": 0.3514672517776489, + "learning_rate": 4.601762126461672e-05, + "loss": 0.3574, + "step": 5885000 + }, + { + "epoch": 39.8271708531832, + "grad_norm": 0.3994062542915344, + "learning_rate": 4.601728291468168e-05, + "loss": 0.3577, + "step": 5885500 + }, + { + "epoch": 39.83055435253357, + "grad_norm": 0.4034961760044098, + "learning_rate": 4.6016944564746644e-05, + "loss": 0.3597, + "step": 5886000 + }, + { + "epoch": 39.83393785188393, + "grad_norm": 0.3779377043247223, + "learning_rate": 4.6016606214811607e-05, + "loss": 0.3581, + "step": 5886500 + }, + { + "epoch": 39.8373213512343, + "grad_norm": 0.3766578435897827, + "learning_rate": 4.6016267864876575e-05, + "loss": 0.3566, + "step": 5887000 + }, + { + "epoch": 39.84070485058467, + "grad_norm": 0.3871261775493622, + "learning_rate": 4.601592951494154e-05, + "loss": 0.358, + "step": 5887500 + }, + { + "epoch": 39.844088349935035, + "grad_norm": 0.3668980896472931, + "learning_rate": 4.60155911650065e-05, + "loss": 0.3587, + "step": 5888000 + }, + { + "epoch": 39.847471849285405, + "grad_norm": 0.3791254460811615, + "learning_rate": 4.601525281507146e-05, + "loss": 0.3595, + "step": 5888500 + }, + { + "epoch": 39.850855348635775, + "grad_norm": 0.36987370252609253, + "learning_rate": 4.6014914465136424e-05, + "loss": 0.3596, + "step": 5889000 + }, + { + "epoch": 39.85423884798614, + "grad_norm": 0.40789470076560974, + "learning_rate": 4.6014576115201386e-05, + "loss": 0.359, + "step": 5889500 + }, + { + "epoch": 39.85762234733651, + "grad_norm": 0.37992024421691895, + "learning_rate": 4.601423776526635e-05, + "loss": 0.3591, + "step": 5890000 + }, + { + "epoch": 39.86100584668688, + "grad_norm": 0.4102227985858917, + "learning_rate": 4.601389941533131e-05, + "loss": 0.3583, + "step": 5890500 + }, + { + "epoch": 39.86438934603724, + "grad_norm": 0.3742344379425049, + "learning_rate": 4.601356106539628e-05, + "loss": 0.3578, + "step": 5891000 + }, + { + "epoch": 39.86777284538761, + "grad_norm": 0.3808158040046692, + "learning_rate": 4.601322271546124e-05, + "loss": 0.3584, + "step": 5891500 + }, + { + "epoch": 39.87115634473798, + "grad_norm": 0.4250912368297577, + "learning_rate": 4.6012884365526203e-05, + "loss": 0.3565, + "step": 5892000 + }, + { + "epoch": 39.87453984408835, + "grad_norm": 0.4150397777557373, + "learning_rate": 4.6012546015591166e-05, + "loss": 0.3582, + "step": 5892500 + }, + { + "epoch": 39.87792334343872, + "grad_norm": 0.3577214181423187, + "learning_rate": 4.6012207665656134e-05, + "loss": 0.3573, + "step": 5893000 + }, + { + "epoch": 39.88130684278909, + "grad_norm": 0.34287115931510925, + "learning_rate": 4.6011869315721097e-05, + "loss": 0.3598, + "step": 5893500 + }, + { + "epoch": 39.88469034213946, + "grad_norm": 0.376375287771225, + "learning_rate": 4.601153096578606e-05, + "loss": 0.3564, + "step": 5894000 + }, + { + "epoch": 39.88807384148982, + "grad_norm": 0.41835808753967285, + "learning_rate": 4.601119261585102e-05, + "loss": 0.3583, + "step": 5894500 + }, + { + "epoch": 39.89145734084019, + "grad_norm": 0.41685935854911804, + "learning_rate": 4.601085426591598e-05, + "loss": 0.3583, + "step": 5895000 + }, + { + "epoch": 39.89484084019056, + "grad_norm": 0.40916863083839417, + "learning_rate": 4.6010515915980945e-05, + "loss": 0.3566, + "step": 5895500 + }, + { + "epoch": 39.898224339540924, + "grad_norm": 0.3984031677246094, + "learning_rate": 4.601017756604591e-05, + "loss": 0.3588, + "step": 5896000 + }, + { + "epoch": 39.901607838891294, + "grad_norm": 0.40788960456848145, + "learning_rate": 4.600983921611087e-05, + "loss": 0.3585, + "step": 5896500 + }, + { + "epoch": 39.904991338241665, + "grad_norm": 0.37437763810157776, + "learning_rate": 4.600950086617584e-05, + "loss": 0.3578, + "step": 5897000 + }, + { + "epoch": 39.90837483759203, + "grad_norm": 0.4370589554309845, + "learning_rate": 4.60091625162408e-05, + "loss": 0.3579, + "step": 5897500 + }, + { + "epoch": 39.9117583369424, + "grad_norm": 0.38210275769233704, + "learning_rate": 4.600882416630576e-05, + "loss": 0.3576, + "step": 5898000 + }, + { + "epoch": 39.91514183629277, + "grad_norm": 0.37423208355903625, + "learning_rate": 4.6008485816370725e-05, + "loss": 0.3585, + "step": 5898500 + }, + { + "epoch": 39.91852533564314, + "grad_norm": 0.42241284251213074, + "learning_rate": 4.600814746643569e-05, + "loss": 0.3578, + "step": 5899000 + }, + { + "epoch": 39.9219088349935, + "grad_norm": 0.39159727096557617, + "learning_rate": 4.600780911650065e-05, + "loss": 0.3573, + "step": 5899500 + }, + { + "epoch": 39.92529233434387, + "grad_norm": 0.3790798485279083, + "learning_rate": 4.600747076656561e-05, + "loss": 0.3588, + "step": 5900000 + }, + { + "epoch": 39.92867583369424, + "grad_norm": 0.3681636452674866, + "learning_rate": 4.600713241663058e-05, + "loss": 0.359, + "step": 5900500 + }, + { + "epoch": 39.932059333044606, + "grad_norm": 0.38505762815475464, + "learning_rate": 4.600679406669554e-05, + "loss": 0.3593, + "step": 5901000 + }, + { + "epoch": 39.935442832394976, + "grad_norm": 0.35872504115104675, + "learning_rate": 4.6006455716760504e-05, + "loss": 0.3578, + "step": 5901500 + }, + { + "epoch": 39.93882633174535, + "grad_norm": 0.415998250246048, + "learning_rate": 4.6006117366825466e-05, + "loss": 0.3584, + "step": 5902000 + }, + { + "epoch": 39.94220983109571, + "grad_norm": 0.33244240283966064, + "learning_rate": 4.6005779016890435e-05, + "loss": 0.3567, + "step": 5902500 + }, + { + "epoch": 39.94559333044608, + "grad_norm": 0.36787545680999756, + "learning_rate": 4.60054406669554e-05, + "loss": 0.3571, + "step": 5903000 + }, + { + "epoch": 39.94897682979645, + "grad_norm": 0.4008660614490509, + "learning_rate": 4.600510231702036e-05, + "loss": 0.3595, + "step": 5903500 + }, + { + "epoch": 39.95236032914681, + "grad_norm": 0.36828315258026123, + "learning_rate": 4.600476396708532e-05, + "loss": 0.359, + "step": 5904000 + }, + { + "epoch": 39.955743828497184, + "grad_norm": 0.4211144745349884, + "learning_rate": 4.6004425617150284e-05, + "loss": 0.3586, + "step": 5904500 + }, + { + "epoch": 39.959127327847554, + "grad_norm": 0.39619413018226624, + "learning_rate": 4.6004087267215246e-05, + "loss": 0.3592, + "step": 5905000 + }, + { + "epoch": 39.962510827197924, + "grad_norm": 0.39225244522094727, + "learning_rate": 4.600374891728021e-05, + "loss": 0.3595, + "step": 5905500 + }, + { + "epoch": 39.96589432654829, + "grad_norm": 0.3748459815979004, + "learning_rate": 4.600341056734517e-05, + "loss": 0.3582, + "step": 5906000 + }, + { + "epoch": 39.96927782589866, + "grad_norm": 0.427299439907074, + "learning_rate": 4.600307221741014e-05, + "loss": 0.3594, + "step": 5906500 + }, + { + "epoch": 39.97266132524903, + "grad_norm": 0.3732565641403198, + "learning_rate": 4.60027338674751e-05, + "loss": 0.3567, + "step": 5907000 + }, + { + "epoch": 39.97604482459939, + "grad_norm": 0.4486788809299469, + "learning_rate": 4.600239551754006e-05, + "loss": 0.3587, + "step": 5907500 + }, + { + "epoch": 39.97942832394976, + "grad_norm": 0.40454939007759094, + "learning_rate": 4.6002057167605025e-05, + "loss": 0.3593, + "step": 5908000 + }, + { + "epoch": 39.98281182330013, + "grad_norm": 0.4098987579345703, + "learning_rate": 4.600171881766999e-05, + "loss": 0.3564, + "step": 5908500 + }, + { + "epoch": 39.986195322650495, + "grad_norm": 0.4133622348308563, + "learning_rate": 4.600138046773495e-05, + "loss": 0.3586, + "step": 5909000 + }, + { + "epoch": 39.989578822000865, + "grad_norm": 0.3992837369441986, + "learning_rate": 4.600104211779991e-05, + "loss": 0.3566, + "step": 5909500 + }, + { + "epoch": 39.992962321351236, + "grad_norm": 0.35246866941452026, + "learning_rate": 4.600070376786488e-05, + "loss": 0.3573, + "step": 5910000 + }, + { + "epoch": 39.9963458207016, + "grad_norm": 0.3770292103290558, + "learning_rate": 4.600036541792984e-05, + "loss": 0.359, + "step": 5910500 + }, + { + "epoch": 39.99972932005197, + "grad_norm": 0.39107683300971985, + "learning_rate": 4.6000027067994805e-05, + "loss": 0.3583, + "step": 5911000 + }, + { + "epoch": 40.0, + "eval_accuracy": 0.8631902569762103, + "eval_loss": 0.5551746487617493, + "eval_runtime": 3370.1943, + "eval_samples_per_second": 86.269, + "eval_steps_per_second": 5.392, + "step": 5911040 + }, + { + "epoch": 40.00311281940234, + "grad_norm": 0.3805144429206848, + "learning_rate": 4.599968871805977e-05, + "loss": 0.3567, + "step": 5911500 + }, + { + "epoch": 40.00649631875271, + "grad_norm": 0.3924298882484436, + "learning_rate": 4.5999350368124736e-05, + "loss": 0.3553, + "step": 5912000 + }, + { + "epoch": 40.00987981810307, + "grad_norm": 0.39365607500076294, + "learning_rate": 4.59990120181897e-05, + "loss": 0.3537, + "step": 5912500 + }, + { + "epoch": 40.01326331745344, + "grad_norm": 0.4074643850326538, + "learning_rate": 4.599867366825466e-05, + "loss": 0.3564, + "step": 5913000 + }, + { + "epoch": 40.016646816803814, + "grad_norm": 0.41037800908088684, + "learning_rate": 4.5998335318319615e-05, + "loss": 0.3563, + "step": 5913500 + }, + { + "epoch": 40.02003031615418, + "grad_norm": 0.4109076261520386, + "learning_rate": 4.5997996968384584e-05, + "loss": 0.3548, + "step": 5914000 + }, + { + "epoch": 40.02341381550455, + "grad_norm": 0.40716245770454407, + "learning_rate": 4.5997658618449546e-05, + "loss": 0.3569, + "step": 5914500 + }, + { + "epoch": 40.02679731485492, + "grad_norm": 0.38791415095329285, + "learning_rate": 4.599732026851451e-05, + "loss": 0.3559, + "step": 5915000 + }, + { + "epoch": 40.03018081420528, + "grad_norm": 0.37908267974853516, + "learning_rate": 4.599698191857947e-05, + "loss": 0.3556, + "step": 5915500 + }, + { + "epoch": 40.03356431355565, + "grad_norm": 0.35474398732185364, + "learning_rate": 4.599664356864444e-05, + "loss": 0.357, + "step": 5916000 + }, + { + "epoch": 40.03694781290602, + "grad_norm": 0.3792150616645813, + "learning_rate": 4.59963052187094e-05, + "loss": 0.3563, + "step": 5916500 + }, + { + "epoch": 40.04033131225639, + "grad_norm": 0.41410353779792786, + "learning_rate": 4.5995966868774364e-05, + "loss": 0.3554, + "step": 5917000 + }, + { + "epoch": 40.043714811606755, + "grad_norm": 0.3760049641132355, + "learning_rate": 4.5995628518839326e-05, + "loss": 0.3557, + "step": 5917500 + }, + { + "epoch": 40.047098310957125, + "grad_norm": 0.3935118317604065, + "learning_rate": 4.5995290168904295e-05, + "loss": 0.3557, + "step": 5918000 + }, + { + "epoch": 40.050481810307495, + "grad_norm": 0.40363773703575134, + "learning_rate": 4.599495181896925e-05, + "loss": 0.3558, + "step": 5918500 + }, + { + "epoch": 40.05386530965786, + "grad_norm": 0.39092475175857544, + "learning_rate": 4.599461346903421e-05, + "loss": 0.3575, + "step": 5919000 + }, + { + "epoch": 40.05724880900823, + "grad_norm": 0.41542938351631165, + "learning_rate": 4.599427511909918e-05, + "loss": 0.3577, + "step": 5919500 + }, + { + "epoch": 40.0606323083586, + "grad_norm": 0.40621158480644226, + "learning_rate": 4.599393676916414e-05, + "loss": 0.3566, + "step": 5920000 + }, + { + "epoch": 40.06401580770896, + "grad_norm": 0.3823866844177246, + "learning_rate": 4.5993598419229105e-05, + "loss": 0.3558, + "step": 5920500 + }, + { + "epoch": 40.06739930705933, + "grad_norm": 0.38658443093299866, + "learning_rate": 4.599326006929407e-05, + "loss": 0.3573, + "step": 5921000 + }, + { + "epoch": 40.0707828064097, + "grad_norm": 0.45442578196525574, + "learning_rate": 4.5992921719359036e-05, + "loss": 0.3562, + "step": 5921500 + }, + { + "epoch": 40.074166305760066, + "grad_norm": 0.3641699552536011, + "learning_rate": 4.5992583369424e-05, + "loss": 0.3565, + "step": 5922000 + }, + { + "epoch": 40.077549805110436, + "grad_norm": 0.41012468934059143, + "learning_rate": 4.599224501948896e-05, + "loss": 0.3564, + "step": 5922500 + }, + { + "epoch": 40.08093330446081, + "grad_norm": 0.37911295890808105, + "learning_rate": 4.5991906669553916e-05, + "loss": 0.3568, + "step": 5923000 + }, + { + "epoch": 40.08431680381118, + "grad_norm": 0.36601924896240234, + "learning_rate": 4.5991568319618885e-05, + "loss": 0.356, + "step": 5923500 + }, + { + "epoch": 40.08770030316154, + "grad_norm": 0.3938429057598114, + "learning_rate": 4.599122996968385e-05, + "loss": 0.3571, + "step": 5924000 + }, + { + "epoch": 40.09108380251191, + "grad_norm": 0.3910832703113556, + "learning_rate": 4.599089161974881e-05, + "loss": 0.3561, + "step": 5924500 + }, + { + "epoch": 40.09446730186228, + "grad_norm": 0.41138482093811035, + "learning_rate": 4.599055326981377e-05, + "loss": 0.3568, + "step": 5925000 + }, + { + "epoch": 40.097850801212644, + "grad_norm": 0.3618259131908417, + "learning_rate": 4.599021491987874e-05, + "loss": 0.3556, + "step": 5925500 + }, + { + "epoch": 40.101234300563014, + "grad_norm": 0.39352527260780334, + "learning_rate": 4.59898765699437e-05, + "loss": 0.3564, + "step": 5926000 + }, + { + "epoch": 40.104617799913385, + "grad_norm": 0.3567906320095062, + "learning_rate": 4.5989538220008664e-05, + "loss": 0.3577, + "step": 5926500 + }, + { + "epoch": 40.10800129926375, + "grad_norm": 0.3813663423061371, + "learning_rate": 4.5989199870073626e-05, + "loss": 0.3568, + "step": 5927000 + }, + { + "epoch": 40.11138479861412, + "grad_norm": 0.3681528866291046, + "learning_rate": 4.5988861520138595e-05, + "loss": 0.3551, + "step": 5927500 + }, + { + "epoch": 40.11476829796449, + "grad_norm": 0.3969293534755707, + "learning_rate": 4.598852317020355e-05, + "loss": 0.3569, + "step": 5928000 + }, + { + "epoch": 40.11815179731485, + "grad_norm": 0.38515737652778625, + "learning_rate": 4.598818482026851e-05, + "loss": 0.3562, + "step": 5928500 + }, + { + "epoch": 40.12153529666522, + "grad_norm": 0.4067007601261139, + "learning_rate": 4.598784647033348e-05, + "loss": 0.3563, + "step": 5929000 + }, + { + "epoch": 40.12491879601559, + "grad_norm": 0.36965522170066833, + "learning_rate": 4.5987508120398444e-05, + "loss": 0.3554, + "step": 5929500 + }, + { + "epoch": 40.12830229536596, + "grad_norm": 0.40654975175857544, + "learning_rate": 4.5987169770463406e-05, + "loss": 0.3548, + "step": 5930000 + }, + { + "epoch": 40.131685794716326, + "grad_norm": 0.4154300391674042, + "learning_rate": 4.598683142052837e-05, + "loss": 0.3564, + "step": 5930500 + }, + { + "epoch": 40.135069294066696, + "grad_norm": 0.37950339913368225, + "learning_rate": 4.598649307059334e-05, + "loss": 0.357, + "step": 5931000 + }, + { + "epoch": 40.138452793417066, + "grad_norm": 0.36879271268844604, + "learning_rate": 4.59861547206583e-05, + "loss": 0.3584, + "step": 5931500 + }, + { + "epoch": 40.14183629276743, + "grad_norm": 0.4255102574825287, + "learning_rate": 4.598581637072326e-05, + "loss": 0.3574, + "step": 5932000 + }, + { + "epoch": 40.1452197921178, + "grad_norm": 0.362420916557312, + "learning_rate": 4.5985478020788217e-05, + "loss": 0.3572, + "step": 5932500 + }, + { + "epoch": 40.14860329146817, + "grad_norm": 0.4359753131866455, + "learning_rate": 4.5985139670853185e-05, + "loss": 0.3557, + "step": 5933000 + }, + { + "epoch": 40.15198679081853, + "grad_norm": 0.41759398579597473, + "learning_rate": 4.598480132091815e-05, + "loss": 0.3578, + "step": 5933500 + }, + { + "epoch": 40.155370290168904, + "grad_norm": 0.3773077726364136, + "learning_rate": 4.598446297098311e-05, + "loss": 0.3557, + "step": 5934000 + }, + { + "epoch": 40.158753789519274, + "grad_norm": 0.3422764539718628, + "learning_rate": 4.598412462104807e-05, + "loss": 0.3578, + "step": 5934500 + }, + { + "epoch": 40.16213728886964, + "grad_norm": 0.3767971992492676, + "learning_rate": 4.598378627111304e-05, + "loss": 0.3569, + "step": 5935000 + }, + { + "epoch": 40.16552078822001, + "grad_norm": 0.3736751675605774, + "learning_rate": 4.5983447921178e-05, + "loss": 0.3577, + "step": 5935500 + }, + { + "epoch": 40.16890428757038, + "grad_norm": 0.4079616367816925, + "learning_rate": 4.5983109571242965e-05, + "loss": 0.3569, + "step": 5936000 + }, + { + "epoch": 40.17228778692075, + "grad_norm": 0.4015907645225525, + "learning_rate": 4.598277122130793e-05, + "loss": 0.3581, + "step": 5936500 + }, + { + "epoch": 40.17567128627111, + "grad_norm": 0.3792710602283478, + "learning_rate": 4.5982432871372896e-05, + "loss": 0.3583, + "step": 5937000 + }, + { + "epoch": 40.17905478562148, + "grad_norm": 0.44401755928993225, + "learning_rate": 4.598209452143785e-05, + "loss": 0.3567, + "step": 5937500 + }, + { + "epoch": 40.18243828497185, + "grad_norm": 0.404691219329834, + "learning_rate": 4.5981756171502813e-05, + "loss": 0.3573, + "step": 5938000 + }, + { + "epoch": 40.185821784322215, + "grad_norm": 0.3993310034275055, + "learning_rate": 4.598141782156778e-05, + "loss": 0.3577, + "step": 5938500 + }, + { + "epoch": 40.189205283672585, + "grad_norm": 0.3728111684322357, + "learning_rate": 4.5981079471632744e-05, + "loss": 0.356, + "step": 5939000 + }, + { + "epoch": 40.192588783022956, + "grad_norm": 0.37582162022590637, + "learning_rate": 4.5980741121697707e-05, + "loss": 0.3568, + "step": 5939500 + }, + { + "epoch": 40.19597228237332, + "grad_norm": 0.3851401209831238, + "learning_rate": 4.598040277176267e-05, + "loss": 0.3568, + "step": 5940000 + }, + { + "epoch": 40.19935578172369, + "grad_norm": 0.3895336985588074, + "learning_rate": 4.598006442182764e-05, + "loss": 0.3581, + "step": 5940500 + }, + { + "epoch": 40.20273928107406, + "grad_norm": 0.39405518770217896, + "learning_rate": 4.59797260718926e-05, + "loss": 0.3572, + "step": 5941000 + }, + { + "epoch": 40.20612278042443, + "grad_norm": 0.4513786733150482, + "learning_rate": 4.597938772195756e-05, + "loss": 0.3574, + "step": 5941500 + }, + { + "epoch": 40.20950627977479, + "grad_norm": 0.36259347200393677, + "learning_rate": 4.597904937202252e-05, + "loss": 0.3565, + "step": 5942000 + }, + { + "epoch": 40.21288977912516, + "grad_norm": 0.4183095693588257, + "learning_rate": 4.5978711022087486e-05, + "loss": 0.3562, + "step": 5942500 + }, + { + "epoch": 40.21627327847553, + "grad_norm": 0.40985429286956787, + "learning_rate": 4.597837267215245e-05, + "loss": 0.3575, + "step": 5943000 + }, + { + "epoch": 40.2196567778259, + "grad_norm": 0.3730085790157318, + "learning_rate": 4.597803432221741e-05, + "loss": 0.3571, + "step": 5943500 + }, + { + "epoch": 40.22304027717627, + "grad_norm": 0.3655538260936737, + "learning_rate": 4.597769597228237e-05, + "loss": 0.3563, + "step": 5944000 + }, + { + "epoch": 40.22642377652664, + "grad_norm": 0.3684774935245514, + "learning_rate": 4.597735762234734e-05, + "loss": 0.3582, + "step": 5944500 + }, + { + "epoch": 40.229807275877, + "grad_norm": 0.3919091522693634, + "learning_rate": 4.5977019272412303e-05, + "loss": 0.3575, + "step": 5945000 + }, + { + "epoch": 40.23319077522737, + "grad_norm": 0.37911686301231384, + "learning_rate": 4.5976680922477266e-05, + "loss": 0.3576, + "step": 5945500 + }, + { + "epoch": 40.23657427457774, + "grad_norm": 0.36616331338882446, + "learning_rate": 4.597634257254223e-05, + "loss": 0.3588, + "step": 5946000 + }, + { + "epoch": 40.239957773928104, + "grad_norm": 0.38871854543685913, + "learning_rate": 4.59760042226072e-05, + "loss": 0.3574, + "step": 5946500 + }, + { + "epoch": 40.243341273278475, + "grad_norm": 0.40605083107948303, + "learning_rate": 4.597566587267215e-05, + "loss": 0.3576, + "step": 5947000 + }, + { + "epoch": 40.246724772628845, + "grad_norm": 0.3829292058944702, + "learning_rate": 4.5975327522737114e-05, + "loss": 0.3564, + "step": 5947500 + }, + { + "epoch": 40.250108271979215, + "grad_norm": 0.3808296322822571, + "learning_rate": 4.597498917280208e-05, + "loss": 0.3577, + "step": 5948000 + }, + { + "epoch": 40.25349177132958, + "grad_norm": 0.3695087134838104, + "learning_rate": 4.5974650822867045e-05, + "loss": 0.3587, + "step": 5948500 + }, + { + "epoch": 40.25687527067995, + "grad_norm": 0.3775908946990967, + "learning_rate": 4.597431247293201e-05, + "loss": 0.3573, + "step": 5949000 + }, + { + "epoch": 40.26025877003032, + "grad_norm": 0.3715585172176361, + "learning_rate": 4.597397412299697e-05, + "loss": 0.3583, + "step": 5949500 + }, + { + "epoch": 40.26364226938068, + "grad_norm": 0.3579839766025543, + "learning_rate": 4.597363577306194e-05, + "loss": 0.3568, + "step": 5950000 + }, + { + "epoch": 40.26702576873105, + "grad_norm": 0.39917808771133423, + "learning_rate": 4.59732974231269e-05, + "loss": 0.358, + "step": 5950500 + }, + { + "epoch": 40.27040926808142, + "grad_norm": 0.37289804220199585, + "learning_rate": 4.597295907319186e-05, + "loss": 0.3562, + "step": 5951000 + }, + { + "epoch": 40.273792767431786, + "grad_norm": 0.39005935192108154, + "learning_rate": 4.597262072325682e-05, + "loss": 0.3594, + "step": 5951500 + }, + { + "epoch": 40.277176266782156, + "grad_norm": 0.41513052582740784, + "learning_rate": 4.597228237332179e-05, + "loss": 0.3579, + "step": 5952000 + }, + { + "epoch": 40.28055976613253, + "grad_norm": 0.35334670543670654, + "learning_rate": 4.597194402338675e-05, + "loss": 0.3571, + "step": 5952500 + }, + { + "epoch": 40.28394326548289, + "grad_norm": 0.4273497760295868, + "learning_rate": 4.597160567345171e-05, + "loss": 0.3568, + "step": 5953000 + }, + { + "epoch": 40.28732676483326, + "grad_norm": 0.35260331630706787, + "learning_rate": 4.597126732351667e-05, + "loss": 0.3584, + "step": 5953500 + }, + { + "epoch": 40.29071026418363, + "grad_norm": 0.4094509482383728, + "learning_rate": 4.597092897358164e-05, + "loss": 0.357, + "step": 5954000 + }, + { + "epoch": 40.294093763534, + "grad_norm": 0.3877350389957428, + "learning_rate": 4.5970590623646604e-05, + "loss": 0.3589, + "step": 5954500 + }, + { + "epoch": 40.297477262884364, + "grad_norm": 0.35621377825737, + "learning_rate": 4.5970252273711566e-05, + "loss": 0.3567, + "step": 5955000 + }, + { + "epoch": 40.300860762234734, + "grad_norm": 0.4190492331981659, + "learning_rate": 4.596991392377653e-05, + "loss": 0.358, + "step": 5955500 + }, + { + "epoch": 40.304244261585104, + "grad_norm": 0.3819047212600708, + "learning_rate": 4.59695755738415e-05, + "loss": 0.3559, + "step": 5956000 + }, + { + "epoch": 40.30762776093547, + "grad_norm": 0.3595086634159088, + "learning_rate": 4.596923722390645e-05, + "loss": 0.3569, + "step": 5956500 + }, + { + "epoch": 40.31101126028584, + "grad_norm": 0.41916951537132263, + "learning_rate": 4.5968898873971415e-05, + "loss": 0.3573, + "step": 5957000 + }, + { + "epoch": 40.31439475963621, + "grad_norm": 0.41138339042663574, + "learning_rate": 4.5968560524036384e-05, + "loss": 0.3587, + "step": 5957500 + }, + { + "epoch": 40.31777825898657, + "grad_norm": 0.4318738579750061, + "learning_rate": 4.5968222174101346e-05, + "loss": 0.3554, + "step": 5958000 + }, + { + "epoch": 40.32116175833694, + "grad_norm": 0.36461934447288513, + "learning_rate": 4.596788382416631e-05, + "loss": 0.3569, + "step": 5958500 + }, + { + "epoch": 40.32454525768731, + "grad_norm": 0.3716467022895813, + "learning_rate": 4.596754547423127e-05, + "loss": 0.3572, + "step": 5959000 + }, + { + "epoch": 40.327928757037675, + "grad_norm": 0.3580436408519745, + "learning_rate": 4.596720712429623e-05, + "loss": 0.3573, + "step": 5959500 + }, + { + "epoch": 40.331312256388046, + "grad_norm": 0.414146363735199, + "learning_rate": 4.59668687743612e-05, + "loss": 0.3581, + "step": 5960000 + }, + { + "epoch": 40.334695755738416, + "grad_norm": 0.42792972922325134, + "learning_rate": 4.596653042442616e-05, + "loss": 0.3579, + "step": 5960500 + }, + { + "epoch": 40.338079255088786, + "grad_norm": 0.36569127440452576, + "learning_rate": 4.596619207449112e-05, + "loss": 0.3567, + "step": 5961000 + }, + { + "epoch": 40.34146275443915, + "grad_norm": 0.3546809256076813, + "learning_rate": 4.596585372455609e-05, + "loss": 0.3582, + "step": 5961500 + }, + { + "epoch": 40.34484625378952, + "grad_norm": 0.35243067145347595, + "learning_rate": 4.596551537462105e-05, + "loss": 0.3569, + "step": 5962000 + }, + { + "epoch": 40.34822975313989, + "grad_norm": 0.39889785647392273, + "learning_rate": 4.596517702468601e-05, + "loss": 0.3569, + "step": 5962500 + }, + { + "epoch": 40.35161325249025, + "grad_norm": 0.3862617611885071, + "learning_rate": 4.5964838674750974e-05, + "loss": 0.3576, + "step": 5963000 + }, + { + "epoch": 40.35499675184062, + "grad_norm": 0.4139243960380554, + "learning_rate": 4.596450032481594e-05, + "loss": 0.3552, + "step": 5963500 + }, + { + "epoch": 40.358380251190994, + "grad_norm": 0.414829283952713, + "learning_rate": 4.5964161974880905e-05, + "loss": 0.3565, + "step": 5964000 + }, + { + "epoch": 40.36176375054136, + "grad_norm": 0.3791615068912506, + "learning_rate": 4.596382362494587e-05, + "loss": 0.356, + "step": 5964500 + }, + { + "epoch": 40.36514724989173, + "grad_norm": 0.36732369661331177, + "learning_rate": 4.596348527501083e-05, + "loss": 0.3587, + "step": 5965000 + }, + { + "epoch": 40.3685307492421, + "grad_norm": 0.4129006862640381, + "learning_rate": 4.59631469250758e-05, + "loss": 0.3588, + "step": 5965500 + }, + { + "epoch": 40.37191424859246, + "grad_norm": 0.3940274715423584, + "learning_rate": 4.596280857514075e-05, + "loss": 0.3575, + "step": 5966000 + }, + { + "epoch": 40.37529774794283, + "grad_norm": 0.3739898204803467, + "learning_rate": 4.5962470225205715e-05, + "loss": 0.3587, + "step": 5966500 + }, + { + "epoch": 40.3786812472932, + "grad_norm": 0.39497530460357666, + "learning_rate": 4.5962131875270684e-05, + "loss": 0.3559, + "step": 5967000 + }, + { + "epoch": 40.38206474664357, + "grad_norm": 0.40286335349082947, + "learning_rate": 4.5961793525335646e-05, + "loss": 0.3584, + "step": 5967500 + }, + { + "epoch": 40.385448245993935, + "grad_norm": 0.36392825841903687, + "learning_rate": 4.596145517540061e-05, + "loss": 0.3581, + "step": 5968000 + }, + { + "epoch": 40.388831745344305, + "grad_norm": 0.37698015570640564, + "learning_rate": 4.596111682546557e-05, + "loss": 0.3574, + "step": 5968500 + }, + { + "epoch": 40.392215244694675, + "grad_norm": 0.4125801622867584, + "learning_rate": 4.596077847553053e-05, + "loss": 0.3584, + "step": 5969000 + }, + { + "epoch": 40.39559874404504, + "grad_norm": 0.39844027161598206, + "learning_rate": 4.59604401255955e-05, + "loss": 0.3559, + "step": 5969500 + }, + { + "epoch": 40.39898224339541, + "grad_norm": 0.42070603370666504, + "learning_rate": 4.5960101775660464e-05, + "loss": 0.3569, + "step": 5970000 + }, + { + "epoch": 40.40236574274578, + "grad_norm": 0.38819003105163574, + "learning_rate": 4.595976342572542e-05, + "loss": 0.3571, + "step": 5970500 + }, + { + "epoch": 40.40574924209614, + "grad_norm": 0.370553582906723, + "learning_rate": 4.595942507579039e-05, + "loss": 0.358, + "step": 5971000 + }, + { + "epoch": 40.40913274144651, + "grad_norm": 0.4015754461288452, + "learning_rate": 4.595908672585535e-05, + "loss": 0.3579, + "step": 5971500 + }, + { + "epoch": 40.41251624079688, + "grad_norm": 0.399566650390625, + "learning_rate": 4.595874837592031e-05, + "loss": 0.3571, + "step": 5972000 + }, + { + "epoch": 40.41589974014725, + "grad_norm": 0.3622363805770874, + "learning_rate": 4.5958410025985274e-05, + "loss": 0.3583, + "step": 5972500 + }, + { + "epoch": 40.41928323949762, + "grad_norm": 0.39378848671913147, + "learning_rate": 4.595807167605024e-05, + "loss": 0.3576, + "step": 5973000 + }, + { + "epoch": 40.42266673884799, + "grad_norm": 0.39654675126075745, + "learning_rate": 4.5957733326115205e-05, + "loss": 0.3556, + "step": 5973500 + }, + { + "epoch": 40.42605023819836, + "grad_norm": 0.404291570186615, + "learning_rate": 4.595739497618017e-05, + "loss": 0.3596, + "step": 5974000 + }, + { + "epoch": 40.42943373754872, + "grad_norm": 0.39705920219421387, + "learning_rate": 4.595705662624513e-05, + "loss": 0.3579, + "step": 5974500 + }, + { + "epoch": 40.43281723689909, + "grad_norm": 0.3815435767173767, + "learning_rate": 4.59567182763101e-05, + "loss": 0.3597, + "step": 5975000 + }, + { + "epoch": 40.43620073624946, + "grad_norm": 0.3721045255661011, + "learning_rate": 4.5956379926375054e-05, + "loss": 0.358, + "step": 5975500 + }, + { + "epoch": 40.439584235599824, + "grad_norm": 0.4162366986274719, + "learning_rate": 4.5956041576440016e-05, + "loss": 0.359, + "step": 5976000 + }, + { + "epoch": 40.442967734950194, + "grad_norm": 0.4168601930141449, + "learning_rate": 4.595570322650498e-05, + "loss": 0.3573, + "step": 5976500 + }, + { + "epoch": 40.446351234300565, + "grad_norm": 0.4040071666240692, + "learning_rate": 4.595536487656995e-05, + "loss": 0.3572, + "step": 5977000 + }, + { + "epoch": 40.44973473365093, + "grad_norm": 0.3881406784057617, + "learning_rate": 4.595502652663491e-05, + "loss": 0.3575, + "step": 5977500 + }, + { + "epoch": 40.4531182330013, + "grad_norm": 0.3883194029331207, + "learning_rate": 4.595468817669987e-05, + "loss": 0.3575, + "step": 5978000 + }, + { + "epoch": 40.45650173235167, + "grad_norm": 0.3823226988315582, + "learning_rate": 4.595434982676483e-05, + "loss": 0.3573, + "step": 5978500 + }, + { + "epoch": 40.45988523170204, + "grad_norm": 0.42660027742385864, + "learning_rate": 4.59540114768298e-05, + "loss": 0.3583, + "step": 5979000 + }, + { + "epoch": 40.4632687310524, + "grad_norm": 0.3672007620334625, + "learning_rate": 4.5953673126894764e-05, + "loss": 0.3589, + "step": 5979500 + }, + { + "epoch": 40.46665223040277, + "grad_norm": 0.4127280116081238, + "learning_rate": 4.5953334776959727e-05, + "loss": 0.3568, + "step": 5980000 + }, + { + "epoch": 40.47003572975314, + "grad_norm": 0.39377361536026, + "learning_rate": 4.595299642702469e-05, + "loss": 0.3573, + "step": 5980500 + }, + { + "epoch": 40.473419229103506, + "grad_norm": 0.3723832964897156, + "learning_rate": 4.595265807708965e-05, + "loss": 0.3572, + "step": 5981000 + }, + { + "epoch": 40.476802728453876, + "grad_norm": 0.3649671673774719, + "learning_rate": 4.595231972715461e-05, + "loss": 0.3591, + "step": 5981500 + }, + { + "epoch": 40.480186227804246, + "grad_norm": 0.38867637515068054, + "learning_rate": 4.5951981377219575e-05, + "loss": 0.3578, + "step": 5982000 + }, + { + "epoch": 40.48356972715461, + "grad_norm": 0.4012649953365326, + "learning_rate": 4.5951643027284544e-05, + "loss": 0.3571, + "step": 5982500 + }, + { + "epoch": 40.48695322650498, + "grad_norm": 0.38399559259414673, + "learning_rate": 4.5951304677349506e-05, + "loss": 0.3583, + "step": 5983000 + }, + { + "epoch": 40.49033672585535, + "grad_norm": 0.4250212013721466, + "learning_rate": 4.595096632741447e-05, + "loss": 0.3578, + "step": 5983500 + }, + { + "epoch": 40.49372022520571, + "grad_norm": 0.3713824152946472, + "learning_rate": 4.595062797747943e-05, + "loss": 0.3585, + "step": 5984000 + }, + { + "epoch": 40.497103724556084, + "grad_norm": 0.4231489300727844, + "learning_rate": 4.59502896275444e-05, + "loss": 0.3586, + "step": 5984500 + }, + { + "epoch": 40.500487223906454, + "grad_norm": 0.3801126778125763, + "learning_rate": 4.5949951277609355e-05, + "loss": 0.3569, + "step": 5985000 + }, + { + "epoch": 40.503870723256824, + "grad_norm": 0.42874547839164734, + "learning_rate": 4.594961292767432e-05, + "loss": 0.3587, + "step": 5985500 + }, + { + "epoch": 40.50725422260719, + "grad_norm": 0.3551200032234192, + "learning_rate": 4.594927457773928e-05, + "loss": 0.3578, + "step": 5986000 + }, + { + "epoch": 40.51063772195756, + "grad_norm": 0.3460412323474884, + "learning_rate": 4.594893622780425e-05, + "loss": 0.3579, + "step": 5986500 + }, + { + "epoch": 40.51402122130793, + "grad_norm": 0.3623094856739044, + "learning_rate": 4.594859787786921e-05, + "loss": 0.3586, + "step": 5987000 + }, + { + "epoch": 40.51740472065829, + "grad_norm": 0.38421979546546936, + "learning_rate": 4.594825952793417e-05, + "loss": 0.3576, + "step": 5987500 + }, + { + "epoch": 40.52078822000866, + "grad_norm": 0.36018890142440796, + "learning_rate": 4.5947921177999134e-05, + "loss": 0.3579, + "step": 5988000 + }, + { + "epoch": 40.52417171935903, + "grad_norm": 0.4077075719833374, + "learning_rate": 4.59475828280641e-05, + "loss": 0.3588, + "step": 5988500 + }, + { + "epoch": 40.527555218709395, + "grad_norm": 0.3461393415927887, + "learning_rate": 4.5947244478129065e-05, + "loss": 0.3592, + "step": 5989000 + }, + { + "epoch": 40.530938718059765, + "grad_norm": 0.3685201108455658, + "learning_rate": 4.594690612819403e-05, + "loss": 0.3584, + "step": 5989500 + }, + { + "epoch": 40.534322217410136, + "grad_norm": 0.37963569164276123, + "learning_rate": 4.594656777825899e-05, + "loss": 0.3567, + "step": 5990000 + }, + { + "epoch": 40.5377057167605, + "grad_norm": 0.39100098609924316, + "learning_rate": 4.594622942832395e-05, + "loss": 0.3591, + "step": 5990500 + }, + { + "epoch": 40.54108921611087, + "grad_norm": 0.40807783603668213, + "learning_rate": 4.5945891078388914e-05, + "loss": 0.3596, + "step": 5991000 + }, + { + "epoch": 40.54447271546124, + "grad_norm": 0.3754607141017914, + "learning_rate": 4.5945552728453876e-05, + "loss": 0.3566, + "step": 5991500 + }, + { + "epoch": 40.54785621481161, + "grad_norm": 0.38255998492240906, + "learning_rate": 4.5945214378518845e-05, + "loss": 0.3584, + "step": 5992000 + }, + { + "epoch": 40.55123971416197, + "grad_norm": 0.3817797899246216, + "learning_rate": 4.594487602858381e-05, + "loss": 0.3567, + "step": 5992500 + }, + { + "epoch": 40.55462321351234, + "grad_norm": 0.3684269189834595, + "learning_rate": 4.594453767864877e-05, + "loss": 0.3593, + "step": 5993000 + }, + { + "epoch": 40.55800671286271, + "grad_norm": 0.3826868236064911, + "learning_rate": 4.594419932871373e-05, + "loss": 0.3575, + "step": 5993500 + }, + { + "epoch": 40.56139021221308, + "grad_norm": 0.4091789126396179, + "learning_rate": 4.59438609787787e-05, + "loss": 0.3576, + "step": 5994000 + }, + { + "epoch": 40.56477371156345, + "grad_norm": 0.41384512186050415, + "learning_rate": 4.5943522628843655e-05, + "loss": 0.3585, + "step": 5994500 + }, + { + "epoch": 40.56815721091382, + "grad_norm": 0.38715091347694397, + "learning_rate": 4.594318427890862e-05, + "loss": 0.3583, + "step": 5995000 + }, + { + "epoch": 40.57154071026418, + "grad_norm": 0.3721652925014496, + "learning_rate": 4.594284592897358e-05, + "loss": 0.3581, + "step": 5995500 + }, + { + "epoch": 40.57492420961455, + "grad_norm": 0.4144269824028015, + "learning_rate": 4.594250757903855e-05, + "loss": 0.3577, + "step": 5996000 + }, + { + "epoch": 40.57830770896492, + "grad_norm": 0.36702612042427063, + "learning_rate": 4.594216922910351e-05, + "loss": 0.3584, + "step": 5996500 + }, + { + "epoch": 40.58169120831529, + "grad_norm": 0.41641584038734436, + "learning_rate": 4.594183087916847e-05, + "loss": 0.3571, + "step": 5997000 + }, + { + "epoch": 40.585074707665655, + "grad_norm": 0.3495043218135834, + "learning_rate": 4.5941492529233435e-05, + "loss": 0.3571, + "step": 5997500 + }, + { + "epoch": 40.588458207016025, + "grad_norm": 0.40029117465019226, + "learning_rate": 4.5941154179298404e-05, + "loss": 0.3572, + "step": 5998000 + }, + { + "epoch": 40.591841706366395, + "grad_norm": 0.3930448293685913, + "learning_rate": 4.5940815829363366e-05, + "loss": 0.3572, + "step": 5998500 + }, + { + "epoch": 40.59522520571676, + "grad_norm": 0.3824836015701294, + "learning_rate": 4.594047747942833e-05, + "loss": 0.3583, + "step": 5999000 + }, + { + "epoch": 40.59860870506713, + "grad_norm": 0.36394089460372925, + "learning_rate": 4.594013912949329e-05, + "loss": 0.3584, + "step": 5999500 + }, + { + "epoch": 40.6019922044175, + "grad_norm": 0.3454444408416748, + "learning_rate": 4.593980077955825e-05, + "loss": 0.3585, + "step": 6000000 + }, + { + "epoch": 40.60537570376786, + "grad_norm": 0.3821978271007538, + "learning_rate": 4.5939462429623214e-05, + "loss": 0.3583, + "step": 6000500 + }, + { + "epoch": 40.60875920311823, + "grad_norm": 0.38868358731269836, + "learning_rate": 4.5939124079688176e-05, + "loss": 0.358, + "step": 6001000 + }, + { + "epoch": 40.6121427024686, + "grad_norm": 0.4017002582550049, + "learning_rate": 4.5938785729753145e-05, + "loss": 0.3577, + "step": 6001500 + }, + { + "epoch": 40.615526201818966, + "grad_norm": 0.3536180853843689, + "learning_rate": 4.593844737981811e-05, + "loss": 0.358, + "step": 6002000 + }, + { + "epoch": 40.618909701169336, + "grad_norm": 0.4217749238014221, + "learning_rate": 4.593810902988307e-05, + "loss": 0.3569, + "step": 6002500 + }, + { + "epoch": 40.62229320051971, + "grad_norm": 0.3951268792152405, + "learning_rate": 4.593777067994803e-05, + "loss": 0.3565, + "step": 6003000 + }, + { + "epoch": 40.62567669987008, + "grad_norm": 0.4119824469089508, + "learning_rate": 4.5937432330013e-05, + "loss": 0.3592, + "step": 6003500 + }, + { + "epoch": 40.62906019922044, + "grad_norm": 0.3959159255027771, + "learning_rate": 4.5937093980077956e-05, + "loss": 0.3564, + "step": 6004000 + }, + { + "epoch": 40.63244369857081, + "grad_norm": 0.41242820024490356, + "learning_rate": 4.593675563014292e-05, + "loss": 0.3597, + "step": 6004500 + }, + { + "epoch": 40.63582719792118, + "grad_norm": 0.3618526756763458, + "learning_rate": 4.593641728020788e-05, + "loss": 0.3582, + "step": 6005000 + }, + { + "epoch": 40.639210697271544, + "grad_norm": 0.3952378034591675, + "learning_rate": 4.593607893027285e-05, + "loss": 0.3558, + "step": 6005500 + }, + { + "epoch": 40.642594196621914, + "grad_norm": 0.3555503785610199, + "learning_rate": 4.593574058033781e-05, + "loss": 0.3584, + "step": 6006000 + }, + { + "epoch": 40.645977695972284, + "grad_norm": 0.41898027062416077, + "learning_rate": 4.593540223040277e-05, + "loss": 0.358, + "step": 6006500 + }, + { + "epoch": 40.64936119532265, + "grad_norm": 0.3822486400604248, + "learning_rate": 4.5935063880467735e-05, + "loss": 0.3584, + "step": 6007000 + }, + { + "epoch": 40.65274469467302, + "grad_norm": 0.42330479621887207, + "learning_rate": 4.5934725530532704e-05, + "loss": 0.3572, + "step": 6007500 + }, + { + "epoch": 40.65612819402339, + "grad_norm": 0.42337852716445923, + "learning_rate": 4.5934387180597666e-05, + "loss": 0.358, + "step": 6008000 + }, + { + "epoch": 40.65951169337375, + "grad_norm": 0.3900694251060486, + "learning_rate": 4.593404883066263e-05, + "loss": 0.3602, + "step": 6008500 + }, + { + "epoch": 40.66289519272412, + "grad_norm": 0.38055336475372314, + "learning_rate": 4.593371048072759e-05, + "loss": 0.3565, + "step": 6009000 + }, + { + "epoch": 40.66627869207449, + "grad_norm": 0.406038761138916, + "learning_rate": 4.593337213079255e-05, + "loss": 0.3581, + "step": 6009500 + }, + { + "epoch": 40.66966219142486, + "grad_norm": 0.34239327907562256, + "learning_rate": 4.5933033780857515e-05, + "loss": 0.3581, + "step": 6010000 + }, + { + "epoch": 40.673045690775226, + "grad_norm": 0.374510794878006, + "learning_rate": 4.593269543092248e-05, + "loss": 0.3576, + "step": 6010500 + }, + { + "epoch": 40.676429190125596, + "grad_norm": 0.38726770877838135, + "learning_rate": 4.5932357080987446e-05, + "loss": 0.3565, + "step": 6011000 + }, + { + "epoch": 40.679812689475966, + "grad_norm": 0.4068380296230316, + "learning_rate": 4.593201873105241e-05, + "loss": 0.3567, + "step": 6011500 + }, + { + "epoch": 40.68319618882633, + "grad_norm": 0.42458081245422363, + "learning_rate": 4.593168038111737e-05, + "loss": 0.3585, + "step": 6012000 + }, + { + "epoch": 40.6865796881767, + "grad_norm": 0.45075562596321106, + "learning_rate": 4.593134203118233e-05, + "loss": 0.3572, + "step": 6012500 + }, + { + "epoch": 40.68996318752707, + "grad_norm": 0.39937084913253784, + "learning_rate": 4.59310036812473e-05, + "loss": 0.3569, + "step": 6013000 + }, + { + "epoch": 40.69334668687743, + "grad_norm": 0.372530996799469, + "learning_rate": 4.5930665331312256e-05, + "loss": 0.3583, + "step": 6013500 + }, + { + "epoch": 40.6967301862278, + "grad_norm": 0.3779999315738678, + "learning_rate": 4.593032698137722e-05, + "loss": 0.3575, + "step": 6014000 + }, + { + "epoch": 40.700113685578174, + "grad_norm": 0.3948569595813751, + "learning_rate": 4.592998863144218e-05, + "loss": 0.3567, + "step": 6014500 + }, + { + "epoch": 40.70349718492854, + "grad_norm": 0.4100547134876251, + "learning_rate": 4.592965028150715e-05, + "loss": 0.3575, + "step": 6015000 + }, + { + "epoch": 40.70688068427891, + "grad_norm": 0.3982258439064026, + "learning_rate": 4.592931193157211e-05, + "loss": 0.3559, + "step": 6015500 + }, + { + "epoch": 40.71026418362928, + "grad_norm": 0.41315528750419617, + "learning_rate": 4.5928973581637074e-05, + "loss": 0.3591, + "step": 6016000 + }, + { + "epoch": 40.71364768297965, + "grad_norm": 0.3640074133872986, + "learning_rate": 4.5928635231702036e-05, + "loss": 0.3586, + "step": 6016500 + }, + { + "epoch": 40.71703118233001, + "grad_norm": 0.3865445852279663, + "learning_rate": 4.5928296881767005e-05, + "loss": 0.3578, + "step": 6017000 + }, + { + "epoch": 40.72041468168038, + "grad_norm": 0.40857845544815063, + "learning_rate": 4.592795853183197e-05, + "loss": 0.358, + "step": 6017500 + }, + { + "epoch": 40.72379818103075, + "grad_norm": 0.45894935727119446, + "learning_rate": 4.592762018189693e-05, + "loss": 0.3594, + "step": 6018000 + }, + { + "epoch": 40.727181680381115, + "grad_norm": 0.4257006347179413, + "learning_rate": 4.592728183196189e-05, + "loss": 0.357, + "step": 6018500 + }, + { + "epoch": 40.730565179731485, + "grad_norm": 0.37430503964424133, + "learning_rate": 4.592694348202685e-05, + "loss": 0.3586, + "step": 6019000 + }, + { + "epoch": 40.733948679081855, + "grad_norm": 0.3944646716117859, + "learning_rate": 4.5926605132091815e-05, + "loss": 0.3561, + "step": 6019500 + }, + { + "epoch": 40.73733217843222, + "grad_norm": 0.3656405508518219, + "learning_rate": 4.592626678215678e-05, + "loss": 0.3596, + "step": 6020000 + }, + { + "epoch": 40.74071567778259, + "grad_norm": 0.41020286083221436, + "learning_rate": 4.5925928432221746e-05, + "loss": 0.3579, + "step": 6020500 + }, + { + "epoch": 40.74409917713296, + "grad_norm": 0.38027000427246094, + "learning_rate": 4.592559008228671e-05, + "loss": 0.36, + "step": 6021000 + }, + { + "epoch": 40.74748267648333, + "grad_norm": 0.36818185448646545, + "learning_rate": 4.592525173235167e-05, + "loss": 0.3573, + "step": 6021500 + }, + { + "epoch": 40.75086617583369, + "grad_norm": 0.38438427448272705, + "learning_rate": 4.592491338241663e-05, + "loss": 0.3564, + "step": 6022000 + }, + { + "epoch": 40.75424967518406, + "grad_norm": 0.3644617795944214, + "learning_rate": 4.5924575032481595e-05, + "loss": 0.3586, + "step": 6022500 + }, + { + "epoch": 40.75763317453443, + "grad_norm": 0.37247687578201294, + "learning_rate": 4.592423668254656e-05, + "loss": 0.3558, + "step": 6023000 + }, + { + "epoch": 40.7610166738848, + "grad_norm": 0.35467490553855896, + "learning_rate": 4.592389833261152e-05, + "loss": 0.3568, + "step": 6023500 + }, + { + "epoch": 40.76440017323517, + "grad_norm": 0.37191349267959595, + "learning_rate": 4.592355998267648e-05, + "loss": 0.3574, + "step": 6024000 + }, + { + "epoch": 40.76778367258554, + "grad_norm": 0.37874293327331543, + "learning_rate": 4.592322163274145e-05, + "loss": 0.356, + "step": 6024500 + }, + { + "epoch": 40.7711671719359, + "grad_norm": 0.37583380937576294, + "learning_rate": 4.592288328280641e-05, + "loss": 0.3575, + "step": 6025000 + }, + { + "epoch": 40.77455067128627, + "grad_norm": 0.37207546830177307, + "learning_rate": 4.5922544932871374e-05, + "loss": 0.3574, + "step": 6025500 + }, + { + "epoch": 40.77793417063664, + "grad_norm": 0.3862629532814026, + "learning_rate": 4.5922206582936337e-05, + "loss": 0.3574, + "step": 6026000 + }, + { + "epoch": 40.781317669987004, + "grad_norm": 0.36246976256370544, + "learning_rate": 4.5921868233001305e-05, + "loss": 0.3579, + "step": 6026500 + }, + { + "epoch": 40.784701169337374, + "grad_norm": 0.3949558734893799, + "learning_rate": 4.592152988306627e-05, + "loss": 0.3584, + "step": 6027000 + }, + { + "epoch": 40.788084668687745, + "grad_norm": 0.4112015664577484, + "learning_rate": 4.592119153313123e-05, + "loss": 0.3582, + "step": 6027500 + }, + { + "epoch": 40.791468168038115, + "grad_norm": 0.3892301917076111, + "learning_rate": 4.592085318319619e-05, + "loss": 0.3593, + "step": 6028000 + }, + { + "epoch": 40.79485166738848, + "grad_norm": 0.3758805990219116, + "learning_rate": 4.5920514833261154e-05, + "loss": 0.3579, + "step": 6028500 + }, + { + "epoch": 40.79823516673885, + "grad_norm": 0.40026286244392395, + "learning_rate": 4.5920176483326116e-05, + "loss": 0.3591, + "step": 6029000 + }, + { + "epoch": 40.80161866608922, + "grad_norm": 0.37214037775993347, + "learning_rate": 4.591983813339108e-05, + "loss": 0.3591, + "step": 6029500 + }, + { + "epoch": 40.80500216543958, + "grad_norm": 0.4223296642303467, + "learning_rate": 4.591949978345604e-05, + "loss": 0.3576, + "step": 6030000 + }, + { + "epoch": 40.80838566478995, + "grad_norm": 0.3842328190803528, + "learning_rate": 4.591916143352101e-05, + "loss": 0.3572, + "step": 6030500 + }, + { + "epoch": 40.81176916414032, + "grad_norm": 0.43461838364601135, + "learning_rate": 4.591882308358597e-05, + "loss": 0.358, + "step": 6031000 + }, + { + "epoch": 40.815152663490686, + "grad_norm": 0.35343775153160095, + "learning_rate": 4.5918484733650933e-05, + "loss": 0.3571, + "step": 6031500 + }, + { + "epoch": 40.818536162841056, + "grad_norm": 0.3892674744129181, + "learning_rate": 4.5918146383715896e-05, + "loss": 0.3584, + "step": 6032000 + }, + { + "epoch": 40.821919662191426, + "grad_norm": 0.45603224635124207, + "learning_rate": 4.5917808033780864e-05, + "loss": 0.3582, + "step": 6032500 + }, + { + "epoch": 40.82530316154179, + "grad_norm": 0.4169098138809204, + "learning_rate": 4.591746968384582e-05, + "loss": 0.3585, + "step": 6033000 + }, + { + "epoch": 40.82868666089216, + "grad_norm": 0.3868551254272461, + "learning_rate": 4.591713133391078e-05, + "loss": 0.358, + "step": 6033500 + }, + { + "epoch": 40.83207016024253, + "grad_norm": 0.40169456601142883, + "learning_rate": 4.591679298397575e-05, + "loss": 0.3567, + "step": 6034000 + }, + { + "epoch": 40.8354536595929, + "grad_norm": 0.35913214087486267, + "learning_rate": 4.591645463404071e-05, + "loss": 0.3591, + "step": 6034500 + }, + { + "epoch": 40.838837158943264, + "grad_norm": 0.38202446699142456, + "learning_rate": 4.5916116284105675e-05, + "loss": 0.3574, + "step": 6035000 + }, + { + "epoch": 40.842220658293634, + "grad_norm": 0.3530464470386505, + "learning_rate": 4.591577793417064e-05, + "loss": 0.3592, + "step": 6035500 + }, + { + "epoch": 40.845604157644004, + "grad_norm": 0.37851080298423767, + "learning_rate": 4.5915439584235606e-05, + "loss": 0.3578, + "step": 6036000 + }, + { + "epoch": 40.84898765699437, + "grad_norm": 0.38370686769485474, + "learning_rate": 4.591510123430057e-05, + "loss": 0.3588, + "step": 6036500 + }, + { + "epoch": 40.85237115634474, + "grad_norm": 0.382432758808136, + "learning_rate": 4.591476288436553e-05, + "loss": 0.3584, + "step": 6037000 + }, + { + "epoch": 40.85575465569511, + "grad_norm": 0.37672939896583557, + "learning_rate": 4.591442453443049e-05, + "loss": 0.3587, + "step": 6037500 + }, + { + "epoch": 40.85913815504547, + "grad_norm": 0.3865501582622528, + "learning_rate": 4.5914086184495455e-05, + "loss": 0.3568, + "step": 6038000 + }, + { + "epoch": 40.86252165439584, + "grad_norm": 0.36493822932243347, + "learning_rate": 4.591374783456042e-05, + "loss": 0.3576, + "step": 6038500 + }, + { + "epoch": 40.86590515374621, + "grad_norm": 0.38755717873573303, + "learning_rate": 4.591340948462538e-05, + "loss": 0.3582, + "step": 6039000 + }, + { + "epoch": 40.869288653096575, + "grad_norm": 0.420865923166275, + "learning_rate": 4.591307113469034e-05, + "loss": 0.3578, + "step": 6039500 + }, + { + "epoch": 40.872672152446945, + "grad_norm": 0.39818307757377625, + "learning_rate": 4.591273278475531e-05, + "loss": 0.3582, + "step": 6040000 + }, + { + "epoch": 40.876055651797316, + "grad_norm": 0.39191457629203796, + "learning_rate": 4.591239443482027e-05, + "loss": 0.3581, + "step": 6040500 + }, + { + "epoch": 40.879439151147686, + "grad_norm": 0.4268248975276947, + "learning_rate": 4.5912056084885234e-05, + "loss": 0.3585, + "step": 6041000 + }, + { + "epoch": 40.88282265049805, + "grad_norm": 0.4153226315975189, + "learning_rate": 4.5911717734950196e-05, + "loss": 0.3571, + "step": 6041500 + }, + { + "epoch": 40.88620614984842, + "grad_norm": 0.37966054677963257, + "learning_rate": 4.5911379385015165e-05, + "loss": 0.3587, + "step": 6042000 + }, + { + "epoch": 40.88958964919879, + "grad_norm": 0.38498005270957947, + "learning_rate": 4.591104103508012e-05, + "loss": 0.3571, + "step": 6042500 + }, + { + "epoch": 40.89297314854915, + "grad_norm": 0.3869180679321289, + "learning_rate": 4.591070268514508e-05, + "loss": 0.3583, + "step": 6043000 + }, + { + "epoch": 40.89635664789952, + "grad_norm": 0.3785061836242676, + "learning_rate": 4.591036433521005e-05, + "loss": 0.3583, + "step": 6043500 + }, + { + "epoch": 40.899740147249894, + "grad_norm": 0.42890506982803345, + "learning_rate": 4.5910025985275014e-05, + "loss": 0.3581, + "step": 6044000 + }, + { + "epoch": 40.90312364660026, + "grad_norm": 0.3925100564956665, + "learning_rate": 4.5909687635339976e-05, + "loss": 0.3582, + "step": 6044500 + }, + { + "epoch": 40.90650714595063, + "grad_norm": 0.35613325238227844, + "learning_rate": 4.590934928540494e-05, + "loss": 0.3583, + "step": 6045000 + }, + { + "epoch": 40.909890645301, + "grad_norm": 0.3742819130420685, + "learning_rate": 4.590901093546991e-05, + "loss": 0.3591, + "step": 6045500 + }, + { + "epoch": 40.91327414465137, + "grad_norm": 0.38186126947402954, + "learning_rate": 4.590867258553487e-05, + "loss": 0.358, + "step": 6046000 + }, + { + "epoch": 40.91665764400173, + "grad_norm": 0.3733210861682892, + "learning_rate": 4.590833423559983e-05, + "loss": 0.3575, + "step": 6046500 + }, + { + "epoch": 40.9200411433521, + "grad_norm": 0.36450719833374023, + "learning_rate": 4.5907995885664786e-05, + "loss": 0.3583, + "step": 6047000 + }, + { + "epoch": 40.92342464270247, + "grad_norm": 0.34866660833358765, + "learning_rate": 4.5907657535729755e-05, + "loss": 0.3581, + "step": 6047500 + }, + { + "epoch": 40.926808142052835, + "grad_norm": 0.42558640241622925, + "learning_rate": 4.590731918579472e-05, + "loss": 0.3573, + "step": 6048000 + }, + { + "epoch": 40.930191641403205, + "grad_norm": 0.4213411211967468, + "learning_rate": 4.590698083585968e-05, + "loss": 0.3578, + "step": 6048500 + }, + { + "epoch": 40.933575140753575, + "grad_norm": 0.4106290638446808, + "learning_rate": 4.590664248592464e-05, + "loss": 0.3568, + "step": 6049000 + }, + { + "epoch": 40.93695864010394, + "grad_norm": 0.39277195930480957, + "learning_rate": 4.590630413598961e-05, + "loss": 0.3575, + "step": 6049500 + }, + { + "epoch": 40.94034213945431, + "grad_norm": 0.42475879192352295, + "learning_rate": 4.590596578605457e-05, + "loss": 0.3567, + "step": 6050000 + }, + { + "epoch": 40.94372563880468, + "grad_norm": 0.40933966636657715, + "learning_rate": 4.5905627436119535e-05, + "loss": 0.3582, + "step": 6050500 + }, + { + "epoch": 40.94710913815504, + "grad_norm": 0.391244500875473, + "learning_rate": 4.59052890861845e-05, + "loss": 0.3585, + "step": 6051000 + }, + { + "epoch": 40.95049263750541, + "grad_norm": 0.38254618644714355, + "learning_rate": 4.5904950736249466e-05, + "loss": 0.3585, + "step": 6051500 + }, + { + "epoch": 40.95387613685578, + "grad_norm": 0.3931604027748108, + "learning_rate": 4.590461238631442e-05, + "loss": 0.3587, + "step": 6052000 + }, + { + "epoch": 40.95725963620615, + "grad_norm": 0.42496395111083984, + "learning_rate": 4.590427403637938e-05, + "loss": 0.3561, + "step": 6052500 + }, + { + "epoch": 40.960643135556516, + "grad_norm": 0.4158755838871002, + "learning_rate": 4.590393568644435e-05, + "loss": 0.3579, + "step": 6053000 + }, + { + "epoch": 40.96402663490689, + "grad_norm": 0.36279532313346863, + "learning_rate": 4.5903597336509314e-05, + "loss": 0.3579, + "step": 6053500 + }, + { + "epoch": 40.96741013425726, + "grad_norm": 0.3945808708667755, + "learning_rate": 4.5903258986574276e-05, + "loss": 0.3572, + "step": 6054000 + }, + { + "epoch": 40.97079363360762, + "grad_norm": 0.3513178527355194, + "learning_rate": 4.590292063663924e-05, + "loss": 0.3591, + "step": 6054500 + }, + { + "epoch": 40.97417713295799, + "grad_norm": 0.38702842593193054, + "learning_rate": 4.590258228670421e-05, + "loss": 0.3578, + "step": 6055000 + }, + { + "epoch": 40.97756063230836, + "grad_norm": 0.40155449509620667, + "learning_rate": 4.590224393676917e-05, + "loss": 0.3575, + "step": 6055500 + }, + { + "epoch": 40.980944131658724, + "grad_norm": 0.3826092481613159, + "learning_rate": 4.590190558683413e-05, + "loss": 0.3585, + "step": 6056000 + }, + { + "epoch": 40.984327631009094, + "grad_norm": 0.3890673816204071, + "learning_rate": 4.590156723689909e-05, + "loss": 0.3573, + "step": 6056500 + }, + { + "epoch": 40.987711130359465, + "grad_norm": 0.4273022711277008, + "learning_rate": 4.5901228886964056e-05, + "loss": 0.359, + "step": 6057000 + }, + { + "epoch": 40.99109462970983, + "grad_norm": 0.35231855511665344, + "learning_rate": 4.590089053702902e-05, + "loss": 0.3567, + "step": 6057500 + }, + { + "epoch": 40.9944781290602, + "grad_norm": 0.3643796443939209, + "learning_rate": 4.590055218709398e-05, + "loss": 0.3573, + "step": 6058000 + }, + { + "epoch": 40.99786162841057, + "grad_norm": 0.39727750420570374, + "learning_rate": 4.590021383715894e-05, + "loss": 0.3579, + "step": 6058500 + }, + { + "epoch": 41.0, + "eval_accuracy": 0.8634818976166824, + "eval_loss": 0.5537328124046326, + "eval_runtime": 3396.4168, + "eval_samples_per_second": 85.603, + "eval_steps_per_second": 5.35, + "step": 6058816 + } + ], + "logging_steps": 500, + "max_steps": 73888000, + "num_input_tokens_seen": 0, + "num_train_epochs": 500, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.065980816103834e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}