diff --git "a/checkpoint-30000/trainer_state.json" "b/checkpoint-30000/trainer_state.json"
--- "a/checkpoint-30000/trainer_state.json"
+++ "b/checkpoint-30000/trainer_state.json"
@@ -1,7 +1,7 @@
 {
-  "best_metric": 3.4922268390655518,
-  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_8397/checkpoint-30000",
-  "epoch": 3.228931223764934,
+  "best_metric": 3.4946022033691406,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_high_2000_8397/checkpoint-30000",
+  "epoch": 3.234501347708895,
   "eval_steps": 1000,
   "global_step": 30000,
   "is_hyper_param_search": false,
@@ -9,4478 +9,4478 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.005381552039608223,
-      "grad_norm": 1.3847299814224243,
-      "learning_rate": 0.0003,
-      "loss": 8.4655,
+      "epoch": 0.005390835579514825,
+      "grad_norm": 1.6020084619522095,
+      "learning_rate": 0.00028199999999999997,
+      "loss": 8.5757,
       "step": 50
     },
     {
-      "epoch": 0.010763104079216447,
-      "grad_norm": 1.0632425546646118,
-      "learning_rate": 0.0006,
-      "loss": 6.8249,
+      "epoch": 0.01078167115902965,
+      "grad_norm": 1.8397823572158813,
+      "learning_rate": 0.0005819999999999999,
+      "loss": 6.94,
       "step": 100
     },
     {
-      "epoch": 0.01614465611882467,
-      "grad_norm": 2.088347911834717,
-      "learning_rate": 0.0005996767589699385,
-      "loss": 6.4221,
+      "epoch": 0.016172506738544475,
+      "grad_norm": 2.0749616622924805,
+      "learning_rate": 0.0005996956287101997,
+      "loss": 6.498,
       "step": 150
     },
     {
-      "epoch": 0.021526208158432893,
-      "grad_norm": 1.0710954666137695,
-      "learning_rate": 0.0005993535179398771,
-      "loss": 6.1909,
+      "epoch": 0.0215633423180593,
+      "grad_norm": 2.3127713203430176,
+      "learning_rate": 0.0005993718294657311,
+      "loss": 6.2576,
       "step": 200
     },
     {
-      "epoch": 0.026907760198041114,
-      "grad_norm": 1.5236896276474,
-      "learning_rate": 0.0005990302769098158,
-      "loss": 6.0442,
+      "epoch": 0.026954177897574125,
+      "grad_norm": 0.8723729848861694,
+      "learning_rate": 0.0005990480302212627,
+      "loss": 6.1064,
       "step": 250
     },
     {
-      "epoch": 0.03228931223764934,
-      "grad_norm": 1.8247500658035278,
-      "learning_rate": 0.0005987070358797543,
-      "loss": 5.9513,
+      "epoch": 0.03234501347708895,
+      "grad_norm": 1.36876380443573,
+      "learning_rate": 0.0005987242309767944,
+      "loss": 6.0069,
       "step": 300
     },
     {
-      "epoch": 0.03767086427725756,
-      "grad_norm": 1.2985302209854126,
-      "learning_rate": 0.0005983837948496929,
-      "loss": 5.8643,
+      "epoch": 0.03773584905660377,
+      "grad_norm": 1.1116160154342651,
+      "learning_rate": 0.0005984004317323259,
+      "loss": 5.8938,
       "step": 350
     },
     {
-      "epoch": 0.04305241631686579,
-      "grad_norm": 1.256401538848877,
-      "learning_rate": 0.0005980605538196314,
-      "loss": 5.8018,
+      "epoch": 0.0431266846361186,
+      "grad_norm": 1.7246553897857666,
+      "learning_rate": 0.0005980766324878575,
+      "loss": 5.8482,
       "step": 400
     },
     {
-      "epoch": 0.048433968356474004,
-      "grad_norm": 0.8569295406341553,
-      "learning_rate": 0.0005977373127895701,
-      "loss": 5.7053,
+      "epoch": 0.04851752021563342,
+      "grad_norm": 1.5949625968933105,
+      "learning_rate": 0.000597752833243389,
+      "loss": 5.7717,
       "step": 450
     },
     {
-      "epoch": 0.05381552039608223,
-      "grad_norm": 0.860461413860321,
-      "learning_rate": 0.0005974140717595086,
-      "loss": 5.6413,
+      "epoch": 0.05390835579514825,
+      "grad_norm": 1.2913446426391602,
+      "learning_rate": 0.0005974290339989207,
+      "loss": 5.6782,
       "step": 500
     },
     {
-      "epoch": 0.05919707243569045,
-      "grad_norm": 1.6800360679626465,
-      "learning_rate": 0.0005970908307294472,
-      "loss": 5.5789,
+      "epoch": 0.05929919137466307,
+      "grad_norm": 1.3474897146224976,
+      "learning_rate": 0.0005971052347544522,
+      "loss": 5.6001,
       "step": 550
     },
     {
-      "epoch": 0.06457862447529868,
-      "grad_norm": 1.408766508102417,
-      "learning_rate": 0.0005967675896993858,
-      "loss": 5.4957,
+      "epoch": 0.0646900269541779,
+      "grad_norm": 1.230842113494873,
+      "learning_rate": 0.0005967814355099838,
+      "loss": 5.5233,
       "step": 600
     },
     {
-      "epoch": 0.0699601765149069,
-      "grad_norm": 1.263355016708374,
-      "learning_rate": 0.0005964443486693243,
-      "loss": 5.421,
+      "epoch": 0.07008086253369272,
+      "grad_norm": 1.3157235383987427,
+      "learning_rate": 0.0005964576362655153,
+      "loss": 5.4574,
       "step": 650
     },
     {
-      "epoch": 0.07534172855451512,
-      "grad_norm": 1.617833137512207,
-      "learning_rate": 0.000596121107639263,
-      "loss": 5.334,
+      "epoch": 0.07547169811320754,
+      "grad_norm": 1.1696827411651611,
+      "learning_rate": 0.0005961338370210469,
+      "loss": 5.4063,
       "step": 700
     },
     {
-      "epoch": 0.08072328059412334,
-      "grad_norm": 1.2163816690444946,
-      "learning_rate": 0.0005957978666092015,
-      "loss": 5.2925,
+      "epoch": 0.08086253369272237,
+      "grad_norm": 1.3056632280349731,
+      "learning_rate": 0.0005958100377765785,
+      "loss": 5.3331,
       "step": 750
     },
     {
-      "epoch": 0.08610483263373157,
-      "grad_norm": 1.4054591655731201,
-      "learning_rate": 0.0005954746255791401,
-      "loss": 5.222,
+      "epoch": 0.0862533692722372,
+      "grad_norm": 1.6241542100906372,
+      "learning_rate": 0.00059548623853211,
+      "loss": 5.2849,
       "step": 800
     },
     {
-      "epoch": 0.09148638467333979,
-      "grad_norm": 1.0674018859863281,
-      "learning_rate": 0.0005951513845490787,
-      "loss": 5.2071,
+      "epoch": 0.09164420485175202,
+      "grad_norm": 1.0650733709335327,
+      "learning_rate": 0.0005951624392876416,
+      "loss": 5.2456,
       "step": 850
     },
     {
-      "epoch": 0.09686793671294801,
-      "grad_norm": 1.4989951848983765,
-      "learning_rate": 0.0005948281435190174,
-      "loss": 5.1526,
+      "epoch": 0.09703504043126684,
+      "grad_norm": 1.5372745990753174,
+      "learning_rate": 0.0005948386400431732,
+      "loss": 5.191,
       "step": 900
     },
     {
-      "epoch": 0.10224948875255624,
-      "grad_norm": 1.028799057006836,
-      "learning_rate": 0.0005945049024889559,
-      "loss": 5.1098,
+      "epoch": 0.10242587601078167,
+      "grad_norm": 1.4312268495559692,
+      "learning_rate": 0.0005945148407987047,
+      "loss": 5.1659,
       "step": 950
     },
     {
-      "epoch": 0.10763104079216446,
-      "grad_norm": 0.955634355545044,
-      "learning_rate": 0.0005941816614588944,
-      "loss": 5.0633,
+      "epoch": 0.1078167115902965,
+      "grad_norm": 0.9435037970542908,
+      "learning_rate": 0.0005941910415542363,
+      "loss": 5.1266,
       "step": 1000
     },
     {
-      "epoch": 0.10763104079216446,
-      "eval_accuracy": 0.22915491630956988,
-      "eval_loss": 4.996466636657715,
-      "eval_runtime": 202.5064,
-      "eval_samples_per_second": 88.94,
-      "eval_steps_per_second": 5.56,
+      "epoch": 0.1078167115902965,
+      "eval_accuracy": 0.22553308094716198,
+      "eval_loss": 5.041633605957031,
+      "eval_runtime": 152.6311,
+      "eval_samples_per_second": 118.004,
+      "eval_steps_per_second": 7.377,
       "step": 1000
     },
     {
-      "epoch": 0.11301259283177269,
-      "grad_norm": 1.2216582298278809,
-      "learning_rate": 0.000593858420428833,
-      "loss": 5.0219,
+      "epoch": 0.11320754716981132,
+      "grad_norm": 1.088536024093628,
+      "learning_rate": 0.0005938672423097679,
+      "loss": 5.0725,
       "step": 1050
     },
     {
-      "epoch": 0.1183941448713809,
-      "grad_norm": 1.2919893264770508,
-      "learning_rate": 0.0005935351793987716,
-      "loss": 4.9962,
+      "epoch": 0.11859838274932614,
+      "grad_norm": 0.9872896075248718,
+      "learning_rate": 0.0005935434430652995,
+      "loss": 5.0443,
       "step": 1100
     },
     {
-      "epoch": 0.12377569691098914,
-      "grad_norm": 1.842529058456421,
-      "learning_rate": 0.0005932119383687103,
-      "loss": 4.9736,
+      "epoch": 0.12398921832884097,
+      "grad_norm": 0.9727984070777893,
+      "learning_rate": 0.000593219643820831,
+      "loss": 5.0164,
       "step": 1150
     },
     {
-      "epoch": 0.12915724895059735,
-      "grad_norm": 0.9416630268096924,
-      "learning_rate": 0.0005928886973386488,
-      "loss": 4.9687,
+      "epoch": 0.1293800539083558,
+      "grad_norm": 1.0641847848892212,
+      "learning_rate": 0.0005928958445763626,
+      "loss": 4.978,
       "step": 1200
     },
     {
-      "epoch": 0.13453880099020557,
-      "grad_norm": 1.340468406677246,
-      "learning_rate": 0.0005925654563085874,
-      "loss": 4.9251,
+      "epoch": 0.1347708894878706,
+      "grad_norm": 0.8890565633773804,
+      "learning_rate": 0.0005925720453318941,
+      "loss": 4.9292,
       "step": 1250
     },
     {
-      "epoch": 0.1399203530298138,
-      "grad_norm": 1.4057867527008057,
-      "learning_rate": 0.000592242215278526,
-      "loss": 4.8782,
+      "epoch": 0.14016172506738545,
+      "grad_norm": 1.0303993225097656,
+      "learning_rate": 0.0005922482460874258,
+      "loss": 4.9,
       "step": 1300
     },
     {
-      "epoch": 0.14530190506942203,
-      "grad_norm": 1.3462789058685303,
-      "learning_rate": 0.0005919189742484645,
-      "loss": 4.8431,
+      "epoch": 0.14555256064690028,
+      "grad_norm": 1.0149179697036743,
+      "learning_rate": 0.0005919244468429573,
+      "loss": 4.8846,
       "step": 1350
     },
     {
-      "epoch": 0.15068345710903025,
-      "grad_norm": 0.9291279911994934,
-      "learning_rate": 0.0005915957332184032,
-      "loss": 4.8316,
+      "epoch": 0.1509433962264151,
+      "grad_norm": 1.0036956071853638,
+      "learning_rate": 0.0005916006475984889,
+      "loss": 4.8728,
       "step": 1400
     },
     {
-      "epoch": 0.15606500914863847,
-      "grad_norm": 0.7789126634597778,
-      "learning_rate": 0.0005912724921883417,
-      "loss": 4.8028,
+      "epoch": 0.15633423180592992,
+      "grad_norm": 0.8696756958961487,
+      "learning_rate": 0.0005912768483540205,
+      "loss": 4.8206,
       "step": 1450
     },
     {
-      "epoch": 0.16144656118824668,
-      "grad_norm": 1.2111430168151855,
-      "learning_rate": 0.0005909492511582803,
-      "loss": 4.7837,
+      "epoch": 0.16172506738544473,
+      "grad_norm": 1.1622956991195679,
+      "learning_rate": 0.0005909530491095521,
+      "loss": 4.8359,
       "step": 1500
     },
     {
-      "epoch": 0.1668281132278549,
-      "grad_norm": 1.1703121662139893,
-      "learning_rate": 0.0005906260101282189,
-      "loss": 4.7456,
+      "epoch": 0.16711590296495957,
+      "grad_norm": 1.1141048669815063,
+      "learning_rate": 0.0005906292498650836,
+      "loss": 4.7878,
       "step": 1550
     },
     {
-      "epoch": 0.17220966526746315,
-      "grad_norm": 0.9755436182022095,
-      "learning_rate": 0.0005903027690981575,
-      "loss": 4.7303,
+      "epoch": 0.1725067385444744,
+      "grad_norm": 1.274352788925171,
+      "learning_rate": 0.0005903054506206151,
+      "loss": 4.7862,
       "step": 1600
     },
     {
-      "epoch": 0.17759121730707136,
-      "grad_norm": 0.9121830463409424,
-      "learning_rate": 0.000589979528068096,
-      "loss": 4.7437,
+      "epoch": 0.1778975741239892,
+      "grad_norm": 0.8880785703659058,
+      "learning_rate": 0.0005899816513761468,
+      "loss": 4.7445,
       "step": 1650
     },
     {
-      "epoch": 0.18297276934667958,
-      "grad_norm": 1.2337779998779297,
-      "learning_rate": 0.0005896562870380347,
-      "loss": 4.6991,
+      "epoch": 0.18328840970350405,
+      "grad_norm": 0.9446322321891785,
+      "learning_rate": 0.0005896578521316783,
+      "loss": 4.7346,
       "step": 1700
     },
     {
-      "epoch": 0.1883543213862878,
-      "grad_norm": 0.7923302054405212,
-      "learning_rate": 0.0005893330460079732,
-      "loss": 4.6819,
+      "epoch": 0.18867924528301888,
+      "grad_norm": 0.9463135600090027,
+      "learning_rate": 0.0005893340528872099,
+      "loss": 4.6952,
       "step": 1750
     },
     {
-      "epoch": 0.19373587342589602,
-      "grad_norm": 0.9414947032928467,
-      "learning_rate": 0.0005890098049779118,
-      "loss": 4.6401,
+      "epoch": 0.1940700808625337,
+      "grad_norm": 0.8582187294960022,
+      "learning_rate": 0.0005890102536427414,
+      "loss": 4.7029,
       "step": 1800
     },
     {
-      "epoch": 0.19911742546550426,
-      "grad_norm": 0.8575401306152344,
-      "learning_rate": 0.0005886865639478504,
-      "loss": 4.6503,
+      "epoch": 0.19946091644204852,
+      "grad_norm": 0.9058123826980591,
+      "learning_rate": 0.0005886864543982731,
+      "loss": 4.6735,
       "step": 1850
     },
     {
-      "epoch": 0.20449897750511248,
-      "grad_norm": 0.874113917350769,
-      "learning_rate": 0.0005883633229177889,
-      "loss": 4.6262,
+      "epoch": 0.20485175202156333,
+      "grad_norm": 0.9203746318817139,
+      "learning_rate": 0.0005883626551538046,
+      "loss": 4.6596,
       "step": 1900
     },
     {
-      "epoch": 0.2098805295447207,
-      "grad_norm": 0.7579442262649536,
-      "learning_rate": 0.0005880400818877276,
-      "loss": 4.5946,
+      "epoch": 0.21024258760107817,
+      "grad_norm": 1.0241466760635376,
+      "learning_rate": 0.0005880388559093362,
+      "loss": 4.6282,
       "step": 1950
     },
     {
-      "epoch": 0.2152620815843289,
-      "grad_norm": 0.8062272071838379,
-      "learning_rate": 0.0005877168408576662,
-      "loss": 4.579,
+      "epoch": 0.215633423180593,
+      "grad_norm": 1.1103599071502686,
+      "learning_rate": 0.0005877150566648677,
+      "loss": 4.6194,
       "step": 2000
     },
     {
-      "epoch": 0.2152620815843289,
-      "eval_accuracy": 0.2712283567521957,
-      "eval_loss": 4.50425910949707,
-      "eval_runtime": 211.1194,
-      "eval_samples_per_second": 85.312,
-      "eval_steps_per_second": 5.333,
+      "epoch": 0.215633423180593,
+      "eval_accuracy": 0.2670949835939572,
+      "eval_loss": 4.5360541343688965,
+      "eval_runtime": 152.732,
+      "eval_samples_per_second": 117.926,
+      "eval_steps_per_second": 7.372,
       "step": 2000
     },
     {
-      "epoch": 0.22064363362393713,
-      "grad_norm": 1.1572167873382568,
-      "learning_rate": 0.0005873935998276048,
-      "loss": 4.5564,
+      "epoch": 0.2210242587601078,
+      "grad_norm": 0.8629128336906433,
+      "learning_rate": 0.0005873912574203993,
+      "loss": 4.5954,
       "step": 2050
     },
     {
-      "epoch": 0.22602518566354537,
-      "grad_norm": 1.1494579315185547,
-      "learning_rate": 0.0005870703587975433,
-      "loss": 4.5425,
+      "epoch": 0.22641509433962265,
+      "grad_norm": 0.7517758011817932,
+      "learning_rate": 0.0005870674581759309,
+      "loss": 4.5774,
       "step": 2100
     },
     {
-      "epoch": 0.2314067377031536,
-      "grad_norm": 0.9105241298675537,
-      "learning_rate": 0.0005867471177674818,
-      "loss": 4.5083,
+      "epoch": 0.23180592991913745,
+      "grad_norm": 1.1525517702102661,
+      "learning_rate": 0.0005867436589314624,
+      "loss": 4.5567,
       "step": 2150
     },
     {
-      "epoch": 0.2367882897427618,
-      "grad_norm": 0.9526411890983582,
-      "learning_rate": 0.0005864238767374205,
-      "loss": 4.5128,
+      "epoch": 0.2371967654986523,
+      "grad_norm": 1.0665569305419922,
+      "learning_rate": 0.000586419859686994,
+      "loss": 4.5313,
       "step": 2200
     },
     {
-      "epoch": 0.24216984178237003,
-      "grad_norm": 0.7726457715034485,
-      "learning_rate": 0.0005861006357073591,
-      "loss": 4.4899,
+      "epoch": 0.24258760107816713,
+      "grad_norm": 0.8165591359138489,
+      "learning_rate": 0.0005860960604425256,
+      "loss": 4.5156,
       "step": 2250
     },
     {
-      "epoch": 0.24755139382197827,
-      "grad_norm": 1.0442702770233154,
-      "learning_rate": 0.0005857773946772977,
-      "loss": 4.4716,
+      "epoch": 0.24797843665768193,
+      "grad_norm": 1.0752588510513306,
+      "learning_rate": 0.0005857722611980571,
+      "loss": 4.526,
       "step": 2300
     },
     {
-      "epoch": 0.2529329458615865,
-      "grad_norm": 1.1448233127593994,
-      "learning_rate": 0.0005854541536472362,
-      "loss": 4.4607,
+      "epoch": 0.25336927223719674,
+      "grad_norm": 0.9010254740715027,
+      "learning_rate": 0.0005854484619535887,
+      "loss": 4.4768,
       "step": 2350
     },
     {
-      "epoch": 0.2583144979011947,
-      "grad_norm": 0.9515467286109924,
-      "learning_rate": 0.0005851309126171749,
-      "loss": 4.4544,
+      "epoch": 0.2587601078167116,
+      "grad_norm": 0.813308596611023,
+      "learning_rate": 0.0005851246627091202,
+      "loss": 4.486,
       "step": 2400
     },
     {
-      "epoch": 0.2636960499408029,
-      "grad_norm": 0.7929104566574097,
-      "learning_rate": 0.0005848076715871134,
-      "loss": 4.4392,
+      "epoch": 0.2641509433962264,
+      "grad_norm": 0.997901976108551,
+      "learning_rate": 0.0005848008634646519,
+      "loss": 4.4602,
       "step": 2450
     },
     {
-      "epoch": 0.26907760198041114,
-      "grad_norm": 1.1213116645812988,
-      "learning_rate": 0.000584484430557052,
-      "loss": 4.4407,
+      "epoch": 0.2695417789757412,
+      "grad_norm": 1.0612961053848267,
+      "learning_rate": 0.0005844770642201834,
+      "loss": 4.4441,
       "step": 2500
     },
     {
-      "epoch": 0.27445915402001936,
-      "grad_norm": 1.006108045578003,
-      "learning_rate": 0.0005841611895269906,
-      "loss": 4.3997,
+      "epoch": 0.2749326145552561,
+      "grad_norm": 0.7074645757675171,
+      "learning_rate": 0.000584153264975715,
+      "loss": 4.4268,
       "step": 2550
     },
     {
-      "epoch": 0.2798407060596276,
-      "grad_norm": 0.7756773829460144,
-      "learning_rate": 0.0005838379484969291,
-      "loss": 4.3982,
+      "epoch": 0.2803234501347709,
+      "grad_norm": 0.7893711924552917,
+      "learning_rate": 0.0005838294657312465,
+      "loss": 4.3983,
       "step": 2600
     },
     {
-      "epoch": 0.2852222580992358,
-      "grad_norm": 0.7442255020141602,
-      "learning_rate": 0.0005835147074668678,
-      "loss": 4.3751,
+      "epoch": 0.2857142857142857,
+      "grad_norm": 0.8888121843338013,
+      "learning_rate": 0.0005835056664867782,
+      "loss": 4.4127,
       "step": 2650
     },
     {
-      "epoch": 0.29060381013884407,
-      "grad_norm": 0.7645350694656372,
-      "learning_rate": 0.0005831914664368063,
-      "loss": 4.3873,
+      "epoch": 0.29110512129380056,
+      "grad_norm": 0.8462523221969604,
+      "learning_rate": 0.0005831818672423098,
+      "loss": 4.4129,
       "step": 2700
     },
     {
-      "epoch": 0.2959853621784523,
-      "grad_norm": 0.7028666734695435,
-      "learning_rate": 0.0005828682254067449,
-      "loss": 4.3659,
+      "epoch": 0.29649595687331537,
+      "grad_norm": 0.8949580192565918,
+      "learning_rate": 0.0005828580679978413,
+      "loss": 4.3751,
       "step": 2750
     },
     {
-      "epoch": 0.3013669142180605,
-      "grad_norm": 0.8504071831703186,
-      "learning_rate": 0.0005825449843766835,
-      "loss": 4.3617,
+      "epoch": 0.3018867924528302,
+      "grad_norm": 0.7590197324752808,
+      "learning_rate": 0.0005825342687533729,
+      "loss": 4.3503,
       "step": 2800
     },
     {
-      "epoch": 0.3067484662576687,
-      "grad_norm": 0.880102276802063,
-      "learning_rate": 0.0005822217433466221,
-      "loss": 4.3115,
+      "epoch": 0.30727762803234504,
+      "grad_norm": 0.6843862533569336,
+      "learning_rate": 0.0005822104695089044,
+      "loss": 4.3515,
       "step": 2850
     },
     {
-      "epoch": 0.31213001829727693,
-      "grad_norm": 0.7290977835655212,
-      "learning_rate": 0.0005818985023165607,
-      "loss": 4.3215,
+      "epoch": 0.31266846361185985,
+      "grad_norm": 0.8599426746368408,
+      "learning_rate": 0.000581886670264436,
+      "loss": 4.342,
       "step": 2900
     },
     {
-      "epoch": 0.31751157033688515,
-      "grad_norm": 0.8019910454750061,
-      "learning_rate": 0.0005815752612864992,
-      "loss": 4.3149,
+      "epoch": 0.31805929919137466,
+      "grad_norm": 0.893551766872406,
+      "learning_rate": 0.0005815628710199675,
+      "loss": 4.3359,
       "step": 2950
     },
     {
-      "epoch": 0.32289312237649337,
-      "grad_norm": 0.7739085555076599,
-      "learning_rate": 0.0005812520202564378,
-      "loss": 4.322,
+      "epoch": 0.32345013477088946,
+      "grad_norm": 0.8023635149002075,
+      "learning_rate": 0.0005812390717754992,
+      "loss": 4.3296,
       "step": 3000
     },
     {
-      "epoch": 0.32289312237649337,
-      "eval_accuracy": 0.2992396144952079,
-      "eval_loss": 4.226711750030518,
-      "eval_runtime": 199.2176,
-      "eval_samples_per_second": 90.409,
-      "eval_steps_per_second": 5.652,
+      "epoch": 0.32345013477088946,
+      "eval_accuracy": 0.29539221573769714,
+      "eval_loss": 4.2696380615234375,
+      "eval_runtime": 152.7589,
+      "eval_samples_per_second": 117.905,
+      "eval_steps_per_second": 7.371,
       "step": 3000
     },
     {
-      "epoch": 0.3282746744161016,
-      "grad_norm": 1.0869840383529663,
-      "learning_rate": 0.0005809287792263764,
-      "loss": 4.3072,
+      "epoch": 0.3288409703504043,
+      "grad_norm": 0.6987361907958984,
+      "learning_rate": 0.0005809152725310307,
+      "loss": 4.3208,
       "step": 3050
     },
     {
-      "epoch": 0.3336562264557098,
-      "grad_norm": 0.7337630391120911,
-      "learning_rate": 0.0005806055381963151,
-      "loss": 4.2759,
+      "epoch": 0.33423180592991913,
+      "grad_norm": 0.7668080925941467,
+      "learning_rate": 0.0005805914732865623,
+      "loss": 4.3246,
       "step": 3100
     },
     {
-      "epoch": 0.3390377784953181,
-      "grad_norm": 0.7336916923522949,
-      "learning_rate": 0.0005802822971662536,
-      "loss": 4.2838,
+      "epoch": 0.33962264150943394,
+      "grad_norm": 0.7270567417144775,
+      "learning_rate": 0.0005802676740420938,
+      "loss": 4.2823,
       "step": 3150
     },
     {
-      "epoch": 0.3444193305349263,
-      "grad_norm": 0.7901497483253479,
-      "learning_rate": 0.0005799590561361922,
-      "loss": 4.2757,
+      "epoch": 0.3450134770889488,
+      "grad_norm": 0.6801213622093201,
+      "learning_rate": 0.0005799438747976255,
+      "loss": 4.2811,
       "step": 3200
     },
     {
-      "epoch": 0.3498008825745345,
-      "grad_norm": 0.9568474888801575,
-      "learning_rate": 0.0005796358151061307,
-      "loss": 4.2574,
+      "epoch": 0.3504043126684636,
+      "grad_norm": 0.7790431976318359,
+      "learning_rate": 0.000579620075553157,
+      "loss": 4.28,
       "step": 3250
     },
     {
-      "epoch": 0.35518243461414273,
-      "grad_norm": 0.9219122529029846,
-      "learning_rate": 0.0005793125740760694,
-      "loss": 4.2423,
+      "epoch": 0.3557951482479784,
+      "grad_norm": 0.8592586517333984,
+      "learning_rate": 0.0005792962763086886,
+      "loss": 4.2634,
       "step": 3300
     },
     {
-      "epoch": 0.36056398665375095,
-      "grad_norm": 0.6552030444145203,
-      "learning_rate": 0.0005789893330460079,
-      "loss": 4.2339,
+      "epoch": 0.3611859838274933,
+      "grad_norm": 0.7063053846359253,
+      "learning_rate": 0.0005789724770642201,
+      "loss": 4.2784,
       "step": 3350
     },
     {
-      "epoch": 0.36594553869335916,
-      "grad_norm": 0.6318998336791992,
-      "learning_rate": 0.0005786660920159465,
-      "loss": 4.2253,
+      "epoch": 0.3665768194070081,
+      "grad_norm": 0.7619624733924866,
+      "learning_rate": 0.0005786486778197517,
+      "loss": 4.2507,
       "step": 3400
     },
     {
-      "epoch": 0.3713270907329674,
-      "grad_norm": 0.7610264420509338,
-      "learning_rate": 0.0005783428509858851,
-      "loss": 4.2369,
+      "epoch": 0.3719676549865229,
+      "grad_norm": 0.7230337858200073,
+      "learning_rate": 0.0005783248785752833,
+      "loss": 4.2259,
       "step": 3450
     },
     {
-      "epoch": 0.3767086427725756,
-      "grad_norm": 0.6418978571891785,
-      "learning_rate": 0.0005780196099558237,
-      "loss": 4.2239,
+      "epoch": 0.37735849056603776,
+      "grad_norm": 0.7475656270980835,
+      "learning_rate": 0.0005780010793308148,
+      "loss": 4.2353,
       "step": 3500
     },
     {
-      "epoch": 0.3820901948121838,
-      "grad_norm": 0.7791884541511536,
-      "learning_rate": 0.0005776963689257623,
-      "loss": 4.2253,
+      "epoch": 0.38274932614555257,
+      "grad_norm": 0.7857275009155273,
+      "learning_rate": 0.0005776772800863464,
+      "loss": 4.2317,
       "step": 3550
     },
     {
-      "epoch": 0.38747174685179203,
-      "grad_norm": 0.7014104127883911,
-      "learning_rate": 0.0005773731278957008,
-      "loss": 4.1888,
+      "epoch": 0.3881401617250674,
+      "grad_norm": 0.6113690733909607,
+      "learning_rate": 0.000577353480841878,
+      "loss": 4.2344,
       "step": 3600
     },
     {
-      "epoch": 0.3928532988914003,
-      "grad_norm": 0.7691745758056641,
-      "learning_rate": 0.0005770498868656394,
-      "loss": 4.2055,
+      "epoch": 0.3935309973045822,
+      "grad_norm": 0.8256933093070984,
+      "learning_rate": 0.0005770296815974095,
+      "loss": 4.2054,
       "step": 3650
     },
     {
-      "epoch": 0.3982348509310085,
-      "grad_norm": 0.7330165505409241,
-      "learning_rate": 0.000576726645835578,
-      "loss": 4.1917,
+      "epoch": 0.39892183288409705,
+      "grad_norm": 0.8064551949501038,
+      "learning_rate": 0.0005767058823529411,
+      "loss": 4.2078,
       "step": 3700
     },
     {
-      "epoch": 0.40361640297061674,
-      "grad_norm": 0.6409156918525696,
-      "learning_rate": 0.0005764034048055167,
-      "loss": 4.1739,
+      "epoch": 0.40431266846361186,
+      "grad_norm": 0.6000850200653076,
+      "learning_rate": 0.0005763820831084726,
+      "loss": 4.2117,
       "step": 3750
     },
     {
-      "epoch": 0.40899795501022496,
-      "grad_norm": 0.751600444316864,
-      "learning_rate": 0.0005760801637754552,
-      "loss": 4.1815,
+      "epoch": 0.40970350404312667,
+      "grad_norm": 0.6357919573783875,
+      "learning_rate": 0.0005760582838640043,
+      "loss": 4.205,
       "step": 3800
     },
     {
-      "epoch": 0.4143795070498332,
-      "grad_norm": 0.6341384649276733,
-      "learning_rate": 0.0005757569227453937,
-      "loss": 4.1752,
+      "epoch": 0.41509433962264153,
+      "grad_norm": 0.6840759515762329,
+      "learning_rate": 0.0005757344846195359,
+      "loss": 4.1844,
       "step": 3850
     },
     {
-      "epoch": 0.4197610590894414,
-      "grad_norm": 0.8716601729393005,
-      "learning_rate": 0.0005754336817153324,
-      "loss": 4.1772,
+      "epoch": 0.42048517520215634,
+      "grad_norm": 0.7748379111289978,
+      "learning_rate": 0.0005754106853750674,
+      "loss": 4.1815,
       "step": 3900
     },
     {
-      "epoch": 0.4251426111290496,
-      "grad_norm": 0.7268335223197937,
-      "learning_rate": 0.0005751104406852709,
-      "loss": 4.1579,
+      "epoch": 0.42587601078167114,
+      "grad_norm": 0.6725760698318481,
+      "learning_rate": 0.000575086886130599,
+      "loss": 4.1742,
       "step": 3950
     },
     {
-      "epoch": 0.4305241631686578,
-      "grad_norm": 0.7490660548210144,
-      "learning_rate": 0.0005747871996552096,
-      "loss": 4.1439,
+      "epoch": 0.431266846361186,
+      "grad_norm": 0.8388188481330872,
+      "learning_rate": 0.0005747630868861306,
+      "loss": 4.1751,
       "step": 4000
     },
     {
-      "epoch": 0.4305241631686578,
-      "eval_accuracy": 0.31218462820231296,
-      "eval_loss": 4.092499256134033,
-      "eval_runtime": 199.4467,
-      "eval_samples_per_second": 90.305,
-      "eval_steps_per_second": 5.646,
+      "epoch": 0.431266846361186,
+      "eval_accuracy": 0.31143709633414934,
+      "eval_loss": 4.099897384643555,
+      "eval_runtime": 152.799,
+      "eval_samples_per_second": 117.874,
+      "eval_steps_per_second": 7.369,
       "step": 4000
     },
     {
-      "epoch": 0.43590571520826604,
-      "grad_norm": 0.9424974918365479,
-      "learning_rate": 0.0005744639586251481,
-      "loss": 4.1592,
+      "epoch": 0.4366576819407008,
+      "grad_norm": 0.5372797250747681,
+      "learning_rate": 0.0005744392876416622,
+      "loss": 4.1637,
       "step": 4050
     },
     {
-      "epoch": 0.44128726724787426,
-      "grad_norm": 0.771391749382019,
-      "learning_rate": 0.0005741407175950867,
-      "loss": 4.1455,
+      "epoch": 0.4420485175202156,
+      "grad_norm": 0.7316073775291443,
+      "learning_rate": 0.0005741154883971936,
+      "loss": 4.1648,
       "step": 4100
     },
     {
-      "epoch": 0.44666881928748253,
-      "grad_norm": 0.7916195392608643,
-      "learning_rate": 0.0005738174765650253,
-      "loss": 4.1329,
+      "epoch": 0.4474393530997305,
+      "grad_norm": 0.6784662008285522,
+      "learning_rate": 0.0005737916891527253,
+      "loss": 4.1653,
       "step": 4150
     },
     {
-      "epoch": 0.45205037132709075,
-      "grad_norm": 0.7015873789787292,
-      "learning_rate": 0.0005734942355349638,
-      "loss": 4.1338,
+      "epoch": 0.4528301886792453,
+      "grad_norm": 0.6223421692848206,
+      "learning_rate": 0.0005734678899082568,
+      "loss": 4.1481,
       "step": 4200
     },
     {
-      "epoch": 0.45743192336669897,
-      "grad_norm": 0.62185138463974,
-      "learning_rate": 0.0005731709945049025,
-      "loss": 4.1517,
+      "epoch": 0.4582210242587601,
+      "grad_norm": 0.6360011100769043,
+      "learning_rate": 0.0005731440906637884,
+      "loss": 4.1347,
       "step": 4250
     },
     {
-      "epoch": 0.4628134754063072,
-      "grad_norm": 0.6499165296554565,
-      "learning_rate": 0.000572847753474841,
-      "loss": 4.1249,
+      "epoch": 0.4636118598382749,
+      "grad_norm": 0.9023606181144714,
+      "learning_rate": 0.0005728202914193199,
+      "loss": 4.1318,
       "step": 4300
     },
     {
-      "epoch": 0.4681950274459154,
-      "grad_norm": 0.8142028450965881,
-      "learning_rate": 0.0005725245124447796,
-      "loss": 4.1094,
+      "epoch": 0.46900269541778977,
+      "grad_norm": 0.5858056545257568,
+      "learning_rate": 0.0005724964921748516,
+      "loss": 4.1316,
       "step": 4350
     },
     {
-      "epoch": 0.4735765794855236,
-      "grad_norm": 0.686265230178833,
-      "learning_rate": 0.0005722012714147182,
-      "loss": 4.1156,
+      "epoch": 0.4743935309973046,
+      "grad_norm": 0.7229328751564026,
+      "learning_rate": 0.0005721726929303831,
+      "loss": 4.1268,
       "step": 4400
     },
     {
-      "epoch": 0.47895813152513184,
-      "grad_norm": 0.6861261129379272,
-      "learning_rate": 0.0005718780303846568,
-      "loss": 4.0911,
+      "epoch": 0.4797843665768194,
+      "grad_norm": 0.8891165256500244,
+      "learning_rate": 0.0005718488936859147,
+      "loss": 4.1138,
       "step": 4450
     },
     {
-      "epoch": 0.48433968356474005,
-      "grad_norm": 0.717899739742279,
-      "learning_rate": 0.0005715547893545953,
-      "loss": 4.0984,
+      "epoch": 0.48517520215633425,
+      "grad_norm": 0.6901257038116455,
+      "learning_rate": 0.0005715250944414462,
+      "loss": 4.1243,
       "step": 4500
     },
     {
-      "epoch": 0.48972123560434827,
-      "grad_norm": 0.635183572769165,
-      "learning_rate": 0.000571231548324534,
-      "loss": 4.0915,
+      "epoch": 0.49056603773584906,
+      "grad_norm": 0.6422321200370789,
+      "learning_rate": 0.0005712012951969778,
+      "loss": 4.1152,
       "step": 4550
     },
     {
-      "epoch": 0.49510278764395654,
-      "grad_norm": 0.6602552533149719,
-      "learning_rate": 0.0005709083072944725,
-      "loss": 4.0886,
+      "epoch": 0.49595687331536387,
+      "grad_norm": 0.7133573293685913,
+      "learning_rate": 0.0005708774959525094,
+      "loss": 4.0984,
       "step": 4600
     },
     {
-      "epoch": 0.5004843396835648,
-      "grad_norm": 0.7243141531944275,
-      "learning_rate": 0.0005705850662644111,
-      "loss": 4.0928,
+      "epoch": 0.5013477088948787,
+      "grad_norm": 0.625822126865387,
+      "learning_rate": 0.000570553696708041,
+      "loss": 4.0931,
       "step": 4650
     },
     {
-      "epoch": 0.505865891723173,
-      "grad_norm": 0.6310231685638428,
-      "learning_rate": 0.0005702618252343497,
-      "loss": 4.0986,
+      "epoch": 0.5067385444743935,
+      "grad_norm": 0.6901353001594543,
+      "learning_rate": 0.0005702298974635725,
+      "loss": 4.1064,
       "step": 4700
     },
     {
-      "epoch": 0.5112474437627812,
-      "grad_norm": 0.5868064761161804,
-      "learning_rate": 0.0005699385842042882,
-      "loss": 4.0687,
+      "epoch": 0.5121293800539084,
+      "grad_norm": 0.6352577805519104,
+      "learning_rate": 0.0005699060982191041,
+      "loss": 4.0958,
       "step": 4750
     },
     {
-      "epoch": 0.5166289958023894,
-      "grad_norm": 0.6389384269714355,
-      "learning_rate": 0.0005696153431742269,
-      "loss": 4.0789,
+      "epoch": 0.5175202156334232,
+      "grad_norm": 0.7718018293380737,
+      "learning_rate": 0.0005695822989746357,
+      "loss": 4.0929,
       "step": 4800
     },
     {
-      "epoch": 0.5220105478419976,
-      "grad_norm": 0.7776073217391968,
-      "learning_rate": 0.0005692921021441655,
-      "loss": 4.0977,
+      "epoch": 0.522911051212938,
+      "grad_norm": 0.6640039682388306,
+      "learning_rate": 0.0005692584997301672,
+      "loss": 4.0793,
       "step": 4850
     },
     {
-      "epoch": 0.5273920998816058,
-      "grad_norm": 0.6234896779060364,
-      "learning_rate": 0.0005689688611141041,
-      "loss": 4.0547,
+      "epoch": 0.5283018867924528,
+      "grad_norm": 0.5986649990081787,
+      "learning_rate": 0.0005689347004856988,
+      "loss": 4.0624,
       "step": 4900
     },
     {
-      "epoch": 0.5327736519212141,
-      "grad_norm": 0.8750380277633667,
-      "learning_rate": 0.0005686456200840426,
-      "loss": 4.0718,
+      "epoch": 0.5336927223719676,
+      "grad_norm": 0.6509894132614136,
+      "learning_rate": 0.0005686109012412304,
+      "loss": 4.08,
       "step": 4950
     },
     {
-      "epoch": 0.5381552039608223,
-      "grad_norm": 0.7700529098510742,
-      "learning_rate": 0.0005683223790539811,
-      "loss": 4.043,
+      "epoch": 0.5390835579514824,
+      "grad_norm": 0.5881455540657043,
+      "learning_rate": 0.000568287101996762,
+      "loss": 4.0618,
       "step": 5000
     },
     {
-      "epoch": 0.5381552039608223,
-      "eval_accuracy": 0.3214325100957547,
-      "eval_loss": 3.9926369190216064,
-      "eval_runtime": 225.5059,
-      "eval_samples_per_second": 79.869,
-      "eval_steps_per_second": 4.993,
+      "epoch": 0.5390835579514824,
+      "eval_accuracy": 0.3202667732623931,
+      "eval_loss": 4.000813007354736,
+      "eval_runtime": 152.9398,
+      "eval_samples_per_second": 117.765,
+      "eval_steps_per_second": 7.362,
       "step": 5000
     },
     {
-      "epoch": 0.5435367560004305,
-      "grad_norm": 0.7643857002258301,
-      "learning_rate": 0.0005679991380239198,
-      "loss": 4.0605,
+      "epoch": 0.5444743935309974,
+      "grad_norm": 0.6732229590415955,
+      "learning_rate": 0.0005679633027522935,
+      "loss": 4.068,
       "step": 5050
     },
     {
-      "epoch": 0.5489183080400387,
-      "grad_norm": 0.554442286491394,
-      "learning_rate": 0.0005676758969938584,
-      "loss": 4.0446,
+      "epoch": 0.5498652291105122,
+      "grad_norm": 0.6501848697662354,
+      "learning_rate": 0.000567639503507825,
+      "loss": 4.0611,
       "step": 5100
     },
     {
-      "epoch": 0.5542998600796469,
-      "grad_norm": 0.7216308116912842,
-      "learning_rate": 0.000567352655963797,
-      "loss": 4.0313,
+      "epoch": 0.555256064690027,
+      "grad_norm": 0.6546076536178589,
+      "learning_rate": 0.0005673157042633567,
+      "loss": 4.0601,
       "step": 5150
     },
     {
-      "epoch": 0.5596814121192552,
-      "grad_norm": 0.6057455539703369,
-      "learning_rate": 0.0005670294149337355,
-      "loss": 4.0337,
+      "epoch": 0.5606469002695418,
+      "grad_norm": 0.6216645836830139,
+      "learning_rate": 0.0005669919050188883,
+      "loss": 4.0788,
       "step": 5200
     },
     {
-      "epoch": 0.5650629641588634,
-      "grad_norm": 0.535860002040863,
-      "learning_rate": 0.0005667061739036742,
-      "loss": 4.0392,
+      "epoch": 0.5660377358490566,
+      "grad_norm": 0.6688370704650879,
+      "learning_rate": 0.0005666681057744198,
+      "loss": 4.0529,
       "step": 5250
     },
     {
-      "epoch": 0.5704445161984716,
-      "grad_norm": 0.647204577922821,
-      "learning_rate": 0.0005663829328736127,
-      "loss": 4.0421,
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.7299574613571167,
+      "learning_rate": 0.0005663443065299514,
+      "loss": 4.0448,
       "step": 5300
     },
     {
-      "epoch": 0.5758260682380799,
-      "grad_norm": 0.5126392841339111,
-      "learning_rate": 0.0005660596918435512,
-      "loss": 4.0318,
+      "epoch": 0.5768194070080862,
+      "grad_norm": 0.6612991094589233,
+      "learning_rate": 0.000566020507285483,
+      "loss": 4.0386,
       "step": 5350
     },
     {
-      "epoch": 0.5812076202776881,
-      "grad_norm": 0.6040687561035156,
-      "learning_rate": 0.0005657364508134899,
-      "loss": 4.0117,
+      "epoch": 0.5822102425876011,
+      "grad_norm": 0.5907670855522156,
+      "learning_rate": 0.0005656967080410146,
+      "loss": 4.0478,
       "step": 5400
     },
     {
-      "epoch": 0.5865891723172963,
-      "grad_norm": 0.6971921324729919,
-      "learning_rate": 0.0005654132097834284,
-      "loss": 4.025,
+      "epoch": 0.5876010781671159,
+      "grad_norm": 0.539881706237793,
+      "learning_rate": 0.000565372908796546,
+      "loss": 4.0481,
       "step": 5450
     },
     {
-      "epoch": 0.5919707243569046,
-      "grad_norm": 0.5343753099441528,
-      "learning_rate": 0.0005650899687533671,
-      "loss": 4.015,
+      "epoch": 0.5929919137466307,
+      "grad_norm": 0.6923815608024597,
+      "learning_rate": 0.0005650491095520777,
+      "loss": 4.028,
       "step": 5500
     },
     {
-      "epoch": 0.5973522763965128,
-      "grad_norm": 0.5600801706314087,
-      "learning_rate": 0.0005647667277233056,
-      "loss": 4.0087,
+      "epoch": 0.5983827493261455,
+      "grad_norm": 0.6604423522949219,
+      "learning_rate": 0.0005647253103076092,
+      "loss": 4.0191,
       "step": 5550
     },
     {
-      "epoch": 0.602733828436121,
-      "grad_norm": 0.6542683243751526,
-      "learning_rate": 0.0005644434866932442,
-      "loss": 3.9957,
+      "epoch": 0.6037735849056604,
+      "grad_norm": 0.6762202382087708,
+      "learning_rate": 0.0005644015110631408,
+      "loss": 4.038,
       "step": 5600
     },
     {
-      "epoch": 0.6081153804757292,
-      "grad_norm": 0.6249606609344482,
-      "learning_rate": 0.0005641202456631828,
-      "loss": 4.0136,
+      "epoch": 0.6091644204851752,
+      "grad_norm": 0.6797250509262085,
+      "learning_rate": 0.0005640777118186723,
+      "loss": 4.015,
       "step": 5650
     },
     {
-      "epoch": 0.6134969325153374,
-      "grad_norm": 0.6572969555854797,
-      "learning_rate": 0.0005637970046331214,
-      "loss": 4.0046,
+      "epoch": 0.6145552560646901,
+      "grad_norm": 0.5718669891357422,
+      "learning_rate": 0.000563753912574204,
+      "loss": 4.0142,
       "step": 5700
     },
     {
-      "epoch": 0.6188784845549457,
-      "grad_norm": 0.6926669478416443,
-      "learning_rate": 0.00056347376360306,
-      "loss": 3.9869,
+      "epoch": 0.6199460916442049,
+      "grad_norm": 0.6036492586135864,
+      "learning_rate": 0.0005634301133297355,
+      "loss": 3.9904,
       "step": 5750
     },
     {
-      "epoch": 0.6242600365945539,
-      "grad_norm": 0.6491366624832153,
-      "learning_rate": 0.0005631505225729985,
-      "loss": 4.0004,
+      "epoch": 0.6253369272237197,
+      "grad_norm": 0.6295326352119446,
+      "learning_rate": 0.0005631063140852671,
+      "loss": 3.9973,
       "step": 5800
     },
     {
-      "epoch": 0.6296415886341621,
-      "grad_norm": 0.6138956546783447,
-      "learning_rate": 0.0005628272815429371,
-      "loss": 3.9814,
+      "epoch": 0.6307277628032345,
+      "grad_norm": 0.6866003274917603,
+      "learning_rate": 0.0005627825148407986,
+      "loss": 4.0086,
       "step": 5850
     },
     {
-      "epoch": 0.6350231406737703,
-      "grad_norm": 0.6467788219451904,
-      "learning_rate": 0.0005625040405128757,
-      "loss": 3.9771,
+      "epoch": 0.6361185983827493,
+      "grad_norm": 0.6472943425178528,
+      "learning_rate": 0.0005624587155963302,
+      "loss": 4.0048,
       "step": 5900
     },
     {
-      "epoch": 0.6404046927133785,
-      "grad_norm": 0.6469590067863464,
-      "learning_rate": 0.0005621807994828143,
-      "loss": 3.9882,
+      "epoch": 0.6415094339622641,
+      "grad_norm": 0.6805757284164429,
+      "learning_rate": 0.0005621349163518618,
+      "loss": 3.9913,
       "step": 5950
     },
     {
-      "epoch": 0.6457862447529867,
-      "grad_norm": 0.7396846413612366,
-      "learning_rate": 0.0005618575584527529,
-      "loss": 3.9759,
+      "epoch": 0.6469002695417789,
+      "grad_norm": 0.6941430568695068,
+      "learning_rate": 0.0005618111171073934,
+      "loss": 4.0077,
       "step": 6000
     },
     {
-      "epoch": 0.6457862447529867,
-      "eval_accuracy": 0.32833664054615025,
-      "eval_loss": 3.919649839401245,
-      "eval_runtime": 200.1239,
-      "eval_samples_per_second": 89.999,
-      "eval_steps_per_second": 5.627,
+      "epoch": 0.6469002695417789,
+      "eval_accuracy": 0.3263130891634934,
+      "eval_loss": 3.9324727058410645,
+      "eval_runtime": 152.9066,
+      "eval_samples_per_second": 117.791,
+      "eval_steps_per_second": 7.364,
       "step": 6000
     },
     {
-      "epoch": 0.651167796792595,
-      "grad_norm": 0.7368170619010925,
-      "learning_rate": 0.0005615343174226915,
+      "epoch": 0.6522911051212938,
+      "grad_norm": 0.6350868344306946,
+      "learning_rate": 0.0005614873178629249,
       "loss": 3.9789,
       "step": 6050
     },
     {
-      "epoch": 0.6565493488322032,
-      "grad_norm": 0.593928873538971,
-      "learning_rate": 0.00056121107639263,
-      "loss": 3.962,
+      "epoch": 0.6576819407008087,
+      "grad_norm": 0.6860512495040894,
+      "learning_rate": 0.0005611635186184565,
+      "loss": 4.0102,
       "step": 6100
     },
     {
-      "epoch": 0.6619309008718114,
-      "grad_norm": 0.5571395754814148,
-      "learning_rate": 0.0005608878353625687,
-      "loss": 3.9732,
+      "epoch": 0.6630727762803235,
+      "grad_norm": 0.771823525428772,
+      "learning_rate": 0.0005608397193739882,
+      "loss": 3.9907,
       "step": 6150
     },
     {
-      "epoch": 0.6673124529114196,
-      "grad_norm": 0.6184373497962952,
-      "learning_rate": 0.0005605645943325072,
-      "loss": 3.9618,
+      "epoch": 0.6684636118598383,
+      "grad_norm": 0.6326267123222351,
+      "learning_rate": 0.0005605159201295196,
+      "loss": 3.9633,
       "step": 6200
     },
     {
-      "epoch": 0.6726940049510278,
-      "grad_norm": 0.6731172800064087,
-      "learning_rate": 0.0005602413533024458,
-      "loss": 3.9629,
+      "epoch": 0.6738544474393531,
+      "grad_norm": 0.5878480672836304,
+      "learning_rate": 0.0005601921208850511,
+      "loss": 3.9821,
       "step": 6250
     },
     {
-      "epoch": 0.6780755569906362,
-      "grad_norm": Infinity,
-      "learning_rate": 0.0005599245770929855,
-      "loss": 3.9661,
+      "epoch": 0.6792452830188679,
+      "grad_norm": 0.6786486506462097,
+      "learning_rate": 0.0005598683216405828,
+      "loss": 3.9856,
       "step": 6300
     },
     {
-      "epoch": 0.6834571090302444,
-      "grad_norm": 0.6674894690513611,
-      "learning_rate": 0.0005596013360629242,
-      "loss": 3.9598,
+      "epoch": 0.6846361185983828,
+      "grad_norm": 0.5655982494354248,
+      "learning_rate": 0.0005595445223961144,
+      "loss": 3.9732,
       "step": 6350
     },
     {
-      "epoch": 0.6888386610698526,
-      "grad_norm": 0.6270789504051208,
-      "learning_rate": 0.0005592780950328628,
-      "loss": 3.9443,
+      "epoch": 0.6900269541778976,
+      "grad_norm": 0.6204545497894287,
+      "learning_rate": 0.0005592207231516459,
+      "loss": 3.9792,
       "step": 6400
     },
     {
-      "epoch": 0.6942202131094608,
-      "grad_norm": 0.6163814067840576,
-      "learning_rate": 0.0005589548540028014,
-      "loss": 3.9506,
+      "epoch": 0.6954177897574124,
+      "grad_norm": 0.718676745891571,
+      "learning_rate": 0.0005588969239071775,
+      "loss": 3.9757,
       "step": 6450
     },
     {
-      "epoch": 0.699601765149069,
-      "grad_norm": 0.6060782670974731,
-      "learning_rate": 0.0005586316129727399,
-      "loss": 3.9645,
+      "epoch": 0.7008086253369272,
+      "grad_norm": 0.5987905263900757,
+      "learning_rate": 0.0005585731246627091,
+      "loss": 3.9713,
       "step": 6500
     },
     {
-      "epoch": 0.7049833171886772,
-      "grad_norm": 0.6246522068977356,
-      "learning_rate": 0.0005583083719426786,
-      "loss": 3.9614,
+      "epoch": 0.706199460916442,
+      "grad_norm": 0.6304261684417725,
+      "learning_rate": 0.0005582493254182407,
+      "loss": 3.9699,
       "step": 6550
     },
     {
-      "epoch": 0.7103648692282855,
-      "grad_norm": 0.603921115398407,
-      "learning_rate": 0.0005579851309126171,
-      "loss": 3.94,
+      "epoch": 0.7115902964959568,
+      "grad_norm": 0.5621398091316223,
+      "learning_rate": 0.0005579255261737722,
+      "loss": 3.9707,
       "step": 6600
     },
     {
-      "epoch": 0.7157464212678937,
-      "grad_norm": 0.5655505061149597,
-      "learning_rate": 0.0005576618898825558,
-      "loss": 3.9318,
+      "epoch": 0.7169811320754716,
+      "grad_norm": 0.637505829334259,
+      "learning_rate": 0.0005576017269293038,
+      "loss": 3.9813,
       "step": 6650
     },
     {
-      "epoch": 0.7211279733075019,
-      "grad_norm": 0.604542076587677,
-      "learning_rate": 0.0005573386488524943,
-      "loss": 3.9307,
+      "epoch": 0.7223719676549866,
+      "grad_norm": 0.6425164341926575,
+      "learning_rate": 0.0005572779276848353,
+      "loss": 3.9482,
       "step": 6700
     },
     {
-      "epoch": 0.7265095253471101,
-      "grad_norm": 0.600004255771637,
-      "learning_rate": 0.0005570154078224328,
-      "loss": 3.9442,
+      "epoch": 0.7277628032345014,
+      "grad_norm": 0.624649703502655,
+      "learning_rate": 0.000556954128440367,
+      "loss": 3.9741,
       "step": 6750
     },
     {
-      "epoch": 0.7318910773867183,
-      "grad_norm": 0.6960250735282898,
-      "learning_rate": 0.0005566921667923715,
-      "loss": 3.9425,
+      "epoch": 0.7331536388140162,
+      "grad_norm": 0.6176960468292236,
+      "learning_rate": 0.0005566303291958984,
+      "loss": 3.9436,
       "step": 6800
     },
     {
-      "epoch": 0.7372726294263265,
-      "grad_norm": 0.5619109869003296,
-      "learning_rate": 0.00055636892576231,
-      "loss": 3.9073,
+      "epoch": 0.738544474393531,
+      "grad_norm": 0.6119649410247803,
+      "learning_rate": 0.0005563065299514301,
+      "loss": 3.9442,
       "step": 6850
     },
     {
-      "epoch": 0.7426541814659348,
-      "grad_norm": 0.6036113500595093,
-      "learning_rate": 0.0005560456847322487,
-      "loss": 3.9305,
+      "epoch": 0.7439353099730458,
+      "grad_norm": 0.5820748805999756,
+      "learning_rate": 0.0005559827307069616,
+      "loss": 3.9463,
       "step": 6900
     },
     {
-      "epoch": 0.748035733505543,
-      "grad_norm": 0.6234127283096313,
-      "learning_rate": 0.0005557224437021872,
-      "loss": 3.9285,
+      "epoch": 0.7493261455525606,
+      "grad_norm": 0.6512102484703064,
+      "learning_rate": 0.0005556589314624932,
+      "loss": 3.9498,
       "step": 6950
     },
     {
-      "epoch": 0.7534172855451512,
-      "grad_norm": 0.6131523251533508,
-      "learning_rate": 0.0005553992026721258,
-      "loss": 3.922,
+      "epoch": 0.7547169811320755,
+      "grad_norm": 0.548623263835907,
+      "learning_rate": 0.0005553351322180247,
+      "loss": 3.9306,
       "step": 7000
     },
     {
-      "epoch": 0.7534172855451512,
-      "eval_accuracy": 0.3334209437785411,
-      "eval_loss": 3.861107349395752,
-      "eval_runtime": 205.1306,
-      "eval_samples_per_second": 87.803,
-      "eval_steps_per_second": 5.489,
+      "epoch": 0.7547169811320755,
+      "eval_accuracy": 0.3323316985782009,
+      "eval_loss": 3.8741683959960938,
+      "eval_runtime": 152.843,
+      "eval_samples_per_second": 117.84,
+      "eval_steps_per_second": 7.367,
       "step": 7000
     },
     {
-      "epoch": 0.7587988375847594,
-      "grad_norm": 0.6058487296104431,
-      "learning_rate": 0.0005550759616420644,
-      "loss": 3.9083,
+      "epoch": 0.7601078167115903,
+      "grad_norm": 0.6566927433013916,
+      "learning_rate": 0.0005550113329735564,
+      "loss": 3.9435,
       "step": 7050
     },
     {
-      "epoch": 0.7641803896243676,
-      "grad_norm": 0.624620258808136,
-      "learning_rate": 0.000554752720612003,
-      "loss": 3.9349,
+      "epoch": 0.7654986522911051,
+      "grad_norm": 0.8524190187454224,
+      "learning_rate": 0.0005546875337290879,
+      "loss": 3.9177,
       "step": 7100
     },
     {
-      "epoch": 0.7695619416639758,
-      "grad_norm": 0.5525732040405273,
-      "learning_rate": 0.0005544294795819415,
-      "loss": 3.9261,
+      "epoch": 0.77088948787062,
+      "grad_norm": 0.6614950299263,
+      "learning_rate": 0.0005543637344846195,
+      "loss": 3.9448,
       "step": 7150
     },
     {
-      "epoch": 0.7749434937035841,
-      "grad_norm": 0.5950748324394226,
-      "learning_rate": 0.0005541062385518801,
-      "loss": 3.9067,
+      "epoch": 0.7762803234501348,
+      "grad_norm": 0.5254852175712585,
+      "learning_rate": 0.000554039935240151,
+      "loss": 3.9164,
       "step": 7200
     },
     {
-      "epoch": 0.7803250457431924,
-      "grad_norm": 0.5052813291549683,
-      "learning_rate": 0.0005537829975218188,
-      "loss": 3.9004,
+      "epoch": 0.7816711590296496,
+      "grad_norm": 0.6216305494308472,
+      "learning_rate": 0.0005537161359956826,
+      "loss": 3.909,
       "step": 7250
     },
     {
-      "epoch": 0.7857065977828006,
-      "grad_norm": 0.5610913038253784,
-      "learning_rate": 0.0005534662213123586,
-      "loss": 3.8911,
+      "epoch": 0.7870619946091644,
+      "grad_norm": 0.6032219529151917,
+      "learning_rate": 0.0005533923367512143,
+      "loss": 3.9232,
       "step": 7300
     },
     {
-      "epoch": 0.7910881498224088,
-      "grad_norm": 0.5639758110046387,
-      "learning_rate": 0.0005531429802822971,
-      "loss": 3.931,
+      "epoch": 0.7924528301886793,
+      "grad_norm": 0.6744025349617004,
+      "learning_rate": 0.0005530685375067458,
+      "loss": 3.9224,
       "step": 7350
     },
     {
-      "epoch": 0.796469701862017,
-      "grad_norm": 0.6221727728843689,
-      "learning_rate": 0.0005528197392522357,
-      "loss": 3.9288,
+      "epoch": 0.7978436657681941,
+      "grad_norm": 0.568401038646698,
+      "learning_rate": 0.0005527447382622774,
+      "loss": 3.918,
       "step": 7400
     },
     {
-      "epoch": 0.8018512539016253,
-      "grad_norm": 0.5807225108146667,
-      "learning_rate": 0.0005524964982221743,
-      "loss": 3.9206,
+      "epoch": 0.8032345013477089,
+      "grad_norm": 0.5871582627296448,
+      "learning_rate": 0.0005524209390178089,
+      "loss": 3.8955,
       "step": 7450
     },
     {
-      "epoch": 0.8072328059412335,
-      "grad_norm": 0.6214258074760437,
-      "learning_rate": 0.0005521732571921129,
-      "loss": 3.9216,
+      "epoch": 0.8086253369272237,
+      "grad_norm": 0.641424834728241,
+      "learning_rate": 0.0005520971397733406,
+      "loss": 3.9128,
       "step": 7500
     },
     {
-      "epoch": 0.8126143579808417,
-      "grad_norm": 0.6088549494743347,
-      "learning_rate": 0.0005518500161620514,
-      "loss": 3.9102,
+      "epoch": 0.8140161725067385,
+      "grad_norm": 0.60320645570755,
+      "learning_rate": 0.000551773340528872,
+      "loss": 3.908,
       "step": 7550
     },
     {
-      "epoch": 0.8179959100204499,
-      "grad_norm": 0.615315318107605,
-      "learning_rate": 0.00055152677513199,
-      "loss": 3.8932,
+      "epoch": 0.8194070080862533,
+      "grad_norm": 0.6258028149604797,
+      "learning_rate": 0.0005514495412844036,
+      "loss": 3.9018,
       "step": 7600
     },
     {
-      "epoch": 0.8233774620600581,
-      "grad_norm": 0.5932325124740601,
-      "learning_rate": 0.0005512035341019286,
-      "loss": 3.9037,
+      "epoch": 0.8247978436657682,
+      "grad_norm": 0.5553128123283386,
+      "learning_rate": 0.0005511257420399352,
+      "loss": 3.9058,
       "step": 7650
     },
     {
-      "epoch": 0.8287590140996663,
-      "grad_norm": 0.632127583026886,
-      "learning_rate": 0.0005508802930718672,
-      "loss": 3.9074,
+      "epoch": 0.8301886792452831,
+      "grad_norm": 0.6213424801826477,
+      "learning_rate": 0.0005508019427954668,
+      "loss": 3.9005,
       "step": 7700
     },
     {
-      "epoch": 0.8341405661392746,
-      "grad_norm": 0.5906286239624023,
-      "learning_rate": 0.0005505570520418058,
-      "loss": 3.8969,
+      "epoch": 0.8355795148247979,
+      "grad_norm": 0.6496004462242126,
+      "learning_rate": 0.0005504781435509983,
+      "loss": 3.9025,
       "step": 7750
     },
     {
-      "epoch": 0.8395221181788828,
-      "grad_norm": 0.7419958114624023,
-      "learning_rate": 0.0005502338110117443,
-      "loss": 3.911,
+      "epoch": 0.8409703504043127,
+      "grad_norm": 0.6373844146728516,
+      "learning_rate": 0.0005501543443065299,
+      "loss": 3.8844,
       "step": 7800
     },
     {
-      "epoch": 0.844903670218491,
-      "grad_norm": 0.5959650874137878,
-      "learning_rate": 0.000549910569981683,
-      "loss": 3.885,
+      "epoch": 0.8463611859838275,
+      "grad_norm": 0.5957589745521545,
+      "learning_rate": 0.0005498305450620615,
+      "loss": 3.8902,
       "step": 7850
     },
     {
-      "epoch": 0.8502852222580992,
-      "grad_norm": 0.5370450615882874,
-      "learning_rate": 0.0005495873289516215,
-      "loss": 3.8883,
+      "epoch": 0.8517520215633423,
+      "grad_norm": 0.6516701579093933,
+      "learning_rate": 0.0005495067458175931,
+      "loss": 3.8876,
       "step": 7900
     },
     {
-      "epoch": 0.8556667742977074,
-      "grad_norm": 0.6142526865005493,
-      "learning_rate": 0.0005492640879215602,
-      "loss": 3.8891,
+      "epoch": 0.8571428571428571,
+      "grad_norm": 0.6374366283416748,
+      "learning_rate": 0.0005491829465731246,
+      "loss": 3.9173,
       "step": 7950
     },
     {
-      "epoch": 0.8610483263373157,
-      "grad_norm": 0.5250054001808167,
-      "learning_rate": 0.0005489408468914987,
-      "loss": 3.8731,
+      "epoch": 0.862533692722372,
+      "grad_norm": 0.5943326354026794,
+      "learning_rate": 0.0005488591473286562,
+      "loss": 3.8636,
       "step": 8000
     },
     {
-      "epoch": 0.8610483263373157,
-      "eval_accuracy": 0.3380041398923315,
-      "eval_loss": 3.814573049545288,
-      "eval_runtime": 218.7029,
-      "eval_samples_per_second": 82.354,
-      "eval_steps_per_second": 5.149,
+      "epoch": 0.862533692722372,
+      "eval_accuracy": 0.3369106572293666,
+      "eval_loss": 3.824968099594116,
+      "eval_runtime": 152.7242,
+      "eval_samples_per_second": 117.932,
+      "eval_steps_per_second": 7.373,
       "step": 8000
     },
     {
-      "epoch": 0.8664298783769239,
-      "grad_norm": 0.5592741370201111,
-      "learning_rate": 0.0005486176058614372,
-      "loss": 3.8661,
+      "epoch": 0.8679245283018868,
+      "grad_norm": 0.6996535062789917,
+      "learning_rate": 0.0005485353480841877,
+      "loss": 3.906,
       "step": 8050
     },
     {
-      "epoch": 0.8718114304165321,
-      "grad_norm": 0.558358371257782,
-      "learning_rate": 0.0005482943648313759,
-      "loss": 3.8665,
+      "epoch": 0.8733153638814016,
+      "grad_norm": 0.6234637498855591,
+      "learning_rate": 0.0005482115488397194,
+      "loss": 3.8831,
       "step": 8100
     },
     {
-      "epoch": 0.8771929824561403,
-      "grad_norm": 0.6065065860748291,
-      "learning_rate": 0.0005479711238013145,
-      "loss": 3.8815,
+      "epoch": 0.8787061994609164,
+      "grad_norm": 0.6020538806915283,
+      "learning_rate": 0.0005478877495952508,
+      "loss": 3.8864,
       "step": 8150
     },
     {
-      "epoch": 0.8825745344957485,
-      "grad_norm": 0.566906750202179,
-      "learning_rate": 0.0005476478827712531,
-      "loss": 3.8788,
+      "epoch": 0.8840970350404312,
+      "grad_norm": 0.5640770792961121,
+      "learning_rate": 0.0005475639503507825,
+      "loss": 3.8799,
       "step": 8200
     },
     {
-      "epoch": 0.8879560865353568,
-      "grad_norm": 0.5542231202125549,
-      "learning_rate": 0.0005473246417411916,
-      "loss": 3.8757,
+      "epoch": 0.889487870619946,
+      "grad_norm": 0.6602328419685364,
+      "learning_rate": 0.000547240151106314,
+      "loss": 3.8788,
       "step": 8250
     },
     {
-      "epoch": 0.8933376385749651,
-      "grad_norm": 0.5650736093521118,
-      "learning_rate": 0.0005470014007111302,
-      "loss": 3.867,
+      "epoch": 0.894878706199461,
+      "grad_norm": 0.6336698532104492,
+      "learning_rate": 0.0005469163518618456,
+      "loss": 3.8969,
       "step": 8300
     },
     {
-      "epoch": 0.8987191906145733,
-      "grad_norm": 0.6359168887138367,
-      "learning_rate": 0.0005466781596810688,
-      "loss": 3.8704,
+      "epoch": 0.9002695417789758,
+      "grad_norm": 0.562479555606842,
+      "learning_rate": 0.0005465925526173771,
+      "loss": 3.8686,
       "step": 8350
     },
     {
-      "epoch": 0.9041007426541815,
-      "grad_norm": 0.5609973669052124,
-      "learning_rate": 0.0005463549186510073,
-      "loss": 3.8704,
+      "epoch": 0.9056603773584906,
+      "grad_norm": 0.5576388835906982,
+      "learning_rate": 0.0005462687533729087,
+      "loss": 3.8729,
       "step": 8400
     },
     {
-      "epoch": 0.9094822946937897,
-      "grad_norm": 0.5966047048568726,
-      "learning_rate": 0.000546031677620946,
-      "loss": 3.8649,
+      "epoch": 0.9110512129380054,
+      "grad_norm": 0.5111304521560669,
+      "learning_rate": 0.0005459449541284403,
+      "loss": 3.8679,
       "step": 8450
     },
     {
-      "epoch": 0.9148638467333979,
-      "grad_norm": 0.555415689945221,
-      "learning_rate": 0.0005457084365908845,
-      "loss": 3.8775,
+      "epoch": 0.9164420485175202,
+      "grad_norm": 0.6154835224151611,
+      "learning_rate": 0.0005456211548839719,
+      "loss": 3.8749,
       "step": 8500
     },
     {
-      "epoch": 0.9202453987730062,
-      "grad_norm": 0.5719990134239197,
-      "learning_rate": 0.0005453851955608232,
-      "loss": 3.8601,
+      "epoch": 0.921832884097035,
+      "grad_norm": 0.5426989197731018,
+      "learning_rate": 0.0005452973556395034,
+      "loss": 3.8685,
       "step": 8550
     },
     {
-      "epoch": 0.9256269508126144,
-      "grad_norm": 0.5464230179786682,
-      "learning_rate": 0.0005450619545307617,
-      "loss": 3.8599,
+      "epoch": 0.9272237196765498,
+      "grad_norm": 0.5612382888793945,
+      "learning_rate": 0.000544973556395035,
+      "loss": 3.8747,
       "step": 8600
     },
     {
-      "epoch": 0.9310085028522226,
-      "grad_norm": 0.6069023013114929,
-      "learning_rate": 0.0005447387135007003,
-      "loss": 3.8584,
+      "epoch": 0.9326145552560647,
+      "grad_norm": 0.5568158626556396,
+      "learning_rate": 0.0005446497571505667,
+      "loss": 3.8547,
       "step": 8650
     },
     {
-      "epoch": 0.9363900548918308,
-      "grad_norm": 0.556610643863678,
-      "learning_rate": 0.0005444154724706389,
-      "loss": 3.8636,
+      "epoch": 0.9380053908355795,
+      "grad_norm": 0.6064712405204773,
+      "learning_rate": 0.0005443259579060982,
+      "loss": 3.8589,
       "step": 8700
     },
     {
-      "epoch": 0.941771606931439,
-      "grad_norm": 0.5988168120384216,
-      "learning_rate": 0.0005440922314405775,
-      "loss": 3.8366,
+      "epoch": 0.9433962264150944,
+      "grad_norm": 0.5729287266731262,
+      "learning_rate": 0.0005440021586616298,
+      "loss": 3.8548,
       "step": 8750
     },
     {
-      "epoch": 0.9471531589710472,
-      "grad_norm": 0.5634347200393677,
-      "learning_rate": 0.0005437689904105161,
-      "loss": 3.8604,
+      "epoch": 0.9487870619946092,
+      "grad_norm": 0.5489872694015503,
+      "learning_rate": 0.0005436783594171613,
+      "loss": 3.8599,
       "step": 8800
     },
     {
-      "epoch": 0.9525347110106555,
-      "grad_norm": 0.5349180102348328,
-      "learning_rate": 0.0005434457493804546,
-      "loss": 3.8395,
+      "epoch": 0.954177897574124,
+      "grad_norm": 0.6285766959190369,
+      "learning_rate": 0.0005433545601726929,
+      "loss": 3.8606,
       "step": 8850
     },
     {
-      "epoch": 0.9579162630502637,
-      "grad_norm": 0.5537461042404175,
-      "learning_rate": 0.0005431225083503932,
-      "loss": 3.8644,
+      "epoch": 0.9595687331536388,
+      "grad_norm": 0.566647469997406,
+      "learning_rate": 0.0005430307609282244,
+      "loss": 3.8472,
       "step": 8900
     },
     {
-      "epoch": 0.9632978150898719,
-      "grad_norm": 0.5711461901664734,
-      "learning_rate": 0.0005427992673203318,
-      "loss": 3.8352,
+      "epoch": 0.9649595687331537,
+      "grad_norm": 0.5826287269592285,
+      "learning_rate": 0.000542706961683756,
+      "loss": 3.8404,
       "step": 8950
     },
     {
-      "epoch": 0.9686793671294801,
-      "grad_norm": 0.6128414273262024,
-      "learning_rate": 0.0005424760262902704,
-      "loss": 3.8405,
+      "epoch": 0.9703504043126685,
+      "grad_norm": 0.6150389313697815,
+      "learning_rate": 0.0005423831624392876,
+      "loss": 3.8653,
       "step": 9000
     },
     {
-      "epoch": 0.9686793671294801,
-      "eval_accuracy": 0.3417073562677125,
-      "eval_loss": 3.779106378555298,
-      "eval_runtime": 210.7209,
-      "eval_samples_per_second": 85.473,
-      "eval_steps_per_second": 5.344,
+      "epoch": 0.9703504043126685,
+      "eval_accuracy": 0.3401243923451433,
+      "eval_loss": 3.788287878036499,
+      "eval_runtime": 153.2128,
+      "eval_samples_per_second": 117.555,
+      "eval_steps_per_second": 7.349,
       "step": 9000
     },
     {
-      "epoch": 0.9740609191690883,
-      "grad_norm": 0.6156434416770935,
-      "learning_rate": 0.000542152785260209,
-      "loss": 3.8333,
+      "epoch": 0.9757412398921833,
+      "grad_norm": 0.6907781958580017,
+      "learning_rate": 0.0005420593631948192,
+      "loss": 3.8789,
       "step": 9050
     },
     {
-      "epoch": 0.9794424712086965,
-      "grad_norm": 0.5652556419372559,
-      "learning_rate": 0.0005418295442301476,
-      "loss": 3.8327,
+      "epoch": 0.9811320754716981,
+      "grad_norm": 0.5165227055549622,
+      "learning_rate": 0.0005417355639503507,
+      "loss": 3.8372,
       "step": 9100
     },
     {
-      "epoch": 0.9848240232483048,
-      "grad_norm": 0.6443074941635132,
-      "learning_rate": 0.0005415063032000861,
-      "loss": 3.842,
+      "epoch": 0.9865229110512129,
+      "grad_norm": 0.563310980796814,
+      "learning_rate": 0.0005414117647058823,
+      "loss": 3.848,
       "step": 9150
     },
     {
-      "epoch": 0.9902055752879131,
-      "grad_norm": 0.632297158241272,
-      "learning_rate": 0.0005411830621700248,
-      "loss": 3.8312,
+      "epoch": 0.9919137466307277,
+      "grad_norm": 0.5229910016059875,
+      "learning_rate": 0.0005410879654614139,
+      "loss": 3.8388,
       "step": 9200
     },
     {
-      "epoch": 0.9955871273275213,
-      "grad_norm": 0.5863285064697266,
-      "learning_rate": 0.0005408598211399633,
-      "loss": 3.8298,
+      "epoch": 0.9973045822102425,
+      "grad_norm": 0.5532335638999939,
+      "learning_rate": 0.0005407641662169455,
+      "loss": 3.8559,
       "step": 9250
     },
     {
-      "epoch": 1.0009686793671295,
-      "grad_norm": 0.5393086671829224,
-      "learning_rate": 0.0005405365801099019,
-      "loss": 3.8069,
+      "epoch": 1.0026954177897573,
+      "grad_norm": 0.5877416729927063,
+      "learning_rate": 0.000540440366972477,
+      "loss": 3.8053,
       "step": 9300
     },
     {
-      "epoch": 1.0063502314067376,
-      "grad_norm": 0.5285312533378601,
-      "learning_rate": 0.0005402133390798405,
-      "loss": 3.7619,
+      "epoch": 1.0080862533692723,
+      "grad_norm": 0.5423709154129028,
+      "learning_rate": 0.0005401165677280086,
+      "loss": 3.7729,
       "step": 9350
     },
     {
-      "epoch": 1.011731783446346,
-      "grad_norm": 0.5921708941459656,
-      "learning_rate": 0.000539890098049779,
-      "loss": 3.7701,
+      "epoch": 1.013477088948787,
+      "grad_norm": 0.6410120725631714,
+      "learning_rate": 0.0005397927684835401,
+      "loss": 3.7655,
       "step": 9400
     },
     {
-      "epoch": 1.017113335485954,
-      "grad_norm": 0.5384525060653687,
-      "learning_rate": 0.0005395668570197177,
-      "loss": 3.7657,
+      "epoch": 1.0188679245283019,
+      "grad_norm": 0.5875224471092224,
+      "learning_rate": 0.0005394689692390718,
+      "loss": 3.7643,
       "step": 9450
     },
     {
-      "epoch": 1.0224948875255624,
-      "grad_norm": 0.5624929666519165,
-      "learning_rate": 0.0005392436159896562,
-      "loss": 3.7754,
+      "epoch": 1.0242587601078168,
+      "grad_norm": 0.5834726095199585,
+      "learning_rate": 0.0005391451699946032,
+      "loss": 3.7698,
       "step": 9500
     },
     {
-      "epoch": 1.0278764395651705,
-      "grad_norm": 0.5882208943367004,
-      "learning_rate": 0.0005389203749595948,
-      "loss": 3.7848,
+      "epoch": 1.0296495956873315,
+      "grad_norm": 0.5419567227363586,
+      "learning_rate": 0.0005388213707501349,
+      "loss": 3.7799,
       "step": 9550
     },
     {
-      "epoch": 1.0332579916047788,
-      "grad_norm": 0.5588924884796143,
-      "learning_rate": 0.0005385971339295334,
-      "loss": 3.7603,
+      "epoch": 1.0350404312668464,
+      "grad_norm": 0.6511868238449097,
+      "learning_rate": 0.0005384975715056664,
+      "loss": 3.7784,
       "step": 9600
     },
     {
-      "epoch": 1.0386395436443872,
-      "grad_norm": 0.5837873220443726,
-      "learning_rate": 0.000538273892899472,
-      "loss": 3.7785,
+      "epoch": 1.0404312668463611,
+      "grad_norm": 0.6302853226661682,
+      "learning_rate": 0.000538173772261198,
+      "loss": 3.7842,
       "step": 9650
     },
     {
-      "epoch": 1.0440210956839953,
-      "grad_norm": 0.5880899429321289,
-      "learning_rate": 0.0005379506518694106,
-      "loss": 3.7506,
+      "epoch": 1.045822102425876,
+      "grad_norm": 0.531711220741272,
+      "learning_rate": 0.0005378499730167295,
+      "loss": 3.7661,
       "step": 9700
     },
     {
-      "epoch": 1.0494026477236036,
-      "grad_norm": 0.5579642653465271,
-      "learning_rate": 0.0005376274108393491,
-      "loss": 3.7714,
+      "epoch": 1.0512129380053907,
+      "grad_norm": 0.5548348426818848,
+      "learning_rate": 0.0005375261737722611,
+      "loss": 3.7646,
       "step": 9750
     },
     {
-      "epoch": 1.0547841997632117,
-      "grad_norm": 0.576766312122345,
-      "learning_rate": 0.0005373041698092877,
-      "loss": 3.7653,
+      "epoch": 1.0566037735849056,
+      "grad_norm": 0.5238717794418335,
+      "learning_rate": 0.0005372023745277928,
+      "loss": 3.7914,
       "step": 9800
     },
     {
-      "epoch": 1.06016575180282,
-      "grad_norm": 0.5471925735473633,
-      "learning_rate": 0.0005369809287792263,
-      "loss": 3.7686,
+      "epoch": 1.0619946091644206,
+      "grad_norm": 0.5447802543640137,
+      "learning_rate": 0.0005368785752833243,
+      "loss": 3.7911,
       "step": 9850
     },
     {
-      "epoch": 1.0655473038424281,
-      "grad_norm": 0.59092116355896,
-      "learning_rate": 0.000536657687749165,
-      "loss": 3.7667,
+      "epoch": 1.0673854447439353,
+      "grad_norm": 0.5512890815734863,
+      "learning_rate": 0.0005365547760388559,
+      "loss": 3.7828,
       "step": 9900
     },
     {
-      "epoch": 1.0709288558820365,
-      "grad_norm": 0.638380765914917,
-      "learning_rate": 0.0005363344467191035,
-      "loss": 3.7722,
+      "epoch": 1.0727762803234502,
+      "grad_norm": 0.5664125680923462,
+      "learning_rate": 0.0005362309767943874,
+      "loss": 3.7949,
       "step": 9950
     },
     {
-      "epoch": 1.0763104079216446,
-      "grad_norm": 0.5594536066055298,
-      "learning_rate": 0.000536011205689042,
-      "loss": 3.7655,
+      "epoch": 1.0781671159029649,
+      "grad_norm": 0.6107265949249268,
+      "learning_rate": 0.0005359071775499191,
+      "loss": 3.7721,
       "step": 10000
     },
     {
-      "epoch": 1.0763104079216446,
-      "eval_accuracy": 0.3451049320696713,
-      "eval_loss": 3.7489354610443115,
-      "eval_runtime": 202.8767,
-      "eval_samples_per_second": 88.778,
-      "eval_steps_per_second": 5.55,
+      "epoch": 1.0781671159029649,
+      "eval_accuracy": 0.34332498046149446,
+      "eval_loss": 3.7572708129882812,
+      "eval_runtime": 152.6822,
+      "eval_samples_per_second": 117.964,
+      "eval_steps_per_second": 7.375,
       "step": 10000
     },
     {
-      "epoch": 1.081691959961253,
-      "grad_norm": 0.640766441822052,
-      "learning_rate": 0.0005356879646589807,
-      "loss": 3.7646,
+      "epoch": 1.0835579514824798,
+      "grad_norm": 0.5660226941108704,
+      "learning_rate": 0.0005355833783054506,
+      "loss": 3.7667,
       "step": 10050
     },
     {
-      "epoch": 1.087073512000861,
-      "grad_norm": 0.6311773657798767,
-      "learning_rate": 0.0005353647236289192,
-      "loss": 3.7643,
+      "epoch": 1.0889487870619945,
+      "grad_norm": 0.6355052590370178,
+      "learning_rate": 0.0005352595790609822,
+      "loss": 3.7712,
       "step": 10100
     },
     {
-      "epoch": 1.0924550640404693,
-      "grad_norm": 0.5654868483543396,
-      "learning_rate": 0.0005350414825988579,
-      "loss": 3.7592,
+      "epoch": 1.0943396226415094,
+      "grad_norm": 0.5896925330162048,
+      "learning_rate": 0.0005349357798165137,
+      "loss": 3.7638,
       "step": 10150
     },
     {
-      "epoch": 1.0978366160800774,
-      "grad_norm": 0.5341681838035583,
-      "learning_rate": 0.0005347182415687964,
-      "loss": 3.7609,
+      "epoch": 1.0997304582210243,
+      "grad_norm": 0.6454715728759766,
+      "learning_rate": 0.0005346119805720453,
+      "loss": 3.7765,
       "step": 10200
     },
     {
-      "epoch": 1.1032181681196858,
-      "grad_norm": 0.8984493613243103,
-      "learning_rate": 0.000534395000538735,
-      "loss": 3.7588,
+      "epoch": 1.105121293800539,
+      "grad_norm": 0.6419016122817993,
+      "learning_rate": 0.0005342881813275768,
+      "loss": 3.7669,
       "step": 10250
     },
     {
-      "epoch": 1.1085997201592939,
-      "grad_norm": 0.6130673885345459,
-      "learning_rate": 0.0005340717595086736,
-      "loss": 3.7725,
+      "epoch": 1.110512129380054,
+      "grad_norm": 0.6399803161621094,
+      "learning_rate": 0.0005339643820831084,
+      "loss": 3.7723,
       "step": 10300
     },
     {
-      "epoch": 1.1139812721989022,
-      "grad_norm": 0.5171802043914795,
-      "learning_rate": 0.0005337485184786122,
-      "loss": 3.7686,
+      "epoch": 1.1159029649595686,
+      "grad_norm": 0.5695080757141113,
+      "learning_rate": 0.00053364058283864,
+      "loss": 3.7676,
       "step": 10350
     },
     {
-      "epoch": 1.1193628242385103,
-      "grad_norm": 0.6430924534797668,
-      "learning_rate": 0.0005334252774485507,
-      "loss": 3.7493,
+      "epoch": 1.1212938005390836,
+      "grad_norm": 0.641523540019989,
+      "learning_rate": 0.0005333167835941716,
+      "loss": 3.774,
       "step": 10400
     },
     {
-      "epoch": 1.1247443762781186,
-      "grad_norm": 0.5376786589622498,
-      "learning_rate": 0.0005331020364184894,
-      "loss": 3.7686,
+      "epoch": 1.1266846361185983,
+      "grad_norm": 0.5272489786148071,
+      "learning_rate": 0.0005329929843497031,
+      "loss": 3.7699,
       "step": 10450
     },
     {
-      "epoch": 1.1301259283177267,
-      "grad_norm": 0.5798326730728149,
-      "learning_rate": 0.0005327787953884279,
-      "loss": 3.7654,
+      "epoch": 1.1320754716981132,
+      "grad_norm": 0.5674998760223389,
+      "learning_rate": 0.0005326691851052347,
+      "loss": 3.7715,
       "step": 10500
     },
     {
-      "epoch": 1.135507480357335,
-      "grad_norm": 0.5384038090705872,
-      "learning_rate": 0.0005324555543583665,
-      "loss": 3.7485,
+      "epoch": 1.137466307277628,
+      "grad_norm": 0.5550113320350647,
+      "learning_rate": 0.0005323453858607662,
+      "loss": 3.7748,
       "step": 10550
     },
     {
-      "epoch": 1.1408890323969434,
-      "grad_norm": 0.5550758838653564,
-      "learning_rate": 0.0005321323133283051,
-      "loss": 3.748,
+      "epoch": 1.1428571428571428,
+      "grad_norm": 0.5779372453689575,
+      "learning_rate": 0.0005320215866162979,
+      "loss": 3.7667,
       "step": 10600
     },
     {
-      "epoch": 1.1462705844365515,
-      "grad_norm": 0.5656107664108276,
-      "learning_rate": 0.0005318090722982436,
-      "loss": 3.7485,
+      "epoch": 1.1482479784366577,
+      "grad_norm": 0.5660243630409241,
+      "learning_rate": 0.0005316977873718294,
+      "loss": 3.7511,
       "step": 10650
     },
     {
-      "epoch": 1.1516521364761596,
-      "grad_norm": 0.5888227224349976,
-      "learning_rate": 0.0005314858312681823,
-      "loss": 3.7438,
+      "epoch": 1.1536388140161726,
+      "grad_norm": 0.5514928102493286,
+      "learning_rate": 0.000531373988127361,
+      "loss": 3.7664,
       "step": 10700
     },
     {
-      "epoch": 1.157033688515768,
-      "grad_norm": 0.5515899658203125,
-      "learning_rate": 0.0005311625902381209,
-      "loss": 3.7721,
+      "epoch": 1.1590296495956873,
+      "grad_norm": 0.5613722801208496,
+      "learning_rate": 0.0005310501888828925,
+      "loss": 3.7443,
       "step": 10750
     },
     {
-      "epoch": 1.1624152405553763,
-      "grad_norm": 0.6210424900054932,
-      "learning_rate": 0.0005308393492080595,
-      "loss": 3.7652,
+      "epoch": 1.1644204851752022,
+      "grad_norm": 0.5789129734039307,
+      "learning_rate": 0.0005307263896384242,
+      "loss": 3.7512,
       "step": 10800
     },
     {
-      "epoch": 1.1677967925949844,
-      "grad_norm": 0.5752713084220886,
-      "learning_rate": 0.000530516108177998,
-      "loss": 3.7486,
+      "epoch": 1.169811320754717,
+      "grad_norm": 0.5901221036911011,
+      "learning_rate": 0.0005304025903939556,
+      "loss": 3.7406,
       "step": 10850
     },
     {
-      "epoch": 1.1731783446345927,
-      "grad_norm": 0.6572223901748657,
-      "learning_rate": 0.0005301928671479365,
-      "loss": 3.7272,
+      "epoch": 1.1752021563342319,
+      "grad_norm": 0.6217564344406128,
+      "learning_rate": 0.0005300787911494873,
+      "loss": 3.7545,
       "step": 10900
     },
     {
-      "epoch": 1.1785598966742008,
-      "grad_norm": 0.6310757994651794,
-      "learning_rate": 0.0005298696261178752,
-      "loss": 3.7563,
+      "epoch": 1.1805929919137466,
+      "grad_norm": 0.5514366030693054,
+      "learning_rate": 0.0005297549919050189,
+      "loss": 3.7697,
       "step": 10950
     },
     {
-      "epoch": 1.1839414487138091,
-      "grad_norm": 0.5674658417701721,
-      "learning_rate": 0.0005295463850878138,
-      "loss": 3.7441,
+      "epoch": 1.1859838274932615,
+      "grad_norm": 0.5353732109069824,
+      "learning_rate": 0.0005294311926605504,
+      "loss": 3.7783,
       "step": 11000
     },
     {
-      "epoch": 1.1839414487138091,
-      "eval_accuracy": 0.34708784727228553,
-      "eval_loss": 3.7200334072113037,
-      "eval_runtime": 208.4996,
-      "eval_samples_per_second": 86.384,
-      "eval_steps_per_second": 5.4,
+      "epoch": 1.1859838274932615,
+      "eval_accuracy": 0.34627240734923787,
+      "eval_loss": 3.730870246887207,
+      "eval_runtime": 152.3855,
+      "eval_samples_per_second": 118.194,
+      "eval_steps_per_second": 7.389,
       "step": 11000
     },
     {
-      "epoch": 1.1893230007534172,
-      "grad_norm": 0.5541161298751831,
-      "learning_rate": 0.0005292231440577524,
-      "loss": 3.7499,
+      "epoch": 1.1913746630727764,
+      "grad_norm": 0.5589845776557922,
+      "learning_rate": 0.000529107393416082,
+      "loss": 3.7652,
       "step": 11050
     },
     {
-      "epoch": 1.1947045527930256,
-      "grad_norm": 0.5533831715583801,
-      "learning_rate": 0.0005288999030276909,
-      "loss": 3.7418,
+      "epoch": 1.196765498652291,
+      "grad_norm": 0.5585349798202515,
+      "learning_rate": 0.0005287835941716135,
+      "loss": 3.7521,
       "step": 11100
     },
     {
-      "epoch": 1.2000861048326337,
-      "grad_norm": 0.5780303478240967,
-      "learning_rate": 0.0005285766619976295,
-      "loss": 3.7409,
+      "epoch": 1.202156334231806,
+      "grad_norm": 0.5789923071861267,
+      "learning_rate": 0.0005284597949271452,
+      "loss": 3.7476,
       "step": 11150
     },
     {
-      "epoch": 1.205467656872242,
-      "grad_norm": 0.6292614340782166,
-      "learning_rate": 0.0005282534209675681,
-      "loss": 3.7435,
+      "epoch": 1.2075471698113207,
+      "grad_norm": 0.5433546900749207,
+      "learning_rate": 0.0005281359956826767,
+      "loss": 3.7709,
       "step": 11200
     },
     {
-      "epoch": 1.21084920891185,
-      "grad_norm": 0.5940732359886169,
-      "learning_rate": 0.0005279301799375066,
-      "loss": 3.7362,
+      "epoch": 1.2129380053908356,
+      "grad_norm": 0.613101065158844,
+      "learning_rate": 0.0005278121964382083,
+      "loss": 3.7594,
       "step": 11250
     },
     {
-      "epoch": 1.2162307609514584,
-      "grad_norm": 0.5676620006561279,
-      "learning_rate": 0.0005276134037280465,
-      "loss": 3.7241,
+      "epoch": 1.2183288409703503,
+      "grad_norm": 0.5600290298461914,
+      "learning_rate": 0.0005274883971937398,
+      "loss": 3.7442,
       "step": 11300
     },
     {
-      "epoch": 1.2216123129910665,
-      "grad_norm": 0.5294714570045471,
-      "learning_rate": 0.0005272901626979851,
-      "loss": 3.7192,
+      "epoch": 1.2237196765498652,
+      "grad_norm": 0.5522957444190979,
+      "learning_rate": 0.0005271645979492714,
+      "loss": 3.7389,
       "step": 11350
     },
     {
-      "epoch": 1.2269938650306749,
-      "grad_norm": 0.5521119832992554,
-      "learning_rate": 0.0005269669216679236,
-      "loss": 3.734,
+      "epoch": 1.2291105121293802,
+      "grad_norm": 0.7547903656959534,
+      "learning_rate": 0.000526840798704803,
+      "loss": 3.7529,
       "step": 11400
     },
     {
-      "epoch": 1.232375417070283,
-      "grad_norm": 0.7975606918334961,
-      "learning_rate": 0.0005266436806378623,
-      "loss": 3.7188,
+      "epoch": 1.2345013477088949,
+      "grad_norm": 0.5353315472602844,
+      "learning_rate": 0.0005265169994603346,
+      "loss": 3.7639,
       "step": 11450
     },
     {
-      "epoch": 1.2377569691098913,
-      "grad_norm": 0.5794736742973328,
-      "learning_rate": 0.0005263204396078008,
-      "loss": 3.7279,
+      "epoch": 1.2398921832884098,
+      "grad_norm": 0.524540364742279,
+      "learning_rate": 0.0005261932002158661,
+      "loss": 3.7537,
       "step": 11500
     },
     {
-      "epoch": 1.2431385211494996,
-      "grad_norm": 0.5361841917037964,
-      "learning_rate": 0.0005259971985777394,
-      "loss": 3.7432,
+      "epoch": 1.2452830188679245,
+      "grad_norm": 0.5606273412704468,
+      "learning_rate": 0.0005258694009713977,
+      "loss": 3.7457,
       "step": 11550
     },
     {
-      "epoch": 1.2485200731891077,
-      "grad_norm": 0.5248964428901672,
-      "learning_rate": 0.000525673957547678,
-      "loss": 3.7447,
+      "epoch": 1.2506738544474394,
+      "grad_norm": 0.5786051154136658,
+      "learning_rate": 0.0005255456017269292,
+      "loss": 3.761,
       "step": 11600
     },
     {
-      "epoch": 1.2539016252287158,
-      "grad_norm": 0.5553768873214722,
-      "learning_rate": 0.0005253507165176167,
-      "loss": 3.7477,
+      "epoch": 1.256064690026954,
+      "grad_norm": 0.5806413888931274,
+      "learning_rate": 0.0005252218024824608,
+      "loss": 3.7393,
       "step": 11650
     },
     {
-      "epoch": 1.2592831772683242,
-      "grad_norm": 0.5761224627494812,
-      "learning_rate": 0.0005250274754875552,
-      "loss": 3.7211,
+      "epoch": 1.261455525606469,
+      "grad_norm": 0.5003126263618469,
+      "learning_rate": 0.0005248980032379924,
+      "loss": 3.7527,
       "step": 11700
     },
     {
-      "epoch": 1.2646647293079325,
-      "grad_norm": 0.607130229473114,
-      "learning_rate": 0.0005247042344574938,
-      "loss": 3.7359,
+      "epoch": 1.266846361185984,
+      "grad_norm": 0.5108680725097656,
+      "learning_rate": 0.000524574203993524,
+      "loss": 3.7502,
       "step": 11750
     },
     {
-      "epoch": 1.2700462813475406,
-      "grad_norm": 0.5540531873703003,
-      "learning_rate": 0.0005243809934274323,
-      "loss": 3.7217,
+      "epoch": 1.2722371967654986,
+      "grad_norm": 0.6031525135040283,
+      "learning_rate": 0.0005242504047490555,
+      "loss": 3.7638,
       "step": 11800
     },
     {
-      "epoch": 1.275427833387149,
-      "grad_norm": 0.6098884344100952,
-      "learning_rate": 0.0005240577523973709,
-      "loss": 3.7431,
+      "epoch": 1.2776280323450135,
+      "grad_norm": 0.5650216937065125,
+      "learning_rate": 0.0005239266055045871,
+      "loss": 3.7471,
       "step": 11850
     },
     {
-      "epoch": 1.280809385426757,
-      "grad_norm": 0.5816884636878967,
-      "learning_rate": 0.0005237345113673095,
-      "loss": 3.7404,
+      "epoch": 1.2830188679245282,
+      "grad_norm": 0.600111186504364,
+      "learning_rate": 0.0005236028062601186,
+      "loss": 3.7432,
       "step": 11900
     },
     {
-      "epoch": 1.2861909374663654,
-      "grad_norm": 0.5769429802894592,
-      "learning_rate": 0.0005234112703372481,
-      "loss": 3.7404,
+      "epoch": 1.2884097035040432,
+      "grad_norm": 0.5846624374389648,
+      "learning_rate": 0.0005232790070156503,
+      "loss": 3.7259,
       "step": 11950
     },
     {
-      "epoch": 1.2915724895059735,
-      "grad_norm": 0.563298761844635,
-      "learning_rate": 0.0005230880293071867,
-      "loss": 3.7293,
+      "epoch": 1.2938005390835579,
+      "grad_norm": 0.5376623868942261,
+      "learning_rate": 0.0005229552077711818,
+      "loss": 3.7384,
       "step": 12000
     },
     {
-      "epoch": 1.2915724895059735,
-      "eval_accuracy": 0.34958697234490643,
-      "eval_loss": 3.6987245082855225,
-      "eval_runtime": 217.4267,
-      "eval_samples_per_second": 82.837,
-      "eval_steps_per_second": 5.179,
+      "epoch": 1.2938005390835579,
+      "eval_accuracy": 0.34856509193501123,
+      "eval_loss": 3.702909469604492,
+      "eval_runtime": 152.7981,
+      "eval_samples_per_second": 117.875,
+      "eval_steps_per_second": 7.369,
       "step": 12000
     },
     {
-      "epoch": 1.2969540415455818,
-      "grad_norm": 0.5788484811782837,
-      "learning_rate": 0.0005227647882771253,
-      "loss": 3.7336,
+      "epoch": 1.2991913746630728,
+      "grad_norm": 0.5912179350852966,
+      "learning_rate": 0.0005226314085267134,
+      "loss": 3.7548,
       "step": 12050
     },
     {
-      "epoch": 1.30233559358519,
-      "grad_norm": 0.6204023957252502,
-      "learning_rate": 0.0005224415472470639,
-      "loss": 3.7243,
+      "epoch": 1.3045822102425877,
+      "grad_norm": 0.7062541842460632,
+      "learning_rate": 0.0005223140852671344,
+      "loss": 3.7468,
       "step": 12100
     },
     {
-      "epoch": 1.3077171456247982,
-      "grad_norm": 0.5986481308937073,
-      "learning_rate": 0.0005221183062170024,
-      "loss": 3.7316,
+      "epoch": 1.3099730458221024,
+      "grad_norm": 0.5411067008972168,
+      "learning_rate": 0.0005219902860226659,
+      "loss": 3.7358,
       "step": 12150
     },
     {
-      "epoch": 1.3130986976644063,
-      "grad_norm": 0.6356789469718933,
-      "learning_rate": 0.0005217950651869409,
-      "loss": 3.7352,
+      "epoch": 1.3153638814016173,
+      "grad_norm": 0.5543079376220703,
+      "learning_rate": 0.0005216664867781975,
+      "loss": 3.7528,
       "step": 12200
     },
     {
-      "epoch": 1.3184802497040147,
-      "grad_norm": 0.555164098739624,
-      "learning_rate": 0.0005214718241568796,
-      "loss": 3.7149,
+      "epoch": 1.320754716981132,
+      "grad_norm": 0.626389741897583,
+      "learning_rate": 0.000521342687533729,
+      "loss": 3.7289,
       "step": 12250
     },
     {
-      "epoch": 1.3238618017436228,
-      "grad_norm": 0.6156308054924011,
-      "learning_rate": 0.0005211485831268182,
-      "loss": 3.6991,
+      "epoch": 1.326145552560647,
+      "grad_norm": 0.7137591242790222,
+      "learning_rate": 0.0005210188882892606,
+      "loss": 3.7343,
       "step": 12300
     },
     {
-      "epoch": 1.329243353783231,
-      "grad_norm": 0.5790920853614807,
-      "learning_rate": 0.0005208253420967568,
-      "loss": 3.7206,
+      "epoch": 1.3315363881401616,
+      "grad_norm": 0.5231083631515503,
+      "learning_rate": 0.0005206950890447922,
+      "loss": 3.743,
       "step": 12350
     },
     {
-      "epoch": 1.3346249058228392,
-      "grad_norm": 0.5524982810020447,
-      "learning_rate": 0.0005205021010666953,
-      "loss": 3.7344,
+      "epoch": 1.3369272237196765,
+      "grad_norm": 0.6331651210784912,
+      "learning_rate": 0.0005203712898003238,
+      "loss": 3.74,
       "step": 12400
     },
     {
-      "epoch": 1.3400064578624475,
-      "grad_norm": 0.5789136290550232,
-      "learning_rate": 0.0005201788600366339,
-      "loss": 3.714,
+      "epoch": 1.3423180592991915,
+      "grad_norm": 0.5775096416473389,
+      "learning_rate": 0.0005200474905558553,
+      "loss": 3.7299,
       "step": 12450
     },
     {
-      "epoch": 1.3453880099020559,
-      "grad_norm": 0.5845145583152771,
-      "learning_rate": 0.0005198556190065725,
-      "loss": 3.7083,
+      "epoch": 1.3477088948787062,
+      "grad_norm": 0.6246299147605896,
+      "learning_rate": 0.0005197236913113869,
+      "loss": 3.7473,
       "step": 12500
     },
     {
-      "epoch": 1.350769561941664,
-      "grad_norm": 0.6391981840133667,
-      "learning_rate": 0.0005195323779765112,
-      "loss": 3.7103,
+      "epoch": 1.353099730458221,
+      "grad_norm": 0.5844605565071106,
+      "learning_rate": 0.0005193998920669184,
+      "loss": 3.7204,
       "step": 12550
     },
     {
-      "epoch": 1.356151113981272,
-      "grad_norm": 0.543088972568512,
-      "learning_rate": 0.0005192091369464497,
-      "loss": 3.7256,
+      "epoch": 1.3584905660377358,
+      "grad_norm": 0.5848774909973145,
+      "learning_rate": 0.0005190760928224501,
+      "loss": 3.7216,
       "step": 12600
     },
     {
-      "epoch": 1.3615326660208804,
-      "grad_norm": 0.5750318169593811,
-      "learning_rate": 0.0005188858959163882,
-      "loss": 3.7289,
+      "epoch": 1.3638814016172507,
+      "grad_norm": 0.6270635724067688,
+      "learning_rate": 0.0005187522935779816,
+      "loss": 3.7121,
       "step": 12650
     },
     {
-      "epoch": 1.3669142180604887,
-      "grad_norm": 0.6135967969894409,
-      "learning_rate": 0.0005185626548863269,
-      "loss": 3.7014,
+      "epoch": 1.3692722371967654,
+      "grad_norm": 0.5674271583557129,
+      "learning_rate": 0.0005184284943335132,
+      "loss": 3.7243,
       "step": 12700
     },
     {
-      "epoch": 1.3722957701000968,
-      "grad_norm": 0.5833747982978821,
-      "learning_rate": 0.0005182394138562654,
-      "loss": 3.7292,
+      "epoch": 1.3746630727762803,
+      "grad_norm": 0.5642839074134827,
+      "learning_rate": 0.0005181046950890447,
+      "loss": 3.7156,
       "step": 12750
     },
     {
-      "epoch": 1.3776773221397052,
-      "grad_norm": 0.571729838848114,
-      "learning_rate": 0.0005179161728262041,
-      "loss": 3.7158,
+      "epoch": 1.3800539083557952,
+      "grad_norm": 0.6046538949012756,
+      "learning_rate": 0.0005177808958445764,
+      "loss": 3.7149,
       "step": 12800
     },
     {
-      "epoch": 1.3830588741793133,
-      "grad_norm": 0.6265289783477783,
-      "learning_rate": 0.0005175929317961426,
-      "loss": 3.7261,
+      "epoch": 1.38544474393531,
+      "grad_norm": 0.6427596807479858,
+      "learning_rate": 0.0005174570966001078,
+      "loss": 3.7265,
       "step": 12850
     },
     {
-      "epoch": 1.3884404262189216,
-      "grad_norm": 0.6207230091094971,
-      "learning_rate": 0.0005172696907660812,
-      "loss": 3.699,
+      "epoch": 1.3908355795148248,
+      "grad_norm": 0.5846236944198608,
+      "learning_rate": 0.0005171332973556395,
+      "loss": 3.7258,
       "step": 12900
     },
     {
-      "epoch": 1.3938219782585297,
-      "grad_norm": 0.5325709581375122,
-      "learning_rate": 0.0005169464497360198,
-      "loss": 3.7059,
+      "epoch": 1.3962264150943398,
+      "grad_norm": 0.5765063762664795,
+      "learning_rate": 0.000516809498111171,
+      "loss": 3.7341,
       "step": 12950
     },
     {
-      "epoch": 1.399203530298138,
-      "grad_norm": 0.5781683325767517,
-      "learning_rate": 0.0005166232087059583,
-      "loss": 3.7029,
+      "epoch": 1.4016172506738545,
+      "grad_norm": 0.5632540583610535,
+      "learning_rate": 0.0005164856988667026,
+      "loss": 3.7152,
       "step": 13000
     },
     {
-      "epoch": 1.399203530298138,
-      "eval_accuracy": 0.3522667871699628,
-      "eval_loss": 3.674454927444458,
-      "eval_runtime": 217.915,
-      "eval_samples_per_second": 82.651,
-      "eval_steps_per_second": 5.167,
+      "epoch": 1.4016172506738545,
+      "eval_accuracy": 0.351037923008781,
+      "eval_loss": 3.681976079940796,
+      "eval_runtime": 153.0001,
+      "eval_samples_per_second": 117.719,
+      "eval_steps_per_second": 7.359,
       "step": 13000
     },
     {
-      "epoch": 1.4045850823377461,
-      "grad_norm": 0.565986156463623,
-      "learning_rate": 0.0005162999676758969,
-      "loss": 3.7075,
+      "epoch": 1.4070080862533692,
+      "grad_norm": 0.584800660610199,
+      "learning_rate": 0.0005161618996222341,
+      "loss": 3.7137,
       "step": 13050
     },
     {
-      "epoch": 1.4099666343773545,
-      "grad_norm": 0.5653162002563477,
-      "learning_rate": 0.0005159767266458355,
-      "loss": 3.6983,
+      "epoch": 1.412398921832884,
+      "grad_norm": 0.5406800508499146,
+      "learning_rate": 0.0005158381003777657,
+      "loss": 3.7076,
       "step": 13100
     },
     {
-      "epoch": 1.4153481864169626,
-      "grad_norm": 0.5863538384437561,
-      "learning_rate": 0.0005156534856157741,
-      "loss": 3.7083,
+      "epoch": 1.417789757412399,
+      "grad_norm": 0.6078150272369385,
+      "learning_rate": 0.0005155143011332973,
+      "loss": 3.714,
       "step": 13150
     },
     {
-      "epoch": 1.420729738456571,
-      "grad_norm": 0.5486993789672852,
-      "learning_rate": 0.0005153302445857127,
-      "loss": 3.7251,
+      "epoch": 1.4231805929919137,
+      "grad_norm": 0.5497395396232605,
+      "learning_rate": 0.0005151905018888289,
+      "loss": 3.7119,
       "step": 13200
     },
     {
-      "epoch": 1.426111290496179,
-      "grad_norm": 0.6212597489356995,
-      "learning_rate": 0.0005150070035556513,
-      "loss": 3.7196,
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.5663301348686218,
+      "learning_rate": 0.0005148667026443604,
+      "loss": 3.7237,
       "step": 13250
     },
     {
-      "epoch": 1.4314928425357873,
-      "grad_norm": 0.6025354862213135,
-      "learning_rate": 0.0005146837625255898,
-      "loss": 3.7195,
+      "epoch": 1.4339622641509435,
+      "grad_norm": 0.6136855483055115,
+      "learning_rate": 0.000514542903399892,
+      "loss": 3.6982,
       "step": 13300
     },
     {
-      "epoch": 1.4368743945753955,
-      "grad_norm": 0.5640348792076111,
-      "learning_rate": 0.0005143669863161297,
-      "loss": 3.7035,
+      "epoch": 1.4393530997304582,
+      "grad_norm": 0.577499270439148,
+      "learning_rate": 0.0005142191041554237,
+      "loss": 3.7174,
       "step": 13350
     },
     {
-      "epoch": 1.4422559466150038,
-      "grad_norm": 0.5984680652618408,
-      "learning_rate": 0.0005140437452860683,
-      "loss": 3.7052,
+      "epoch": 1.444743935309973,
+      "grad_norm": 0.5561140775680542,
+      "learning_rate": 0.0005139017808958445,
+      "loss": 3.7323,
       "step": 13400
     },
     {
-      "epoch": 1.447637498654612,
-      "grad_norm": 0.5823872685432434,
-      "learning_rate": 0.0005137205042560069,
-      "loss": 3.695,
+      "epoch": 1.4501347708894878,
+      "grad_norm": 0.5748620629310608,
+      "learning_rate": 0.0005135779816513762,
+      "loss": 3.7341,
       "step": 13450
     },
     {
-      "epoch": 1.4530190506942202,
-      "grad_norm": 0.5364094972610474,
-      "learning_rate": 0.0005133972632259455,
-      "loss": 3.7199,
+      "epoch": 1.4555256064690028,
+      "grad_norm": 0.6695783734321594,
+      "learning_rate": 0.0005132541824069076,
+      "loss": 3.7254,
       "step": 13500
     },
     {
-      "epoch": 1.4584006027338283,
-      "grad_norm": 0.5685153603553772,
-      "learning_rate": 0.000513074022195884,
-      "loss": 3.6992,
+      "epoch": 1.4609164420485174,
+      "grad_norm": 0.5569912195205688,
+      "learning_rate": 0.0005129303831624393,
+      "loss": 3.7147,
       "step": 13550
     },
     {
-      "epoch": 1.4637821547734367,
-      "grad_norm": 0.5787658095359802,
-      "learning_rate": 0.0005127507811658226,
-      "loss": 3.6923,
+      "epoch": 1.4663072776280324,
+      "grad_norm": 0.5915151834487915,
+      "learning_rate": 0.0005126065839179708,
+      "loss": 3.73,
       "step": 13600
     },
     {
-      "epoch": 1.469163706813045,
-      "grad_norm": 0.5486370325088501,
-      "learning_rate": 0.0005124275401357612,
-      "loss": 3.7024,
+      "epoch": 1.4716981132075473,
+      "grad_norm": 0.5770183205604553,
+      "learning_rate": 0.0005122827846735024,
+      "loss": 3.7206,
       "step": 13650
     },
     {
-      "epoch": 1.474545258852653,
-      "grad_norm": 0.5473746061325073,
-      "learning_rate": 0.0005121042991056997,
-      "loss": 3.6833,
+      "epoch": 1.477088948787062,
+      "grad_norm": 0.6127224564552307,
+      "learning_rate": 0.0005119589854290339,
+      "loss": 3.7078,
       "step": 13700
     },
     {
-      "epoch": 1.4799268108922612,
-      "grad_norm": 0.5517615675926208,
-      "learning_rate": 0.0005117810580756384,
-      "loss": 3.6902,
+      "epoch": 1.482479784366577,
+      "grad_norm": 0.6317543983459473,
+      "learning_rate": 0.0005116351861845655,
+      "loss": 3.7085,
       "step": 13750
     },
     {
-      "epoch": 1.4853083629318695,
-      "grad_norm": 0.5971811413764954,
-      "learning_rate": 0.0005114578170455769,
-      "loss": 3.6845,
+      "epoch": 1.4878706199460916,
+      "grad_norm": 0.5654078125953674,
+      "learning_rate": 0.0005113113869400971,
+      "loss": 3.7115,
       "step": 13800
     },
     {
-      "epoch": 1.4906899149714778,
-      "grad_norm": 0.5672309398651123,
-      "learning_rate": 0.0005111345760155156,
-      "loss": 3.6854,
+      "epoch": 1.4932614555256065,
+      "grad_norm": 0.6050569415092468,
+      "learning_rate": 0.0005109875876956287,
+      "loss": 3.7161,
       "step": 13850
     },
     {
-      "epoch": 1.496071467011086,
-      "grad_norm": 0.5523454546928406,
-      "learning_rate": 0.0005108113349854541,
-      "loss": 3.6961,
+      "epoch": 1.4986522911051212,
+      "grad_norm": 0.7072622776031494,
+      "learning_rate": 0.0005106637884511602,
+      "loss": 3.6974,
       "step": 13900
     },
     {
-      "epoch": 1.501453019050694,
-      "grad_norm": 0.5691514611244202,
-      "learning_rate": 0.0005104880939553926,
-      "loss": 3.6803,
+      "epoch": 1.5040431266846361,
+      "grad_norm": 0.5389599800109863,
+      "learning_rate": 0.0005103399892066918,
+      "loss": 3.6989,
       "step": 13950
     },
     {
-      "epoch": 1.5068345710903024,
-      "grad_norm": 0.5720673203468323,
-      "learning_rate": 0.0005101648529253313,
-      "loss": 3.6962,
+      "epoch": 1.509433962264151,
+      "grad_norm": 0.5915652513504028,
+      "learning_rate": 0.0005100161899622234,
+      "loss": 3.7106,
       "step": 14000
     },
     {
-      "epoch": 1.5068345710903024,
-      "eval_accuracy": 0.35364048563060124,
-      "eval_loss": 3.655740261077881,
-      "eval_runtime": 204.9718,
-      "eval_samples_per_second": 87.871,
-      "eval_steps_per_second": 5.493,
+      "epoch": 1.509433962264151,
+      "eval_accuracy": 0.35301268924480916,
+      "eval_loss": 3.660759449005127,
+      "eval_runtime": 152.8889,
+      "eval_samples_per_second": 117.805,
+      "eval_steps_per_second": 7.365,
       "step": 14000
     },
     {
-      "epoch": 1.5122161231299107,
-      "grad_norm": 0.5242102742195129,
-      "learning_rate": 0.0005098480767158711,
-      "loss": 3.697,
+      "epoch": 1.5148247978436657,
+      "grad_norm": 0.5557901263237,
+      "learning_rate": 0.000509692390717755,
+      "loss": 3.7113,
       "step": 14050
     },
     {
-      "epoch": 1.5175976751695188,
-      "grad_norm": 0.5905733108520508,
-      "learning_rate": 0.0005095248356858097,
-      "loss": 3.6869,
+      "epoch": 1.5202156334231804,
+      "grad_norm": 0.6000087857246399,
+      "learning_rate": 0.0005093685914732865,
+      "loss": 3.7091,
       "step": 14100
     },
     {
-      "epoch": 1.5229792272091272,
-      "grad_norm": 0.6295709609985352,
-      "learning_rate": 0.0005092015946557483,
-      "loss": 3.6839,
+      "epoch": 1.5256064690026954,
+      "grad_norm": 0.5491834282875061,
+      "learning_rate": 0.0005090447922288181,
+      "loss": 3.7027,
       "step": 14150
     },
     {
-      "epoch": 1.5283607792487355,
-      "grad_norm": 0.6163989901542664,
-      "learning_rate": 0.0005088783536256868,
-      "loss": 3.6979,
+      "epoch": 1.5309973045822103,
+      "grad_norm": 0.5360293388366699,
+      "learning_rate": 0.0005087209929843496,
+      "loss": 3.7071,
       "step": 14200
     },
     {
-      "epoch": 1.5337423312883436,
-      "grad_norm": 0.5677395462989807,
-      "learning_rate": 0.0005085551125956255,
-      "loss": 3.698,
+      "epoch": 1.536388140161725,
+      "grad_norm": 0.6027365326881409,
+      "learning_rate": 0.0005083971937398812,
+      "loss": 3.7083,
       "step": 14250
     },
     {
-      "epoch": 1.5391238833279517,
-      "grad_norm": 0.5294774174690247,
-      "learning_rate": 0.000508231871565564,
-      "loss": 3.6858,
+      "epoch": 1.54177897574124,
+      "grad_norm": 0.5418972373008728,
+      "learning_rate": 0.0005080733944954127,
+      "loss": 3.7045,
       "step": 14300
     },
     {
-      "epoch": 1.54450543536756,
-      "grad_norm": 0.585757315158844,
-      "learning_rate": 0.0005079086305355026,
-      "loss": 3.7028,
+      "epoch": 1.5471698113207548,
+      "grad_norm": 0.5064155459403992,
+      "learning_rate": 0.0005077495952509444,
+      "loss": 3.6947,
       "step": 14350
     },
     {
-      "epoch": 1.5498869874071683,
-      "grad_norm": 0.5384769439697266,
-      "learning_rate": 0.0005075853895054412,
-      "loss": 3.686,
+      "epoch": 1.5525606469002695,
+      "grad_norm": 0.5617753267288208,
+      "learning_rate": 0.0005074257960064759,
+      "loss": 3.6987,
       "step": 14400
     },
     {
-      "epoch": 1.5552685394467765,
-      "grad_norm": 0.5806359052658081,
-      "learning_rate": 0.0005072621484753797,
-      "loss": 3.689,
+      "epoch": 1.5579514824797842,
+      "grad_norm": 0.5464807152748108,
+      "learning_rate": 0.0005071019967620075,
+      "loss": 3.7002,
       "step": 14450
     },
     {
-      "epoch": 1.5606500914863846,
-      "grad_norm": 0.5702711939811707,
-      "learning_rate": 0.0005069389074453184,
-      "loss": 3.675,
+      "epoch": 1.5633423180592994,
+      "grad_norm": 0.5438482165336609,
+      "learning_rate": 0.000506778197517539,
+      "loss": 3.6985,
       "step": 14500
     },
     {
-      "epoch": 1.566031643525993,
-      "grad_norm": 0.5366608500480652,
-      "learning_rate": 0.0005066156664152569,
-      "loss": 3.6914,
+      "epoch": 1.568733153638814,
+      "grad_norm": 0.5997867584228516,
+      "learning_rate": 0.0005064543982730707,
+      "loss": 3.6976,
       "step": 14550
     },
     {
-      "epoch": 1.5714131955656012,
-      "grad_norm": 0.60133296251297,
-      "learning_rate": 0.0005062924253851955,
-      "loss": 3.693,
+      "epoch": 1.5741239892183287,
+      "grad_norm": 0.5712060928344727,
+      "learning_rate": 0.0005061305990286023,
+      "loss": 3.6846,
       "step": 14600
     },
     {
-      "epoch": 1.5767947476052093,
-      "grad_norm": 0.5777215361595154,
-      "learning_rate": 0.0005059691843551341,
-      "loss": 3.6786,
+      "epoch": 1.5795148247978437,
+      "grad_norm": 0.5789122581481934,
+      "learning_rate": 0.0005058067997841338,
+      "loss": 3.689,
       "step": 14650
     },
     {
-      "epoch": 1.5821762996448174,
-      "grad_norm": 0.5946151614189148,
-      "learning_rate": 0.0005056459433250727,
-      "loss": 3.6849,
+      "epoch": 1.5849056603773586,
+      "grad_norm": 0.5818272233009338,
+      "learning_rate": 0.0005054830005396654,
+      "loss": 3.6941,
       "step": 14700
     },
     {
-      "epoch": 1.5875578516844258,
-      "grad_norm": 0.601273775100708,
-      "learning_rate": 0.0005053227022950113,
-      "loss": 3.6833,
+      "epoch": 1.5902964959568733,
+      "grad_norm": 0.595536470413208,
+      "learning_rate": 0.0005051592012951969,
+      "loss": 3.6728,
       "step": 14750
     },
     {
-      "epoch": 1.592939403724034,
-      "grad_norm": 0.5582096576690674,
-      "learning_rate": 0.0005049994612649499,
-      "loss": 3.681,
+      "epoch": 1.595687331536388,
+      "grad_norm": 0.5664342045783997,
+      "learning_rate": 0.0005048354020507286,
+      "loss": 3.666,
       "step": 14800
     },
     {
-      "epoch": 1.5983209557636422,
-      "grad_norm": 0.5676015615463257,
-      "learning_rate": 0.0005046762202348884,
-      "loss": 3.6892,
+      "epoch": 1.6010781671159031,
+      "grad_norm": 0.5805814862251282,
+      "learning_rate": 0.00050451160280626,
+      "loss": 3.6899,
       "step": 14850
     },
     {
-      "epoch": 1.6037025078032503,
-      "grad_norm": 0.5280758738517761,
-      "learning_rate": 0.000504352979204827,
-      "loss": 3.6736,
+      "epoch": 1.6064690026954178,
+      "grad_norm": 0.5888657569885254,
+      "learning_rate": 0.0005041878035617917,
+      "loss": 3.6958,
       "step": 14900
     },
     {
-      "epoch": 1.6090840598428586,
-      "grad_norm": 0.6497607827186584,
-      "learning_rate": 0.0005040297381747656,
-      "loss": 3.6761,
+      "epoch": 1.6118598382749325,
+      "grad_norm": 0.6133636832237244,
+      "learning_rate": 0.0005038640043173232,
+      "loss": 3.6823,
       "step": 14950
     },
     {
-      "epoch": 1.614465611882467,
-      "grad_norm": 0.6543067693710327,
-      "learning_rate": 0.0005037064971447042,
-      "loss": 3.6601,
+      "epoch": 1.6172506738544474,
+      "grad_norm": 0.5385803580284119,
+      "learning_rate": 0.0005035402050728548,
+      "loss": 3.673,
       "step": 15000
     },
     {
-      "epoch": 1.614465611882467,
-      "eval_accuracy": 0.3556327449815676,
-      "eval_loss": 3.6367475986480713,
-      "eval_runtime": 205.3772,
-      "eval_samples_per_second": 87.697,
-      "eval_steps_per_second": 5.483,
+      "epoch": 1.6172506738544474,
+      "eval_accuracy": 0.3548456634622395,
+      "eval_loss": 3.643002986907959,
+      "eval_runtime": 152.5762,
+      "eval_samples_per_second": 118.046,
+      "eval_steps_per_second": 7.38,
       "step": 15000
     },
     {
-      "epoch": 1.619847163922075,
-      "grad_norm": 0.5906841158866882,
-      "learning_rate": 0.0005033832561146428,
-      "loss": 3.6741,
+      "epoch": 1.6226415094339623,
+      "grad_norm": 0.5596092343330383,
+      "learning_rate": 0.0005032164058283863,
+      "loss": 3.6722,
       "step": 15050
     },
     {
-      "epoch": 1.6252287159616834,
-      "grad_norm": 0.5810515880584717,
-      "learning_rate": 0.0005030600150845813,
-      "loss": 3.6827,
+      "epoch": 1.628032345013477,
+      "grad_norm": 0.5818222761154175,
+      "learning_rate": 0.0005028926065839179,
+      "loss": 3.683,
       "step": 15100
     },
     {
-      "epoch": 1.6306102680012917,
-      "grad_norm": 0.6275368928909302,
-      "learning_rate": 0.00050273677405452,
-      "loss": 3.673,
+      "epoch": 1.633423180592992,
+      "grad_norm": 0.5584225058555603,
+      "learning_rate": 0.0005025688073394495,
+      "loss": 3.7002,
       "step": 15150
     },
     {
-      "epoch": 1.6359918200408998,
-      "grad_norm": 0.5321255922317505,
-      "learning_rate": 0.0005024135330244585,
-      "loss": 3.6789,
+      "epoch": 1.6388140161725069,
+      "grad_norm": 0.523932933807373,
+      "learning_rate": 0.0005022450080949811,
+      "loss": 3.6904,
       "step": 15200
     },
     {
-      "epoch": 1.641373372080508,
-      "grad_norm": 0.5455909371376038,
-      "learning_rate": 0.0005020902919943972,
-      "loss": 3.6691,
+      "epoch": 1.6442048517520216,
+      "grad_norm": 0.5471178293228149,
+      "learning_rate": 0.0005019212088505126,
+      "loss": 3.6822,
       "step": 15250
     },
     {
-      "epoch": 1.6467549241201163,
-      "grad_norm": 0.5684463977813721,
-      "learning_rate": 0.0005017670509643357,
-      "loss": 3.656,
+      "epoch": 1.6495956873315363,
+      "grad_norm": 0.5977591276168823,
+      "learning_rate": 0.0005015974096060442,
+      "loss": 3.6788,
       "step": 15300
     },
     {
-      "epoch": 1.6521364761597246,
-      "grad_norm": 0.5891856551170349,
-      "learning_rate": 0.0005014438099342743,
-      "loss": 3.666,
+      "epoch": 1.6549865229110512,
+      "grad_norm": 0.527600884437561,
+      "learning_rate": 0.0005012736103615758,
+      "loss": 3.6653,
       "step": 15350
     },
     {
-      "epoch": 1.6575180281993327,
-      "grad_norm": 0.5768228769302368,
-      "learning_rate": 0.0005011205689042129,
-      "loss": 3.6802,
+      "epoch": 1.6603773584905661,
+      "grad_norm": 0.5747079253196716,
+      "learning_rate": 0.0005009498111171074,
+      "loss": 3.6841,
       "step": 15400
     },
     {
-      "epoch": 1.6628995802389408,
-      "grad_norm": 0.5384355187416077,
-      "learning_rate": 0.0005007973278741514,
-      "loss": 3.6677,
+      "epoch": 1.6657681940700808,
+      "grad_norm": 0.6080580353736877,
+      "learning_rate": 0.0005006260118726389,
+      "loss": 3.6727,
       "step": 15450
     },
     {
-      "epoch": 1.6682811322785491,
-      "grad_norm": 0.5776270031929016,
-      "learning_rate": 0.00050047408684409,
-      "loss": 3.6913,
+      "epoch": 1.6711590296495957,
+      "grad_norm": 0.5732918977737427,
+      "learning_rate": 0.0005003022126281705,
+      "loss": 3.6717,
       "step": 15500
     },
     {
-      "epoch": 1.6736626843181575,
-      "grad_norm": 0.5457106232643127,
-      "learning_rate": 0.0005001508458140286,
-      "loss": 3.6747,
+      "epoch": 1.6765498652291106,
+      "grad_norm": 0.559468686580658,
+      "learning_rate": 0.000499978413383702,
+      "loss": 3.659,
       "step": 15550
     },
     {
-      "epoch": 1.6790442363577656,
-      "grad_norm": 0.5584999322891235,
-      "learning_rate": 0.0004998276047839673,
-      "loss": 3.6656,
+      "epoch": 1.6819407008086253,
+      "grad_norm": 0.6118157505989075,
+      "learning_rate": 0.0004996546141392336,
+      "loss": 3.6778,
       "step": 15600
     },
     {
-      "epoch": 1.6844257883973737,
-      "grad_norm": 0.5563948154449463,
-      "learning_rate": 0.0004995043637539058,
-      "loss": 3.6699,
+      "epoch": 1.68733153638814,
+      "grad_norm": 0.5858181118965149,
+      "learning_rate": 0.0004993308148947651,
+      "loss": 3.6833,
       "step": 15650
     },
     {
-      "epoch": 1.689807340436982,
-      "grad_norm": 0.5881028175354004,
-      "learning_rate": 0.0004991811227238443,
-      "loss": 3.6704,
+      "epoch": 1.692722371967655,
+      "grad_norm": 0.6019673943519592,
+      "learning_rate": 0.0004990070156502968,
+      "loss": 3.6808,
       "step": 15700
     },
     {
-      "epoch": 1.6951888924765903,
-      "grad_norm": 0.5460503101348877,
-      "learning_rate": 0.0004988578816937829,
-      "loss": 3.6895,
+      "epoch": 1.6981132075471699,
+      "grad_norm": 0.5684689879417419,
+      "learning_rate": 0.0004986832164058284,
+      "loss": 3.6617,
       "step": 15750
     },
     {
-      "epoch": 1.7005704445161984,
-      "grad_norm": 0.6377148032188416,
-      "learning_rate": 0.0004985346406637215,
-      "loss": 3.6985,
+      "epoch": 1.7035040431266846,
+      "grad_norm": 0.6181156635284424,
+      "learning_rate": 0.0004983594171613599,
+      "loss": 3.6723,
       "step": 15800
     },
     {
-      "epoch": 1.7059519965558065,
-      "grad_norm": 0.5566238760948181,
-      "learning_rate": 0.0004982113996336602,
-      "loss": 3.6537,
+      "epoch": 1.7088948787061995,
+      "grad_norm": 0.6059028506278992,
+      "learning_rate": 0.0004980356179168915,
+      "loss": 3.6885,
       "step": 15850
     },
     {
-      "epoch": 1.7113335485954149,
-      "grad_norm": 0.5709816813468933,
-      "learning_rate": 0.0004978881586035987,
-      "loss": 3.6434,
+      "epoch": 1.7142857142857144,
+      "grad_norm": 0.5463699698448181,
+      "learning_rate": 0.000497711818672423,
+      "loss": 3.6892,
       "step": 15900
     },
     {
-      "epoch": 1.7167151006350232,
-      "grad_norm": 0.5734738111495972,
-      "learning_rate": 0.0004975649175735373,
-      "loss": 3.6449,
+      "epoch": 1.719676549865229,
+      "grad_norm": 0.5700849890708923,
+      "learning_rate": 0.0004973880194279547,
+      "loss": 3.6756,
       "step": 15950
     },
     {
-      "epoch": 1.7220966526746313,
-      "grad_norm": 0.5572895407676697,
-      "learning_rate": 0.0004972416765434759,
-      "loss": 3.6579,
+      "epoch": 1.7250673854447438,
+      "grad_norm": 0.61159747838974,
+      "learning_rate": 0.0004970642201834862,
+      "loss": 3.6719,
       "step": 16000
     },
     {
-      "epoch": 1.7220966526746313,
-      "eval_accuracy": 0.35718387360800713,
-      "eval_loss": 3.6182830333709717,
-      "eval_runtime": 214.6803,
-      "eval_samples_per_second": 83.897,
-      "eval_steps_per_second": 5.245,
+      "epoch": 1.7250673854447438,
+      "eval_accuracy": 0.3567654513370335,
+      "eval_loss": 3.625068426132202,
+      "eval_runtime": 153.0219,
+      "eval_samples_per_second": 117.702,
+      "eval_steps_per_second": 7.358,
       "step": 16000
     },
     {
-      "epoch": 1.7274782047142396,
-      "grad_norm": 0.5495673418045044,
-      "learning_rate": 0.0004969184355134145,
-      "loss": 3.6629,
+      "epoch": 1.7304582210242587,
+      "grad_norm": 0.6022645831108093,
+      "learning_rate": 0.0004967404209390178,
+      "loss": 3.6581,
       "step": 16050
     },
     {
-      "epoch": 1.732859756753848,
-      "grad_norm": 0.6197149753570557,
-      "learning_rate": 0.0004965951944833531,
-      "loss": 3.6439,
+      "epoch": 1.7358490566037736,
+      "grad_norm": 0.5485215187072754,
+      "learning_rate": 0.0004964166216945493,
+      "loss": 3.6612,
       "step": 16100
     },
     {
-      "epoch": 1.738241308793456,
-      "grad_norm": 0.5787531137466431,
-      "learning_rate": 0.0004962719534532916,
-      "loss": 3.6703,
+      "epoch": 1.7412398921832883,
+      "grad_norm": 0.6632840633392334,
+      "learning_rate": 0.000496092822450081,
+      "loss": 3.6679,
       "step": 16150
     },
     {
-      "epoch": 1.7436228608330642,
-      "grad_norm": 0.5500142574310303,
-      "learning_rate": 0.0004959487124232302,
-      "loss": 3.656,
+      "epoch": 1.7466307277628033,
+      "grad_norm": 0.6232796907424927,
+      "learning_rate": 0.0004957690232056125,
+      "loss": 3.6763,
       "step": 16200
     },
     {
-      "epoch": 1.7490044128726725,
-      "grad_norm": 0.5271518230438232,
-      "learning_rate": 0.0004956254713931688,
-      "loss": 3.6594,
+      "epoch": 1.7520215633423182,
+      "grad_norm": 0.5928359031677246,
+      "learning_rate": 0.0004954452239611441,
+      "loss": 3.6582,
       "step": 16250
     },
     {
-      "epoch": 1.7543859649122808,
-      "grad_norm": 0.5560300350189209,
-      "learning_rate": 0.0004953022303631074,
-      "loss": 3.6622,
+      "epoch": 1.7574123989218329,
+      "grad_norm": 0.542195737361908,
+      "learning_rate": 0.0004951214247166756,
+      "loss": 3.6535,
       "step": 16300
     },
     {
-      "epoch": 1.759767516951889,
-      "grad_norm": 0.5545780062675476,
-      "learning_rate": 0.0004949789893330459,
-      "loss": 3.6657,
+      "epoch": 1.7628032345013476,
+      "grad_norm": 0.5729801058769226,
+      "learning_rate": 0.0004947976254722072,
+      "loss": 3.6752,
       "step": 16350
     },
     {
-      "epoch": 1.765149068991497,
-      "grad_norm": 0.5637168884277344,
-      "learning_rate": 0.0004946557483029846,
-      "loss": 3.6701,
+      "epoch": 1.7681940700808625,
+      "grad_norm": 0.5516073107719421,
+      "learning_rate": 0.0004944738262277387,
+      "loss": 3.6533,
       "step": 16400
     },
     {
-      "epoch": 1.7705306210311054,
-      "grad_norm": 0.6499923467636108,
-      "learning_rate": 0.0004943325072729231,
-      "loss": 3.6591,
+      "epoch": 1.7735849056603774,
+      "grad_norm": 0.5613517165184021,
+      "learning_rate": 0.0004941500269832703,
+      "loss": 3.6646,
       "step": 16450
     },
     {
-      "epoch": 1.7759121730707137,
-      "grad_norm": 0.5951055884361267,
-      "learning_rate": 0.0004940092662428617,
-      "loss": 3.6589,
+      "epoch": 1.778975741239892,
+      "grad_norm": 0.503390908241272,
+      "learning_rate": 0.0004938262277388019,
+      "loss": 3.6498,
       "step": 16500
     },
     {
-      "epoch": 1.7812937251103218,
-      "grad_norm": 0.5989054441452026,
-      "learning_rate": 0.0004936860252128003,
-      "loss": 3.6615,
+      "epoch": 1.784366576819407,
+      "grad_norm": 0.5427384972572327,
+      "learning_rate": 0.0004935024284943335,
+      "loss": 3.6562,
       "step": 16550
     },
     {
-      "epoch": 1.78667527714993,
-      "grad_norm": 0.6071681976318359,
-      "learning_rate": 0.0004933627841827388,
-      "loss": 3.6413,
+      "epoch": 1.789757412398922,
+      "grad_norm": 0.5183855295181274,
+      "learning_rate": 0.000493178629249865,
+      "loss": 3.6698,
       "step": 16600
     },
     {
-      "epoch": 1.7920568291895382,
-      "grad_norm": 0.5401762127876282,
-      "learning_rate": 0.0004930395431526775,
-      "loss": 3.6646,
+      "epoch": 1.7951482479784366,
+      "grad_norm": 0.5511346459388733,
+      "learning_rate": 0.0004928548300053966,
+      "loss": 3.6607,
       "step": 16650
     },
     {
-      "epoch": 1.7974383812291466,
-      "grad_norm": 0.6720755100250244,
-      "learning_rate": 0.0004927163021226161,
-      "loss": 3.6535,
+      "epoch": 1.8005390835579513,
+      "grad_norm": 0.6408042907714844,
+      "learning_rate": 0.0004925310307609282,
+      "loss": 3.669,
       "step": 16700
     },
     {
-      "epoch": 1.8028199332687547,
-      "grad_norm": 0.5372287034988403,
-      "learning_rate": 0.0004923930610925547,
-      "loss": 3.6387,
+      "epoch": 1.8059299191374663,
+      "grad_norm": 0.5517454743385315,
+      "learning_rate": 0.0004922072315164598,
+      "loss": 3.6657,
       "step": 16750
     },
     {
-      "epoch": 1.8082014853083628,
-      "grad_norm": 0.5767584443092346,
-      "learning_rate": 0.0004920698200624932,
-      "loss": 3.6509,
+      "epoch": 1.8113207547169812,
+      "grad_norm": 0.5822569727897644,
+      "learning_rate": 0.0004918834322719913,
+      "loss": 3.6735,
       "step": 16800
     },
     {
-      "epoch": 1.813583037347971,
-      "grad_norm": 0.619735598564148,
-      "learning_rate": 0.0004917465790324317,
-      "loss": 3.6593,
+      "epoch": 1.8167115902964959,
+      "grad_norm": 0.5561976432800293,
+      "learning_rate": 0.0004915596330275229,
+      "loss": 3.6814,
       "step": 16850
     },
     {
-      "epoch": 1.8189645893875794,
-      "grad_norm": 0.612182080745697,
-      "learning_rate": 0.0004914233380023704,
-      "loss": 3.669,
+      "epoch": 1.8221024258760108,
+      "grad_norm": 0.547584593296051,
+      "learning_rate": 0.0004912358337830544,
+      "loss": 3.6533,
       "step": 16900
     },
     {
-      "epoch": 1.8243461414271875,
-      "grad_norm": 0.5973613262176514,
-      "learning_rate": 0.0004911000969723089,
-      "loss": 3.6581,
+      "epoch": 1.8274932614555257,
+      "grad_norm": 0.5689830780029297,
+      "learning_rate": 0.000490912034538586,
+      "loss": 3.6651,
       "step": 16950
     },
     {
-      "epoch": 1.8297276934667959,
-      "grad_norm": 0.5377869606018066,
-      "learning_rate": 0.0004907768559422476,
-      "loss": 3.6582,
+      "epoch": 1.8328840970350404,
+      "grad_norm": 0.5999789237976074,
+      "learning_rate": 0.0004905882352941175,
+      "loss": 3.6449,
       "step": 17000
     },
     {
-      "epoch": 1.8297276934667959,
-      "eval_accuracy": 0.3589950085949867,
-      "eval_loss": 3.60500168800354,
-      "eval_runtime": 203.9944,
-      "eval_samples_per_second": 88.292,
-      "eval_steps_per_second": 5.52,
+      "epoch": 1.8328840970350404,
+      "eval_accuracy": 0.3580432092977318,
+      "eval_loss": 3.6107499599456787,
+      "eval_runtime": 152.6204,
+      "eval_samples_per_second": 118.012,
+      "eval_steps_per_second": 7.378,
       "step": 17000
     },
     {
-      "epoch": 1.8351092455064042,
-      "grad_norm": 0.5382079482078552,
-      "learning_rate": 0.0004904536149121861,
-      "loss": 3.6536,
+      "epoch": 1.838274932614555,
+      "grad_norm": 0.5623023509979248,
+      "learning_rate": 0.0004902644360496492,
+      "loss": 3.6507,
       "step": 17050
     },
     {
-      "epoch": 1.8404907975460123,
-      "grad_norm": 0.5734342336654663,
-      "learning_rate": 0.0004901303738821248,
-      "loss": 3.6325,
+      "epoch": 1.8436657681940702,
+      "grad_norm": 0.5425819754600525,
+      "learning_rate": 0.0004899406368051808,
+      "loss": 3.6409,
       "step": 17100
     },
     {
-      "epoch": 1.8458723495856204,
-      "grad_norm": 0.6593245267868042,
-      "learning_rate": 0.0004898071328520633,
-      "loss": 3.6733,
+      "epoch": 1.849056603773585,
+      "grad_norm": 0.5366511940956116,
+      "learning_rate": 0.0004896168375607123,
+      "loss": 3.6465,
       "step": 17150
     },
     {
-      "epoch": 1.8512539016252287,
-      "grad_norm": 0.5794579982757568,
-      "learning_rate": 0.0004894838918220019,
-      "loss": 3.6634,
+      "epoch": 1.8544474393530996,
+      "grad_norm": 0.5287717580795288,
+      "learning_rate": 0.0004892930383162439,
+      "loss": 3.6553,
       "step": 17200
     },
     {
-      "epoch": 1.856635453664837,
-      "grad_norm": 0.5953862071037292,
-      "learning_rate": 0.0004891606507919405,
-      "loss": 3.6544,
+      "epoch": 1.8598382749326146,
+      "grad_norm": 0.5218010544776917,
+      "learning_rate": 0.0004889692390717754,
+      "loss": 3.6619,
       "step": 17250
     },
     {
-      "epoch": 1.8620170057044452,
-      "grad_norm": 0.6200346946716309,
-      "learning_rate": 0.000488837409761879,
-      "loss": 3.6443,
+      "epoch": 1.8652291105121295,
+      "grad_norm": 0.5530878901481628,
+      "learning_rate": 0.0004886454398273071,
+      "loss": 3.6631,
       "step": 17300
     },
     {
-      "epoch": 1.8673985577440533,
-      "grad_norm": 0.585228681564331,
-      "learning_rate": 0.0004885141687318177,
-      "loss": 3.6478,
+      "epoch": 1.8706199460916442,
+      "grad_norm": 0.5743786096572876,
+      "learning_rate": 0.0004883216405828386,
+      "loss": 3.6628,
       "step": 17350
     },
     {
-      "epoch": 1.8727801097836616,
-      "grad_norm": 0.6394591927528381,
-      "learning_rate": 0.00048819092770175623,
-      "loss": 3.6416,
+      "epoch": 1.8760107816711589,
+      "grad_norm": 0.542460024356842,
+      "learning_rate": 0.00048799784133837017,
+      "loss": 3.6582,
       "step": 17400
     },
     {
-      "epoch": 1.87816166182327,
-      "grad_norm": 0.5823236107826233,
-      "learning_rate": 0.0004878676866716948,
-      "loss": 3.6572,
+      "epoch": 1.881401617250674,
+      "grad_norm": 0.6218643188476562,
+      "learning_rate": 0.0004876740420939017,
+      "loss": 3.6466,
       "step": 17450
     },
     {
-      "epoch": 1.883543213862878,
-      "grad_norm": 0.5914448499679565,
-      "learning_rate": 0.00048754444564163337,
-      "loss": 3.6479,
+      "epoch": 1.8867924528301887,
+      "grad_norm": 0.5412008166313171,
+      "learning_rate": 0.0004873567188343227,
+      "loss": 3.6474,
       "step": 17500
     },
     {
-      "epoch": 1.8889247659024861,
-      "grad_norm": 0.6287000775337219,
-      "learning_rate": 0.000487221204611572,
-      "loss": 3.6431,
+      "epoch": 1.8921832884097034,
+      "grad_norm": 0.5895251035690308,
+      "learning_rate": 0.0004870329195898542,
+      "loss": 3.645,
       "step": 17550
     },
     {
-      "epoch": 1.8943063179420945,
-      "grad_norm": 0.5507499575614929,
-      "learning_rate": 0.00048689796358151056,
-      "loss": 3.6444,
+      "epoch": 1.8975741239892183,
+      "grad_norm": 0.5757617950439453,
+      "learning_rate": 0.00048670912034538583,
+      "loss": 3.66,
       "step": 17600
     },
     {
-      "epoch": 1.8996878699817028,
-      "grad_norm": 0.5838987827301025,
-      "learning_rate": 0.00048657472255144915,
-      "loss": 3.649,
+      "epoch": 1.9029649595687332,
+      "grad_norm": 0.6187827587127686,
+      "learning_rate": 0.0004863853211009174,
+      "loss": 3.6504,
       "step": 17650
     },
     {
-      "epoch": 1.905069422021311,
-      "grad_norm": 0.6239995360374451,
-      "learning_rate": 0.00048625148152138775,
-      "loss": 3.6385,
+      "epoch": 1.908355795148248,
+      "grad_norm": 0.5335280299186707,
+      "learning_rate": 0.000486061521856449,
+      "loss": 3.644,
       "step": 17700
     },
     {
-      "epoch": 1.910450974060919,
-      "grad_norm": 0.5718323588371277,
-      "learning_rate": 0.0004859282404913263,
-      "loss": 3.6248,
+      "epoch": 1.9137466307277629,
+      "grad_norm": 0.5789933800697327,
+      "learning_rate": 0.00048573772261198054,
+      "loss": 3.6661,
       "step": 17750
     },
     {
-      "epoch": 1.9158325261005273,
-      "grad_norm": 0.5826128125190735,
-      "learning_rate": 0.0004856049994612649,
-      "loss": 3.6441,
+      "epoch": 1.9191374663072778,
+      "grad_norm": 0.6365672945976257,
+      "learning_rate": 0.00048541392336751214,
+      "loss": 3.6509,
       "step": 17800
     },
     {
-      "epoch": 1.9212140781401357,
-      "grad_norm": 0.6261082291603088,
-      "learning_rate": 0.00048528175843120353,
-      "loss": 3.6379,
+      "epoch": 1.9245283018867925,
+      "grad_norm": 0.5443440079689026,
+      "learning_rate": 0.0004850901241230437,
+      "loss": 3.6415,
       "step": 17850
     },
     {
-      "epoch": 1.9265956301797438,
-      "grad_norm": 0.5729628205299377,
-      "learning_rate": 0.0004849585174011421,
-      "loss": 3.6403,
+      "epoch": 1.9299191374663072,
+      "grad_norm": 0.5827429294586182,
+      "learning_rate": 0.0004847663248785753,
+      "loss": 3.655,
       "step": 17900
     },
     {
-      "epoch": 1.931977182219352,
-      "grad_norm": 0.6479345560073853,
-      "learning_rate": 0.00048464174119168193,
-      "loss": 3.6513,
+      "epoch": 1.935309973045822,
+      "grad_norm": 0.5369715094566345,
+      "learning_rate": 0.0004844425256341068,
+      "loss": 3.6345,
       "step": 17950
     },
     {
-      "epoch": 1.9373587342589604,
-      "grad_norm": 0.6019191145896912,
-      "learning_rate": 0.0004843185001616205,
-      "loss": 3.6427,
+      "epoch": 1.940700808625337,
+      "grad_norm": 0.5673846006393433,
+      "learning_rate": 0.00048411872638963834,
+      "loss": 3.6395,
       "step": 18000
     },
     {
-      "epoch": 1.9373587342589604,
-      "eval_accuracy": 0.3606186086975985,
-      "eval_loss": 3.589313268661499,
-      "eval_runtime": 204.8701,
-      "eval_samples_per_second": 87.914,
-      "eval_steps_per_second": 5.496,
+      "epoch": 1.940700808625337,
+      "eval_accuracy": 0.35913441025005266,
+      "eval_loss": 3.5955255031585693,
+      "eval_runtime": 152.9984,
+      "eval_samples_per_second": 117.72,
+      "eval_steps_per_second": 7.36,
       "step": 18000
     },
     {
-      "epoch": 1.9427402862985685,
-      "grad_norm": 0.5419387817382812,
-      "learning_rate": 0.00048399525913155907,
-      "loss": 3.6361,
+      "epoch": 1.9460916442048517,
+      "grad_norm": 0.5917626619338989,
+      "learning_rate": 0.00048379492714516995,
+      "loss": 3.6433,
       "step": 18050
     },
     {
-      "epoch": 1.9481218383381766,
-      "grad_norm": 0.6220253109931946,
-      "learning_rate": 0.0004836720181014976,
-      "loss": 3.6207,
+      "epoch": 1.9514824797843666,
+      "grad_norm": 0.5339709520339966,
+      "learning_rate": 0.0004834711279007015,
+      "loss": 3.6414,
       "step": 18100
     },
     {
-      "epoch": 1.953503390377785,
-      "grad_norm": 0.5842125415802002,
-      "learning_rate": 0.0004833487770714362,
-      "loss": 3.6405,
+      "epoch": 1.9568733153638815,
+      "grad_norm": 0.5372532606124878,
+      "learning_rate": 0.0004831473286562331,
+      "loss": 3.6388,
       "step": 18150
     },
     {
-      "epoch": 1.9588849424173933,
-      "grad_norm": 0.5908029675483704,
-      "learning_rate": 0.00048302553604137485,
-      "loss": 3.6445,
+      "epoch": 1.9622641509433962,
+      "grad_norm": 0.605369508266449,
+      "learning_rate": 0.00048282352941176465,
+      "loss": 3.6292,
       "step": 18200
     },
     {
-      "epoch": 1.9642664944570014,
-      "grad_norm": 0.5424453020095825,
-      "learning_rate": 0.0004827022950113134,
-      "loss": 3.6419,
+      "epoch": 1.967654986522911,
+      "grad_norm": 0.566500723361969,
+      "learning_rate": 0.00048249973016729626,
+      "loss": 3.6473,
       "step": 18250
     },
     {
-      "epoch": 1.9696480464966095,
-      "grad_norm": 0.6032727360725403,
-      "learning_rate": 0.000482379053981252,
-      "loss": 3.6507,
+      "epoch": 1.9730458221024259,
+      "grad_norm": 0.6016471982002258,
+      "learning_rate": 0.0004821759309228278,
+      "loss": 3.6583,
       "step": 18300
     },
     {
-      "epoch": 1.9750295985362178,
-      "grad_norm": 0.6096407771110535,
-      "learning_rate": 0.0004820558129511906,
-      "loss": 3.6364,
+      "epoch": 1.9784366576819408,
+      "grad_norm": 0.5543355345726013,
+      "learning_rate": 0.00048185213167835936,
+      "loss": 3.6431,
       "step": 18350
     },
     {
-      "epoch": 1.9804111505758262,
-      "grad_norm": 0.6347204446792603,
-      "learning_rate": 0.0004817325719211291,
-      "loss": 3.6451,
+      "epoch": 1.9838274932614555,
+      "grad_norm": 0.6136170029640198,
+      "learning_rate": 0.00048152833243389096,
+      "loss": 3.643,
       "step": 18400
     },
     {
-      "epoch": 1.9857927026154343,
-      "grad_norm": 0.5634959936141968,
-      "learning_rate": 0.0004814093308910677,
-      "loss": 3.641,
+      "epoch": 1.9892183288409704,
+      "grad_norm": 0.5570639371871948,
+      "learning_rate": 0.0004812045331894225,
+      "loss": 3.6528,
       "step": 18450
     },
     {
-      "epoch": 1.9911742546550424,
-      "grad_norm": 0.5860807299613953,
-      "learning_rate": 0.00048108608986100637,
-      "loss": 3.6336,
+      "epoch": 1.9946091644204853,
+      "grad_norm": 0.5984097719192505,
+      "learning_rate": 0.0004808807339449541,
+      "loss": 3.6418,
       "step": 18500
     },
     {
-      "epoch": 1.9965558066946507,
-      "grad_norm": 0.5338708162307739,
-      "learning_rate": 0.0004807628488309449,
-      "loss": 3.6211,
+      "epoch": 2.0,
+      "grad_norm": 1.259522795677185,
+      "learning_rate": 0.0004805569347004856,
+      "loss": 3.6473,
       "step": 18550
     },
     {
-      "epoch": 2.001937358734259,
-      "grad_norm": 0.5974145531654358,
-      "learning_rate": 0.0004804396078008835,
-      "loss": 3.5958,
+      "epoch": 2.0053908355795147,
+      "grad_norm": 0.5877414345741272,
+      "learning_rate": 0.0004802331354560173,
+      "loss": 3.5795,
       "step": 18600
     },
     {
-      "epoch": 2.007318910773867,
-      "grad_norm": 0.5562605261802673,
-      "learning_rate": 0.00048011636677082204,
-      "loss": 3.5582,
+      "epoch": 2.01078167115903,
+      "grad_norm": 0.5764709711074829,
+      "learning_rate": 0.00047990933621154877,
+      "loss": 3.5482,
       "step": 18650
     },
     {
-      "epoch": 2.0127004628134753,
-      "grad_norm": 0.6376757025718689,
-      "learning_rate": 0.00047979312574076064,
-      "loss": 3.5377,
+      "epoch": 2.0161725067385445,
+      "grad_norm": 0.5639136433601379,
+      "learning_rate": 0.0004795855369670804,
+      "loss": 3.5606,
       "step": 18700
     },
     {
-      "epoch": 2.018082014853084,
-      "grad_norm": 0.5702477097511292,
-      "learning_rate": 0.0004794698847106992,
-      "loss": 3.5431,
+      "epoch": 2.0215633423180592,
+      "grad_norm": 0.5635726451873779,
+      "learning_rate": 0.0004792617377226119,
+      "loss": 3.5408,
       "step": 18750
     },
     {
-      "epoch": 2.023463566892692,
-      "grad_norm": 0.5655612945556641,
-      "learning_rate": 0.0004791466436806378,
-      "loss": 3.5661,
+      "epoch": 2.026954177897574,
+      "grad_norm": 0.6025230884552002,
+      "learning_rate": 0.0004789379384781435,
+      "loss": 3.5591,
       "step": 18800
     },
     {
-      "epoch": 2.0288451189323,
-      "grad_norm": 0.5684214234352112,
-      "learning_rate": 0.0004788234026505764,
-      "loss": 3.5378,
+      "epoch": 2.032345013477089,
+      "grad_norm": 0.5429965257644653,
+      "learning_rate": 0.0004786141392336751,
+      "loss": 3.5674,
       "step": 18850
     },
     {
-      "epoch": 2.034226670971908,
-      "grad_norm": 0.595730185508728,
-      "learning_rate": 0.00047850016162051496,
-      "loss": 3.5512,
+      "epoch": 2.0377358490566038,
+      "grad_norm": 0.5635925531387329,
+      "learning_rate": 0.00047829033998920663,
+      "loss": 3.5557,
       "step": 18900
     },
     {
-      "epoch": 2.0396082230115167,
-      "grad_norm": 0.602304220199585,
-      "learning_rate": 0.00047817692059045356,
-      "loss": 3.5471,
+      "epoch": 2.0431266846361185,
+      "grad_norm": 0.5662844181060791,
+      "learning_rate": 0.00047796654074473824,
+      "loss": 3.5489,
       "step": 18950
     },
     {
-      "epoch": 2.044989775051125,
-      "grad_norm": 0.6218283176422119,
-      "learning_rate": 0.00047785367956039215,
-      "loss": 3.56,
+      "epoch": 2.0485175202156336,
+      "grad_norm": 0.5810090899467468,
+      "learning_rate": 0.0004776427415002698,
+      "loss": 3.5784,
       "step": 19000
     },
     {
-      "epoch": 2.044989775051125,
-      "eval_accuracy": 0.3616626542966078,
-      "eval_loss": 3.5785436630249023,
-      "eval_runtime": 202.2767,
-      "eval_samples_per_second": 89.041,
-      "eval_steps_per_second": 5.567,
+      "epoch": 2.0485175202156336,
+      "eval_accuracy": 0.3612909527673945,
+      "eval_loss": 3.5867395401000977,
+      "eval_runtime": 152.2957,
+      "eval_samples_per_second": 118.263,
+      "eval_steps_per_second": 7.394,
       "step": 19000
     },
     {
-      "epoch": 2.050371327090733,
-      "grad_norm": 0.5897380113601685,
-      "learning_rate": 0.00047753043853033075,
-      "loss": 3.5425,
+      "epoch": 2.0539083557951483,
+      "grad_norm": 0.5732460021972656,
+      "learning_rate": 0.0004773189422558014,
+      "loss": 3.5515,
       "step": 19050
     },
     {
-      "epoch": 2.055752879130341,
-      "grad_norm": 0.5932830572128296,
-      "learning_rate": 0.00047720719750026934,
-      "loss": 3.5699,
+      "epoch": 2.059299191374663,
+      "grad_norm": 0.5630287528038025,
+      "learning_rate": 0.00047699514301133294,
+      "loss": 3.5751,
       "step": 19100
     },
     {
-      "epoch": 2.0611344311699495,
-      "grad_norm": 0.5921191573143005,
-      "learning_rate": 0.00047688395647020793,
-      "loss": 3.5431,
+      "epoch": 2.0646900269541777,
+      "grad_norm": 0.5999619364738464,
+      "learning_rate": 0.00047667134376686455,
+      "loss": 3.569,
       "step": 19150
     },
     {
-      "epoch": 2.0665159832095576,
-      "grad_norm": 0.6052051186561584,
-      "learning_rate": 0.0004765607154401465,
-      "loss": 3.5734,
+      "epoch": 2.070080862533693,
+      "grad_norm": 0.6230016350746155,
+      "learning_rate": 0.0004763475445223961,
+      "loss": 3.5766,
       "step": 19200
     },
     {
-      "epoch": 2.0718975352491658,
-      "grad_norm": 0.562523603439331,
-      "learning_rate": 0.00047623747441008507,
-      "loss": 3.543,
+      "epoch": 2.0754716981132075,
+      "grad_norm": 0.6227099895477295,
+      "learning_rate": 0.0004760237452779276,
+      "loss": 3.5583,
       "step": 19250
     },
     {
-      "epoch": 2.0772790872887743,
-      "grad_norm": 0.546420156955719,
-      "learning_rate": 0.0004759142333800236,
-      "loss": 3.5441,
+      "epoch": 2.0808625336927222,
+      "grad_norm": 0.575489342212677,
+      "learning_rate": 0.0004756999460334592,
+      "loss": 3.5529,
       "step": 19300
     },
     {
-      "epoch": 2.0826606393283824,
-      "grad_norm": 0.590813159942627,
-      "learning_rate": 0.00047559099234996226,
-      "loss": 3.5624,
+      "epoch": 2.0862533692722374,
+      "grad_norm": 0.6161721348762512,
+      "learning_rate": 0.00047537614678899075,
+      "loss": 3.5567,
       "step": 19350
     },
     {
-      "epoch": 2.0880421913679905,
-      "grad_norm": 0.6105584502220154,
-      "learning_rate": 0.00047526775131990085,
-      "loss": 3.557,
+      "epoch": 2.091644204851752,
+      "grad_norm": 0.570197582244873,
+      "learning_rate": 0.00047505234754452235,
+      "loss": 3.551,
       "step": 19400
     },
     {
-      "epoch": 2.0934237434075986,
-      "grad_norm": 0.6170618534088135,
-      "learning_rate": 0.0004749445102898394,
-      "loss": 3.5373,
+      "epoch": 2.0970350404312668,
+      "grad_norm": 0.6006742715835571,
+      "learning_rate": 0.0004747285483000539,
+      "loss": 3.5608,
       "step": 19450
     },
     {
-      "epoch": 2.098805295447207,
-      "grad_norm": 0.5814986228942871,
-      "learning_rate": 0.000474621269259778,
-      "loss": 3.5693,
+      "epoch": 2.1024258760107815,
+      "grad_norm": 0.5746577978134155,
+      "learning_rate": 0.00047441122504047486,
+      "loss": 3.5538,
       "step": 19500
     },
     {
-      "epoch": 2.1041868474868153,
-      "grad_norm": 0.6113174557685852,
-      "learning_rate": 0.0004742980282297166,
-      "loss": 3.5589,
+      "epoch": 2.1078167115902966,
+      "grad_norm": 0.5501875281333923,
+      "learning_rate": 0.0004740874257960064,
+      "loss": 3.559,
       "step": 19550
     },
     {
-      "epoch": 2.1095683995264234,
-      "grad_norm": 0.6312413811683655,
-      "learning_rate": 0.0004739747871996551,
-      "loss": 3.5398,
+      "epoch": 2.1132075471698113,
+      "grad_norm": 0.5990867018699646,
+      "learning_rate": 0.000473763626551538,
+      "loss": 3.5566,
       "step": 19600
     },
     {
-      "epoch": 2.1149499515660315,
-      "grad_norm": 0.6352087259292603,
-      "learning_rate": 0.00047365154616959377,
-      "loss": 3.5541,
+      "epoch": 2.118598382749326,
+      "grad_norm": 0.5872001647949219,
+      "learning_rate": 0.0004734592552617377,
+      "loss": 3.5883,
       "step": 19650
     },
     {
-      "epoch": 2.12033150360564,
-      "grad_norm": 0.6081045269966125,
-      "learning_rate": 0.00047332830513953237,
-      "loss": 3.576,
+      "epoch": 2.123989218328841,
+      "grad_norm": 0.5909648537635803,
+      "learning_rate": 0.00047313545601726926,
+      "loss": 3.5488,
       "step": 19700
     },
     {
-      "epoch": 2.125713055645248,
-      "grad_norm": 0.5704367160797119,
-      "learning_rate": 0.0004730050641094709,
-      "loss": 3.5563,
+      "epoch": 2.129380053908356,
+      "grad_norm": 0.5571679472923279,
+      "learning_rate": 0.0004728116567728008,
+      "loss": 3.5699,
       "step": 19750
     },
     {
-      "epoch": 2.1310946076848563,
-      "grad_norm": 0.6367883682250977,
-      "learning_rate": 0.0004726818230794095,
-      "loss": 3.5358,
+      "epoch": 2.1347708894878705,
+      "grad_norm": 0.5732533931732178,
+      "learning_rate": 0.0004724878575283324,
+      "loss": 3.5616,
       "step": 19800
     },
     {
-      "epoch": 2.1364761597244644,
-      "grad_norm": 0.5543844699859619,
-      "learning_rate": 0.00047235858204934804,
-      "loss": 3.5561,
+      "epoch": 2.1401617250673857,
+      "grad_norm": 0.5497268438339233,
+      "learning_rate": 0.00047216405828386397,
+      "loss": 3.5549,
       "step": 19850
     },
     {
-      "epoch": 2.141857711764073,
-      "grad_norm": 0.6325891017913818,
-      "learning_rate": 0.0004720353410192867,
-      "loss": 3.5459,
+      "epoch": 2.1455525606469004,
+      "grad_norm": 0.5779978632926941,
+      "learning_rate": 0.0004718402590393956,
+      "loss": 3.5673,
       "step": 19900
     },
     {
-      "epoch": 2.147239263803681,
-      "grad_norm": 0.5876049995422363,
-      "learning_rate": 0.0004717120999892253,
-      "loss": 3.5568,
+      "epoch": 2.150943396226415,
+      "grad_norm": 0.5528773069381714,
+      "learning_rate": 0.00047151645979492707,
+      "loss": 3.5603,
       "step": 19950
     },
     {
-      "epoch": 2.152620815843289,
-      "grad_norm": 0.5623601675033569,
-      "learning_rate": 0.0004713888589591638,
-      "loss": 3.5527,
+      "epoch": 2.1563342318059298,
+      "grad_norm": 0.6277232766151428,
+      "learning_rate": 0.0004711926605504587,
+      "loss": 3.5816,
       "step": 20000
     },
     {
-      "epoch": 2.152620815843289,
-      "eval_accuracy": 0.3627479879929867,
-      "eval_loss": 3.5705673694610596,
-      "eval_runtime": 206.2591,
-      "eval_samples_per_second": 87.322,
-      "eval_steps_per_second": 5.459,
+      "epoch": 2.1563342318059298,
+      "eval_accuracy": 0.3615700820361899,
+      "eval_loss": 3.5775678157806396,
+      "eval_runtime": 152.8956,
+      "eval_samples_per_second": 117.799,
+      "eval_steps_per_second": 7.365,
       "step": 20000
     },
     {
-      "epoch": 2.1580023678828972,
-      "grad_norm": 0.5725038051605225,
-      "learning_rate": 0.0004710656179291024,
-      "loss": 3.5565,
+      "epoch": 2.161725067385445,
+      "grad_norm": 0.5952591896057129,
+      "learning_rate": 0.0004708688613059902,
+      "loss": 3.5851,
       "step": 20050
     },
     {
-      "epoch": 2.163383919922506,
-      "grad_norm": 0.6287389993667603,
-      "learning_rate": 0.000470742376899041,
-      "loss": 3.5528,
+      "epoch": 2.1671159029649596,
+      "grad_norm": 0.6090236306190491,
+      "learning_rate": 0.0004705450620615218,
+      "loss": 3.5848,
       "step": 20100
     },
     {
-      "epoch": 2.168765471962114,
-      "grad_norm": 0.5884525179862976,
-      "learning_rate": 0.00047041913586897956,
-      "loss": 3.5597,
+      "epoch": 2.1725067385444743,
+      "grad_norm": 0.5715079307556152,
+      "learning_rate": 0.0004702212628170534,
+      "loss": 3.5757,
       "step": 20150
     },
     {
-      "epoch": 2.174147024001722,
-      "grad_norm": 0.5764286518096924,
-      "learning_rate": 0.0004700958948389182,
-      "loss": 3.5426,
+      "epoch": 2.177897574123989,
+      "grad_norm": 0.6015267968177795,
+      "learning_rate": 0.00046989746357258493,
+      "loss": 3.5736,
       "step": 20200
     },
     {
-      "epoch": 2.1795285760413305,
-      "grad_norm": 0.6041747331619263,
-      "learning_rate": 0.0004697726538088568,
-      "loss": 3.5471,
+      "epoch": 2.183288409703504,
+      "grad_norm": 0.5924991369247437,
+      "learning_rate": 0.00046957366432811654,
+      "loss": 3.5751,
       "step": 20250
     },
     {
-      "epoch": 2.1849101280809387,
-      "grad_norm": 0.5907628536224365,
-      "learning_rate": 0.00046944941277879534,
-      "loss": 3.5453,
+      "epoch": 2.188679245283019,
+      "grad_norm": 0.5487517714500427,
+      "learning_rate": 0.0004692498650836481,
+      "loss": 3.5663,
       "step": 20300
     },
     {
-      "epoch": 2.1902916801205468,
-      "grad_norm": 0.5632272958755493,
-      "learning_rate": 0.00046912617174873394,
-      "loss": 3.5558,
+      "epoch": 2.1940700808625335,
+      "grad_norm": 0.5642894506454468,
+      "learning_rate": 0.0004689260658391797,
+      "loss": 3.5811,
       "step": 20350
     },
     {
-      "epoch": 2.195673232160155,
-      "grad_norm": 0.6226711869239807,
-      "learning_rate": 0.0004688029307186725,
-      "loss": 3.5865,
+      "epoch": 2.1994609164420487,
+      "grad_norm": 0.6076687574386597,
+      "learning_rate": 0.00046860226659471124,
+      "loss": 3.5823,
       "step": 20400
     },
     {
-      "epoch": 2.2010547841997634,
-      "grad_norm": 0.5935404896736145,
-      "learning_rate": 0.00046847968968861107,
-      "loss": 3.5594,
+      "epoch": 2.2048517520215634,
+      "grad_norm": 0.5700116157531738,
+      "learning_rate": 0.00046827846735024285,
+      "loss": 3.5785,
       "step": 20450
     },
     {
-      "epoch": 2.2064363362393715,
-      "grad_norm": 0.5879222750663757,
-      "learning_rate": 0.0004681564486585497,
-      "loss": 3.5637,
+      "epoch": 2.210242587601078,
+      "grad_norm": 0.5856590867042542,
+      "learning_rate": 0.0004679546681057744,
+      "loss": 3.5476,
       "step": 20500
     },
     {
-      "epoch": 2.2118178882789796,
-      "grad_norm": 0.5888842940330505,
-      "learning_rate": 0.00046783320762848826,
-      "loss": 3.5438,
+      "epoch": 2.215633423180593,
+      "grad_norm": 0.5908544659614563,
+      "learning_rate": 0.0004676308688613059,
+      "loss": 3.5791,
       "step": 20550
     },
     {
-      "epoch": 2.2171994403185877,
-      "grad_norm": 0.5803065299987793,
-      "learning_rate": 0.00046750996659842685,
-      "loss": 3.5429,
+      "epoch": 2.221024258760108,
+      "grad_norm": 0.6657243371009827,
+      "learning_rate": 0.00046730706961683755,
+      "loss": 3.5541,
       "step": 20600
     },
     {
-      "epoch": 2.2225809923581963,
-      "grad_norm": 0.5627387166023254,
-      "learning_rate": 0.00046718672556836545,
-      "loss": 3.5601,
+      "epoch": 2.2264150943396226,
+      "grad_norm": 0.5953009724617004,
+      "learning_rate": 0.00046698327037236905,
+      "loss": 3.5625,
       "step": 20650
     },
     {
-      "epoch": 2.2279625443978044,
-      "grad_norm": 0.5622759461402893,
-      "learning_rate": 0.000466863484538304,
-      "loss": 3.5676,
+      "epoch": 2.2318059299191373,
+      "grad_norm": 0.5696465969085693,
+      "learning_rate": 0.00046665947112790065,
+      "loss": 3.5584,
       "step": 20700
     },
     {
-      "epoch": 2.2333440964374125,
-      "grad_norm": 0.5685352087020874,
-      "learning_rate": 0.0004665402435082426,
-      "loss": 3.5572,
+      "epoch": 2.2371967654986524,
+      "grad_norm": 0.5952767729759216,
+      "learning_rate": 0.0004663356718834322,
+      "loss": 3.5659,
       "step": 20750
     },
     {
-      "epoch": 2.2387256484770206,
-      "grad_norm": 0.6102375388145447,
-      "learning_rate": 0.00046621700247818123,
-      "loss": 3.5546,
+      "epoch": 2.242587601078167,
+      "grad_norm": 0.5673699975013733,
+      "learning_rate": 0.0004660118726389638,
+      "loss": 3.5602,
       "step": 20800
     },
     {
-      "epoch": 2.244107200516629,
-      "grad_norm": 0.5915598273277283,
-      "learning_rate": 0.0004658937614481198,
-      "loss": 3.5519,
+      "epoch": 2.247978436657682,
+      "grad_norm": 0.5948861241340637,
+      "learning_rate": 0.00046568807339449536,
+      "loss": 3.5748,
       "step": 20850
     },
     {
-      "epoch": 2.2494887525562373,
-      "grad_norm": 0.5531293153762817,
-      "learning_rate": 0.00046557052041805837,
-      "loss": 3.5491,
+      "epoch": 2.2533692722371965,
+      "grad_norm": 0.5734835267066956,
+      "learning_rate": 0.00046536427415002696,
+      "loss": 3.568,
       "step": 20900
     },
     {
-      "epoch": 2.2548703045958454,
-      "grad_norm": 0.5677109956741333,
-      "learning_rate": 0.0004652472793879969,
-      "loss": 3.5562,
+      "epoch": 2.2587601078167117,
+      "grad_norm": 0.5433420538902283,
+      "learning_rate": 0.0004650404749055585,
+      "loss": 3.5583,
       "step": 20950
     },
     {
-      "epoch": 2.2602518566354535,
-      "grad_norm": 0.5943840742111206,
-      "learning_rate": 0.0004649240383579355,
-      "loss": 3.5487,
+      "epoch": 2.2641509433962264,
+      "grad_norm": 0.6004379391670227,
+      "learning_rate": 0.00046471667566109007,
+      "loss": 3.5716,
       "step": 21000
     },
     {
-      "epoch": 2.2602518566354535,
-      "eval_accuracy": 0.36394262649450687,
-      "eval_loss": 3.560176372528076,
-      "eval_runtime": 204.7535,
-      "eval_samples_per_second": 87.964,
-      "eval_steps_per_second": 5.499,
+      "epoch": 2.2641509433962264,
+      "eval_accuracy": 0.36321606463369144,
+      "eval_loss": 3.5651097297668457,
+      "eval_runtime": 152.3329,
+      "eval_samples_per_second": 118.234,
+      "eval_steps_per_second": 7.392,
       "step": 21000
     },
     {
-      "epoch": 2.265633408675062,
-      "grad_norm": 0.5741783380508423,
-      "learning_rate": 0.00046460079732787415,
-      "loss": 3.5606,
+      "epoch": 2.269541778975741,
+      "grad_norm": 0.5563739538192749,
+      "learning_rate": 0.00046439287641662167,
+      "loss": 3.5663,
       "step": 21050
     },
     {
-      "epoch": 2.27101496071467,
-      "grad_norm": 0.6168876886367798,
-      "learning_rate": 0.0004642775562978127,
-      "loss": 3.5662,
+      "epoch": 2.274932614555256,
+      "grad_norm": 0.6050722599029541,
+      "learning_rate": 0.0004640690771721532,
+      "loss": 3.562,
       "step": 21100
     },
     {
-      "epoch": 2.2763965127542782,
-      "grad_norm": 0.6492966413497925,
-      "learning_rate": 0.0004639543152677513,
-      "loss": 3.5546,
+      "epoch": 2.280323450134771,
+      "grad_norm": 0.5888524055480957,
+      "learning_rate": 0.0004637452779276848,
+      "loss": 3.5838,
       "step": 21150
     },
     {
-      "epoch": 2.281778064793887,
-      "grad_norm": 0.634781539440155,
-      "learning_rate": 0.0004636310742376899,
-      "loss": 3.5674,
+      "epoch": 2.2857142857142856,
+      "grad_norm": 0.6243483424186707,
+      "learning_rate": 0.0004634214786832164,
+      "loss": 3.5691,
       "step": 21200
     },
     {
-      "epoch": 2.287159616833495,
-      "grad_norm": 0.5752384066581726,
-      "learning_rate": 0.0004633078332076284,
-      "loss": 3.5561,
+      "epoch": 2.2911051212938007,
+      "grad_norm": 0.6089298725128174,
+      "learning_rate": 0.000463097679438748,
+      "loss": 3.5804,
       "step": 21250
     },
     {
-      "epoch": 2.292541168873103,
-      "grad_norm": 0.5751746296882629,
-      "learning_rate": 0.000462984592177567,
-      "loss": 3.5505,
+      "epoch": 2.2964959568733154,
+      "grad_norm": 0.5981297492980957,
+      "learning_rate": 0.0004627738801942795,
+      "loss": 3.5454,
       "step": 21300
     },
     {
-      "epoch": 2.297922720912711,
-      "grad_norm": 0.5733442902565002,
-      "learning_rate": 0.00046266135114750567,
-      "loss": 3.5489,
+      "epoch": 2.30188679245283,
+      "grad_norm": 0.7177810072898865,
+      "learning_rate": 0.0004624500809498111,
+      "loss": 3.5644,
       "step": 21350
     },
     {
-      "epoch": 2.303304272952319,
-      "grad_norm": 0.6262395977973938,
-      "learning_rate": 0.0004623381101174442,
-      "loss": 3.552,
+      "epoch": 2.3072776280323453,
+      "grad_norm": 0.5293326377868652,
+      "learning_rate": 0.00046212628170534263,
+      "loss": 3.559,
       "step": 21400
     },
     {
-      "epoch": 2.3086858249919278,
-      "grad_norm": 0.6084133982658386,
-      "learning_rate": 0.0004620148690873828,
-      "loss": 3.5515,
+      "epoch": 2.31266846361186,
+      "grad_norm": 0.5836008191108704,
+      "learning_rate": 0.0004618024824608742,
+      "loss": 3.5843,
       "step": 21450
     },
     {
-      "epoch": 2.314067377031536,
-      "grad_norm": 0.558005690574646,
-      "learning_rate": 0.00046169162805732134,
-      "loss": 3.5501,
+      "epoch": 2.3180592991913747,
+      "grad_norm": 0.5900083780288696,
+      "learning_rate": 0.0004614786832164058,
+      "loss": 3.5552,
       "step": 21500
     },
     {
-      "epoch": 2.319448929071144,
-      "grad_norm": 0.656355619430542,
-      "learning_rate": 0.00046136838702725994,
-      "loss": 3.5677,
+      "epoch": 2.3234501347708894,
+      "grad_norm": 0.5620343685150146,
+      "learning_rate": 0.00046115488397193734,
+      "loss": 3.569,
       "step": 21550
     },
     {
-      "epoch": 2.3248304811107525,
-      "grad_norm": 0.6125053763389587,
-      "learning_rate": 0.00046104514599719853,
-      "loss": 3.5496,
+      "epoch": 2.3288409703504045,
+      "grad_norm": 0.6022666096687317,
+      "learning_rate": 0.00046083108472746894,
+      "loss": 3.5624,
       "step": 21600
     },
     {
-      "epoch": 2.3302120331503606,
-      "grad_norm": 0.708257257938385,
-      "learning_rate": 0.0004607219049671371,
-      "loss": 3.594,
+      "epoch": 2.334231805929919,
+      "grad_norm": 0.7098419070243835,
+      "learning_rate": 0.0004605072854830005,
+      "loss": 3.563,
       "step": 21650
     },
     {
-      "epoch": 2.3355935851899687,
-      "grad_norm": 0.6216398477554321,
-      "learning_rate": 0.0004603986639370757,
-      "loss": 3.5749,
+      "epoch": 2.339622641509434,
+      "grad_norm": 0.6487218141555786,
+      "learning_rate": 0.0004601834862385321,
+      "loss": 3.5825,
       "step": 21700
     },
     {
-      "epoch": 2.340975137229577,
-      "grad_norm": 0.5760979056358337,
-      "learning_rate": 0.0004600754229070143,
-      "loss": 3.5546,
+      "epoch": 2.3450134770889486,
+      "grad_norm": 0.6314107179641724,
+      "learning_rate": 0.00045985968699406365,
+      "loss": 3.5701,
       "step": 21750
     },
     {
-      "epoch": 2.3463566892691854,
-      "grad_norm": 0.5885084867477417,
-      "learning_rate": 0.00045975218187695286,
-      "loss": 3.5654,
+      "epoch": 2.3504043126684637,
+      "grad_norm": 0.5793794989585876,
+      "learning_rate": 0.0004595358877495952,
+      "loss": 3.5527,
       "step": 21800
     },
     {
-      "epoch": 2.3517382413087935,
-      "grad_norm": 0.6174579858779907,
-      "learning_rate": 0.00045942894084689145,
-      "loss": 3.5543,
+      "epoch": 2.3557951482479784,
+      "grad_norm": 0.5867211222648621,
+      "learning_rate": 0.0004592120885051268,
+      "loss": 3.562,
       "step": 21850
     },
     {
-      "epoch": 2.3571197933484016,
-      "grad_norm": 0.6659450531005859,
-      "learning_rate": 0.0004591056998168301,
-      "loss": 3.553,
+      "epoch": 2.361185983827493,
+      "grad_norm": 0.5849786400794983,
+      "learning_rate": 0.0004588882892606583,
+      "loss": 3.555,
       "step": 21900
     },
     {
-      "epoch": 2.3625013453880097,
-      "grad_norm": 0.5789748430252075,
-      "learning_rate": 0.00045878245878676864,
-      "loss": 3.5611,
+      "epoch": 2.3665768194070083,
+      "grad_norm": 0.5810790657997131,
+      "learning_rate": 0.00045856449001618996,
+      "loss": 3.5578,
       "step": 21950
     },
     {
-      "epoch": 2.3678828974276183,
-      "grad_norm": 0.5881466865539551,
-      "learning_rate": 0.00045846568257730845,
-      "loss": 3.5664,
+      "epoch": 2.371967654986523,
+      "grad_norm": 0.5677943825721741,
+      "learning_rate": 0.00045824069077172146,
+      "loss": 3.5795,
       "step": 22000
     },
     {
-      "epoch": 2.3678828974276183,
-      "eval_accuracy": 0.36495016472321057,
-      "eval_loss": 3.5474905967712402,
-      "eval_runtime": 194.8012,
-      "eval_samples_per_second": 92.458,
-      "eval_steps_per_second": 5.78,
+      "epoch": 2.371967654986523,
+      "eval_accuracy": 0.3641010424049404,
+      "eval_loss": 3.556856393814087,
+      "eval_runtime": 152.9462,
+      "eval_samples_per_second": 117.76,
+      "eval_steps_per_second": 7.362,
       "step": 22000
     },
     {
-      "epoch": 2.3732644494672264,
-      "grad_norm": 0.6417264342308044,
-      "learning_rate": 0.00045814244154724704,
-      "loss": 3.5509,
+      "epoch": 2.3773584905660377,
+      "grad_norm": 0.5545417070388794,
+      "learning_rate": 0.00045791689152725306,
+      "loss": 3.5555,
       "step": 22050
     },
     {
-      "epoch": 2.3786460015068345,
-      "grad_norm": 0.6134101748466492,
-      "learning_rate": 0.00045781920051718563,
-      "loss": 3.5482,
+      "epoch": 2.382749326145553,
+      "grad_norm": 0.662285327911377,
+      "learning_rate": 0.0004575930922827846,
+      "loss": 3.5585,
       "step": 22100
     },
     {
-      "epoch": 2.384027553546443,
-      "grad_norm": 0.8174729943275452,
-      "learning_rate": 0.0004574959594871242,
-      "loss": 3.5538,
+      "epoch": 2.3881401617250675,
+      "grad_norm": 0.6141922473907471,
+      "learning_rate": 0.0004572692930383162,
+      "loss": 3.5602,
       "step": 22150
     },
     {
-      "epoch": 2.389409105586051,
-      "grad_norm": 0.6091681122779846,
-      "learning_rate": 0.00045717271845706277,
-      "loss": 3.5516,
+      "epoch": 2.393530997304582,
+      "grad_norm": 0.6011219024658203,
+      "learning_rate": 0.00045694549379384777,
+      "loss": 3.5705,
       "step": 22200
     },
     {
-      "epoch": 2.3947906576256592,
-      "grad_norm": 0.5839729905128479,
-      "learning_rate": 0.0004568494774270013,
-      "loss": 3.5545,
+      "epoch": 2.398921832884097,
+      "grad_norm": 0.5966909527778625,
+      "learning_rate": 0.0004566216945493793,
+      "loss": 3.575,
       "step": 22250
     },
     {
-      "epoch": 2.4001722096652673,
-      "grad_norm": 0.620452344417572,
-      "learning_rate": 0.00045652623639693996,
-      "loss": 3.5427,
+      "epoch": 2.404312668463612,
+      "grad_norm": 0.5801154971122742,
+      "learning_rate": 0.0004562978953049109,
+      "loss": 3.5767,
       "step": 22300
     },
     {
-      "epoch": 2.4055537617048754,
-      "grad_norm": 0.6439001560211182,
-      "learning_rate": 0.00045620299536687855,
-      "loss": 3.554,
+      "epoch": 2.4097035040431267,
+      "grad_norm": 0.5791796445846558,
+      "learning_rate": 0.0004559740960604425,
+      "loss": 3.558,
       "step": 22350
     },
     {
-      "epoch": 2.410935313744484,
-      "grad_norm": 0.6319786906242371,
-      "learning_rate": 0.0004558797543368171,
-      "loss": 3.5723,
+      "epoch": 2.4150943396226414,
+      "grad_norm": 0.6324838399887085,
+      "learning_rate": 0.0004556502968159741,
+      "loss": 3.5805,
       "step": 22400
     },
     {
-      "epoch": 2.416316865784092,
-      "grad_norm": 0.6055615544319153,
-      "learning_rate": 0.0004555565133067557,
-      "loss": 3.5568,
+      "epoch": 2.420485175202156,
+      "grad_norm": 0.589653491973877,
+      "learning_rate": 0.00045532649757150563,
+      "loss": 3.5696,
       "step": 22450
     },
     {
-      "epoch": 2.4216984178237,
-      "grad_norm": 0.6199345588684082,
-      "learning_rate": 0.0004552332722766943,
-      "loss": 3.549,
+      "epoch": 2.4258760107816713,
+      "grad_norm": 0.5994852781295776,
+      "learning_rate": 0.00045500269832703723,
+      "loss": 3.5624,
       "step": 22500
     },
     {
-      "epoch": 2.4270799698633088,
-      "grad_norm": 0.587843656539917,
-      "learning_rate": 0.0004549100312466328,
-      "loss": 3.5311,
+      "epoch": 2.431266846361186,
+      "grad_norm": 0.5957964062690735,
+      "learning_rate": 0.0004546788990825688,
+      "loss": 3.5596,
       "step": 22550
     },
     {
-      "epoch": 2.432461521902917,
-      "grad_norm": 0.5868039727210999,
-      "learning_rate": 0.0004545867902165715,
-      "loss": 3.5624,
+      "epoch": 2.4366576819407006,
+      "grad_norm": 0.5986453890800476,
+      "learning_rate": 0.0004543550998381004,
+      "loss": 3.56,
       "step": 22600
     },
     {
-      "epoch": 2.437843073942525,
-      "grad_norm": 0.5922572016716003,
-      "learning_rate": 0.00045426354918651007,
-      "loss": 3.5538,
+      "epoch": 2.442048517520216,
+      "grad_norm": 0.5685117244720459,
+      "learning_rate": 0.0004540313005936319,
+      "loss": 3.5473,
       "step": 22650
     },
     {
-      "epoch": 2.443224625982133,
-      "grad_norm": 0.6361838579177856,
-      "learning_rate": 0.0004539403081564486,
-      "loss": 3.5364,
+      "epoch": 2.4474393530997305,
+      "grad_norm": 0.6107739210128784,
+      "learning_rate": 0.00045370750134916344,
+      "loss": 3.5517,
       "step": 22700
     },
     {
-      "epoch": 2.4486061780217416,
-      "grad_norm": 0.5849791765213013,
-      "learning_rate": 0.0004536170671263872,
-      "loss": 3.5606,
+      "epoch": 2.452830188679245,
+      "grad_norm": 0.5900905132293701,
+      "learning_rate": 0.00045338370210469504,
+      "loss": 3.555,
       "step": 22750
     },
     {
-      "epoch": 2.4539877300613497,
-      "grad_norm": 0.6513404250144958,
-      "learning_rate": 0.00045329382609632574,
-      "loss": 3.535,
+      "epoch": 2.4582210242587603,
+      "grad_norm": 0.5761315822601318,
+      "learning_rate": 0.0004530599028602266,
+      "loss": 3.5514,
       "step": 22800
     },
     {
-      "epoch": 2.459369282100958,
-      "grad_norm": 0.5861430168151855,
-      "learning_rate": 0.0004529705850662644,
-      "loss": 3.5432,
+      "epoch": 2.463611859838275,
+      "grad_norm": 0.6041805744171143,
+      "learning_rate": 0.0004527361036157582,
+      "loss": 3.5594,
       "step": 22850
     },
     {
-      "epoch": 2.464750834140566,
-      "grad_norm": 0.6476746797561646,
-      "learning_rate": 0.000452647344036203,
-      "loss": 3.5454,
+      "epoch": 2.4690026954177897,
+      "grad_norm": 0.6323763132095337,
+      "learning_rate": 0.00045241230437128975,
+      "loss": 3.5682,
       "step": 22900
     },
     {
-      "epoch": 2.4701323861801745,
-      "grad_norm": 0.6115290522575378,
-      "learning_rate": 0.00045232410300614153,
-      "loss": 3.5439,
+      "epoch": 2.4743935309973044,
+      "grad_norm": 0.6081458926200867,
+      "learning_rate": 0.00045208850512682135,
+      "loss": 3.5714,
       "step": 22950
     },
     {
-      "epoch": 2.4755139382197826,
-      "grad_norm": 0.5903562903404236,
-      "learning_rate": 0.0004520008619760801,
-      "loss": 3.5464,
+      "epoch": 2.4797843665768196,
+      "grad_norm": 0.5616876482963562,
+      "learning_rate": 0.0004517647058823529,
+      "loss": 3.5451,
       "step": 23000
     },
     {
-      "epoch": 2.4755139382197826,
-      "eval_accuracy": 0.3655437354493411,
-      "eval_loss": 3.5417819023132324,
-      "eval_runtime": 206.588,
-      "eval_samples_per_second": 87.183,
-      "eval_steps_per_second": 5.45,
+      "epoch": 2.4797843665768196,
+      "eval_accuracy": 0.3651323756160755,
+      "eval_loss": 3.544189691543579,
+      "eval_runtime": 152.5592,
+      "eval_samples_per_second": 118.059,
+      "eval_steps_per_second": 7.381,
       "step": 23000
     },
     {
-      "epoch": 2.4808954902593907,
-      "grad_norm": 0.5942795276641846,
-      "learning_rate": 0.0004516776209460187,
-      "loss": 3.5293,
+      "epoch": 2.4851752021563343,
+      "grad_norm": 0.5997558832168579,
+      "learning_rate": 0.0004514409066378845,
+      "loss": 3.5474,
       "step": 23050
     },
     {
-      "epoch": 2.4862770422989993,
-      "grad_norm": 0.5559934973716736,
-      "learning_rate": 0.00045135437991595726,
-      "loss": 3.5569,
+      "epoch": 2.490566037735849,
+      "grad_norm": 0.5757705569267273,
+      "learning_rate": 0.00045111710739341606,
+      "loss": 3.5646,
       "step": 23100
     },
     {
-      "epoch": 2.4916585943386074,
-      "grad_norm": 0.6209527254104614,
-      "learning_rate": 0.0004510311388858959,
-      "loss": 3.5374,
+      "epoch": 2.4959568733153636,
+      "grad_norm": 0.5957797765731812,
+      "learning_rate": 0.0004507933081489476,
+      "loss": 3.5681,
       "step": 23150
     },
     {
-      "epoch": 2.4970401463782155,
-      "grad_norm": 0.5962895750999451,
-      "learning_rate": 0.0004507078978558345,
-      "loss": 3.5538,
+      "epoch": 2.501347708894879,
+      "grad_norm": 0.5993410348892212,
+      "learning_rate": 0.0004504695089044792,
+      "loss": 3.5585,
       "step": 23200
     },
     {
-      "epoch": 2.5024216984178236,
-      "grad_norm": 0.6051533222198486,
-      "learning_rate": 0.00045038465682577304,
-      "loss": 3.552,
+      "epoch": 2.5067385444743935,
+      "grad_norm": 0.5758466124534607,
+      "learning_rate": 0.0004501457096600107,
+      "loss": 3.5529,
       "step": 23250
     },
     {
-      "epoch": 2.5078032504574317,
-      "grad_norm": 0.5640018582344055,
-      "learning_rate": 0.00045006141579571164,
-      "loss": 3.5596,
+      "epoch": 2.512129380053908,
+      "grad_norm": 0.584588885307312,
+      "learning_rate": 0.00044982191041554237,
+      "loss": 3.5518,
       "step": 23300
     },
     {
-      "epoch": 2.5131848024970402,
-      "grad_norm": 0.6006429195404053,
-      "learning_rate": 0.0004497381747656502,
-      "loss": 3.5692,
+      "epoch": 2.5175202156334233,
+      "grad_norm": 0.6807150840759277,
+      "learning_rate": 0.00044949811117107386,
+      "loss": 3.5484,
       "step": 23350
     },
     {
-      "epoch": 2.5185663545366483,
-      "grad_norm": 0.6432710886001587,
-      "learning_rate": 0.00044941493373558877,
-      "loss": 3.5302,
+      "epoch": 2.522911051212938,
+      "grad_norm": 0.5969212055206299,
+      "learning_rate": 0.00044917431192660547,
+      "loss": 3.5403,
       "step": 23400
     },
     {
-      "epoch": 2.5239479065762565,
-      "grad_norm": 0.5585759282112122,
-      "learning_rate": 0.0004490916927055274,
-      "loss": 3.5559,
+      "epoch": 2.5283018867924527,
+      "grad_norm": 0.5543271899223328,
+      "learning_rate": 0.000448850512682137,
+      "loss": 3.5529,
       "step": 23450
     },
     {
-      "epoch": 2.529329458615865,
-      "grad_norm": 0.6355800032615662,
-      "learning_rate": 0.00044876845167546596,
-      "loss": 3.5332,
+      "epoch": 2.533692722371968,
+      "grad_norm": 0.5737673044204712,
+      "learning_rate": 0.0004485267134376686,
+      "loss": 3.5474,
       "step": 23500
     },
     {
-      "epoch": 2.534711010655473,
-      "grad_norm": 0.5865529179573059,
-      "learning_rate": 0.00044844521064540455,
-      "loss": 3.5558,
+      "epoch": 2.5390835579514826,
+      "grad_norm": 0.5664697885513306,
+      "learning_rate": 0.0004482029141932002,
+      "loss": 3.5604,
       "step": 23550
     },
     {
-      "epoch": 2.540092562695081,
-      "grad_norm": 0.6548492312431335,
-      "learning_rate": 0.00044812196961534315,
-      "loss": 3.5339,
+      "epoch": 2.5444743935309972,
+      "grad_norm": 0.5698988437652588,
+      "learning_rate": 0.0004478791149487317,
+      "loss": 3.5589,
       "step": 23600
     },
     {
-      "epoch": 2.5454741147346893,
-      "grad_norm": 0.6574897766113281,
-      "learning_rate": 0.0004477987285852817,
-      "loss": 3.5498,
+      "epoch": 2.5498652291105124,
+      "grad_norm": 0.6224414110183716,
+      "learning_rate": 0.00044755531570426333,
+      "loss": 3.5653,
       "step": 23650
     },
     {
-      "epoch": 2.550855666774298,
-      "grad_norm": 0.5814394950866699,
-      "learning_rate": 0.00044747548755522034,
-      "loss": 3.5552,
+      "epoch": 2.555256064690027,
+      "grad_norm": 0.585587203502655,
+      "learning_rate": 0.0004472315164597949,
+      "loss": 3.5702,
       "step": 23700
     },
     {
-      "epoch": 2.556237218813906,
-      "grad_norm": 0.5807026028633118,
-      "learning_rate": 0.00044715224652515893,
-      "loss": 3.5408,
+      "epoch": 2.560646900269542,
+      "grad_norm": 0.5666549205780029,
+      "learning_rate": 0.0004469077172153265,
+      "loss": 3.5777,
       "step": 23750
     },
     {
-      "epoch": 2.561618770853514,
-      "grad_norm": 0.6143361926078796,
-      "learning_rate": 0.0004468290054950975,
-      "loss": 3.5664,
+      "epoch": 2.5660377358490565,
+      "grad_norm": 0.7604691386222839,
+      "learning_rate": 0.00044658391797085804,
+      "loss": 3.5518,
       "step": 23800
     },
     {
-      "epoch": 2.567000322893122,
-      "grad_norm": 0.5572286248207092,
-      "learning_rate": 0.00044650576446503607,
-      "loss": 3.5545,
+      "epoch": 2.571428571428571,
+      "grad_norm": 0.608855128288269,
+      "learning_rate": 0.00044626011872638964,
+      "loss": 3.5655,
       "step": 23850
     },
     {
-      "epoch": 2.5723818749327307,
-      "grad_norm": 0.5634745955467224,
-      "learning_rate": 0.0004461825234349746,
-      "loss": 3.5302,
+      "epoch": 2.5768194070080863,
+      "grad_norm": 0.6249096989631653,
+      "learning_rate": 0.0004459363194819212,
+      "loss": 3.5501,
       "step": 23900
     },
     {
-      "epoch": 2.577763426972339,
-      "grad_norm": 0.5858882665634155,
-      "learning_rate": 0.00044586574722551447,
-      "loss": 3.5528,
+      "epoch": 2.582210242587601,
+      "grad_norm": 0.599327564239502,
+      "learning_rate": 0.0004456125202374527,
+      "loss": 3.5576,
       "step": 23950
     },
     {
-      "epoch": 2.583144979011947,
-      "grad_norm": 0.6945074200630188,
-      "learning_rate": 0.000445542506195453,
-      "loss": 3.5619,
+      "epoch": 2.5876010781671157,
+      "grad_norm": 0.6255910396575928,
+      "learning_rate": 0.0004452887209929843,
+      "loss": 3.5389,
       "step": 24000
     },
     {
-      "epoch": 2.583144979011947,
-      "eval_accuracy": 0.36655561979355733,
-      "eval_loss": 3.5325984954833984,
-      "eval_runtime": 195.3908,
-      "eval_samples_per_second": 92.179,
-      "eval_steps_per_second": 5.763,
+      "epoch": 2.5876010781671157,
+      "eval_accuracy": 0.36652791330716467,
+      "eval_loss": 3.534491777420044,
+      "eval_runtime": 152.998,
+      "eval_samples_per_second": 117.72,
+      "eval_steps_per_second": 7.36,
       "step": 24000
     },
     {
-      "epoch": 2.5885265310515555,
-      "grad_norm": 0.5496438145637512,
-      "learning_rate": 0.0004452192651653916,
-      "loss": 3.5522,
+      "epoch": 2.592991913746631,
+      "grad_norm": 0.5508790612220764,
+      "learning_rate": 0.00044496492174851584,
+      "loss": 3.5528,
       "step": 24050
     },
     {
-      "epoch": 2.5939080830911636,
-      "grad_norm": 0.5970087051391602,
-      "learning_rate": 0.00044489602413533025,
-      "loss": 3.5635,
+      "epoch": 2.5983827493261455,
+      "grad_norm": 0.7283141016960144,
+      "learning_rate": 0.00044464112250404745,
+      "loss": 3.5466,
       "step": 24100
     },
     {
-      "epoch": 2.5992896351307717,
-      "grad_norm": 0.6501139998435974,
-      "learning_rate": 0.0004445727831052688,
-      "loss": 3.533,
+      "epoch": 2.6037735849056602,
+      "grad_norm": 0.5894685983657837,
+      "learning_rate": 0.000444317323259579,
+      "loss": 3.5734,
       "step": 24150
     },
     {
-      "epoch": 2.60467118717038,
-      "grad_norm": 0.6015759110450745,
-      "learning_rate": 0.0004442495420752074,
-      "loss": 3.5451,
+      "epoch": 2.6091644204851754,
+      "grad_norm": 0.6347320079803467,
+      "learning_rate": 0.0004439935240151106,
+      "loss": 3.5372,
       "step": 24200
     },
     {
-      "epoch": 2.610052739209988,
-      "grad_norm": 0.5871186852455139,
-      "learning_rate": 0.00044392630104514593,
-      "loss": 3.5508,
+      "epoch": 2.61455525606469,
+      "grad_norm": 0.5959154367446899,
+      "learning_rate": 0.00044366972477064215,
+      "loss": 3.5464,
       "step": 24250
     },
     {
-      "epoch": 2.6154342912495965,
-      "grad_norm": 0.5801131725311279,
-      "learning_rate": 0.0004436030600150845,
-      "loss": 3.5321,
+      "epoch": 2.6199460916442048,
+      "grad_norm": 0.6151760220527649,
+      "learning_rate": 0.00044334592552617376,
+      "loss": 3.5566,
       "step": 24300
     },
     {
-      "epoch": 2.6208158432892046,
-      "grad_norm": 0.7078419327735901,
-      "learning_rate": 0.0004432798189850231,
-      "loss": 3.5526,
+      "epoch": 2.62533692722372,
+      "grad_norm": 0.5945620536804199,
+      "learning_rate": 0.0004430221262817053,
+      "loss": 3.5717,
       "step": 24350
     },
     {
-      "epoch": 2.6261973953288127,
-      "grad_norm": 0.6336981654167175,
-      "learning_rate": 0.0004429565779549617,
-      "loss": 3.553,
+      "epoch": 2.6307277628032346,
+      "grad_norm": 0.6333128809928894,
+      "learning_rate": 0.00044269832703723686,
+      "loss": 3.5343,
       "step": 24400
     },
     {
-      "epoch": 2.6315789473684212,
-      "grad_norm": 0.621759831905365,
-      "learning_rate": 0.0004426333369249003,
-      "loss": 3.5336,
+      "epoch": 2.6361185983827493,
+      "grad_norm": 0.5592067837715149,
+      "learning_rate": 0.00044237452779276846,
+      "loss": 3.5512,
       "step": 24450
     },
     {
-      "epoch": 2.6369604994080293,
-      "grad_norm": 0.5916746854782104,
-      "learning_rate": 0.0004423100958948389,
-      "loss": 3.5525,
+      "epoch": 2.641509433962264,
+      "grad_norm": 0.5705219507217407,
+      "learning_rate": 0.0004420507285483,
+      "loss": 3.5672,
       "step": 24500
     },
     {
-      "epoch": 2.6423420514476375,
-      "grad_norm": 0.6469952464103699,
-      "learning_rate": 0.00044198685486477744,
-      "loss": 3.5264,
+      "epoch": 2.6469002695417787,
+      "grad_norm": 0.597025454044342,
+      "learning_rate": 0.0004417269293038316,
+      "loss": 3.5344,
       "step": 24550
     },
     {
-      "epoch": 2.6477236034872456,
-      "grad_norm": 0.6205400824546814,
-      "learning_rate": 0.00044166361383471604,
-      "loss": 3.5513,
+      "epoch": 2.652291105121294,
+      "grad_norm": 0.5921151638031006,
+      "learning_rate": 0.0004414031300593631,
+      "loss": 3.54,
       "step": 24600
     },
     {
-      "epoch": 2.653105155526854,
-      "grad_norm": 0.6007969975471497,
-      "learning_rate": 0.0004413403728046547,
-      "loss": 3.5303,
+      "epoch": 2.6576819407008085,
+      "grad_norm": 0.5553827881813049,
+      "learning_rate": 0.0004410793308148948,
+      "loss": 3.5434,
       "step": 24650
     },
     {
-      "epoch": 2.658486707566462,
-      "grad_norm": 0.6586825847625732,
-      "learning_rate": 0.0004410171317745932,
-      "loss": 3.5627,
+      "epoch": 2.6630727762803232,
+      "grad_norm": 0.6193781495094299,
+      "learning_rate": 0.00044075553157042627,
+      "loss": 3.5474,
       "step": 24700
     },
     {
-      "epoch": 2.6638682596060703,
-      "grad_norm": 0.5926357507705688,
-      "learning_rate": 0.0004406938907445318,
-      "loss": 3.5663,
+      "epoch": 2.6684636118598384,
+      "grad_norm": 0.6883496642112732,
+      "learning_rate": 0.0004404317323259579,
+      "loss": 3.5466,
       "step": 24750
     },
     {
-      "epoch": 2.6692498116456784,
-      "grad_norm": 0.5873600840568542,
-      "learning_rate": 0.00044037064971447036,
-      "loss": 3.5241,
+      "epoch": 2.673854447439353,
+      "grad_norm": 0.5652745962142944,
+      "learning_rate": 0.0004401079330814894,
+      "loss": 3.5251,
       "step": 24800
     },
     {
-      "epoch": 2.674631363685287,
-      "grad_norm": 0.592735767364502,
-      "learning_rate": 0.00044004740868440896,
-      "loss": 3.5567,
+      "epoch": 2.6792452830188678,
+      "grad_norm": 0.545843780040741,
+      "learning_rate": 0.000439784133837021,
+      "loss": 3.5213,
       "step": 24850
     },
     {
-      "epoch": 2.680012915724895,
-      "grad_norm": 0.626936674118042,
-      "learning_rate": 0.00043972416765434755,
-      "loss": 3.5204,
+      "epoch": 2.684636118598383,
+      "grad_norm": 0.6006308197975159,
+      "learning_rate": 0.0004394603345925526,
+      "loss": 3.5627,
       "step": 24900
     },
     {
-      "epoch": 2.685394467764503,
-      "grad_norm": 0.5691058039665222,
-      "learning_rate": 0.00043940092662428615,
-      "loss": 3.5214,
+      "epoch": 2.6900269541778976,
+      "grad_norm": 0.6098518967628479,
+      "learning_rate": 0.00043913653534808413,
+      "loss": 3.5446,
       "step": 24950
     },
     {
-      "epoch": 2.6907760198041117,
-      "grad_norm": 0.6450164914131165,
-      "learning_rate": 0.00043907768559422474,
-      "loss": 3.5487,
+      "epoch": 2.6954177897574123,
+      "grad_norm": 0.5801330208778381,
+      "learning_rate": 0.00043881273610361574,
+      "loss": 3.5351,
       "step": 25000
     },
     {
-      "epoch": 2.6907760198041117,
-      "eval_accuracy": 0.36812326365894465,
-      "eval_loss": 3.521350145339966,
-      "eval_runtime": 197.2244,
-      "eval_samples_per_second": 91.322,
-      "eval_steps_per_second": 5.709,
+      "epoch": 2.6954177897574123,
+      "eval_accuracy": 0.36689037934091373,
+      "eval_loss": 3.5260772705078125,
+      "eval_runtime": 152.8924,
+      "eval_samples_per_second": 117.802,
+      "eval_steps_per_second": 7.365,
       "step": 25000
     },
     {
-      "epoch": 2.69615757184372,
-      "grad_norm": 0.6423863768577576,
-      "learning_rate": 0.00043875444456416334,
-      "loss": 3.5373,
+      "epoch": 2.7008086253369274,
+      "grad_norm": 0.595666229724884,
+      "learning_rate": 0.0004384889368591473,
+      "loss": 3.546,
       "step": 25050
     },
     {
-      "epoch": 2.701539123883328,
-      "grad_norm": 0.6400243043899536,
-      "learning_rate": 0.0004384312035341019,
-      "loss": 3.5428,
+      "epoch": 2.706199460916442,
+      "grad_norm": 0.5644325613975525,
+      "learning_rate": 0.0004381651376146789,
+      "loss": 3.5296,
       "step": 25100
     },
     {
-      "epoch": 2.706920675922936,
-      "grad_norm": 0.6252772212028503,
-      "learning_rate": 0.00043810796250404047,
-      "loss": 3.5469,
+      "epoch": 2.711590296495957,
+      "grad_norm": 0.587679386138916,
+      "learning_rate": 0.00043784133837021044,
+      "loss": 3.5217,
       "step": 25150
     },
     {
-      "epoch": 2.712302227962544,
-      "grad_norm": 0.5882490873336792,
-      "learning_rate": 0.000437784721473979,
-      "loss": 3.5376,
+      "epoch": 2.7169811320754715,
+      "grad_norm": 0.6090216040611267,
+      "learning_rate": 0.00043751753912574205,
+      "loss": 3.5267,
       "step": 25200
     },
     {
-      "epoch": 2.7176837800021527,
-      "grad_norm": 0.604751467704773,
-      "learning_rate": 0.00043746148044391766,
-      "loss": 3.5409,
+      "epoch": 2.7223719676549867,
+      "grad_norm": 0.6133639812469482,
+      "learning_rate": 0.0004371937398812736,
+      "loss": 3.5605,
       "step": 25250
     },
     {
-      "epoch": 2.723065332041761,
-      "grad_norm": 0.6292399168014526,
-      "learning_rate": 0.00043713823941385625,
-      "loss": 3.5313,
+      "epoch": 2.7277628032345014,
+      "grad_norm": 0.6062129735946655,
+      "learning_rate": 0.0004368699406368051,
+      "loss": 3.5186,
       "step": 25300
     },
     {
-      "epoch": 2.728446884081369,
-      "grad_norm": 0.6144165396690369,
-      "learning_rate": 0.0004368149983837948,
-      "loss": 3.5484,
+      "epoch": 2.733153638814016,
+      "grad_norm": 0.5763473510742188,
+      "learning_rate": 0.0004365461413923367,
+      "loss": 3.5433,
       "step": 25350
     },
     {
-      "epoch": 2.7338284361209775,
-      "grad_norm": 0.6086465716362,
-      "learning_rate": 0.0004364917573537334,
-      "loss": 3.5381,
+      "epoch": 2.7385444743935308,
+      "grad_norm": 0.6757517457008362,
+      "learning_rate": 0.00043622234214786825,
+      "loss": 3.5452,
       "step": 25400
     },
     {
-      "epoch": 2.7392099881605856,
-      "grad_norm": 0.6256065964698792,
-      "learning_rate": 0.00043616851632367193,
-      "loss": 3.557,
+      "epoch": 2.743935309973046,
+      "grad_norm": 0.5790356993675232,
+      "learning_rate": 0.00043589854290339985,
+      "loss": 3.5194,
       "step": 25450
     },
     {
-      "epoch": 2.7445915402001937,
-      "grad_norm": 0.6041662096977234,
-      "learning_rate": 0.0004358452752936106,
-      "loss": 3.5347,
+      "epoch": 2.7493261455525606,
+      "grad_norm": 0.6045161485671997,
+      "learning_rate": 0.0004355747436589314,
+      "loss": 3.5777,
       "step": 25500
     },
     {
-      "epoch": 2.749973092239802,
-      "grad_norm": 0.6548029780387878,
-      "learning_rate": 0.0004355220342635492,
-      "loss": 3.5374,
+      "epoch": 2.7547169811320753,
+      "grad_norm": 0.5605710744857788,
+      "learning_rate": 0.000435250944414463,
+      "loss": 3.5656,
       "step": 25550
     },
     {
-      "epoch": 2.7553546442794103,
-      "grad_norm": 0.6124310493469238,
-      "learning_rate": 0.00043519879323348777,
-      "loss": 3.529,
+      "epoch": 2.7601078167115904,
+      "grad_norm": 0.6743922233581543,
+      "learning_rate": 0.00043492714516999456,
+      "loss": 3.5443,
       "step": 25600
     },
     {
-      "epoch": 2.7607361963190185,
-      "grad_norm": 0.596762478351593,
-      "learning_rate": 0.0004348755522034263,
-      "loss": 3.5273,
+      "epoch": 2.765498652291105,
+      "grad_norm": 0.6153864860534668,
+      "learning_rate": 0.0004346033459255261,
+      "loss": 3.5308,
       "step": 25650
     },
     {
-      "epoch": 2.7661177483586266,
-      "grad_norm": 0.5779131650924683,
-      "learning_rate": 0.0004345523111733649,
-      "loss": 3.5448,
+      "epoch": 2.77088948787062,
+      "grad_norm": 0.5993860960006714,
+      "learning_rate": 0.0004342795466810577,
+      "loss": 3.5506,
       "step": 25700
     },
     {
-      "epoch": 2.7714993003982347,
-      "grad_norm": 0.6550736427307129,
-      "learning_rate": 0.00043422907014330344,
-      "loss": 3.5405,
+      "epoch": 2.776280323450135,
+      "grad_norm": 0.622205376625061,
+      "learning_rate": 0.00043395574743658927,
+      "loss": 3.5411,
       "step": 25750
     },
     {
-      "epoch": 2.776880852437843,
-      "grad_norm": 0.7348501682281494,
-      "learning_rate": 0.0004339058291132421,
-      "loss": 3.5329,
+      "epoch": 2.7816711590296497,
+      "grad_norm": 0.600141167640686,
+      "learning_rate": 0.00043363194819212087,
+      "loss": 3.5421,
       "step": 25800
     },
     {
-      "epoch": 2.7822624044774513,
-      "grad_norm": 0.6231963038444519,
-      "learning_rate": 0.0004335825880831807,
-      "loss": 3.5465,
+      "epoch": 2.7870619946091644,
+      "grad_norm": 0.605973482131958,
+      "learning_rate": 0.0004333081489476524,
+      "loss": 3.5291,
       "step": 25850
     },
     {
-      "epoch": 2.7876439565170594,
-      "grad_norm": 0.5558022856712341,
-      "learning_rate": 0.00043325934705311923,
-      "loss": 3.5381,
+      "epoch": 2.7924528301886795,
+      "grad_norm": 0.6000856757164001,
+      "learning_rate": 0.000432984349703184,
+      "loss": 3.5385,
       "step": 25900
     },
     {
-      "epoch": 2.793025508556668,
-      "grad_norm": 0.6114339232444763,
-      "learning_rate": 0.0004329361060230578,
-      "loss": 3.5492,
+      "epoch": 2.797843665768194,
+      "grad_norm": 0.5753637552261353,
+      "learning_rate": 0.0004326605504587155,
+      "loss": 3.5278,
       "step": 25950
     },
     {
-      "epoch": 2.798407060596276,
-      "grad_norm": 0.621716320514679,
-      "learning_rate": 0.00043261286499299636,
-      "loss": 3.5277,
+      "epoch": 2.803234501347709,
+      "grad_norm": 0.6122384071350098,
+      "learning_rate": 0.0004323367512142472,
+      "loss": 3.5397,
       "step": 26000
     },
     {
-      "epoch": 2.798407060596276,
-      "eval_accuracy": 0.36862752171129165,
-      "eval_loss": 3.51253342628479,
-      "eval_runtime": 212.7222,
-      "eval_samples_per_second": 84.669,
-      "eval_steps_per_second": 5.293,
+      "epoch": 2.803234501347709,
+      "eval_accuracy": 0.3679035675197838,
+      "eval_loss": 3.5164499282836914,
+      "eval_runtime": 152.7141,
+      "eval_samples_per_second": 117.939,
+      "eval_steps_per_second": 7.373,
       "step": 26000
     },
     {
-      "epoch": 2.803788612635884,
-      "grad_norm": 0.6198678016662598,
-      "learning_rate": 0.00043228962396293496,
-      "loss": 3.5241,
+      "epoch": 2.8086253369272236,
+      "grad_norm": 0.6394927501678467,
+      "learning_rate": 0.0004320129519697787,
+      "loss": 3.5372,
       "step": 26050
     },
     {
-      "epoch": 2.8091701646754923,
-      "grad_norm": 0.5953556299209595,
-      "learning_rate": 0.0004319663829328736,
-      "loss": 3.5257,
+      "epoch": 2.8140161725067383,
+      "grad_norm": 0.6251195073127747,
+      "learning_rate": 0.00043168915272531023,
+      "loss": 3.5121,
       "step": 26100
     },
     {
-      "epoch": 2.8145517167151004,
-      "grad_norm": 0.6188264489173889,
-      "learning_rate": 0.00043164314190281215,
-      "loss": 3.5268,
+      "epoch": 2.8194070080862534,
+      "grad_norm": 0.6234621405601501,
+      "learning_rate": 0.00043136535348084183,
+      "loss": 3.5139,
       "step": 26150
     },
     {
-      "epoch": 2.819933268754709,
-      "grad_norm": 0.5980226993560791,
-      "learning_rate": 0.00043131990087275074,
-      "loss": 3.5478,
+      "epoch": 2.824797843665768,
+      "grad_norm": 0.6286827325820923,
+      "learning_rate": 0.0004310415542363734,
+      "loss": 3.5389,
       "step": 26200
     },
     {
-      "epoch": 2.825314820794317,
-      "grad_norm": 0.6212344169616699,
-      "learning_rate": 0.00043099665984268934,
-      "loss": 3.5297,
+      "epoch": 2.830188679245283,
+      "grad_norm": 0.615374743938446,
+      "learning_rate": 0.000430717754991905,
+      "loss": 3.5499,
       "step": 26250
     },
     {
-      "epoch": 2.830696372833925,
-      "grad_norm": 0.5705240964889526,
-      "learning_rate": 0.0004306734188126279,
-      "loss": 3.5324,
+      "epoch": 2.835579514824798,
+      "grad_norm": 0.5852181911468506,
+      "learning_rate": 0.00043039395574743654,
+      "loss": 3.5664,
       "step": 26300
     },
     {
-      "epoch": 2.8360779248735337,
-      "grad_norm": 0.6547979712486267,
-      "learning_rate": 0.00043035017778256647,
-      "loss": 3.533,
+      "epoch": 2.8409703504043127,
+      "grad_norm": 0.7029324173927307,
+      "learning_rate": 0.00043007015650296814,
+      "loss": 3.5374,
       "step": 26350
     },
     {
-      "epoch": 2.841459476913142,
-      "grad_norm": 0.5784933567047119,
-      "learning_rate": 0.0004300269367525051,
-      "loss": 3.5297,
+      "epoch": 2.8463611859838274,
+      "grad_norm": 0.612174391746521,
+      "learning_rate": 0.0004297463572584997,
+      "loss": 3.5438,
       "step": 26400
     },
     {
-      "epoch": 2.84684102895275,
-      "grad_norm": 0.7146517038345337,
-      "learning_rate": 0.00042970369572244366,
-      "loss": 3.5382,
+      "epoch": 2.8517520215633425,
+      "grad_norm": 0.6446730494499207,
+      "learning_rate": 0.0004294225580140313,
+      "loss": 3.5538,
       "step": 26450
     },
     {
-      "epoch": 2.852222580992358,
-      "grad_norm": 0.6623178124427795,
-      "learning_rate": 0.00042938045469238226,
-      "loss": 3.5374,
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.6072059273719788,
+      "learning_rate": 0.00042909875876956285,
+      "loss": 3.5238,
       "step": 26500
     },
     {
-      "epoch": 2.857604133031966,
-      "grad_norm": 0.5954160094261169,
-      "learning_rate": 0.0004290572136623208,
-      "loss": 3.5509,
+      "epoch": 2.862533692722372,
+      "grad_norm": 0.6181698441505432,
+      "learning_rate": 0.0004287749595250944,
+      "loss": 3.5385,
       "step": 26550
     },
     {
-      "epoch": 2.8629856850715747,
-      "grad_norm": 0.6484636068344116,
-      "learning_rate": 0.0004287339726322594,
-      "loss": 3.5399,
+      "epoch": 2.867924528301887,
+      "grad_norm": 0.6247893571853638,
+      "learning_rate": 0.000428451160280626,
+      "loss": 3.5481,
       "step": 26600
     },
     {
-      "epoch": 2.868367237111183,
-      "grad_norm": 0.6219885349273682,
-      "learning_rate": 0.00042841073160219804,
-      "loss": 3.5334,
+      "epoch": 2.8733153638814017,
+      "grad_norm": 0.612091064453125,
+      "learning_rate": 0.0004281273610361575,
+      "loss": 3.5525,
       "step": 26650
     },
     {
-      "epoch": 2.873748789150791,
-      "grad_norm": 0.6355128288269043,
-      "learning_rate": 0.0004280874905721366,
-      "loss": 3.5135,
+      "epoch": 2.8787061994609164,
+      "grad_norm": 0.6002130508422852,
+      "learning_rate": 0.0004278035617916891,
+      "loss": 3.5394,
       "step": 26700
     },
     {
-      "epoch": 2.8791303411903995,
-      "grad_norm": 0.5998795032501221,
-      "learning_rate": 0.0004277642495420752,
-      "loss": 3.5442,
+      "epoch": 2.884097035040431,
+      "grad_norm": 0.6247308850288391,
+      "learning_rate": 0.00042747976254722066,
+      "loss": 3.5506,
       "step": 26750
     },
     {
-      "epoch": 2.8845118932300076,
-      "grad_norm": 0.5889739394187927,
-      "learning_rate": 0.00042744100851201377,
-      "loss": 3.5133,
+      "epoch": 2.889487870619946,
+      "grad_norm": 0.5961256623268127,
+      "learning_rate": 0.00042715596330275226,
+      "loss": 3.5267,
       "step": 26800
     },
     {
-      "epoch": 2.8898934452696157,
-      "grad_norm": 0.6281293630599976,
-      "learning_rate": 0.0004271177674819523,
-      "loss": 3.5056,
+      "epoch": 2.894878706199461,
+      "grad_norm": 0.620616614818573,
+      "learning_rate": 0.0004268321640582838,
+      "loss": 3.5502,
       "step": 26850
     },
     {
-      "epoch": 2.895274997309224,
-      "grad_norm": 0.5909113883972168,
-      "learning_rate": 0.0004267945264518909,
-      "loss": 3.5157,
+      "epoch": 2.9002695417789757,
+      "grad_norm": 0.6439360976219177,
+      "learning_rate": 0.0004265083648138154,
+      "loss": 3.5179,
       "step": 26900
     },
     {
-      "epoch": 2.9006565493488323,
-      "grad_norm": 0.6297270059585571,
-      "learning_rate": 0.00042647128542182955,
-      "loss": 3.5354,
+      "epoch": 2.9056603773584904,
+      "grad_norm": 0.6051164865493774,
+      "learning_rate": 0.00042618456556934697,
+      "loss": 3.5367,
       "step": 26950
     },
     {
-      "epoch": 2.9060381013884404,
-      "grad_norm": 0.6224883794784546,
-      "learning_rate": 0.0004261480443917681,
-      "loss": 3.5346,
+      "epoch": 2.9110512129380055,
+      "grad_norm": 0.6090266704559326,
+      "learning_rate": 0.0004258607663248785,
+      "loss": 3.5532,
       "step": 27000
     },
     {
-      "epoch": 2.9060381013884404,
-      "eval_accuracy": 0.3694928333098462,
-      "eval_loss": 3.5037596225738525,
-      "eval_runtime": 206.2343,
-      "eval_samples_per_second": 87.333,
-      "eval_steps_per_second": 5.46,
+      "epoch": 2.9110512129380055,
+      "eval_accuracy": 0.36900107033959784,
+      "eval_loss": 3.5088884830474854,
+      "eval_runtime": 152.765,
+      "eval_samples_per_second": 117.9,
+      "eval_steps_per_second": 7.371,
       "step": 27000
     },
     {
-      "epoch": 2.9114196534280485,
-      "grad_norm": 0.6158651113510132,
-      "learning_rate": 0.00042583126818230795,
-      "loss": 3.5352,
+      "epoch": 2.91644204851752,
+      "grad_norm": 0.6238779425621033,
+      "learning_rate": 0.0004255369670804101,
+      "loss": 3.5368,
       "step": 27050
     },
     {
-      "epoch": 2.9168012054676566,
-      "grad_norm": 0.6259952187538147,
-      "learning_rate": 0.0004255080271522465,
-      "loss": 3.5156,
+      "epoch": 2.921832884097035,
+      "grad_norm": 0.6020278334617615,
+      "learning_rate": 0.0004252131678359417,
+      "loss": 3.541,
       "step": 27100
     },
     {
-      "epoch": 2.922182757507265,
-      "grad_norm": 0.5961571335792542,
-      "learning_rate": 0.0004251847861221851,
+      "epoch": 2.92722371967655,
+      "grad_norm": 0.689791738986969,
+      "learning_rate": 0.0004248893685914733,
       "loss": 3.5362,
       "step": 27150
     },
     {
-      "epoch": 2.9275643095468733,
-      "grad_norm": 0.62592613697052,
-      "learning_rate": 0.00042486154509212363,
-      "loss": 3.5094,
+      "epoch": 2.9326145552560647,
+      "grad_norm": 0.644339382648468,
+      "learning_rate": 0.00042456556934700483,
+      "loss": 3.5235,
       "step": 27200
     },
     {
-      "epoch": 2.9329458615864814,
-      "grad_norm": 0.5974050760269165,
-      "learning_rate": 0.0004245383040620622,
-      "loss": 3.5258,
+      "epoch": 2.9380053908355794,
+      "grad_norm": 0.6350017786026001,
+      "learning_rate": 0.00042424177010253643,
+      "loss": 3.5371,
       "step": 27250
     },
     {
-      "epoch": 2.93832741362609,
-      "grad_norm": 0.5732911229133606,
-      "learning_rate": 0.0004242150630320009,
-      "loss": 3.5374,
+      "epoch": 2.9433962264150946,
+      "grad_norm": 0.5984843969345093,
+      "learning_rate": 0.00042391797085806793,
+      "loss": 3.5414,
       "step": 27300
     },
     {
-      "epoch": 2.943708965665698,
-      "grad_norm": 0.6453855633735657,
-      "learning_rate": 0.0004238918220019394,
-      "loss": 3.527,
+      "epoch": 2.9487870619946093,
+      "grad_norm": 0.5899128913879395,
+      "learning_rate": 0.0004235941716135995,
+      "loss": 3.5269,
       "step": 27350
     },
     {
-      "epoch": 2.949090517705306,
-      "grad_norm": 0.5842043161392212,
-      "learning_rate": 0.000423568580971878,
-      "loss": 3.5214,
+      "epoch": 2.954177897574124,
+      "grad_norm": 0.6649240851402283,
+      "learning_rate": 0.0004232703723691311,
+      "loss": 3.5181,
       "step": 27400
     },
     {
-      "epoch": 2.9544720697449143,
-      "grad_norm": 0.6021690964698792,
-      "learning_rate": 0.00042324533994181655,
-      "loss": 3.5026,
+      "epoch": 2.9595687331536387,
+      "grad_norm": 0.5655061602592468,
+      "learning_rate": 0.00042294657312466264,
+      "loss": 3.5257,
       "step": 27450
     },
     {
-      "epoch": 2.9598536217845224,
-      "grad_norm": 0.6042524576187134,
-      "learning_rate": 0.00042292209891175514,
-      "loss": 3.5169,
+      "epoch": 2.964959568733154,
+      "grad_norm": 0.6041932702064514,
+      "learning_rate": 0.00042262277388019424,
+      "loss": 3.535,
       "step": 27500
     },
     {
-      "epoch": 2.965235173824131,
-      "grad_norm": 0.6471766829490662,
-      "learning_rate": 0.00042259885788169374,
-      "loss": 3.5289,
+      "epoch": 2.9703504043126685,
+      "grad_norm": 0.5469972491264343,
+      "learning_rate": 0.0004222989746357258,
+      "loss": 3.524,
       "step": 27550
     },
     {
-      "epoch": 2.970616725863739,
-      "grad_norm": 0.6328997611999512,
-      "learning_rate": 0.00042227561685163233,
-      "loss": 3.521,
+      "epoch": 2.975741239892183,
+      "grad_norm": 0.5674824714660645,
+      "learning_rate": 0.0004219751753912574,
+      "loss": 3.5333,
       "step": 27600
     },
     {
-      "epoch": 2.975998277903347,
-      "grad_norm": 0.6381612420082092,
-      "learning_rate": 0.00042195237582157093,
-      "loss": 3.5258,
+      "epoch": 2.981132075471698,
+      "grad_norm": 0.5904396772384644,
+      "learning_rate": 0.00042165785213167835,
+      "loss": 3.5118,
       "step": 27650
     },
     {
-      "epoch": 2.9813798299429557,
-      "grad_norm": 0.6035014986991882,
-      "learning_rate": 0.0004216291347915095,
-      "loss": 3.5295,
+      "epoch": 2.986522911051213,
+      "grad_norm": 0.6024655699729919,
+      "learning_rate": 0.0004213340528872099,
+      "loss": 3.5224,
       "step": 27700
     },
     {
-      "epoch": 2.986761381982564,
-      "grad_norm": 0.6743372678756714,
-      "learning_rate": 0.00042130589376144806,
-      "loss": 3.5261,
+      "epoch": 2.9919137466307277,
+      "grad_norm": 0.6454286575317383,
+      "learning_rate": 0.00042101025364274145,
+      "loss": 3.5298,
       "step": 27750
     },
     {
-      "epoch": 2.992142934022172,
-      "grad_norm": 0.6017929315567017,
-      "learning_rate": 0.00042098265273138666,
-      "loss": 3.5172,
+      "epoch": 2.9973045822102424,
+      "grad_norm": 0.6410738229751587,
+      "learning_rate": 0.00042068645439827305,
+      "loss": 3.5312,
       "step": 27800
     },
     {
-      "epoch": 2.9975244860617805,
-      "grad_norm": 0.6554933786392212,
-      "learning_rate": 0.0004206594117013252,
-      "loss": 3.5245,
+      "epoch": 3.0026954177897576,
+      "grad_norm": 0.6354919672012329,
+      "learning_rate": 0.0004203626551538046,
+      "loss": 3.4957,
       "step": 27850
     },
     {
-      "epoch": 3.0029060381013886,
-      "grad_norm": 0.6437206864356995,
-      "learning_rate": 0.00042033617067126385,
-      "loss": 3.4742,
+      "epoch": 3.0080862533692723,
+      "grad_norm": 0.6528053283691406,
+      "learning_rate": 0.0004200388559093362,
+      "loss": 3.4367,
       "step": 27900
     },
     {
-      "epoch": 3.0082875901409967,
-      "grad_norm": 0.6490093469619751,
-      "learning_rate": 0.00042001292964120244,
-      "loss": 3.4372,
+      "epoch": 3.013477088948787,
+      "grad_norm": 0.5901045799255371,
+      "learning_rate": 0.0004197150566648677,
+      "loss": 3.4398,
       "step": 27950
     },
     {
-      "epoch": 3.0136691421806048,
-      "grad_norm": 0.6507807374000549,
-      "learning_rate": 0.000419689688611141,
-      "loss": 3.4461,
+      "epoch": 3.018867924528302,
+      "grad_norm": 0.610563337802887,
+      "learning_rate": 0.00041939125742039936,
+      "loss": 3.4533,
       "step": 28000
     },
     {
-      "epoch": 3.0136691421806048,
-      "eval_accuracy": 0.3706709565724186,
-      "eval_loss": 3.497730016708374,
-      "eval_runtime": 215.1239,
-      "eval_samples_per_second": 83.724,
-      "eval_steps_per_second": 5.234,
+      "epoch": 3.018867924528302,
+      "eval_accuracy": 0.36950989181323307,
+      "eval_loss": 3.5048389434814453,
+      "eval_runtime": 151.8303,
+      "eval_samples_per_second": 118.626,
+      "eval_steps_per_second": 7.416,
       "step": 28000
     },
     {
-      "epoch": 3.0190506942202133,
-      "grad_norm": 0.6191187500953674,
-      "learning_rate": 0.0004193664475810796,
-      "loss": 3.4181,
+      "epoch": 3.024258760107817,
+      "grad_norm": 0.648098886013031,
+      "learning_rate": 0.00041906745817593086,
+      "loss": 3.4386,
       "step": 28050
     },
     {
-      "epoch": 3.0244322462598214,
-      "grad_norm": 0.62087082862854,
-      "learning_rate": 0.00041904320655101817,
-      "loss": 3.4181,
+      "epoch": 3.0296495956873315,
+      "grad_norm": 0.662765383720398,
+      "learning_rate": 0.00041874365893146247,
+      "loss": 3.4337,
       "step": 28100
     },
     {
-      "epoch": 3.0298137982994295,
-      "grad_norm": 0.6341911554336548,
-      "learning_rate": 0.0004187199655209567,
-      "loss": 3.4286,
+      "epoch": 3.035040431266846,
+      "grad_norm": 0.5984097719192505,
+      "learning_rate": 0.000418419859686994,
+      "loss": 3.4747,
       "step": 28150
     },
     {
-      "epoch": 3.0351953503390376,
-      "grad_norm": 0.6108666062355042,
-      "learning_rate": 0.00041839672449089536,
-      "loss": 3.4278,
+      "epoch": 3.0404312668463613,
+      "grad_norm": 0.593792200088501,
+      "learning_rate": 0.00041809606044252557,
+      "loss": 3.4557,
       "step": 28200
     },
     {
-      "epoch": 3.040576902378646,
-      "grad_norm": 0.5951593518257141,
-      "learning_rate": 0.00041807348346083395,
-      "loss": 3.441,
+      "epoch": 3.045822102425876,
+      "grad_norm": 0.619614839553833,
+      "learning_rate": 0.00041777226119805717,
+      "loss": 3.4661,
       "step": 28250
     },
     {
-      "epoch": 3.0459584544182543,
-      "grad_norm": 0.6405830383300781,
-      "learning_rate": 0.0004177502424307725,
-      "loss": 3.4542,
+      "epoch": 3.0512129380053907,
+      "grad_norm": 0.650425910949707,
+      "learning_rate": 0.0004174484619535887,
+      "loss": 3.4621,
       "step": 28300
     },
     {
-      "epoch": 3.0513400064578624,
-      "grad_norm": 0.6463958024978638,
-      "learning_rate": 0.0004174270014007111,
-      "loss": 3.4252,
+      "epoch": 3.056603773584906,
+      "grad_norm": 0.6080370545387268,
+      "learning_rate": 0.00041712466270912033,
+      "loss": 3.4525,
       "step": 28350
     },
     {
-      "epoch": 3.0567215584974705,
-      "grad_norm": 0.5980671048164368,
-      "learning_rate": 0.00041710376037064963,
-      "loss": 3.4458,
+      "epoch": 3.0619946091644206,
+      "grad_norm": 0.604832649230957,
+      "learning_rate": 0.0004168008634646519,
+      "loss": 3.4284,
       "step": 28400
     },
     {
-      "epoch": 3.062103110537079,
-      "grad_norm": 0.6921752691268921,
-      "learning_rate": 0.0004167805193405883,
-      "loss": 3.4406,
+      "epoch": 3.0673854447439353,
+      "grad_norm": 0.6373007893562317,
+      "learning_rate": 0.0004164770642201835,
+      "loss": 3.4731,
       "step": 28450
     },
     {
-      "epoch": 3.067484662576687,
-      "grad_norm": 0.6020042896270752,
-      "learning_rate": 0.0004164572783105269,
-      "loss": 3.4446,
+      "epoch": 3.07277628032345,
+      "grad_norm": 0.6558822989463806,
+      "learning_rate": 0.00041615326497571503,
+      "loss": 3.4663,
       "step": 28500
     },
     {
-      "epoch": 3.0728662146162953,
-      "grad_norm": 0.6640591025352478,
-      "learning_rate": 0.0004161340372804654,
-      "loss": 3.4635,
+      "epoch": 3.078167115902965,
+      "grad_norm": 0.6558163166046143,
+      "learning_rate": 0.0004158294657312466,
+      "loss": 3.4533,
       "step": 28550
     },
     {
-      "epoch": 3.0782477666559034,
-      "grad_norm": 0.6354085206985474,
-      "learning_rate": 0.000415810796250404,
-      "loss": 3.4555,
+      "epoch": 3.08355795148248,
+      "grad_norm": 0.6458030343055725,
+      "learning_rate": 0.0004155056664867782,
+      "loss": 3.4506,
       "step": 28600
     },
     {
-      "epoch": 3.083629318695512,
-      "grad_norm": 0.636901319026947,
-      "learning_rate": 0.0004154875552203426,
-      "loss": 3.4515,
+      "epoch": 3.0889487870619945,
+      "grad_norm": 0.665330708026886,
+      "learning_rate": 0.0004151818672423097,
+      "loss": 3.4485,
       "step": 28650
     },
     {
-      "epoch": 3.08901087073512,
-      "grad_norm": 0.6186822056770325,
-      "learning_rate": 0.00041516431419028114,
-      "loss": 3.4345,
+      "epoch": 3.0943396226415096,
+      "grad_norm": 0.614852249622345,
+      "learning_rate": 0.0004148580679978413,
+      "loss": 3.4589,
       "step": 28700
     },
     {
-      "epoch": 3.094392422774728,
-      "grad_norm": 0.6416160464286804,
-      "learning_rate": 0.0004148410731602198,
-      "loss": 3.45,
+      "epoch": 3.0997304582210243,
+      "grad_norm": 0.6223500370979309,
+      "learning_rate": 0.00041453426875337284,
+      "loss": 3.4486,
       "step": 28750
     },
     {
-      "epoch": 3.0997739748143363,
-      "grad_norm": 0.6019704341888428,
-      "learning_rate": 0.0004145178321301584,
-      "loss": 3.4452,
+      "epoch": 3.105121293800539,
+      "grad_norm": 0.5892756581306458,
+      "learning_rate": 0.00041421046950890445,
+      "loss": 3.4498,
       "step": 28800
     },
     {
-      "epoch": 3.105155526853945,
-      "grad_norm": 0.6140844225883484,
-      "learning_rate": 0.00041419459110009693,
-      "loss": 3.4474,
+      "epoch": 3.1105121293800537,
+      "grad_norm": 0.6175945401191711,
+      "learning_rate": 0.000413886670264436,
+      "loss": 3.4339,
       "step": 28850
     },
     {
-      "epoch": 3.110537078893553,
-      "grad_norm": 0.6584010720252991,
-      "learning_rate": 0.0004138713500700355,
-      "loss": 3.442,
+      "epoch": 3.115902964959569,
+      "grad_norm": 0.5722609758377075,
+      "learning_rate": 0.0004135628710199676,
+      "loss": 3.4505,
       "step": 28900
     },
     {
-      "epoch": 3.115918630933161,
-      "grad_norm": 0.6763238906860352,
-      "learning_rate": 0.00041354810903997406,
-      "loss": 3.4475,
+      "epoch": 3.1212938005390836,
+      "grad_norm": 0.6651545763015747,
+      "learning_rate": 0.00041323907177549915,
+      "loss": 3.4691,
       "step": 28950
     },
     {
-      "epoch": 3.121300182972769,
-      "grad_norm": 0.6160973906517029,
-      "learning_rate": 0.00041322486800991266,
-      "loss": 3.4491,
+      "epoch": 3.1266846361185983,
+      "grad_norm": 0.628146231174469,
+      "learning_rate": 0.0004129152725310307,
+      "loss": 3.4393,
       "step": 29000
     },
     {
-      "epoch": 3.121300182972769,
-      "eval_accuracy": 0.3712210661434233,
-      "eval_loss": 3.494593858718872,
-      "eval_runtime": 209.8562,
-      "eval_samples_per_second": 85.825,
-      "eval_steps_per_second": 5.366,
+      "epoch": 3.1266846361185983,
+      "eval_accuracy": 0.3704548459785556,
+      "eval_loss": 3.5026659965515137,
+      "eval_runtime": 151.9804,
+      "eval_samples_per_second": 118.509,
+      "eval_steps_per_second": 7.409,
       "step": 29000
     },
     {
-      "epoch": 3.1266817350123777,
-      "grad_norm": 0.6309645771980286,
-      "learning_rate": 0.0004129016269798513,
-      "loss": 3.452,
+      "epoch": 3.1320754716981134,
+      "grad_norm": 0.653403103351593,
+      "learning_rate": 0.0004125914732865623,
+      "loss": 3.4586,
       "step": 29050
     },
     {
-      "epoch": 3.132063287051986,
-      "grad_norm": 0.616537868976593,
-      "learning_rate": 0.00041257838594978985,
-      "loss": 3.4708,
+      "epoch": 3.137466307277628,
+      "grad_norm": 0.6372281908988953,
+      "learning_rate": 0.00041226767404209386,
+      "loss": 3.4495,
       "step": 29100
     },
     {
-      "epoch": 3.137444839091594,
-      "grad_norm": 0.6185223460197449,
-      "learning_rate": 0.00041225514491972844,
-      "loss": 3.455,
+      "epoch": 3.142857142857143,
+      "grad_norm": 0.6290000081062317,
+      "learning_rate": 0.00041194387479762546,
+      "loss": 3.4673,
       "step": 29150
     },
     {
-      "epoch": 3.1428263911312024,
-      "grad_norm": 0.6138353943824768,
-      "learning_rate": 0.00041193190388966704,
-      "loss": 3.4406,
+      "epoch": 3.1482479784366575,
+      "grad_norm": 0.570479691028595,
+      "learning_rate": 0.000411620075553157,
+      "loss": 3.4668,
       "step": 29200
     },
     {
-      "epoch": 3.1482079431708105,
-      "grad_norm": 0.608535647392273,
-      "learning_rate": 0.0004116086628596056,
-      "loss": 3.4461,
+      "epoch": 3.1536388140161726,
+      "grad_norm": 0.5886173248291016,
+      "learning_rate": 0.0004112962763086886,
+      "loss": 3.4534,
       "step": 29250
     },
     {
-      "epoch": 3.1535894952104186,
-      "grad_norm": 0.6112627387046814,
-      "learning_rate": 0.0004112854218295442,
-      "loss": 3.4395,
+      "epoch": 3.1590296495956873,
+      "grad_norm": 0.5947257280349731,
+      "learning_rate": 0.0004109724770642201,
+      "loss": 3.4598,
       "step": 29300
     },
     {
-      "epoch": 3.1589710472500268,
-      "grad_norm": 0.5768312811851501,
-      "learning_rate": 0.0004109621807994828,
-      "loss": 3.4302,
+      "epoch": 3.164420485175202,
+      "grad_norm": 0.6352401971817017,
+      "learning_rate": 0.00041064867781975177,
+      "loss": 3.4734,
       "step": 29350
     },
     {
-      "epoch": 3.1643525992896353,
-      "grad_norm": 0.6325687170028687,
-      "learning_rate": 0.00041063893976942136,
-      "loss": 3.4465,
+      "epoch": 3.169811320754717,
+      "grad_norm": 0.5621167421340942,
+      "learning_rate": 0.00041032487857528327,
+      "loss": 3.4683,
       "step": 29400
     },
     {
-      "epoch": 3.1697341513292434,
-      "grad_norm": 0.6342477202415466,
-      "learning_rate": 0.00041031569873935996,
-      "loss": 3.442,
+      "epoch": 3.175202156334232,
+      "grad_norm": 0.6027476787567139,
+      "learning_rate": 0.0004100010793308148,
+      "loss": 3.4779,
       "step": 29450
     },
     {
-      "epoch": 3.1751157033688515,
-      "grad_norm": 0.6336598992347717,
-      "learning_rate": 0.0004099924577092985,
-      "loss": 3.4462,
+      "epoch": 3.1805929919137466,
+      "grad_norm": 0.5749760270118713,
+      "learning_rate": 0.0004096772800863464,
+      "loss": 3.473,
       "step": 29500
     },
     {
-      "epoch": 3.1804972554084596,
-      "grad_norm": 0.607629656791687,
-      "learning_rate": 0.0004096692166792371,
-      "loss": 3.4299,
+      "epoch": 3.1859838274932613,
+      "grad_norm": 0.6280606389045715,
+      "learning_rate": 0.000409353480841878,
+      "loss": 3.4493,
       "step": 29550
     },
     {
-      "epoch": 3.185878807448068,
-      "grad_norm": 0.6477324366569519,
-      "learning_rate": 0.00040934597564917574,
-      "loss": 3.448,
+      "epoch": 3.1913746630727764,
+      "grad_norm": 0.6310157179832458,
+      "learning_rate": 0.0004090296815974096,
+      "loss": 3.4706,
       "step": 29600
     },
     {
-      "epoch": 3.1912603594876763,
-      "grad_norm": 0.6741542220115662,
-      "learning_rate": 0.0004090227346191143,
-      "loss": 3.4596,
+      "epoch": 3.196765498652291,
+      "grad_norm": 0.6024655699729919,
+      "learning_rate": 0.00040870588235294113,
+      "loss": 3.4574,
       "step": 29650
     },
     {
-      "epoch": 3.1966419115272844,
-      "grad_norm": 0.6519025564193726,
-      "learning_rate": 0.0004086994935890529,
-      "loss": 3.4715,
+      "epoch": 3.202156334231806,
+      "grad_norm": 0.6026850938796997,
+      "learning_rate": 0.00040838208310847273,
+      "loss": 3.4629,
       "step": 29700
     },
     {
-      "epoch": 3.2020234635668925,
-      "grad_norm": 0.6172560453414917,
-      "learning_rate": 0.00040837625255899147,
-      "loss": 3.4494,
+      "epoch": 3.207547169811321,
+      "grad_norm": 0.5791655778884888,
+      "learning_rate": 0.0004080582838640043,
+      "loss": 3.4699,
       "step": 29750
     },
     {
-      "epoch": 3.207405015606501,
-      "grad_norm": 0.6829916834831238,
-      "learning_rate": 0.00040805301152893,
-      "loss": 3.4398,
+      "epoch": 3.2129380053908356,
+      "grad_norm": 0.5974932312965393,
+      "learning_rate": 0.00040774096060442524,
+      "loss": 3.4527,
       "step": 29800
     },
     {
-      "epoch": 3.212786567646109,
-      "grad_norm": 0.6081319451332092,
-      "learning_rate": 0.0004077297704988686,
-      "loss": 3.446,
+      "epoch": 3.2183288409703503,
+      "grad_norm": 0.6695175766944885,
+      "learning_rate": 0.0004074171613599568,
+      "loss": 3.4755,
       "step": 29850
     },
     {
-      "epoch": 3.2181681196857173,
-      "grad_norm": 0.6863629817962646,
-      "learning_rate": 0.00040740652946880725,
-      "loss": 3.4471,
+      "epoch": 3.223719676549865,
+      "grad_norm": 0.5739556550979614,
+      "learning_rate": 0.0004070933621154884,
+      "loss": 3.4812,
       "step": 29900
     },
     {
-      "epoch": 3.2235496717253254,
-      "grad_norm": 0.6253100633621216,
-      "learning_rate": 0.0004070832884387458,
-      "loss": 3.4593,
+      "epoch": 3.22911051212938,
+      "grad_norm": 0.6452335119247437,
+      "learning_rate": 0.00040676956287101994,
+      "loss": 3.474,
       "step": 29950
     },
     {
-      "epoch": 3.228931223764934,
-      "grad_norm": 0.61070317029953,
-      "learning_rate": 0.0004067600474086844,
-      "loss": 3.4684,
+      "epoch": 3.234501347708895,
+      "grad_norm": 0.6547558903694153,
+      "learning_rate": 0.00040644576362655155,
+      "loss": 3.4587,
       "step": 30000
     },
     {
-      "epoch": 3.228931223764934,
-      "eval_accuracy": 0.3713381939564873,
-      "eval_loss": 3.4922268390655518,
-      "eval_runtime": 208.2583,
-      "eval_samples_per_second": 86.484,
-      "eval_steps_per_second": 5.407,
+      "epoch": 3.234501347708895,
+      "eval_accuracy": 0.37111730263556053,
+      "eval_loss": 3.4946022033691406,
+      "eval_runtime": 151.9,
+      "eval_samples_per_second": 118.571,
+      "eval_steps_per_second": 7.413,
       "step": 30000
     }
   ],
   "logging_steps": 50,
-  "max_steps": 92910,
+  "max_steps": 92750,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 10,
   "save_steps": 10000,
@@ -4496,7 +4496,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.50830944206848e+17,
+  "total_flos": 2.508207538176e+17,
   "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null