diff --git "a/checkpoint-80000/trainer_state.json" "b/checkpoint-80000/trainer_state.json"
--- "a/checkpoint-80000/trainer_state.json"
+++ "b/checkpoint-80000/trainer_state.json"
@@ -1,7 +1,7 @@
 {
-  "best_metric": 3.323802947998047,
-  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_634/checkpoint-80000",
-  "epoch": 8.610483263373157,
+  "best_metric": 3.3251895904541016,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_low_100_634/checkpoint-80000",
+  "epoch": 8.625336927223719,
   "eval_steps": 1000,
   "global_step": 80000,
   "is_hyper_param_search": false,
@@ -9,11928 +9,11928 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.005381552039608223,
-      "grad_norm": 2.7091472148895264,
+      "epoch": 0.005390835579514825,
+      "grad_norm": 1.4323930740356445,
       "learning_rate": 0.0003,
-      "loss": 8.4865,
+      "loss": 8.6259,
       "step": 50
     },
     {
-      "epoch": 0.010763104079216447,
-      "grad_norm": 1.967639446258545,
+      "epoch": 0.01078167115902965,
+      "grad_norm": 1.0894039869308472,
       "learning_rate": 0.0006,
-      "loss": 6.9044,
+      "loss": 6.9226,
       "step": 100
     },
     {
-      "epoch": 0.01614465611882467,
-      "grad_norm": 2.5172040462493896,
-      "learning_rate": 0.0005996767589699385,
-      "loss": 6.4803,
+      "epoch": 0.016172506738544475,
+      "grad_norm": 1.6132558584213257,
+      "learning_rate": 0.0005996762007555315,
+      "loss": 6.4978,
       "step": 150
     },
     {
-      "epoch": 0.021526208158432893,
-      "grad_norm": 1.3199275732040405,
-      "learning_rate": 0.0005993535179398771,
-      "loss": 6.2354,
+      "epoch": 0.0215633423180593,
+      "grad_norm": 1.2535423040390015,
+      "learning_rate": 0.000599352401511063,
+      "loss": 6.2339,
       "step": 200
     },
     {
-      "epoch": 0.026907760198041114,
-      "grad_norm": 1.2347278594970703,
-      "learning_rate": 0.0005990302769098158,
-      "loss": 6.0754,
+      "epoch": 0.026954177897574125,
+      "grad_norm": 1.3019688129425049,
+      "learning_rate": 0.0005990286022665946,
+      "loss": 6.0869,
       "step": 250
     },
     {
-      "epoch": 0.03228931223764934,
-      "grad_norm": 1.4885425567626953,
-      "learning_rate": 0.0005987070358797543,
-      "loss": 5.9329,
+      "epoch": 0.03234501347708895,
+      "grad_norm": 1.3941422700881958,
+      "learning_rate": 0.0005987048030221263,
+      "loss": 5.9835,
       "step": 300
     },
     {
-      "epoch": 0.03767086427725756,
-      "grad_norm": 1.6232192516326904,
-      "learning_rate": 0.0005983837948496929,
-      "loss": 5.8614,
+      "epoch": 0.03773584905660377,
+      "grad_norm": 2.2118101119995117,
+      "learning_rate": 0.0005983810037776578,
+      "loss": 5.8779,
       "step": 350
     },
     {
-      "epoch": 0.04305241631686579,
-      "grad_norm": 1.598788857460022,
-      "learning_rate": 0.0005980605538196314,
-      "loss": 5.7913,
+      "epoch": 0.0431266846361186,
+      "grad_norm": 0.8065666556358337,
+      "learning_rate": 0.0005980572045331894,
+      "loss": 5.7809,
       "step": 400
     },
     {
-      "epoch": 0.048433968356474004,
-      "grad_norm": 2.481018543243408,
-      "learning_rate": 0.0005977373127895701,
-      "loss": 5.7337,
+      "epoch": 0.04851752021563342,
+      "grad_norm": 1.9112606048583984,
+      "learning_rate": 0.0005977334052887209,
+      "loss": 5.7161,
       "step": 450
     },
     {
-      "epoch": 0.05381552039608223,
-      "grad_norm": 1.4101097583770752,
-      "learning_rate": 0.0005974140717595086,
-      "loss": 5.6182,
+      "epoch": 0.05390835579514825,
+      "grad_norm": 1.0509525537490845,
+      "learning_rate": 0.0005974096060442526,
+      "loss": 5.6556,
       "step": 500
     },
     {
-      "epoch": 0.05919707243569045,
-      "grad_norm": 1.6082617044448853,
-      "learning_rate": 0.0005970908307294472,
-      "loss": 5.5589,
+      "epoch": 0.05929919137466307,
+      "grad_norm": 1.202839970588684,
+      "learning_rate": 0.0005970858067997841,
+      "loss": 5.5734,
       "step": 550
     },
     {
-      "epoch": 0.06457862447529868,
-      "grad_norm": 0.9049023389816284,
-      "learning_rate": 0.0005967675896993858,
-      "loss": 5.5049,
+      "epoch": 0.0646900269541779,
+      "grad_norm": 1.3976365327835083,
+      "learning_rate": 0.0005967620075553157,
+      "loss": 5.5046,
       "step": 600
     },
     {
-      "epoch": 0.0699601765149069,
-      "grad_norm": 1.3293204307556152,
-      "learning_rate": 0.0005964443486693243,
-      "loss": 5.4266,
+      "epoch": 0.07008086253369272,
+      "grad_norm": 1.0246344804763794,
+      "learning_rate": 0.0005964382083108472,
+      "loss": 5.4208,
       "step": 650
     },
     {
-      "epoch": 0.07534172855451512,
-      "grad_norm": 1.7935289144515991,
-      "learning_rate": 0.000596121107639263,
-      "loss": 5.3506,
+      "epoch": 0.07547169811320754,
+      "grad_norm": 1.313137412071228,
+      "learning_rate": 0.0005961144090663788,
+      "loss": 5.367,
       "step": 700
     },
     {
-      "epoch": 0.08072328059412334,
-      "grad_norm": 1.6690304279327393,
-      "learning_rate": 0.0005957978666092015,
-      "loss": 5.3179,
+      "epoch": 0.08086253369272237,
+      "grad_norm": 1.0808387994766235,
+      "learning_rate": 0.0005957906098219104,
+      "loss": 5.3003,
       "step": 750
     },
     {
-      "epoch": 0.08610483263373157,
-      "grad_norm": 1.4270386695861816,
-      "learning_rate": 0.0005954746255791401,
-      "loss": 5.2698,
+      "epoch": 0.0862533692722372,
+      "grad_norm": 0.8700747489929199,
+      "learning_rate": 0.0005954668105774419,
+      "loss": 5.2478,
       "step": 800
     },
     {
-      "epoch": 0.09148638467333979,
-      "grad_norm": 1.5141105651855469,
-      "learning_rate": 0.0005951513845490787,
-      "loss": 5.1827,
+      "epoch": 0.09164420485175202,
+      "grad_norm": 1.0296642780303955,
+      "learning_rate": 0.0005951430113329735,
+      "loss": 5.1893,
       "step": 850
     },
     {
-      "epoch": 0.09686793671294801,
-      "grad_norm": 1.6831254959106445,
-      "learning_rate": 0.0005948281435190174,
-      "loss": 5.1898,
+      "epoch": 0.09703504043126684,
+      "grad_norm": 1.3226677179336548,
+      "learning_rate": 0.0005948192120885051,
+      "loss": 5.1688,
       "step": 900
     },
     {
-      "epoch": 0.10224948875255624,
-      "grad_norm": 1.1933035850524902,
-      "learning_rate": 0.0005945049024889559,
-      "loss": 5.1376,
+      "epoch": 0.10242587601078167,
+      "grad_norm": 1.0636179447174072,
+      "learning_rate": 0.0005944954128440366,
+      "loss": 5.1311,
       "step": 950
     },
     {
-      "epoch": 0.10763104079216446,
-      "grad_norm": 1.2600319385528564,
-      "learning_rate": 0.0005941816614588944,
-      "loss": 5.0756,
+      "epoch": 0.1078167115902965,
+      "grad_norm": 1.4769413471221924,
+      "learning_rate": 0.0005941716135995682,
+      "loss": 5.0765,
       "step": 1000
     },
     {
-      "epoch": 0.10763104079216446,
-      "eval_accuracy": 0.22684951933592223,
-      "eval_loss": 5.017084121704102,
-      "eval_runtime": 185.5208,
-      "eval_samples_per_second": 97.083,
-      "eval_steps_per_second": 6.069,
+      "epoch": 0.1078167115902965,
+      "eval_accuracy": 0.22730021151457672,
+      "eval_loss": 5.021658420562744,
+      "eval_runtime": 183.6554,
+      "eval_samples_per_second": 98.07,
+      "eval_steps_per_second": 6.131,
       "step": 1000
     },
     {
-      "epoch": 0.11301259283177269,
-      "grad_norm": 1.1238996982574463,
-      "learning_rate": 0.000593858420428833,
-      "loss": 5.0471,
+      "epoch": 0.11320754716981132,
+      "grad_norm": 1.1359593868255615,
+      "learning_rate": 0.0005938478143550997,
+      "loss": 5.0369,
       "step": 1050
     },
     {
-      "epoch": 0.1183941448713809,
-      "grad_norm": 1.0484765768051147,
-      "learning_rate": 0.0005935351793987716,
-      "loss": 5.0295,
+      "epoch": 0.11859838274932614,
+      "grad_norm": 0.8879361748695374,
+      "learning_rate": 0.0005935240151106314,
+      "loss": 5.0082,
       "step": 1100
     },
     {
-      "epoch": 0.12377569691098914,
-      "grad_norm": 1.4683005809783936,
-      "learning_rate": 0.0005932119383687103,
-      "loss": 4.9958,
+      "epoch": 0.12398921832884097,
+      "grad_norm": 0.9153700470924377,
+      "learning_rate": 0.0005932002158661629,
+      "loss": 4.9873,
       "step": 1150
     },
     {
-      "epoch": 0.12915724895059735,
-      "grad_norm": 1.0093375444412231,
-      "learning_rate": 0.0005928886973386488,
-      "loss": 4.9523,
+      "epoch": 0.1293800539083558,
+      "grad_norm": 1.2399190664291382,
+      "learning_rate": 0.0005928764166216945,
+      "loss": 4.9252,
       "step": 1200
     },
     {
-      "epoch": 0.13453880099020557,
-      "grad_norm": 1.2035621404647827,
-      "learning_rate": 0.0005925654563085874,
-      "loss": 4.8916,
+      "epoch": 0.1347708894878706,
+      "grad_norm": 0.8370887041091919,
+      "learning_rate": 0.000592552617377226,
+      "loss": 4.8993,
       "step": 1250
     },
     {
-      "epoch": 0.1399203530298138,
-      "grad_norm": 1.0326842069625854,
-      "learning_rate": 0.000592242215278526,
-      "loss": 4.8852,
+      "epoch": 0.14016172506738545,
+      "grad_norm": 1.5546677112579346,
+      "learning_rate": 0.0005922288181327577,
+      "loss": 4.8775,
       "step": 1300
     },
     {
-      "epoch": 0.14530190506942203,
-      "grad_norm": 0.9776148200035095,
-      "learning_rate": 0.0005919189742484645,
-      "loss": 4.8661,
+      "epoch": 0.14555256064690028,
+      "grad_norm": 0.8524234294891357,
+      "learning_rate": 0.0005919050188882893,
+      "loss": 4.841,
       "step": 1350
     },
     {
-      "epoch": 0.15068345710903025,
-      "grad_norm": 1.0485109090805054,
-      "learning_rate": 0.0005915957332184032,
-      "loss": 4.8206,
+      "epoch": 0.1509433962264151,
+      "grad_norm": 0.998586893081665,
+      "learning_rate": 0.0005915812196438207,
+      "loss": 4.8349,
       "step": 1400
     },
     {
-      "epoch": 0.15606500914863847,
-      "grad_norm": 0.931476891040802,
-      "learning_rate": 0.0005912724921883417,
-      "loss": 4.8047,
+      "epoch": 0.15633423180592992,
+      "grad_norm": 1.1706006526947021,
+      "learning_rate": 0.0005912574203993524,
+      "loss": 4.8439,
       "step": 1450
     },
     {
-      "epoch": 0.16144656118824668,
-      "grad_norm": 1.7174251079559326,
-      "learning_rate": 0.0005909492511582803,
-      "loss": 4.8148,
+      "epoch": 0.16172506738544473,
+      "grad_norm": 1.0023614168167114,
+      "learning_rate": 0.0005909336211548839,
+      "loss": 4.7998,
       "step": 1500
     },
     {
-      "epoch": 0.1668281132278549,
-      "grad_norm": 0.9541407823562622,
-      "learning_rate": 0.0005906260101282189,
-      "loss": 4.7619,
+      "epoch": 0.16711590296495957,
+      "grad_norm": 0.874588668346405,
+      "learning_rate": 0.0005906098219104155,
+      "loss": 4.7577,
       "step": 1550
     },
     {
-      "epoch": 0.17220966526746315,
-      "grad_norm": 0.8106948733329773,
-      "learning_rate": 0.0005903027690981575,
-      "loss": 4.7593,
+      "epoch": 0.1725067385444744,
+      "grad_norm": 1.079401969909668,
+      "learning_rate": 0.000590286022665947,
+      "loss": 4.7449,
       "step": 1600
     },
     {
-      "epoch": 0.17759121730707136,
-      "grad_norm": 0.8909491896629333,
-      "learning_rate": 0.000589979528068096,
-      "loss": 4.7116,
+      "epoch": 0.1778975741239892,
+      "grad_norm": 0.9815456867218018,
+      "learning_rate": 0.0005899622234214787,
+      "loss": 4.7201,
       "step": 1650
     },
     {
-      "epoch": 0.18297276934667958,
-      "grad_norm": 0.911844789981842,
-      "learning_rate": 0.0005896562870380347,
-      "loss": 4.6914,
+      "epoch": 0.18328840970350405,
+      "grad_norm": 0.9003214836120605,
+      "learning_rate": 0.0005896384241770102,
+      "loss": 4.696,
       "step": 1700
     },
     {
-      "epoch": 0.1883543213862878,
-      "grad_norm": 1.3206818103790283,
-      "learning_rate": 0.0005893330460079732,
-      "loss": 4.6934,
+      "epoch": 0.18867924528301888,
+      "grad_norm": 0.9949226975440979,
+      "learning_rate": 0.0005893146249325418,
+      "loss": 4.7044,
       "step": 1750
     },
     {
-      "epoch": 0.19373587342589602,
-      "grad_norm": 0.8717551231384277,
-      "learning_rate": 0.0005890098049779118,
-      "loss": 4.6662,
+      "epoch": 0.1940700808625337,
+      "grad_norm": 0.9833788871765137,
+      "learning_rate": 0.0005889908256880733,
+      "loss": 4.672,
       "step": 1800
     },
     {
-      "epoch": 0.19911742546550426,
-      "grad_norm": 0.8639872670173645,
-      "learning_rate": 0.0005886865639478504,
-      "loss": 4.6283,
+      "epoch": 0.19946091644204852,
+      "grad_norm": 0.9863656163215637,
+      "learning_rate": 0.0005886670264436049,
+      "loss": 4.6412,
       "step": 1850
     },
     {
-      "epoch": 0.20449897750511248,
-      "grad_norm": 1.275295615196228,
-      "learning_rate": 0.0005883633229177889,
-      "loss": 4.6063,
+      "epoch": 0.20485175202156333,
+      "grad_norm": 0.9831191897392273,
+      "learning_rate": 0.0005883432271991365,
+      "loss": 4.6256,
       "step": 1900
     },
     {
-      "epoch": 0.2098805295447207,
-      "grad_norm": 0.9782142043113708,
-      "learning_rate": 0.0005880400818877276,
-      "loss": 4.6013,
+      "epoch": 0.21024258760107817,
+      "grad_norm": 0.9612619280815125,
+      "learning_rate": 0.0005880194279546681,
+      "loss": 4.6301,
       "step": 1950
     },
     {
-      "epoch": 0.2152620815843289,
-      "grad_norm": 0.7876592874526978,
-      "learning_rate": 0.0005877168408576662,
-      "loss": 4.5802,
+      "epoch": 0.215633423180593,
+      "grad_norm": 0.8835451006889343,
+      "learning_rate": 0.0005876956287101996,
+      "loss": 4.5772,
       "step": 2000
     },
     {
-      "epoch": 0.2152620815843289,
-      "eval_accuracy": 0.27130006765815323,
-      "eval_loss": 4.498872756958008,
-      "eval_runtime": 185.1447,
-      "eval_samples_per_second": 97.281,
-      "eval_steps_per_second": 6.082,
+      "epoch": 0.215633423180593,
+      "eval_accuracy": 0.2714264309666815,
+      "eval_loss": 4.5014872550964355,
+      "eval_runtime": 183.6725,
+      "eval_samples_per_second": 98.06,
+      "eval_steps_per_second": 6.13,
       "step": 2000
     },
     {
-      "epoch": 0.22064363362393713,
-      "grad_norm": 0.9489641785621643,
-      "learning_rate": 0.0005873935998276048,
-      "loss": 4.576,
+      "epoch": 0.2210242587601078,
+      "grad_norm": 1.0137361288070679,
+      "learning_rate": 0.0005873718294657312,
+      "loss": 4.5575,
       "step": 2050
     },
     {
-      "epoch": 0.22602518566354537,
-      "grad_norm": 0.8717491030693054,
-      "learning_rate": 0.0005870703587975433,
-      "loss": 4.5329,
+      "epoch": 0.22641509433962265,
+      "grad_norm": 0.7878215312957764,
+      "learning_rate": 0.0005870480302212628,
+      "loss": 4.5434,
       "step": 2100
     },
     {
-      "epoch": 0.2314067377031536,
-      "grad_norm": 0.6650089025497437,
-      "learning_rate": 0.0005867471177674818,
-      "loss": 4.5276,
+      "epoch": 0.23180592991913745,
+      "grad_norm": 0.8975843787193298,
+      "learning_rate": 0.0005867242309767943,
+      "loss": 4.5286,
       "step": 2150
     },
     {
-      "epoch": 0.2367882897427618,
-      "grad_norm": 0.9660583138465881,
-      "learning_rate": 0.0005864238767374205,
-      "loss": 4.5157,
+      "epoch": 0.2371967654986523,
+      "grad_norm": 0.9283952116966248,
+      "learning_rate": 0.0005864004317323259,
+      "loss": 4.4883,
       "step": 2200
     },
     {
-      "epoch": 0.24216984178237003,
-      "grad_norm": 0.8200618028640747,
-      "learning_rate": 0.0005861006357073591,
-      "loss": 4.4902,
+      "epoch": 0.24258760107816713,
+      "grad_norm": 0.7984836101531982,
+      "learning_rate": 0.0005860766324878575,
+      "loss": 4.5028,
       "step": 2250
     },
     {
-      "epoch": 0.24755139382197827,
-      "grad_norm": 0.832756519317627,
-      "learning_rate": 0.0005857773946772977,
-      "loss": 4.4648,
+      "epoch": 0.24797843665768193,
+      "grad_norm": 0.8196074366569519,
+      "learning_rate": 0.000585752833243389,
+      "loss": 4.4778,
       "step": 2300
     },
     {
-      "epoch": 0.2529329458615865,
-      "grad_norm": 1.736657977104187,
-      "learning_rate": 0.0005854541536472362,
-      "loss": 4.451,
+      "epoch": 0.25336927223719674,
+      "grad_norm": 1.0101298093795776,
+      "learning_rate": 0.0005854290339989206,
+      "loss": 4.4509,
       "step": 2350
     },
     {
-      "epoch": 0.2583144979011947,
-      "grad_norm": 0.8690645694732666,
-      "learning_rate": 0.0005851309126171749,
-      "loss": 4.442,
+      "epoch": 0.2587601078167116,
+      "grad_norm": 0.9420535564422607,
+      "learning_rate": 0.0005851052347544521,
+      "loss": 4.4503,
       "step": 2400
     },
     {
-      "epoch": 0.2636960499408029,
-      "grad_norm": 0.9058188199996948,
-      "learning_rate": 0.0005848076715871134,
-      "loss": 4.4145,
+      "epoch": 0.2641509433962264,
+      "grad_norm": 0.8534466028213501,
+      "learning_rate": 0.0005847814355099838,
+      "loss": 4.4329,
       "step": 2450
     },
     {
-      "epoch": 0.26907760198041114,
-      "grad_norm": 0.7224371433258057,
-      "learning_rate": 0.000584484430557052,
-      "loss": 4.4354,
+      "epoch": 0.2695417789757412,
+      "grad_norm": 0.7836869359016418,
+      "learning_rate": 0.0005844576362655154,
+      "loss": 4.4255,
       "step": 2500
     },
     {
-      "epoch": 0.27445915402001936,
-      "grad_norm": 0.9763414263725281,
-      "learning_rate": 0.0005841611895269906,
-      "loss": 4.3947,
+      "epoch": 0.2749326145552561,
+      "grad_norm": 1.1862176656723022,
+      "learning_rate": 0.0005841338370210469,
+      "loss": 4.3987,
       "step": 2550
     },
     {
-      "epoch": 0.2798407060596276,
-      "grad_norm": 0.8446633219718933,
-      "learning_rate": 0.0005838379484969291,
-      "loss": 4.3857,
+      "epoch": 0.2803234501347709,
+      "grad_norm": 0.9727432131767273,
+      "learning_rate": 0.0005838100377765785,
+      "loss": 4.3739,
       "step": 2600
     },
     {
-      "epoch": 0.2852222580992358,
-      "grad_norm": 0.7831347584724426,
-      "learning_rate": 0.0005835147074668678,
-      "loss": 4.3803,
+      "epoch": 0.2857142857142857,
+      "grad_norm": 0.9045615196228027,
+      "learning_rate": 0.0005834862385321101,
+      "loss": 4.3749,
       "step": 2650
     },
     {
-      "epoch": 0.29060381013884407,
-      "grad_norm": 0.9835968613624573,
-      "learning_rate": 0.0005831914664368063,
-      "loss": 4.3489,
+      "epoch": 0.29110512129380056,
+      "grad_norm": 0.6496215462684631,
+      "learning_rate": 0.0005831624392876417,
+      "loss": 4.3646,
       "step": 2700
     },
     {
-      "epoch": 0.2959853621784523,
-      "grad_norm": 0.7985646724700928,
-      "learning_rate": 0.0005828682254067449,
-      "loss": 4.3591,
+      "epoch": 0.29649595687331537,
+      "grad_norm": 0.943100094795227,
+      "learning_rate": 0.0005828386400431731,
+      "loss": 4.3741,
       "step": 2750
     },
     {
-      "epoch": 0.3013669142180605,
-      "grad_norm": 0.760099470615387,
-      "learning_rate": 0.0005825449843766835,
-      "loss": 4.3337,
+      "epoch": 0.3018867924528302,
+      "grad_norm": 0.7881677150726318,
+      "learning_rate": 0.0005825148407987048,
+      "loss": 4.3681,
       "step": 2800
     },
     {
-      "epoch": 0.3067484662576687,
-      "grad_norm": 0.7309588193893433,
-      "learning_rate": 0.0005822217433466221,
-      "loss": 4.3136,
+      "epoch": 0.30727762803234504,
+      "grad_norm": 0.8434391021728516,
+      "learning_rate": 0.0005821910415542363,
+      "loss": 4.3316,
       "step": 2850
     },
     {
-      "epoch": 0.31213001829727693,
-      "grad_norm": 0.749220609664917,
-      "learning_rate": 0.0005818985023165607,
-      "loss": 4.3327,
+      "epoch": 0.31266846361185985,
+      "grad_norm": 0.8027449250221252,
+      "learning_rate": 0.0005818672423097679,
+      "loss": 4.3199,
       "step": 2900
     },
     {
-      "epoch": 0.31751157033688515,
-      "grad_norm": 0.7987422943115234,
-      "learning_rate": 0.0005815752612864992,
-      "loss": 4.3193,
+      "epoch": 0.31805929919137466,
+      "grad_norm": 0.7706471681594849,
+      "learning_rate": 0.0005815434430652994,
+      "loss": 4.348,
       "step": 2950
     },
     {
-      "epoch": 0.32289312237649337,
-      "grad_norm": 0.9373783469200134,
-      "learning_rate": 0.0005812520202564378,
-      "loss": 4.2854,
+      "epoch": 0.32345013477088946,
+      "grad_norm": 0.7823014855384827,
+      "learning_rate": 0.0005812196438208311,
+      "loss": 4.3085,
       "step": 3000
     },
     {
-      "epoch": 0.32289312237649337,
-      "eval_accuracy": 0.2990188318271689,
-      "eval_loss": 4.231717109680176,
-      "eval_runtime": 187.2992,
-      "eval_samples_per_second": 96.162,
-      "eval_steps_per_second": 6.012,
+      "epoch": 0.32345013477088946,
+      "eval_accuracy": 0.2989201750050333,
+      "eval_loss": 4.233241081237793,
+      "eval_runtime": 183.5455,
+      "eval_samples_per_second": 98.128,
+      "eval_steps_per_second": 6.135,
       "step": 3000
     },
     {
-      "epoch": 0.3282746744161016,
-      "grad_norm": 0.967170774936676,
-      "learning_rate": 0.0005809287792263764,
-      "loss": 4.286,
+      "epoch": 0.3288409703504043,
+      "grad_norm": 0.7678599953651428,
+      "learning_rate": 0.0005808958445763626,
+      "loss": 4.3041,
       "step": 3050
     },
     {
-      "epoch": 0.3336562264557098,
-      "grad_norm": 0.8400319218635559,
-      "learning_rate": 0.0005806055381963151,
-      "loss": 4.2771,
+      "epoch": 0.33423180592991913,
+      "grad_norm": 0.7595804333686829,
+      "learning_rate": 0.0005805720453318942,
+      "loss": 4.2839,
       "step": 3100
     },
     {
-      "epoch": 0.3390377784953181,
-      "grad_norm": 0.7334919571876526,
-      "learning_rate": 0.0005802822971662536,
-      "loss": 4.2781,
+      "epoch": 0.33962264150943394,
+      "grad_norm": 0.8765571117401123,
+      "learning_rate": 0.0005802482460874257,
+      "loss": 4.2752,
       "step": 3150
     },
     {
-      "epoch": 0.3444193305349263,
-      "grad_norm": 0.744314432144165,
-      "learning_rate": 0.0005799590561361922,
-      "loss": 4.279,
+      "epoch": 0.3450134770889488,
+      "grad_norm": 0.6764340996742249,
+      "learning_rate": 0.0005799244468429573,
+      "loss": 4.2784,
       "step": 3200
     },
     {
-      "epoch": 0.3498008825745345,
-      "grad_norm": 0.8924934267997742,
-      "learning_rate": 0.0005796358151061307,
-      "loss": 4.2584,
+      "epoch": 0.3504043126684636,
+      "grad_norm": 1.5989835262298584,
+      "learning_rate": 0.0005796006475984889,
+      "loss": 4.2516,
       "step": 3250
     },
     {
-      "epoch": 0.35518243461414273,
-      "grad_norm": 0.6779358983039856,
-      "learning_rate": 0.0005793125740760694,
-      "loss": 4.2424,
+      "epoch": 0.3557951482479784,
+      "grad_norm": 0.8266173601150513,
+      "learning_rate": 0.0005792768483540205,
+      "loss": 4.2636,
       "step": 3300
     },
     {
-      "epoch": 0.36056398665375095,
-      "grad_norm": 0.6709272861480713,
-      "learning_rate": 0.0005789893330460079,
-      "loss": 4.233,
+      "epoch": 0.3611859838274933,
+      "grad_norm": 0.9941731095314026,
+      "learning_rate": 0.000578953049109552,
+      "loss": 4.2564,
       "step": 3350
     },
     {
-      "epoch": 0.36594553869335916,
-      "grad_norm": 0.7489773035049438,
-      "learning_rate": 0.0005786660920159465,
-      "loss": 4.2351,
+      "epoch": 0.3665768194070081,
+      "grad_norm": 0.6393300890922546,
+      "learning_rate": 0.0005786292498650836,
+      "loss": 4.2346,
       "step": 3400
     },
     {
-      "epoch": 0.3713270907329674,
-      "grad_norm": 0.8585236668586731,
-      "learning_rate": 0.0005783428509858851,
-      "loss": 4.2377,
+      "epoch": 0.3719676549865229,
+      "grad_norm": 0.8488381505012512,
+      "learning_rate": 0.0005783054506206152,
+      "loss": 4.2266,
       "step": 3450
     },
     {
-      "epoch": 0.3767086427725756,
-      "grad_norm": 0.7472572922706604,
-      "learning_rate": 0.0005780196099558237,
+      "epoch": 0.37735849056603776,
+      "grad_norm": 0.7094395756721497,
+      "learning_rate": 0.0005779816513761467,
       "loss": 4.2177,
       "step": 3500
     },
     {
-      "epoch": 0.3820901948121838,
-      "grad_norm": 0.6559500694274902,
-      "learning_rate": 0.0005776963689257623,
-      "loss": 4.2103,
+      "epoch": 0.38274932614555257,
+      "grad_norm": 0.756734311580658,
+      "learning_rate": 0.0005776578521316782,
+      "loss": 4.1968,
       "step": 3550
     },
     {
-      "epoch": 0.38747174685179203,
-      "grad_norm": 0.8734597563743591,
-      "learning_rate": 0.0005773731278957008,
-      "loss": 4.2007,
+      "epoch": 0.3881401617250674,
+      "grad_norm": 0.803911030292511,
+      "learning_rate": 0.0005773340528872099,
+      "loss": 4.2054,
       "step": 3600
     },
     {
-      "epoch": 0.3928532988914003,
-      "grad_norm": 0.8117654323577881,
-      "learning_rate": 0.0005770498868656394,
-      "loss": 4.194,
+      "epoch": 0.3935309973045822,
+      "grad_norm": 0.6941730976104736,
+      "learning_rate": 0.0005770102536427414,
+      "loss": 4.2234,
       "step": 3650
     },
     {
-      "epoch": 0.3982348509310085,
-      "grad_norm": 0.8064318895339966,
-      "learning_rate": 0.000576726645835578,
-      "loss": 4.1933,
+      "epoch": 0.39892183288409705,
+      "grad_norm": 0.6912753582000732,
+      "learning_rate": 0.000576686454398273,
+      "loss": 4.2084,
       "step": 3700
     },
     {
-      "epoch": 0.40361640297061674,
-      "grad_norm": 0.6744760274887085,
-      "learning_rate": 0.0005764034048055167,
-      "loss": 4.1869,
+      "epoch": 0.40431266846361186,
+      "grad_norm": 0.7011223435401917,
+      "learning_rate": 0.0005763626551538045,
+      "loss": 4.1927,
       "step": 3750
     },
     {
-      "epoch": 0.40899795501022496,
-      "grad_norm": 0.7638756036758423,
-      "learning_rate": 0.0005760801637754552,
-      "loss": 4.1967,
+      "epoch": 0.40970350404312667,
+      "grad_norm": 0.6987757086753845,
+      "learning_rate": 0.0005760388559093362,
+      "loss": 4.1824,
       "step": 3800
     },
     {
-      "epoch": 0.4143795070498332,
-      "grad_norm": 0.8815849423408508,
-      "learning_rate": 0.0005757569227453937,
-      "loss": 4.162,
+      "epoch": 0.41509433962264153,
+      "grad_norm": 0.6419551372528076,
+      "learning_rate": 0.0005757150566648678,
+      "loss": 4.1796,
       "step": 3850
     },
     {
-      "epoch": 0.4197610590894414,
-      "grad_norm": 0.7192302346229553,
-      "learning_rate": 0.0005754336817153324,
-      "loss": 4.1899,
+      "epoch": 0.42048517520215634,
+      "grad_norm": 0.7884727120399475,
+      "learning_rate": 0.0005753912574203993,
+      "loss": 4.1707,
       "step": 3900
     },
     {
-      "epoch": 0.4251426111290496,
-      "grad_norm": 0.5958099365234375,
-      "learning_rate": 0.0005751104406852709,
-      "loss": 4.1567,
+      "epoch": 0.42587601078167114,
+      "grad_norm": 0.772879421710968,
+      "learning_rate": 0.0005750674581759309,
+      "loss": 4.1724,
       "step": 3950
     },
     {
-      "epoch": 0.4305241631686578,
-      "grad_norm": 0.7568697333335876,
-      "learning_rate": 0.0005747871996552096,
-      "loss": 4.167,
+      "epoch": 0.431266846361186,
+      "grad_norm": 0.7401405572891235,
+      "learning_rate": 0.0005747436589314624,
+      "loss": 4.1541,
       "step": 4000
     },
     {
-      "epoch": 0.4305241631686578,
-      "eval_accuracy": 0.31295736754044956,
-      "eval_loss": 4.088932514190674,
-      "eval_runtime": 185.3099,
-      "eval_samples_per_second": 97.194,
-      "eval_steps_per_second": 6.076,
+      "epoch": 0.431266846361186,
+      "eval_accuracy": 0.3129712751100898,
+      "eval_loss": 4.084288597106934,
+      "eval_runtime": 183.5875,
+      "eval_samples_per_second": 98.106,
+      "eval_steps_per_second": 6.133,
       "step": 4000
     },
     {
-      "epoch": 0.43590571520826604,
-      "grad_norm": 0.8409259915351868,
-      "learning_rate": 0.0005744639586251481,
-      "loss": 4.1443,
+      "epoch": 0.4366576819407008,
+      "grad_norm": 0.6693394780158997,
+      "learning_rate": 0.0005744198596869941,
+      "loss": 4.1653,
       "step": 4050
     },
     {
-      "epoch": 0.44128726724787426,
-      "grad_norm": 0.630840003490448,
-      "learning_rate": 0.0005741407175950867,
-      "loss": 4.1604,
+      "epoch": 0.4420485175202156,
+      "grad_norm": 0.6576082110404968,
+      "learning_rate": 0.0005740960604425255,
+      "loss": 4.1609,
       "step": 4100
     },
     {
-      "epoch": 0.44666881928748253,
-      "grad_norm": 0.5622649788856506,
-      "learning_rate": 0.0005738174765650253,
-      "loss": 4.1458,
+      "epoch": 0.4474393530997305,
+      "grad_norm": 0.6224762201309204,
+      "learning_rate": 0.0005737722611980572,
+      "loss": 4.141,
       "step": 4150
     },
     {
-      "epoch": 0.45205037132709075,
-      "grad_norm": 0.681736171245575,
-      "learning_rate": 0.0005734942355349638,
-      "loss": 4.1354,
+      "epoch": 0.4528301886792453,
+      "grad_norm": 0.6698539853096008,
+      "learning_rate": 0.0005734484619535887,
+      "loss": 4.1189,
       "step": 4200
     },
     {
-      "epoch": 0.45743192336669897,
-      "grad_norm": 0.7544794082641602,
-      "learning_rate": 0.0005731709945049025,
-      "loss": 4.1296,
+      "epoch": 0.4582210242587601,
+      "grad_norm": 0.6492160558700562,
+      "learning_rate": 0.0005731246627091203,
+      "loss": 4.1501,
       "step": 4250
     },
     {
-      "epoch": 0.4628134754063072,
-      "grad_norm": 0.7159223556518555,
-      "learning_rate": 0.000572847753474841,
-      "loss": 4.1233,
+      "epoch": 0.4636118598382749,
+      "grad_norm": 0.6496559977531433,
+      "learning_rate": 0.0005728008634646518,
+      "loss": 4.1251,
       "step": 4300
     },
     {
-      "epoch": 0.4681950274459154,
-      "grad_norm": 0.7304403781890869,
-      "learning_rate": 0.0005725245124447796,
-      "loss": 4.1195,
+      "epoch": 0.46900269541778977,
+      "grad_norm": 0.6596023440361023,
+      "learning_rate": 0.0005724770642201835,
+      "loss": 4.1301,
       "step": 4350
     },
     {
-      "epoch": 0.4735765794855236,
-      "grad_norm": 0.6582966446876526,
-      "learning_rate": 0.0005722012714147182,
-      "loss": 4.122,
+      "epoch": 0.4743935309973046,
+      "grad_norm": 0.7114306688308716,
+      "learning_rate": 0.000572153264975715,
+      "loss": 4.1263,
       "step": 4400
     },
     {
-      "epoch": 0.47895813152513184,
-      "grad_norm": 0.6851973533630371,
-      "learning_rate": 0.0005718780303846568,
-      "loss": 4.1075,
+      "epoch": 0.4797843665768194,
+      "grad_norm": 0.6327182054519653,
+      "learning_rate": 0.0005718294657312466,
+      "loss": 4.0962,
       "step": 4450
     },
     {
-      "epoch": 0.48433968356474005,
-      "grad_norm": 0.629666805267334,
-      "learning_rate": 0.0005715547893545953,
-      "loss": 4.1247,
+      "epoch": 0.48517520215633425,
+      "grad_norm": 0.5750563740730286,
+      "learning_rate": 0.0005715056664867781,
+      "loss": 4.0943,
       "step": 4500
     },
     {
-      "epoch": 0.48972123560434827,
-      "grad_norm": 0.6585856080055237,
-      "learning_rate": 0.000571231548324534,
-      "loss": 4.089,
+      "epoch": 0.49056603773584906,
+      "grad_norm": 0.6841105818748474,
+      "learning_rate": 0.0005711818672423097,
+      "loss": 4.09,
       "step": 4550
     },
     {
-      "epoch": 0.49510278764395654,
-      "grad_norm": 0.7275299429893494,
-      "learning_rate": 0.0005709083072944725,
-      "loss": 4.102,
+      "epoch": 0.49595687331536387,
+      "grad_norm": 0.6326912641525269,
+      "learning_rate": 0.0005708580679978413,
+      "loss": 4.0963,
       "step": 4600
     },
     {
-      "epoch": 0.5004843396835648,
-      "grad_norm": 0.7020848393440247,
-      "learning_rate": 0.0005705850662644111,
-      "loss": 4.0871,
+      "epoch": 0.5013477088948787,
+      "grad_norm": 0.856378972530365,
+      "learning_rate": 0.0005705342687533729,
+      "loss": 4.0893,
       "step": 4650
     },
     {
-      "epoch": 0.505865891723173,
-      "grad_norm": 0.7129533290863037,
-      "learning_rate": 0.0005702618252343497,
-      "loss": 4.0816,
+      "epoch": 0.5067385444743935,
+      "grad_norm": 0.5741315484046936,
+      "learning_rate": 0.0005702104695089044,
+      "loss": 4.0698,
       "step": 4700
     },
     {
-      "epoch": 0.5112474437627812,
-      "grad_norm": 0.7143609523773193,
-      "learning_rate": 0.0005699385842042882,
-      "loss": 4.0851,
+      "epoch": 0.5121293800539084,
+      "grad_norm": 0.632841944694519,
+      "learning_rate": 0.000569886670264436,
+      "loss": 4.0845,
       "step": 4750
     },
     {
-      "epoch": 0.5166289958023894,
-      "grad_norm": 0.5908857583999634,
-      "learning_rate": 0.0005696153431742269,
-      "loss": 4.0737,
+      "epoch": 0.5175202156334232,
+      "grad_norm": 0.6167107224464417,
+      "learning_rate": 0.0005695628710199675,
+      "loss": 4.0719,
       "step": 4800
     },
     {
-      "epoch": 0.5220105478419976,
-      "grad_norm": 0.6978116035461426,
-      "learning_rate": 0.0005692921021441655,
-      "loss": 4.0704,
+      "epoch": 0.522911051212938,
+      "grad_norm": 0.6984403133392334,
+      "learning_rate": 0.0005692390717754991,
+      "loss": 4.0659,
       "step": 4850
     },
     {
-      "epoch": 0.5273920998816058,
-      "grad_norm": 0.5921167731285095,
-      "learning_rate": 0.0005689688611141041,
-      "loss": 4.0677,
+      "epoch": 0.5283018867924528,
+      "grad_norm": 0.7294288277626038,
+      "learning_rate": 0.0005689152725310306,
+      "loss": 4.0596,
       "step": 4900
     },
     {
-      "epoch": 0.5327736519212141,
-      "grad_norm": 0.6112943887710571,
-      "learning_rate": 0.0005686456200840426,
-      "loss": 4.0591,
+      "epoch": 0.5336927223719676,
+      "grad_norm": 0.74382084608078,
+      "learning_rate": 0.0005685914732865623,
+      "loss": 4.0633,
       "step": 4950
     },
     {
-      "epoch": 0.5381552039608223,
-      "grad_norm": 0.7539244890213013,
-      "learning_rate": 0.0005683223790539811,
-      "loss": 4.0613,
+      "epoch": 0.5390835579514824,
+      "grad_norm": 0.6517612934112549,
+      "learning_rate": 0.0005682676740420939,
+      "loss": 4.0732,
       "step": 5000
     },
     {
-      "epoch": 0.5381552039608223,
-      "eval_accuracy": 0.32120564286599806,
-      "eval_loss": 3.9930038452148438,
-      "eval_runtime": 185.4107,
-      "eval_samples_per_second": 97.141,
-      "eval_steps_per_second": 6.073,
+      "epoch": 0.5390835579514824,
+      "eval_accuracy": 0.3223035802973634,
+      "eval_loss": 3.9919276237487793,
+      "eval_runtime": 183.6973,
+      "eval_samples_per_second": 98.047,
+      "eval_steps_per_second": 6.13,
       "step": 5000
     },
     {
-      "epoch": 0.5435367560004305,
-      "grad_norm": 0.6740825176239014,
-      "learning_rate": 0.0005679991380239198,
-      "loss": 4.047,
+      "epoch": 0.5444743935309974,
+      "grad_norm": 0.7119221687316895,
+      "learning_rate": 0.0005679438747976254,
+      "loss": 4.0443,
       "step": 5050
     },
     {
-      "epoch": 0.5489183080400387,
-      "grad_norm": 0.755363941192627,
-      "learning_rate": 0.0005676758969938584,
-      "loss": 4.0381,
+      "epoch": 0.5498652291105122,
+      "grad_norm": 0.6482653617858887,
+      "learning_rate": 0.000567620075553157,
+      "loss": 4.045,
       "step": 5100
     },
     {
-      "epoch": 0.5542998600796469,
-      "grad_norm": 0.5839517116546631,
-      "learning_rate": 0.000567352655963797,
-      "loss": 4.0421,
+      "epoch": 0.555256064690027,
+      "grad_norm": 0.6632248163223267,
+      "learning_rate": 0.0005672962763086886,
+      "loss": 4.0499,
       "step": 5150
     },
     {
-      "epoch": 0.5596814121192552,
-      "grad_norm": 0.7238830924034119,
-      "learning_rate": 0.0005670294149337355,
-      "loss": 4.0415,
+      "epoch": 0.5606469002695418,
+      "grad_norm": 0.6758072376251221,
+      "learning_rate": 0.0005669724770642202,
+      "loss": 4.0323,
       "step": 5200
     },
     {
-      "epoch": 0.5650629641588634,
-      "grad_norm": 0.6979734301567078,
-      "learning_rate": 0.0005667061739036742,
-      "loss": 4.0334,
+      "epoch": 0.5660377358490566,
+      "grad_norm": 0.6546369194984436,
+      "learning_rate": 0.0005666486778197517,
+      "loss": 4.03,
       "step": 5250
     },
     {
-      "epoch": 0.5704445161984716,
-      "grad_norm": 0.8152750134468079,
-      "learning_rate": 0.0005663829328736127,
-      "loss": 4.0427,
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.6808826327323914,
+      "learning_rate": 0.0005663248785752833,
+      "loss": 4.0423,
       "step": 5300
     },
     {
-      "epoch": 0.5758260682380799,
-      "grad_norm": 0.6354012489318848,
-      "learning_rate": 0.0005660596918435512,
-      "loss": 4.0428,
+      "epoch": 0.5768194070080862,
+      "grad_norm": 0.5758659839630127,
+      "learning_rate": 0.0005660010793308148,
+      "loss": 4.0272,
       "step": 5350
     },
     {
-      "epoch": 0.5812076202776881,
-      "grad_norm": 0.7258121371269226,
-      "learning_rate": 0.0005657364508134899,
-      "loss": 4.0439,
+      "epoch": 0.5822102425876011,
+      "grad_norm": 0.601265013217926,
+      "learning_rate": 0.0005656772800863465,
+      "loss": 4.0451,
       "step": 5400
     },
     {
-      "epoch": 0.5865891723172963,
-      "grad_norm": 0.5233718752861023,
-      "learning_rate": 0.0005654132097834284,
-      "loss": 4.0143,
+      "epoch": 0.5876010781671159,
+      "grad_norm": 0.5763106942176819,
+      "learning_rate": 0.0005653534808418779,
+      "loss": 4.0201,
       "step": 5450
     },
     {
-      "epoch": 0.5919707243569046,
-      "grad_norm": 0.578235924243927,
-      "learning_rate": 0.0005650899687533671,
-      "loss": 4.0224,
+      "epoch": 0.5929919137466307,
+      "grad_norm": 0.6029739379882812,
+      "learning_rate": 0.0005650296815974096,
+      "loss": 4.0235,
       "step": 5500
     },
     {
-      "epoch": 0.5973522763965128,
-      "grad_norm": 0.5696157217025757,
-      "learning_rate": 0.0005647667277233056,
-      "loss": 4.012,
+      "epoch": 0.5983827493261455,
+      "grad_norm": 0.5782645344734192,
+      "learning_rate": 0.0005647058823529411,
+      "loss": 4.0153,
       "step": 5550
     },
     {
-      "epoch": 0.602733828436121,
-      "grad_norm": 0.7277674674987793,
-      "learning_rate": 0.0005644434866932442,
-      "loss": 4.0144,
+      "epoch": 0.6037735849056604,
+      "grad_norm": 0.5374035835266113,
+      "learning_rate": 0.0005643820831084727,
+      "loss": 4.0068,
       "step": 5600
     },
     {
-      "epoch": 0.6081153804757292,
-      "grad_norm": 0.760840117931366,
-      "learning_rate": 0.0005641202456631828,
-      "loss": 4.0055,
+      "epoch": 0.6091644204851752,
+      "grad_norm": 0.6709702610969543,
+      "learning_rate": 0.0005640582838640042,
+      "loss": 4.0187,
       "step": 5650
     },
     {
-      "epoch": 0.6134969325153374,
-      "grad_norm": 0.6378624439239502,
-      "learning_rate": 0.0005637970046331214,
-      "loss": 4.0246,
+      "epoch": 0.6145552560646901,
+      "grad_norm": 0.661410391330719,
+      "learning_rate": 0.0005637344846195358,
+      "loss": 4.0118,
       "step": 5700
     },
     {
-      "epoch": 0.6188784845549457,
-      "grad_norm": 0.6083015203475952,
-      "learning_rate": 0.00056347376360306,
-      "loss": 4.0077,
+      "epoch": 0.6199460916442049,
+      "grad_norm": 0.5730959177017212,
+      "learning_rate": 0.0005634106853750674,
+      "loss": 4.0212,
       "step": 5750
     },
     {
-      "epoch": 0.6242600365945539,
-      "grad_norm": 0.6895670890808105,
-      "learning_rate": 0.0005631505225729985,
-      "loss": 3.9889,
+      "epoch": 0.6253369272237197,
+      "grad_norm": 0.7666971683502197,
+      "learning_rate": 0.000563086886130599,
+      "loss": 3.9882,
       "step": 5800
     },
     {
-      "epoch": 0.6296415886341621,
-      "grad_norm": 0.7159281969070435,
-      "learning_rate": 0.0005628272815429371,
-      "loss": 3.9946,
+      "epoch": 0.6307277628032345,
+      "grad_norm": 0.7750036120414734,
+      "learning_rate": 0.0005627630868861305,
+      "loss": 3.9926,
       "step": 5850
     },
     {
-      "epoch": 0.6350231406737703,
-      "grad_norm": 0.7747655510902405,
-      "learning_rate": 0.0005625040405128757,
-      "loss": 3.9819,
+      "epoch": 0.6361185983827493,
+      "grad_norm": 0.6507880687713623,
+      "learning_rate": 0.0005624392876416621,
+      "loss": 3.9778,
       "step": 5900
     },
     {
-      "epoch": 0.6404046927133785,
-      "grad_norm": 0.6201274991035461,
-      "learning_rate": 0.0005621807994828143,
-      "loss": 3.9892,
+      "epoch": 0.6415094339622641,
+      "grad_norm": 0.6215813159942627,
+      "learning_rate": 0.0005621154883971937,
+      "loss": 3.9808,
       "step": 5950
     },
     {
-      "epoch": 0.6457862447529867,
-      "grad_norm": 0.5920886397361755,
-      "learning_rate": 0.0005618575584527529,
-      "loss": 3.9942,
+      "epoch": 0.6469002695417789,
+      "grad_norm": 0.7452011108398438,
+      "learning_rate": 0.0005617916891527253,
+      "loss": 3.9623,
       "step": 6000
     },
     {
-      "epoch": 0.6457862447529867,
-      "eval_accuracy": 0.3285595962719456,
-      "eval_loss": 3.9178080558776855,
-      "eval_runtime": 185.1703,
-      "eval_samples_per_second": 97.267,
-      "eval_steps_per_second": 6.081,
+      "epoch": 0.6469002695417789,
+      "eval_accuracy": 0.3290048558062093,
+      "eval_loss": 3.9154105186462402,
+      "eval_runtime": 183.6461,
+      "eval_samples_per_second": 98.074,
+      "eval_steps_per_second": 6.131,
       "step": 6000
     },
     {
-      "epoch": 0.651167796792595,
-      "grad_norm": 0.6048468351364136,
-      "learning_rate": 0.0005615343174226915,
-      "loss": 3.9991,
+      "epoch": 0.6522911051212938,
+      "grad_norm": 0.6668406128883362,
+      "learning_rate": 0.0005614678899082568,
+      "loss": 3.9958,
       "step": 6050
     },
     {
-      "epoch": 0.6565493488322032,
-      "grad_norm": 0.6038753986358643,
-      "learning_rate": 0.00056121107639263,
-      "loss": 3.971,
+      "epoch": 0.6576819407008087,
+      "grad_norm": 0.5859930515289307,
+      "learning_rate": 0.0005611440906637884,
+      "loss": 3.9842,
       "step": 6100
     },
     {
-      "epoch": 0.6619309008718114,
-      "grad_norm": 0.6248626708984375,
-      "learning_rate": 0.0005608878353625687,
-      "loss": 3.9687,
+      "epoch": 0.6630727762803235,
+      "grad_norm": 0.6970394253730774,
+      "learning_rate": 0.00056082029141932,
+      "loss": 3.9757,
       "step": 6150
     },
     {
-      "epoch": 0.6673124529114196,
-      "grad_norm": 0.7638702988624573,
-      "learning_rate": 0.0005605645943325072,
-      "loss": 3.9686,
+      "epoch": 0.6684636118598383,
+      "grad_norm": 0.6556830406188965,
+      "learning_rate": 0.0005604964921748515,
+      "loss": 3.9884,
       "step": 6200
     },
     {
-      "epoch": 0.6726940049510278,
-      "grad_norm": 0.5385346412658691,
-      "learning_rate": 0.0005602413533024458,
-      "loss": 3.9689,
+      "epoch": 0.6738544474393531,
+      "grad_norm": 0.6606318950653076,
+      "learning_rate": 0.000560172692930383,
+      "loss": 3.9484,
       "step": 6250
     },
     {
-      "epoch": 0.6780755569906362,
-      "grad_norm": 0.6341848373413086,
-      "learning_rate": 0.0005599181122723844,
-      "loss": 3.9662,
+      "epoch": 0.6792452830188679,
+      "grad_norm": 0.607824444770813,
+      "learning_rate": 0.0005598488936859147,
+      "loss": 3.959,
       "step": 6300
     },
     {
-      "epoch": 0.6834571090302444,
-      "grad_norm": 0.5963843464851379,
-      "learning_rate": 0.000559594871242323,
-      "loss": 3.9602,
+      "epoch": 0.6846361185983828,
+      "grad_norm": 0.6594722270965576,
+      "learning_rate": 0.0005595250944414463,
+      "loss": 3.9519,
       "step": 6350
     },
     {
-      "epoch": 0.6888386610698526,
-      "grad_norm": 0.5414633750915527,
-      "learning_rate": 0.0005592716302122616,
-      "loss": 3.9664,
+      "epoch": 0.6900269541778976,
+      "grad_norm": 0.6482193470001221,
+      "learning_rate": 0.0005592012951969778,
+      "loss": 3.9433,
       "step": 6400
     },
     {
-      "epoch": 0.6942202131094608,
-      "grad_norm": 0.581794023513794,
-      "learning_rate": 0.0005589483891822001,
-      "loss": 3.9706,
+      "epoch": 0.6954177897574124,
+      "grad_norm": 0.7418032884597778,
+      "learning_rate": 0.0005588774959525094,
+      "loss": 3.9704,
       "step": 6450
     },
     {
-      "epoch": 0.699601765149069,
-      "grad_norm": 0.588293731212616,
-      "learning_rate": 0.0005586251481521387,
-      "loss": 3.9601,
+      "epoch": 0.7008086253369272,
+      "grad_norm": 0.6005130410194397,
+      "learning_rate": 0.000558553696708041,
+      "loss": 3.9547,
       "step": 6500
     },
     {
-      "epoch": 0.7049833171886772,
-      "grad_norm": 0.6920768618583679,
-      "learning_rate": 0.0005583019071220773,
-      "loss": 3.9606,
+      "epoch": 0.706199460916442,
+      "grad_norm": 0.566909909248352,
+      "learning_rate": 0.0005582298974635726,
+      "loss": 3.9509,
       "step": 6550
     },
     {
-      "epoch": 0.7103648692282855,
-      "grad_norm": 0.6583566665649414,
-      "learning_rate": 0.000557978666092016,
-      "loss": 3.9327,
+      "epoch": 0.7115902964959568,
+      "grad_norm": 0.5656232833862305,
+      "learning_rate": 0.0005579060982191041,
+      "loss": 3.9505,
       "step": 6600
     },
     {
-      "epoch": 0.7157464212678937,
-      "grad_norm": 0.5119813680648804,
-      "learning_rate": 0.0005576554250619545,
-      "loss": 3.9518,
+      "epoch": 0.7169811320754716,
+      "grad_norm": 0.6370468139648438,
+      "learning_rate": 0.0005575822989746357,
+      "loss": 3.9435,
       "step": 6650
     },
     {
-      "epoch": 0.7211279733075019,
-      "grad_norm": 0.7030820250511169,
-      "learning_rate": 0.000557332184031893,
-      "loss": 3.9553,
+      "epoch": 0.7223719676549866,
+      "grad_norm": 0.5794790387153625,
+      "learning_rate": 0.0005572584997301672,
+      "loss": 3.9366,
       "step": 6700
     },
     {
-      "epoch": 0.7265095253471101,
-      "grad_norm": 0.640282928943634,
-      "learning_rate": 0.0005570089430018317,
-      "loss": 3.944,
+      "epoch": 0.7277628032345014,
+      "grad_norm": 0.6862756609916687,
+      "learning_rate": 0.0005569347004856989,
+      "loss": 3.94,
       "step": 6750
     },
     {
-      "epoch": 0.7318910773867183,
-      "grad_norm": 0.6724056005477905,
-      "learning_rate": 0.0005566857019717702,
-      "loss": 3.9515,
+      "epoch": 0.7331536388140162,
+      "grad_norm": 0.524553120136261,
+      "learning_rate": 0.0005566109012412303,
+      "loss": 3.9485,
       "step": 6800
     },
     {
-      "epoch": 0.7372726294263265,
-      "grad_norm": 0.5751603245735168,
-      "learning_rate": 0.0005563624609417089,
-      "loss": 3.955,
+      "epoch": 0.738544474393531,
+      "grad_norm": 0.5408486723899841,
+      "learning_rate": 0.000556287101996762,
+      "loss": 3.9322,
       "step": 6850
     },
     {
-      "epoch": 0.7426541814659348,
-      "grad_norm": 0.5616862177848816,
-      "learning_rate": 0.0005560456847322487,
-      "loss": 3.9535,
+      "epoch": 0.7439353099730458,
+      "grad_norm": 0.5416108965873718,
+      "learning_rate": 0.0005559633027522935,
+      "loss": 3.9263,
       "step": 6900
     },
     {
-      "epoch": 0.748035733505543,
-      "grad_norm": 0.5367492437362671,
-      "learning_rate": 0.0005557224437021872,
-      "loss": 3.9248,
+      "epoch": 0.7493261455525606,
+      "grad_norm": 0.6217844486236572,
+      "learning_rate": 0.0005556395035078251,
+      "loss": 3.9157,
       "step": 6950
     },
     {
-      "epoch": 0.7534172855451512,
-      "grad_norm": 0.5820096135139465,
-      "learning_rate": 0.0005553992026721258,
-      "loss": 3.9229,
+      "epoch": 0.7547169811320755,
+      "grad_norm": 0.5892395973205566,
+      "learning_rate": 0.000555322180248246,
+      "loss": 3.9291,
       "step": 7000
     },
     {
-      "epoch": 0.7534172855451512,
-      "eval_accuracy": 0.33421975980975316,
-      "eval_loss": 3.8591833114624023,
-      "eval_runtime": 185.366,
-      "eval_samples_per_second": 97.165,
-      "eval_steps_per_second": 6.074,
+      "epoch": 0.7547169811320755,
+      "eval_accuracy": 0.33351101702253927,
+      "eval_loss": 3.8597514629364014,
+      "eval_runtime": 183.7646,
+      "eval_samples_per_second": 98.011,
+      "eval_steps_per_second": 6.127,
       "step": 7000
     },
     {
-      "epoch": 0.7587988375847594,
-      "grad_norm": 0.6244399547576904,
-      "learning_rate": 0.0005550759616420644,
-      "loss": 3.9139,
+      "epoch": 0.7601078167115903,
+      "grad_norm": 0.5709213018417358,
+      "learning_rate": 0.0005549983810037776,
+      "loss": 3.932,
       "step": 7050
     },
     {
-      "epoch": 0.7641803896243676,
-      "grad_norm": 0.5721721649169922,
-      "learning_rate": 0.000554752720612003,
-      "loss": 3.9179,
+      "epoch": 0.7654986522911051,
+      "grad_norm": 0.5382469296455383,
+      "learning_rate": 0.0005546745817593091,
+      "loss": 3.9325,
       "step": 7100
     },
     {
-      "epoch": 0.7695619416639758,
-      "grad_norm": 0.5274028182029724,
-      "learning_rate": 0.0005544294795819415,
-      "loss": 3.939,
+      "epoch": 0.77088948787062,
+      "grad_norm": 0.7448933720588684,
+      "learning_rate": 0.0005543507825148408,
+      "loss": 3.9099,
       "step": 7150
     },
     {
-      "epoch": 0.7749434937035841,
-      "grad_norm": 0.6537976861000061,
-      "learning_rate": 0.0005541062385518801,
-      "loss": 3.9104,
+      "epoch": 0.7762803234501348,
+      "grad_norm": 0.6360498070716858,
+      "learning_rate": 0.0005540269832703723,
+      "loss": 3.9173,
       "step": 7200
     },
     {
-      "epoch": 0.7803250457431924,
-      "grad_norm": 0.5737007260322571,
-      "learning_rate": 0.0005537829975218188,
-      "loss": 3.8995,
+      "epoch": 0.7816711590296496,
+      "grad_norm": 0.6259989738464355,
+      "learning_rate": 0.0005537031840259039,
+      "loss": 3.9175,
       "step": 7250
     },
     {
-      "epoch": 0.7857065977828006,
-      "grad_norm": 0.655914306640625,
-      "learning_rate": 0.0005534597564917573,
-      "loss": 3.9133,
+      "epoch": 0.7870619946091644,
+      "grad_norm": 0.6026500463485718,
+      "learning_rate": 0.0005533793847814354,
+      "loss": 3.9054,
       "step": 7300
     },
     {
-      "epoch": 0.7910881498224088,
-      "grad_norm": 0.5855588912963867,
-      "learning_rate": 0.0005531365154616959,
-      "loss": 3.8965,
+      "epoch": 0.7924528301886793,
+      "grad_norm": 0.587363600730896,
+      "learning_rate": 0.000553055585536967,
+      "loss": 3.9147,
       "step": 7350
     },
     {
-      "epoch": 0.796469701862017,
-      "grad_norm": 0.5390224456787109,
-      "learning_rate": 0.0005528132744316344,
-      "loss": 3.9107,
+      "epoch": 0.7978436657681941,
+      "grad_norm": 0.647449791431427,
+      "learning_rate": 0.0005527317862924987,
+      "loss": 3.8796,
       "step": 7400
     },
     {
-      "epoch": 0.8018512539016253,
-      "grad_norm": 0.739325225353241,
-      "learning_rate": 0.0005524900334015731,
-      "loss": 3.9033,
+      "epoch": 0.8032345013477089,
+      "grad_norm": 0.587364137172699,
+      "learning_rate": 0.0005524079870480301,
+      "loss": 3.8893,
       "step": 7450
     },
     {
-      "epoch": 0.8072328059412335,
-      "grad_norm": 0.5827202796936035,
-      "learning_rate": 0.0005521667923715117,
-      "loss": 3.9216,
+      "epoch": 0.8086253369272237,
+      "grad_norm": 0.7309133410453796,
+      "learning_rate": 0.0005520841878035618,
+      "loss": 3.9045,
       "step": 7500
     },
     {
-      "epoch": 0.8126143579808417,
-      "grad_norm": 0.5299261212348938,
-      "learning_rate": 0.0005518435513414502,
-      "loss": 3.9062,
+      "epoch": 0.8140161725067385,
+      "grad_norm": 0.6132270693778992,
+      "learning_rate": 0.0005517603885590933,
+      "loss": 3.8994,
       "step": 7550
     },
     {
-      "epoch": 0.8179959100204499,
-      "grad_norm": 0.7677913308143616,
-      "learning_rate": 0.0005515203103113888,
-      "loss": 3.8891,
+      "epoch": 0.8194070080862533,
+      "grad_norm": 0.5545886158943176,
+      "learning_rate": 0.0005514365893146249,
+      "loss": 3.917,
       "step": 7600
     },
     {
-      "epoch": 0.8233774620600581,
-      "grad_norm": 0.5630691647529602,
-      "learning_rate": 0.0005511970692813274,
-      "loss": 3.8984,
+      "epoch": 0.8247978436657682,
+      "grad_norm": 0.6114673614501953,
+      "learning_rate": 0.0005511127900701564,
+      "loss": 3.9014,
       "step": 7650
     },
     {
-      "epoch": 0.8287590140996663,
-      "grad_norm": 0.6013116240501404,
-      "learning_rate": 0.000550873828251266,
-      "loss": 3.8819,
+      "epoch": 0.8301886792452831,
+      "grad_norm": 0.5956445336341858,
+      "learning_rate": 0.000550788990825688,
+      "loss": 3.8704,
       "step": 7700
     },
     {
-      "epoch": 0.8341405661392746,
-      "grad_norm": 0.6272456049919128,
-      "learning_rate": 0.0005505505872212045,
-      "loss": 3.8984,
+      "epoch": 0.8355795148247979,
+      "grad_norm": 0.6900692582130432,
+      "learning_rate": 0.0005504651915812196,
+      "loss": 3.8857,
       "step": 7750
     },
     {
-      "epoch": 0.8395221181788828,
-      "grad_norm": 0.6356287002563477,
-      "learning_rate": 0.0005502273461911432,
-      "loss": 3.9032,
+      "epoch": 0.8409703504043127,
+      "grad_norm": 0.5441706776618958,
+      "learning_rate": 0.0005501413923367512,
+      "loss": 3.9005,
       "step": 7800
     },
     {
-      "epoch": 0.844903670218491,
-      "grad_norm": 0.650209367275238,
-      "learning_rate": 0.0005499041051610817,
-      "loss": 3.8769,
+      "epoch": 0.8463611859838275,
+      "grad_norm": 0.7785384654998779,
+      "learning_rate": 0.0005498175930922827,
+      "loss": 3.8865,
       "step": 7850
     },
     {
-      "epoch": 0.8502852222580992,
-      "grad_norm": 0.5765166878700256,
-      "learning_rate": 0.0005495808641310204,
-      "loss": 3.8887,
+      "epoch": 0.8517520215633423,
+      "grad_norm": 0.679541289806366,
+      "learning_rate": 0.0005494937938478143,
+      "loss": 3.8872,
       "step": 7900
     },
     {
-      "epoch": 0.8556667742977074,
-      "grad_norm": 0.6685440540313721,
-      "learning_rate": 0.0005492576231009589,
-      "loss": 3.9028,
+      "epoch": 0.8571428571428571,
+      "grad_norm": 0.5554518103599548,
+      "learning_rate": 0.0005491699946033459,
+      "loss": 3.8874,
       "step": 7950
     },
     {
-      "epoch": 0.8610483263373157,
-      "grad_norm": 0.5335162281990051,
-      "learning_rate": 0.0005489343820708974,
-      "loss": 3.8881,
+      "epoch": 0.862533692722372,
+      "grad_norm": 0.5628073215484619,
+      "learning_rate": 0.0005488461953588775,
+      "loss": 3.8652,
       "step": 8000
     },
     {
-      "epoch": 0.8610483263373157,
-      "eval_accuracy": 0.3379374270192134,
-      "eval_loss": 3.8148393630981445,
-      "eval_runtime": 184.805,
-      "eval_samples_per_second": 97.459,
-      "eval_steps_per_second": 6.093,
+      "epoch": 0.862533692722372,
+      "eval_accuracy": 0.337994469785316,
+      "eval_loss": 3.814110040664673,
+      "eval_runtime": 183.5196,
+      "eval_samples_per_second": 98.142,
+      "eval_steps_per_second": 6.136,
       "step": 8000
     },
     {
-      "epoch": 0.8664298783769239,
-      "grad_norm": 0.5469226241111755,
-      "learning_rate": 0.0005486111410408361,
-      "loss": 3.8857,
+      "epoch": 0.8679245283018868,
+      "grad_norm": 0.5652970671653748,
+      "learning_rate": 0.000548522396114409,
+      "loss": 3.8841,
       "step": 8050
     },
     {
-      "epoch": 0.8718114304165321,
-      "grad_norm": 0.5885823369026184,
-      "learning_rate": 0.0005482879000107746,
-      "loss": 3.8642,
+      "epoch": 0.8733153638814016,
+      "grad_norm": 0.7923115491867065,
+      "learning_rate": 0.0005481985968699406,
+      "loss": 3.8611,
       "step": 8100
     },
     {
-      "epoch": 0.8771929824561403,
-      "grad_norm": 0.5075781941413879,
-      "learning_rate": 0.0005479646589807133,
-      "loss": 3.8996,
+      "epoch": 0.8787061994609164,
+      "grad_norm": 0.5407063364982605,
+      "learning_rate": 0.0005478747976254721,
+      "loss": 3.8725,
       "step": 8150
     },
     {
-      "epoch": 0.8825745344957485,
-      "grad_norm": 0.5800351500511169,
-      "learning_rate": 0.0005476414179506518,
-      "loss": 3.8807,
+      "epoch": 0.8840970350404312,
+      "grad_norm": 0.5948967933654785,
+      "learning_rate": 0.0005475509983810037,
+      "loss": 3.8625,
       "step": 8200
     },
     {
-      "epoch": 0.8879560865353568,
-      "grad_norm": 0.5522460341453552,
-      "learning_rate": 0.0005473181769205904,
-      "loss": 3.8747,
+      "epoch": 0.889487870619946,
+      "grad_norm": 0.6480773091316223,
+      "learning_rate": 0.0005472271991365352,
+      "loss": 3.8802,
       "step": 8250
     },
     {
-      "epoch": 0.8933376385749651,
-      "grad_norm": 0.5287204384803772,
-      "learning_rate": 0.000546994935890529,
-      "loss": 3.861,
+      "epoch": 0.894878706199461,
+      "grad_norm": 0.5645371675491333,
+      "learning_rate": 0.0005469033998920669,
+      "loss": 3.852,
       "step": 8300
     },
     {
-      "epoch": 0.8987191906145733,
-      "grad_norm": 0.5514733791351318,
-      "learning_rate": 0.0005466716948604677,
-      "loss": 3.8641,
+      "epoch": 0.9002695417789758,
+      "grad_norm": 0.5577579140663147,
+      "learning_rate": 0.0005465796006475984,
+      "loss": 3.8686,
       "step": 8350
     },
     {
-      "epoch": 0.9041007426541815,
-      "grad_norm": 0.504178524017334,
-      "learning_rate": 0.0005463484538304062,
-      "loss": 3.862,
+      "epoch": 0.9056603773584906,
+      "grad_norm": 0.5483347773551941,
+      "learning_rate": 0.00054625580140313,
+      "loss": 3.8684,
       "step": 8400
     },
     {
-      "epoch": 0.9094822946937897,
-      "grad_norm": 0.5593713521957397,
-      "learning_rate": 0.0005460252128003447,
-      "loss": 3.8712,
+      "epoch": 0.9110512129380054,
+      "grad_norm": 0.6329666972160339,
+      "learning_rate": 0.0005459320021586615,
+      "loss": 3.8403,
       "step": 8450
     },
     {
-      "epoch": 0.9148638467333979,
-      "grad_norm": 0.6000622510910034,
-      "learning_rate": 0.0005457019717702833,
-      "loss": 3.8649,
+      "epoch": 0.9164420485175202,
+      "grad_norm": 0.5311110019683838,
+      "learning_rate": 0.0005456082029141932,
+      "loss": 3.852,
       "step": 8500
     },
     {
-      "epoch": 0.9202453987730062,
-      "grad_norm": 0.5310674905776978,
-      "learning_rate": 0.0005453787307402219,
-      "loss": 3.8597,
+      "epoch": 0.921832884097035,
+      "grad_norm": 0.5692155957221985,
+      "learning_rate": 0.0005452844036697248,
+      "loss": 3.8527,
       "step": 8550
     },
     {
-      "epoch": 0.9256269508126144,
-      "grad_norm": 0.5901429057121277,
-      "learning_rate": 0.0005450554897101605,
-      "loss": 3.8501,
+      "epoch": 0.9272237196765498,
+      "grad_norm": 0.6471691131591797,
+      "learning_rate": 0.0005449606044252563,
+      "loss": 3.8798,
       "step": 8600
     },
     {
-      "epoch": 0.9310085028522226,
-      "grad_norm": 0.5769229531288147,
-      "learning_rate": 0.0005447322486800991,
-      "loss": 3.8616,
+      "epoch": 0.9326145552560647,
+      "grad_norm": 0.6338027119636536,
+      "learning_rate": 0.0005446368051807879,
+      "loss": 3.8554,
       "step": 8650
     },
     {
-      "epoch": 0.9363900548918308,
-      "grad_norm": 0.5501662492752075,
-      "learning_rate": 0.0005444090076500377,
-      "loss": 3.8685,
+      "epoch": 0.9380053908355795,
+      "grad_norm": 0.593163013458252,
+      "learning_rate": 0.0005443130059363194,
+      "loss": 3.863,
       "step": 8700
     },
     {
-      "epoch": 0.941771606931439,
-      "grad_norm": 0.5846860408782959,
-      "learning_rate": 0.0005440857666199763,
-      "loss": 3.8416,
+      "epoch": 0.9433962264150944,
+      "grad_norm": 0.5378130674362183,
+      "learning_rate": 0.0005439892066918511,
+      "loss": 3.845,
       "step": 8750
     },
     {
-      "epoch": 0.9471531589710472,
-      "grad_norm": 0.5883833765983582,
-      "learning_rate": 0.0005437625255899148,
-      "loss": 3.8386,
+      "epoch": 0.9487870619946092,
+      "grad_norm": 0.6089158654212952,
+      "learning_rate": 0.0005436654074473825,
+      "loss": 3.8478,
       "step": 8800
     },
     {
-      "epoch": 0.9525347110106555,
-      "grad_norm": 0.5724109411239624,
-      "learning_rate": 0.0005434392845598534,
-      "loss": 3.8617,
+      "epoch": 0.954177897574124,
+      "grad_norm": 0.5586256980895996,
+      "learning_rate": 0.0005433416082029142,
+      "loss": 3.8464,
       "step": 8850
     },
     {
-      "epoch": 0.9579162630502637,
-      "grad_norm": 0.5604748129844666,
-      "learning_rate": 0.000543116043529792,
-      "loss": 3.8494,
+      "epoch": 0.9595687331536388,
+      "grad_norm": 0.6633445620536804,
+      "learning_rate": 0.0005430178089584457,
+      "loss": 3.8466,
       "step": 8900
     },
     {
-      "epoch": 0.9632978150898719,
-      "grad_norm": 0.6778799295425415,
-      "learning_rate": 0.0005427928024997306,
-      "loss": 3.85,
+      "epoch": 0.9649595687331537,
+      "grad_norm": 0.6361255049705505,
+      "learning_rate": 0.0005426940097139773,
+      "loss": 3.8484,
       "step": 8950
     },
     {
-      "epoch": 0.9686793671294801,
-      "grad_norm": 0.5963719487190247,
-      "learning_rate": 0.0005424695614696692,
-      "loss": 3.8573,
+      "epoch": 0.9703504043126685,
+      "grad_norm": 0.6028106808662415,
+      "learning_rate": 0.0005423702104695088,
+      "loss": 3.8655,
       "step": 9000
     },
     {
-      "epoch": 0.9686793671294801,
-      "eval_accuracy": 0.34179547375973,
-      "eval_loss": 3.7812108993530273,
-      "eval_runtime": 184.5862,
-      "eval_samples_per_second": 97.575,
-      "eval_steps_per_second": 6.1,
+      "epoch": 0.9703504043126685,
+      "eval_accuracy": 0.3412723101049033,
+      "eval_loss": 3.778677463531494,
+      "eval_runtime": 183.8289,
+      "eval_samples_per_second": 97.977,
+      "eval_steps_per_second": 6.125,
       "step": 9000
     },
     {
-      "epoch": 0.9740609191690883,
-      "grad_norm": 0.5469352006912231,
-      "learning_rate": 0.0005421463204396078,
-      "loss": 3.8486,
+      "epoch": 0.9757412398921833,
+      "grad_norm": 0.6262213587760925,
+      "learning_rate": 0.0005420528872099298,
+      "loss": 3.8415,
       "step": 9050
     },
     {
-      "epoch": 0.9794424712086965,
-      "grad_norm": 0.6023728251457214,
-      "learning_rate": 0.0005418230794095463,
-      "loss": 3.8536,
+      "epoch": 0.9811320754716981,
+      "grad_norm": 0.5979717373847961,
+      "learning_rate": 0.0005417290879654613,
+      "loss": 3.8327,
       "step": 9100
     },
     {
-      "epoch": 0.9848240232483048,
-      "grad_norm": 0.5906918048858643,
-      "learning_rate": 0.000541499838379485,
-      "loss": 3.8355,
+      "epoch": 0.9865229110512129,
+      "grad_norm": 0.5795934200286865,
+      "learning_rate": 0.000541405288720993,
+      "loss": 3.845,
       "step": 9150
     },
     {
-      "epoch": 0.9902055752879131,
-      "grad_norm": 0.5566977858543396,
-      "learning_rate": 0.0005411765973494235,
-      "loss": 3.8161,
+      "epoch": 0.9919137466307277,
+      "grad_norm": 0.5323654413223267,
+      "learning_rate": 0.0005410814894765245,
+      "loss": 3.8479,
       "step": 9200
     },
     {
-      "epoch": 0.9955871273275213,
-      "grad_norm": 0.6707938313484192,
-      "learning_rate": 0.0005408533563193621,
-      "loss": 3.8442,
+      "epoch": 0.9973045822102425,
+      "grad_norm": 0.5347334146499634,
+      "learning_rate": 0.0005407576902320561,
+      "loss": 3.8378,
       "step": 9250
     },
     {
-      "epoch": 1.0009686793671295,
-      "grad_norm": 0.5935998558998108,
-      "learning_rate": 0.0005405301152893007,
-      "loss": 3.8425,
+      "epoch": 1.0026954177897573,
+      "grad_norm": 0.5488175749778748,
+      "learning_rate": 0.0005404338909875876,
+      "loss": 3.8159,
       "step": 9300
     },
     {
-      "epoch": 1.0063502314067376,
-      "grad_norm": 0.7090808153152466,
-      "learning_rate": 0.0005402068742592392,
-      "loss": 3.7753,
+      "epoch": 1.0080862533692723,
+      "grad_norm": 0.5864335298538208,
+      "learning_rate": 0.0005401100917431192,
+      "loss": 3.7809,
       "step": 9350
     },
     {
-      "epoch": 1.011731783446346,
-      "grad_norm": 0.5787277221679688,
-      "learning_rate": 0.0005398836332291779,
-      "loss": 3.7816,
+      "epoch": 1.013477088948787,
+      "grad_norm": 0.5702503323554993,
+      "learning_rate": 0.0005397862924986508,
+      "loss": 3.79,
       "step": 9400
     },
     {
-      "epoch": 1.017113335485954,
-      "grad_norm": 0.5593772530555725,
-      "learning_rate": 0.0005395603921991164,
-      "loss": 3.7644,
+      "epoch": 1.0188679245283019,
+      "grad_norm": 0.549822986125946,
+      "learning_rate": 0.0005394624932541824,
+      "loss": 3.7797,
       "step": 9450
     },
     {
-      "epoch": 1.0224948875255624,
-      "grad_norm": 0.616612434387207,
-      "learning_rate": 0.0005392371511690551,
-      "loss": 3.7622,
+      "epoch": 1.0242587601078168,
+      "grad_norm": 0.5751377940177917,
+      "learning_rate": 0.0005391386940097139,
+      "loss": 3.7775,
       "step": 9500
     },
     {
-      "epoch": 1.0278764395651705,
-      "grad_norm": 0.6132011413574219,
-      "learning_rate": 0.0005389139101389936,
-      "loss": 3.7698,
+      "epoch": 1.0296495956873315,
+      "grad_norm": 0.5424072742462158,
+      "learning_rate": 0.0005388148947652455,
+      "loss": 3.7885,
       "step": 9550
     },
     {
-      "epoch": 1.0332579916047788,
-      "grad_norm": 0.583960771560669,
-      "learning_rate": 0.0005385906691089321,
-      "loss": 3.7711,
+      "epoch": 1.0350404312668464,
+      "grad_norm": 0.6139684915542603,
+      "learning_rate": 0.000538491095520777,
+      "loss": 3.7689,
       "step": 9600
     },
     {
-      "epoch": 1.0386395436443872,
-      "grad_norm": 0.5626878142356873,
-      "learning_rate": 0.0005382674280788708,
-      "loss": 3.7675,
+      "epoch": 1.0404312668463611,
+      "grad_norm": 0.507134199142456,
+      "learning_rate": 0.0005381672962763086,
+      "loss": 3.7627,
       "step": 9650
     },
     {
-      "epoch": 1.0440210956839953,
-      "grad_norm": 0.6028419733047485,
-      "learning_rate": 0.0005379441870488093,
-      "loss": 3.7545,
+      "epoch": 1.045822102425876,
+      "grad_norm": 0.5824893712997437,
+      "learning_rate": 0.0005378434970318403,
+      "loss": 3.7732,
       "step": 9700
     },
     {
-      "epoch": 1.0494026477236036,
-      "grad_norm": 0.5416905283927917,
-      "learning_rate": 0.0005376209460187479,
-      "loss": 3.7607,
+      "epoch": 1.0512129380053907,
+      "grad_norm": 0.6038205027580261,
+      "learning_rate": 0.0005375196977873718,
+      "loss": 3.766,
       "step": 9750
     },
     {
-      "epoch": 1.0547841997632117,
-      "grad_norm": 0.571316123008728,
-      "learning_rate": 0.0005372977049886865,
-      "loss": 3.7879,
+      "epoch": 1.0566037735849056,
+      "grad_norm": 0.5839976668357849,
+      "learning_rate": 0.0005371958985429034,
+      "loss": 3.7708,
       "step": 9800
     },
     {
-      "epoch": 1.06016575180282,
-      "grad_norm": 0.6111052632331848,
-      "learning_rate": 0.0005369744639586251,
-      "loss": 3.7666,
+      "epoch": 1.0619946091644206,
+      "grad_norm": 0.6339398622512817,
+      "learning_rate": 0.0005368720992984349,
+      "loss": 3.7547,
       "step": 9850
     },
     {
-      "epoch": 1.0655473038424281,
-      "grad_norm": 0.7614707350730896,
-      "learning_rate": 0.0005366512229285637,
-      "loss": 3.7689,
+      "epoch": 1.0673854447439353,
+      "grad_norm": 0.6098514199256897,
+      "learning_rate": 0.0005365483000539665,
+      "loss": 3.7632,
       "step": 9900
     },
     {
-      "epoch": 1.0709288558820365,
-      "grad_norm": 0.541038990020752,
-      "learning_rate": 0.0005363279818985022,
-      "loss": 3.756,
+      "epoch": 1.0727762803234502,
+      "grad_norm": 0.6129767298698425,
+      "learning_rate": 0.0005362245008094981,
+      "loss": 3.7648,
       "step": 9950
     },
     {
-      "epoch": 1.0763104079216446,
-      "grad_norm": 0.5469908714294434,
-      "learning_rate": 0.0005360047408684408,
-      "loss": 3.7641,
+      "epoch": 1.0781671159029649,
+      "grad_norm": 0.6090075969696045,
+      "learning_rate": 0.0005359007015650297,
+      "loss": 3.7691,
       "step": 10000
     },
     {
-      "epoch": 1.0763104079216446,
-      "eval_accuracy": 0.3449987781982765,
-      "eval_loss": 3.7476563453674316,
-      "eval_runtime": 184.5322,
-      "eval_samples_per_second": 97.604,
-      "eval_steps_per_second": 6.102,
+      "epoch": 1.0781671159029649,
+      "eval_accuracy": 0.3452738873102228,
+      "eval_loss": 3.7464487552642822,
+      "eval_runtime": 183.622,
+      "eval_samples_per_second": 98.087,
+      "eval_steps_per_second": 6.132,
       "step": 10000
     },
     {
-      "epoch": 1.081691959961253,
-      "grad_norm": 0.5756956338882446,
-      "learning_rate": 0.0005356814998383794,
-      "loss": 3.7808,
+      "epoch": 1.0835579514824798,
+      "grad_norm": 0.5005014538764954,
+      "learning_rate": 0.0005355769023205612,
+      "loss": 3.7581,
       "step": 10050
     },
     {
-      "epoch": 1.087073512000861,
-      "grad_norm": 0.5653090476989746,
-      "learning_rate": 0.0005353582588083181,
-      "loss": 3.7728,
+      "epoch": 1.0889487870619945,
+      "grad_norm": 0.6306165456771851,
+      "learning_rate": 0.0005352531030760928,
+      "loss": 3.7484,
       "step": 10100
     },
     {
-      "epoch": 1.0924550640404693,
-      "grad_norm": 0.5604647994041443,
-      "learning_rate": 0.0005350350177782566,
-      "loss": 3.7572,
+      "epoch": 1.0943396226415094,
+      "grad_norm": 0.5631089806556702,
+      "learning_rate": 0.0005349293038316244,
+      "loss": 3.7815,
       "step": 10150
     },
     {
-      "epoch": 1.0978366160800774,
-      "grad_norm": 0.6427847743034363,
-      "learning_rate": 0.0005347117767481952,
-      "loss": 3.7671,
+      "epoch": 1.0997304582210243,
+      "grad_norm": 0.57806396484375,
+      "learning_rate": 0.0005346055045871559,
+      "loss": 3.7813,
       "step": 10200
     },
     {
-      "epoch": 1.1032181681196858,
-      "grad_norm": 0.5688785910606384,
-      "learning_rate": 0.000534395000538735,
-      "loss": 3.7718,
+      "epoch": 1.105121293800539,
+      "grad_norm": 0.5878326296806335,
+      "learning_rate": 0.0005342817053426874,
+      "loss": 3.7473,
       "step": 10250
     },
     {
-      "epoch": 1.1085997201592939,
-      "grad_norm": 0.6266538500785828,
-      "learning_rate": 0.0005340717595086736,
-      "loss": 3.7582,
+      "epoch": 1.110512129380054,
+      "grad_norm": 0.6222975850105286,
+      "learning_rate": 0.0005339579060982191,
+      "loss": 3.7746,
       "step": 10300
     },
     {
-      "epoch": 1.1139812721989022,
-      "grad_norm": 0.6398711800575256,
-      "learning_rate": 0.0005337485184786122,
-      "loss": 3.7626,
+      "epoch": 1.1159029649595686,
+      "grad_norm": 0.5526645183563232,
+      "learning_rate": 0.0005336341068537506,
+      "loss": 3.7748,
       "step": 10350
     },
     {
-      "epoch": 1.1193628242385103,
-      "grad_norm": 0.5167144536972046,
-      "learning_rate": 0.0005334252774485507,
-      "loss": 3.7692,
+      "epoch": 1.1212938005390836,
+      "grad_norm": 0.5977159142494202,
+      "learning_rate": 0.0005333103076092822,
+      "loss": 3.759,
       "step": 10400
     },
     {
-      "epoch": 1.1247443762781186,
-      "grad_norm": 0.5785194039344788,
-      "learning_rate": 0.0005331020364184894,
-      "loss": 3.7633,
+      "epoch": 1.1266846361185983,
+      "grad_norm": 0.5398628115653992,
+      "learning_rate": 0.0005329865083648137,
+      "loss": 3.7247,
       "step": 10450
     },
     {
-      "epoch": 1.1301259283177267,
-      "grad_norm": 0.5897461175918579,
-      "learning_rate": 0.0005327787953884279,
-      "loss": 3.7521,
+      "epoch": 1.1320754716981132,
+      "grad_norm": 0.5337214469909668,
+      "learning_rate": 0.0005326627091203454,
+      "loss": 3.7499,
       "step": 10500
     },
     {
-      "epoch": 1.135507480357335,
-      "grad_norm": 0.5663477182388306,
-      "learning_rate": 0.0005324555543583665,
-      "loss": 3.7543,
+      "epoch": 1.137466307277628,
+      "grad_norm": 0.5868701338768005,
+      "learning_rate": 0.0005323389098758769,
+      "loss": 3.7506,
       "step": 10550
     },
     {
-      "epoch": 1.1408890323969434,
-      "grad_norm": 0.565779983997345,
-      "learning_rate": 0.0005321323133283051,
-      "loss": 3.7576,
+      "epoch": 1.1428571428571428,
+      "grad_norm": 0.6092649698257446,
+      "learning_rate": 0.0005320151106314085,
+      "loss": 3.7578,
       "step": 10600
     },
     {
-      "epoch": 1.1462705844365515,
-      "grad_norm": 0.5521111488342285,
-      "learning_rate": 0.0005318090722982436,
-      "loss": 3.7509,
+      "epoch": 1.1482479784366577,
+      "grad_norm": 0.5305042862892151,
+      "learning_rate": 0.00053169131138694,
+      "loss": 3.7657,
       "step": 10650
     },
     {
-      "epoch": 1.1516521364761596,
-      "grad_norm": 0.5536676645278931,
-      "learning_rate": 0.0005314858312681823,
-      "loss": 3.7664,
+      "epoch": 1.1536388140161726,
+      "grad_norm": 0.5675718784332275,
+      "learning_rate": 0.0005313675121424716,
+      "loss": 3.7481,
       "step": 10700
     },
     {
-      "epoch": 1.157033688515768,
-      "grad_norm": 0.5922389030456543,
-      "learning_rate": 0.0005311625902381209,
-      "loss": 3.7486,
+      "epoch": 1.1590296495956873,
+      "grad_norm": 0.601825475692749,
+      "learning_rate": 0.0005310437128980032,
+      "loss": 3.7453,
       "step": 10750
     },
     {
-      "epoch": 1.1624152405553763,
-      "grad_norm": 0.5422095656394958,
-      "learning_rate": 0.0005308393492080595,
-      "loss": 3.7454,
+      "epoch": 1.1644204851752022,
+      "grad_norm": 0.5961639881134033,
+      "learning_rate": 0.0005307199136535348,
+      "loss": 3.7255,
       "step": 10800
     },
     {
-      "epoch": 1.1677967925949844,
-      "grad_norm": 0.5556425452232361,
-      "learning_rate": 0.000530516108177998,
-      "loss": 3.7493,
+      "epoch": 1.169811320754717,
+      "grad_norm": 0.5785391330718994,
+      "learning_rate": 0.0005303961144090663,
+      "loss": 3.7645,
       "step": 10850
     },
     {
-      "epoch": 1.1731783446345927,
-      "grad_norm": 0.589709997177124,
-      "learning_rate": 0.0005301928671479365,
-      "loss": 3.7442,
+      "epoch": 1.1752021563342319,
+      "grad_norm": 0.5923433303833008,
+      "learning_rate": 0.0005300723151645979,
+      "loss": 3.7614,
       "step": 10900
     },
     {
-      "epoch": 1.1785598966742008,
-      "grad_norm": 0.6388732194900513,
-      "learning_rate": 0.0005298696261178752,
-      "loss": 3.7656,
+      "epoch": 1.1805929919137466,
+      "grad_norm": 0.610974133014679,
+      "learning_rate": 0.0005297485159201295,
+      "loss": 3.7204,
       "step": 10950
     },
     {
-      "epoch": 1.1839414487138091,
-      "grad_norm": 0.6585466265678406,
-      "learning_rate": 0.0005295463850878138,
-      "loss": 3.754,
+      "epoch": 1.1859838274932615,
+      "grad_norm": 0.537177562713623,
+      "learning_rate": 0.000529424716675661,
+      "loss": 3.7393,
       "step": 11000
     },
     {
-      "epoch": 1.1839414487138091,
-      "eval_accuracy": 0.3477161869225167,
-      "eval_loss": 3.722252130508423,
-      "eval_runtime": 184.2851,
-      "eval_samples_per_second": 97.734,
-      "eval_steps_per_second": 6.11,
+      "epoch": 1.1859838274932615,
+      "eval_accuracy": 0.34715510340984274,
+      "eval_loss": 3.7238121032714844,
+      "eval_runtime": 183.3902,
+      "eval_samples_per_second": 98.211,
+      "eval_steps_per_second": 6.14,
       "step": 11000
     },
     {
-      "epoch": 1.1893230007534172,
-      "grad_norm": 0.5404320359230042,
-      "learning_rate": 0.0005292231440577524,
-      "loss": 3.7624,
+      "epoch": 1.1913746630727764,
+      "grad_norm": 0.5162882208824158,
+      "learning_rate": 0.000529107393416082,
+      "loss": 3.749,
       "step": 11050
     },
     {
-      "epoch": 1.1947045527930256,
-      "grad_norm": 0.5776370167732239,
-      "learning_rate": 0.0005288999030276909,
-      "loss": 3.7492,
+      "epoch": 1.196765498652291,
+      "grad_norm": 0.5799063444137573,
+      "learning_rate": 0.0005287835941716135,
+      "loss": 3.7425,
       "step": 11100
     },
     {
-      "epoch": 1.2000861048326337,
-      "grad_norm": 0.6025657653808594,
-      "learning_rate": 0.0005285766619976295,
-      "loss": 3.7407,
+      "epoch": 1.202156334231806,
+      "grad_norm": 0.5635416507720947,
+      "learning_rate": 0.0005284597949271452,
+      "loss": 3.7357,
       "step": 11150
     },
     {
-      "epoch": 1.205467656872242,
-      "grad_norm": 0.5973086357116699,
-      "learning_rate": 0.0005282534209675681,
-      "loss": 3.736,
+      "epoch": 1.2075471698113207,
+      "grad_norm": 0.6384708881378174,
+      "learning_rate": 0.0005281359956826767,
+      "loss": 3.7503,
       "step": 11200
     },
     {
-      "epoch": 1.21084920891185,
-      "grad_norm": 0.5211036801338196,
-      "learning_rate": 0.0005279301799375066,
-      "loss": 3.7491,
+      "epoch": 1.2129380053908356,
+      "grad_norm": 0.5484032034873962,
+      "learning_rate": 0.0005278121964382083,
+      "loss": 3.7588,
       "step": 11250
     },
     {
-      "epoch": 1.2162307609514584,
-      "grad_norm": 0.5768703818321228,
-      "learning_rate": 0.0005276069389074453,
-      "loss": 3.7408,
+      "epoch": 1.2183288409703503,
+      "grad_norm": 0.6434171795845032,
+      "learning_rate": 0.0005274883971937398,
+      "loss": 3.7291,
       "step": 11300
     },
     {
-      "epoch": 1.2216123129910665,
-      "grad_norm": 0.5568981766700745,
-      "learning_rate": 0.0005272836978773838,
-      "loss": 3.7406,
+      "epoch": 1.2237196765498652,
+      "grad_norm": 0.5906100869178772,
+      "learning_rate": 0.0005271645979492714,
+      "loss": 3.7504,
       "step": 11350
     },
     {
-      "epoch": 1.2269938650306749,
-      "grad_norm": 0.5531343221664429,
-      "learning_rate": 0.0005269604568473225,
-      "loss": 3.7345,
+      "epoch": 1.2291105121293802,
+      "grad_norm": 0.58035808801651,
+      "learning_rate": 0.000526840798704803,
+      "loss": 3.7506,
       "step": 11400
     },
     {
-      "epoch": 1.232375417070283,
-      "grad_norm": 0.6032938957214355,
-      "learning_rate": 0.000526637215817261,
-      "loss": 3.7292,
+      "epoch": 1.2345013477088949,
+      "grad_norm": 0.5390593409538269,
+      "learning_rate": 0.0005265169994603346,
+      "loss": 3.7528,
       "step": 11450
     },
     {
-      "epoch": 1.2377569691098913,
-      "grad_norm": 0.5232219099998474,
-      "learning_rate": 0.0005263139747871996,
-      "loss": 3.7334,
+      "epoch": 1.2398921832884098,
+      "grad_norm": 0.5824751257896423,
+      "learning_rate": 0.0005261932002158661,
+      "loss": 3.7185,
       "step": 11500
     },
     {
-      "epoch": 1.2431385211494996,
-      "grad_norm": 0.5911790728569031,
-      "learning_rate": 0.0005259907337571381,
-      "loss": 3.7533,
+      "epoch": 1.2452830188679245,
+      "grad_norm": 0.5806322693824768,
+      "learning_rate": 0.0005258694009713977,
+      "loss": 3.7404,
       "step": 11550
     },
     {
-      "epoch": 1.2485200731891077,
-      "grad_norm": 0.5355823636054993,
-      "learning_rate": 0.000525673957547678,
-      "loss": 3.7246,
+      "epoch": 1.2506738544474394,
+      "grad_norm": 0.5735841989517212,
+      "learning_rate": 0.0005255456017269292,
+      "loss": 3.7371,
       "step": 11600
     },
     {
-      "epoch": 1.2539016252287158,
-      "grad_norm": 0.4860752522945404,
-      "learning_rate": 0.0005253507165176167,
-      "loss": 3.752,
+      "epoch": 1.256064690026954,
+      "grad_norm": 0.5399126410484314,
+      "learning_rate": 0.0005252218024824608,
+      "loss": 3.7348,
       "step": 11650
     },
     {
-      "epoch": 1.2592831772683242,
-      "grad_norm": 0.5607491731643677,
-      "learning_rate": 0.0005250274754875552,
-      "loss": 3.731,
+      "epoch": 1.261455525606469,
+      "grad_norm": 0.581797182559967,
+      "learning_rate": 0.0005248980032379924,
+      "loss": 3.7357,
       "step": 11700
     },
     {
-      "epoch": 1.2646647293079325,
-      "grad_norm": 0.5975137948989868,
-      "learning_rate": 0.0005247042344574938,
-      "loss": 3.7511,
+      "epoch": 1.266846361185984,
+      "grad_norm": 0.6158117651939392,
+      "learning_rate": 0.000524574203993524,
+      "loss": 3.7435,
       "step": 11750
     },
     {
-      "epoch": 1.2700462813475406,
-      "grad_norm": 0.5998445749282837,
-      "learning_rate": 0.0005243809934274323,
-      "loss": 3.7465,
+      "epoch": 1.2722371967654986,
+      "grad_norm": 0.5718212723731995,
+      "learning_rate": 0.0005242504047490555,
+      "loss": 3.7316,
       "step": 11800
     },
     {
-      "epoch": 1.275427833387149,
-      "grad_norm": 0.6529081463813782,
-      "learning_rate": 0.0005240577523973709,
-      "loss": 3.7262,
+      "epoch": 1.2776280323450135,
+      "grad_norm": 0.5981943011283875,
+      "learning_rate": 0.0005239266055045871,
+      "loss": 3.7131,
       "step": 11850
     },
     {
-      "epoch": 1.280809385426757,
-      "grad_norm": 0.5797929167747498,
-      "learning_rate": 0.0005237345113673095,
-      "loss": 3.7591,
+      "epoch": 1.2830188679245282,
+      "grad_norm": 0.48539048433303833,
+      "learning_rate": 0.0005236028062601186,
+      "loss": 3.7193,
       "step": 11900
     },
     {
-      "epoch": 1.2861909374663654,
-      "grad_norm": 0.5823287963867188,
-      "learning_rate": 0.0005234112703372481,
-      "loss": 3.722,
+      "epoch": 1.2884097035040432,
+      "grad_norm": 0.5573267936706543,
+      "learning_rate": 0.0005232790070156503,
+      "loss": 3.7284,
       "step": 11950
     },
     {
-      "epoch": 1.2915724895059735,
-      "grad_norm": 0.7930464148521423,
-      "learning_rate": 0.0005230880293071867,
-      "loss": 3.7378,
+      "epoch": 1.2938005390835579,
+      "grad_norm": 0.5594388246536255,
+      "learning_rate": 0.0005229552077711818,
+      "loss": 3.7248,
       "step": 12000
     },
     {
-      "epoch": 1.2915724895059735,
-      "eval_accuracy": 0.34940867295600286,
-      "eval_loss": 3.700629234313965,
-      "eval_runtime": 184.3929,
-      "eval_samples_per_second": 97.677,
-      "eval_steps_per_second": 6.107,
+      "epoch": 1.2938005390835579,
+      "eval_accuracy": 0.3495350362645311,
+      "eval_loss": 3.6991188526153564,
+      "eval_runtime": 183.7777,
+      "eval_samples_per_second": 98.004,
+      "eval_steps_per_second": 6.127,
       "step": 12000
     },
     {
-      "epoch": 1.2969540415455818,
-      "grad_norm": 0.49873632192611694,
-      "learning_rate": 0.0005227647882771253,
-      "loss": 3.7239,
+      "epoch": 1.2991913746630728,
+      "grad_norm": 0.5452162623405457,
+      "learning_rate": 0.0005226314085267134,
+      "loss": 3.7048,
       "step": 12050
     },
     {
-      "epoch": 1.30233559358519,
-      "grad_norm": 0.5773414373397827,
-      "learning_rate": 0.0005224415472470639,
-      "loss": 3.7392,
+      "epoch": 1.3045822102425877,
+      "grad_norm": 0.6047404408454895,
+      "learning_rate": 0.000522307609282245,
+      "loss": 3.7436,
       "step": 12100
     },
     {
-      "epoch": 1.3077171456247982,
-      "grad_norm": 0.6267489194869995,
-      "learning_rate": 0.0005221183062170024,
-      "loss": 3.7183,
+      "epoch": 1.3099730458221024,
+      "grad_norm": 0.5165073871612549,
+      "learning_rate": 0.0005219838100377766,
+      "loss": 3.7241,
       "step": 12150
     },
     {
-      "epoch": 1.3130986976644063,
-      "grad_norm": 0.5878105759620667,
-      "learning_rate": 0.0005217950651869409,
-      "loss": 3.7342,
+      "epoch": 1.3153638814016173,
+      "grad_norm": 0.5657083988189697,
+      "learning_rate": 0.000521660010793308,
+      "loss": 3.7097,
       "step": 12200
     },
     {
-      "epoch": 1.3184802497040147,
-      "grad_norm": 0.5103281736373901,
-      "learning_rate": 0.0005214718241568796,
-      "loss": 3.7391,
+      "epoch": 1.320754716981132,
+      "grad_norm": 0.6331221461296082,
+      "learning_rate": 0.0005213362115488396,
+      "loss": 3.7363,
       "step": 12250
     },
     {
-      "epoch": 1.3238618017436228,
-      "grad_norm": 0.5685731768608093,
-      "learning_rate": 0.0005211485831268182,
-      "loss": 3.7023,
+      "epoch": 1.326145552560647,
+      "grad_norm": 0.5557056069374084,
+      "learning_rate": 0.0005210124123043713,
+      "loss": 3.7024,
       "step": 12300
     },
     {
-      "epoch": 1.329243353783231,
-      "grad_norm": 0.5740435123443604,
-      "learning_rate": 0.0005208253420967568,
-      "loss": 3.7269,
+      "epoch": 1.3315363881401616,
+      "grad_norm": 0.5646718144416809,
+      "learning_rate": 0.0005206886130599028,
+      "loss": 3.7284,
       "step": 12350
     },
     {
-      "epoch": 1.3346249058228392,
-      "grad_norm": 0.5537181496620178,
-      "learning_rate": 0.0005205021010666953,
-      "loss": 3.7228,
+      "epoch": 1.3369272237196765,
+      "grad_norm": 0.5577690601348877,
+      "learning_rate": 0.0005203648138154344,
+      "loss": 3.699,
       "step": 12400
     },
     {
-      "epoch": 1.3400064578624475,
-      "grad_norm": 0.6004145741462708,
-      "learning_rate": 0.0005201788600366339,
-      "loss": 3.7306,
+      "epoch": 1.3423180592991915,
+      "grad_norm": 0.532707154750824,
+      "learning_rate": 0.0005200410145709659,
+      "loss": 3.7212,
       "step": 12450
     },
     {
-      "epoch": 1.3453880099020559,
-      "grad_norm": 0.5789647698402405,
-      "learning_rate": 0.0005198556190065725,
-      "loss": 3.7216,
+      "epoch": 1.3477088948787062,
+      "grad_norm": 0.5770918130874634,
+      "learning_rate": 0.0005197172153264976,
+      "loss": 3.7258,
       "step": 12500
     },
     {
-      "epoch": 1.350769561941664,
-      "grad_norm": 0.5887240767478943,
-      "learning_rate": 0.0005195323779765112,
-      "loss": 3.7198,
+      "epoch": 1.353099730458221,
+      "grad_norm": 0.548803448677063,
+      "learning_rate": 0.0005193934160820291,
+      "loss": 3.7393,
       "step": 12550
     },
     {
-      "epoch": 1.356151113981272,
-      "grad_norm": 0.5683675408363342,
-      "learning_rate": 0.0005192091369464497,
-      "loss": 3.7304,
+      "epoch": 1.3584905660377358,
+      "grad_norm": 0.5411361455917358,
+      "learning_rate": 0.0005190696168375607,
+      "loss": 3.7124,
       "step": 12600
     },
     {
-      "epoch": 1.3615326660208804,
-      "grad_norm": 0.5594485402107239,
-      "learning_rate": 0.0005188858959163882,
-      "loss": 3.7167,
+      "epoch": 1.3638814016172507,
+      "grad_norm": 0.5823306441307068,
+      "learning_rate": 0.0005187458175930922,
+      "loss": 3.7102,
       "step": 12650
     },
     {
-      "epoch": 1.3669142180604887,
-      "grad_norm": 0.6306636333465576,
-      "learning_rate": 0.0005185626548863269,
-      "loss": 3.7207,
+      "epoch": 1.3692722371967654,
+      "grad_norm": 0.5538209676742554,
+      "learning_rate": 0.0005184220183486238,
+      "loss": 3.7083,
       "step": 12700
     },
     {
-      "epoch": 1.3722957701000968,
-      "grad_norm": 0.5611990094184875,
-      "learning_rate": 0.0005182394138562654,
-      "loss": 3.723,
+      "epoch": 1.3746630727762803,
+      "grad_norm": 0.5278885960578918,
+      "learning_rate": 0.0005180982191041554,
+      "loss": 3.7216,
       "step": 12750
     },
     {
-      "epoch": 1.3776773221397052,
-      "grad_norm": 0.5362935662269592,
-      "learning_rate": 0.0005179161728262041,
-      "loss": 3.7171,
+      "epoch": 1.3800539083557952,
+      "grad_norm": 0.570948600769043,
+      "learning_rate": 0.000517774419859687,
+      "loss": 3.7147,
       "step": 12800
     },
     {
-      "epoch": 1.3830588741793133,
-      "grad_norm": 0.5498529672622681,
-      "learning_rate": 0.0005175929317961426,
-      "loss": 3.72,
+      "epoch": 1.38544474393531,
+      "grad_norm": 0.6416262984275818,
+      "learning_rate": 0.0005174506206152185,
+      "loss": 3.7362,
       "step": 12850
     },
     {
-      "epoch": 1.3884404262189216,
-      "grad_norm": 0.5442519783973694,
-      "learning_rate": 0.0005172696907660812,
-      "loss": 3.7128,
+      "epoch": 1.3908355795148248,
+      "grad_norm": 0.658841073513031,
+      "learning_rate": 0.0005171268213707501,
+      "loss": 3.7334,
       "step": 12900
     },
     {
-      "epoch": 1.3938219782585297,
-      "grad_norm": 0.5849413871765137,
-      "learning_rate": 0.0005169464497360198,
-      "loss": 3.7118,
+      "epoch": 1.3962264150943398,
+      "grad_norm": 0.5554807782173157,
+      "learning_rate": 0.0005168030221262816,
+      "loss": 3.7339,
       "step": 12950
     },
     {
-      "epoch": 1.399203530298138,
-      "grad_norm": 0.5864673852920532,
-      "learning_rate": 0.0005166232087059583,
-      "loss": 3.7056,
+      "epoch": 1.4016172506738545,
+      "grad_norm": 0.5637690424919128,
+      "learning_rate": 0.0005164792228818132,
+      "loss": 3.7151,
       "step": 13000
     },
     {
-      "epoch": 1.399203530298138,
-      "eval_accuracy": 0.3516231274625498,
-      "eval_loss": 3.67989444732666,
-      "eval_runtime": 184.826,
-      "eval_samples_per_second": 97.448,
-      "eval_steps_per_second": 6.092,
+      "epoch": 1.4016172506738545,
+      "eval_accuracy": 0.3519373516141093,
+      "eval_loss": 3.6800601482391357,
+      "eval_runtime": 183.4932,
+      "eval_samples_per_second": 98.156,
+      "eval_steps_per_second": 6.136,
       "step": 13000
     },
     {
-      "epoch": 1.4045850823377461,
-      "grad_norm": 0.5450125336647034,
-      "learning_rate": 0.0005162999676758969,
-      "loss": 3.7177,
+      "epoch": 1.4070080862533692,
+      "grad_norm": 0.6130073666572571,
+      "learning_rate": 0.0005161554236373448,
+      "loss": 3.6973,
       "step": 13050
     },
     {
-      "epoch": 1.4099666343773545,
-      "grad_norm": 0.5438733100891113,
-      "learning_rate": 0.0005159767266458355,
-      "loss": 3.7222,
+      "epoch": 1.412398921832884,
+      "grad_norm": 0.5765049457550049,
+      "learning_rate": 0.0005158381003777657,
+      "loss": 3.7145,
       "step": 13100
     },
     {
-      "epoch": 1.4153481864169626,
-      "grad_norm": 0.5453888773918152,
-      "learning_rate": 0.0005156534856157741,
-      "loss": 3.708,
+      "epoch": 1.417789757412399,
+      "grad_norm": 0.5363691449165344,
+      "learning_rate": 0.0005155143011332973,
+      "loss": 3.7161,
       "step": 13150
     },
     {
-      "epoch": 1.420729738456571,
-      "grad_norm": 0.572364330291748,
-      "learning_rate": 0.0005153302445857127,
-      "loss": 3.7001,
+      "epoch": 1.4231805929919137,
+      "grad_norm": 0.5675169229507446,
+      "learning_rate": 0.0005151905018888289,
+      "loss": 3.7403,
       "step": 13200
     },
     {
-      "epoch": 1.426111290496179,
-      "grad_norm": 0.5122117400169373,
-      "learning_rate": 0.0005150070035556513,
-      "loss": 3.7032,
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.594634473323822,
+      "learning_rate": 0.0005148667026443604,
+      "loss": 3.6966,
       "step": 13250
     },
     {
-      "epoch": 1.4314928425357873,
-      "grad_norm": 0.566405713558197,
-      "learning_rate": 0.0005146837625255898,
-      "loss": 3.7114,
+      "epoch": 1.4339622641509435,
+      "grad_norm": 0.5782607793807983,
+      "learning_rate": 0.000514542903399892,
+      "loss": 3.7007,
       "step": 13300
     },
     {
-      "epoch": 1.4368743945753955,
-      "grad_norm": 0.5422384142875671,
-      "learning_rate": 0.0005143605214955285,
-      "loss": 3.7076,
+      "epoch": 1.4393530997304582,
+      "grad_norm": 0.6014611721038818,
+      "learning_rate": 0.0005142191041554237,
+      "loss": 3.6826,
       "step": 13350
     },
     {
-      "epoch": 1.4422559466150038,
-      "grad_norm": 0.5637435913085938,
-      "learning_rate": 0.0005140372804654671,
-      "loss": 3.6984,
+      "epoch": 1.444743935309973,
+      "grad_norm": 0.5752432346343994,
+      "learning_rate": 0.0005138953049109552,
+      "loss": 3.7229,
       "step": 13400
     },
     {
-      "epoch": 1.447637498654612,
-      "grad_norm": 0.5512182712554932,
-      "learning_rate": 0.0005137140394354056,
-      "loss": 3.6969,
+      "epoch": 1.4501347708894878,
+      "grad_norm": 0.6054044961929321,
+      "learning_rate": 0.0005135715056664868,
+      "loss": 3.7175,
       "step": 13450
     },
     {
-      "epoch": 1.4530190506942202,
-      "grad_norm": 0.5623869299888611,
-      "learning_rate": 0.0005133907984053442,
-      "loss": 3.6991,
+      "epoch": 1.4555256064690028,
+      "grad_norm": 0.6118409037590027,
+      "learning_rate": 0.0005132477064220183,
+      "loss": 3.7031,
       "step": 13500
     },
     {
-      "epoch": 1.4584006027338283,
-      "grad_norm": 0.5506566762924194,
-      "learning_rate": 0.0005130675573752827,
-      "loss": 3.7273,
+      "epoch": 1.4609164420485174,
+      "grad_norm": 0.5970998406410217,
+      "learning_rate": 0.0005129239071775499,
+      "loss": 3.7209,
       "step": 13550
     },
     {
-      "epoch": 1.4637821547734367,
-      "grad_norm": 0.5795466303825378,
-      "learning_rate": 0.0005127443163452214,
-      "loss": 3.7172,
+      "epoch": 1.4663072776280324,
+      "grad_norm": 0.5053044557571411,
+      "learning_rate": 0.0005126001079330814,
+      "loss": 3.7031,
       "step": 13600
     },
     {
-      "epoch": 1.469163706813045,
-      "grad_norm": 0.5493975877761841,
-      "learning_rate": 0.00051242107531516,
-      "loss": 3.6948,
+      "epoch": 1.4716981132075473,
+      "grad_norm": 0.5769761204719543,
+      "learning_rate": 0.000512276308688613,
+      "loss": 3.7068,
       "step": 13650
     },
     {
-      "epoch": 1.474545258852653,
-      "grad_norm": 0.5260751247406006,
-      "learning_rate": 0.0005120978342850986,
-      "loss": 3.7049,
+      "epoch": 1.477088948787062,
+      "grad_norm": 0.5561441779136658,
+      "learning_rate": 0.0005119525094441446,
+      "loss": 3.7206,
       "step": 13700
     },
     {
-      "epoch": 1.4799268108922612,
-      "grad_norm": 0.5563033223152161,
-      "learning_rate": 0.0005117745932550371,
-      "loss": 3.7076,
+      "epoch": 1.482479784366577,
+      "grad_norm": 0.5928412079811096,
+      "learning_rate": 0.0005116287101996762,
+      "loss": 3.7031,
       "step": 13750
     },
     {
-      "epoch": 1.4853083629318695,
-      "grad_norm": 0.5787496566772461,
-      "learning_rate": 0.0005114513522249758,
-      "loss": 3.71,
+      "epoch": 1.4878706199460916,
+      "grad_norm": 0.5242429971694946,
+      "learning_rate": 0.0005113049109552077,
+      "loss": 3.7018,
       "step": 13800
     },
     {
-      "epoch": 1.4906899149714778,
-      "grad_norm": 0.5059001445770264,
-      "learning_rate": 0.0005111281111949143,
-      "loss": 3.684,
+      "epoch": 1.4932614555256065,
+      "grad_norm": 0.5548174977302551,
+      "learning_rate": 0.0005109811117107393,
+      "loss": 3.697,
       "step": 13850
     },
     {
-      "epoch": 1.496071467011086,
-      "grad_norm": 0.5558372735977173,
-      "learning_rate": 0.0005108048701648528,
-      "loss": 3.7065,
+      "epoch": 1.4986522911051212,
+      "grad_norm": 0.6115781664848328,
+      "learning_rate": 0.0005106573124662708,
+      "loss": 3.6875,
       "step": 13900
     },
     {
-      "epoch": 1.501453019050694,
-      "grad_norm": 0.5342027544975281,
-      "learning_rate": 0.0005104816291347915,
-      "loss": 3.6999,
+      "epoch": 1.5040431266846361,
+      "grad_norm": 0.536693811416626,
+      "learning_rate": 0.0005103335132218025,
+      "loss": 3.704,
       "step": 13950
     },
     {
-      "epoch": 1.5068345710903024,
-      "grad_norm": 0.5543688535690308,
-      "learning_rate": 0.00051015838810473,
-      "loss": 3.7016,
+      "epoch": 1.509433962264151,
+      "grad_norm": 0.5926096439361572,
+      "learning_rate": 0.000510009713977334,
+      "loss": 3.6969,
       "step": 14000
     },
     {
-      "epoch": 1.5068345710903024,
-      "eval_accuracy": 0.35396144626120524,
-      "eval_loss": 3.656766176223755,
-      "eval_runtime": 184.2793,
-      "eval_samples_per_second": 97.737,
-      "eval_steps_per_second": 6.11,
+      "epoch": 1.509433962264151,
+      "eval_accuracy": 0.3541569128063835,
+      "eval_loss": 3.6548423767089844,
+      "eval_runtime": 183.7963,
+      "eval_samples_per_second": 97.994,
+      "eval_steps_per_second": 6.126,
       "step": 14000
     },
     {
-      "epoch": 1.5122161231299107,
-      "grad_norm": 0.5722943544387817,
-      "learning_rate": 0.0005098351470746687,
-      "loss": 3.6839,
+      "epoch": 1.5148247978436657,
+      "grad_norm": 0.5280314087867737,
+      "learning_rate": 0.0005096859147328656,
+      "loss": 3.6741,
       "step": 14050
     },
     {
-      "epoch": 1.5175976751695188,
-      "grad_norm": 0.5564660429954529,
-      "learning_rate": 0.0005095119060446072,
-      "loss": 3.6961,
+      "epoch": 1.5202156334231804,
+      "grad_norm": 0.5069935321807861,
+      "learning_rate": 0.0005093621154883971,
+      "loss": 3.707,
       "step": 14100
     },
     {
-      "epoch": 1.5229792272091272,
-      "grad_norm": 0.5130824446678162,
-      "learning_rate": 0.0005091886650145458,
-      "loss": 3.6933,
+      "epoch": 1.5256064690026954,
+      "grad_norm": 0.5404360890388489,
+      "learning_rate": 0.0005090383162439288,
+      "loss": 3.6867,
       "step": 14150
     },
     {
-      "epoch": 1.5283607792487355,
-      "grad_norm": 0.5731927156448364,
-      "learning_rate": 0.0005088654239844844,
-      "loss": 3.6865,
+      "epoch": 1.5309973045822103,
+      "grad_norm": 0.5273117423057556,
+      "learning_rate": 0.0005087145169994602,
+      "loss": 3.6862,
       "step": 14200
     },
     {
-      "epoch": 1.5337423312883436,
-      "grad_norm": 0.5487149357795715,
-      "learning_rate": 0.0005085421829544229,
-      "loss": 3.675,
+      "epoch": 1.536388140161725,
+      "grad_norm": 0.5154308676719666,
+      "learning_rate": 0.0005083907177549918,
+      "loss": 3.6916,
       "step": 14250
     },
     {
-      "epoch": 1.5391238833279517,
-      "grad_norm": 0.5173348188400269,
-      "learning_rate": 0.0005082189419243616,
-      "loss": 3.7095,
+      "epoch": 1.54177897574124,
+      "grad_norm": 0.5643147230148315,
+      "learning_rate": 0.0005080669185105234,
+      "loss": 3.701,
       "step": 14300
     },
     {
-      "epoch": 1.54450543536756,
-      "grad_norm": 0.5780784487724304,
-      "learning_rate": 0.0005078957008943001,
-      "loss": 3.6955,
+      "epoch": 1.5471698113207548,
+      "grad_norm": 0.5398867130279541,
+      "learning_rate": 0.000507743119266055,
+      "loss": 3.708,
       "step": 14350
     },
     {
-      "epoch": 1.5498869874071683,
-      "grad_norm": 0.5526981949806213,
-      "learning_rate": 0.0005075724598642387,
-      "loss": 3.6764,
+      "epoch": 1.5525606469002695,
+      "grad_norm": 0.5693142414093018,
+      "learning_rate": 0.0005074193200215865,
+      "loss": 3.6898,
       "step": 14400
     },
     {
-      "epoch": 1.5552685394467765,
-      "grad_norm": 0.550183117389679,
-      "learning_rate": 0.0005072492188341773,
-      "loss": 3.7052,
+      "epoch": 1.5579514824797842,
+      "grad_norm": 0.5457934141159058,
+      "learning_rate": 0.0005070955207771181,
+      "loss": 3.7126,
       "step": 14450
     },
     {
-      "epoch": 1.5606500914863846,
-      "grad_norm": 0.5323998332023621,
-      "learning_rate": 0.000506925977804116,
-      "loss": 3.6719,
+      "epoch": 1.5633423180592994,
+      "grad_norm": 0.539290726184845,
+      "learning_rate": 0.0005067717215326498,
+      "loss": 3.6751,
       "step": 14500
     },
     {
-      "epoch": 1.566031643525993,
-      "grad_norm": 0.5508608818054199,
-      "learning_rate": 0.0005066027367740545,
-      "loss": 3.6832,
+      "epoch": 1.568733153638814,
+      "grad_norm": 0.5418549180030823,
+      "learning_rate": 0.0005064479222881813,
+      "loss": 3.6877,
       "step": 14550
     },
     {
-      "epoch": 1.5714131955656012,
-      "grad_norm": 0.5756356716156006,
-      "learning_rate": 0.000506279495743993,
-      "loss": 3.6806,
+      "epoch": 1.5741239892183287,
+      "grad_norm": 0.4996977746486664,
+      "learning_rate": 0.0005061241230437129,
+      "loss": 3.6781,
       "step": 14600
     },
     {
-      "epoch": 1.5767947476052093,
-      "grad_norm": 0.5558910369873047,
-      "learning_rate": 0.0005059562547139316,
-      "loss": 3.6743,
+      "epoch": 1.5795148247978437,
+      "grad_norm": 0.545849621295929,
+      "learning_rate": 0.0005058003237992444,
+      "loss": 3.6945,
       "step": 14650
     },
     {
-      "epoch": 1.5821762996448174,
-      "grad_norm": 0.5734641551971436,
-      "learning_rate": 0.0005056330136838702,
-      "loss": 3.6983,
+      "epoch": 1.5849056603773586,
+      "grad_norm": 0.5146830081939697,
+      "learning_rate": 0.000505476524554776,
+      "loss": 3.7012,
       "step": 14700
     },
     {
-      "epoch": 1.5875578516844258,
-      "grad_norm": 0.5970582962036133,
-      "learning_rate": 0.0005053097726538088,
-      "loss": 3.698,
+      "epoch": 1.5902964959568733,
+      "grad_norm": 0.5644547939300537,
+      "learning_rate": 0.0005051527253103076,
+      "loss": 3.6647,
       "step": 14750
     },
     {
-      "epoch": 1.592939403724034,
-      "grad_norm": 0.5290464162826538,
-      "learning_rate": 0.0005049865316237474,
-      "loss": 3.6823,
+      "epoch": 1.595687331536388,
+      "grad_norm": 0.5948972105979919,
+      "learning_rate": 0.0005048289260658392,
+      "loss": 3.6893,
       "step": 14800
     },
     {
-      "epoch": 1.5983209557636422,
-      "grad_norm": 0.5535150766372681,
-      "learning_rate": 0.000504663290593686,
-      "loss": 3.69,
+      "epoch": 1.6010781671159031,
+      "grad_norm": 0.5488345623016357,
+      "learning_rate": 0.0005045051268213707,
+      "loss": 3.6793,
       "step": 14850
     },
     {
-      "epoch": 1.6037025078032503,
-      "grad_norm": 0.5767305493354797,
-      "learning_rate": 0.0005043400495636246,
-      "loss": 3.6766,
+      "epoch": 1.6064690026954178,
+      "grad_norm": 0.5010797381401062,
+      "learning_rate": 0.0005041813275769023,
+      "loss": 3.6759,
       "step": 14900
     },
     {
-      "epoch": 1.6090840598428586,
-      "grad_norm": 0.5387904047966003,
-      "learning_rate": 0.0005040168085335632,
-      "loss": 3.6813,
+      "epoch": 1.6118598382749325,
+      "grad_norm": 0.5631289482116699,
+      "learning_rate": 0.0005038640043173232,
+      "loss": 3.675,
       "step": 14950
     },
     {
-      "epoch": 1.614465611882467,
-      "grad_norm": 0.5717957019805908,
-      "learning_rate": 0.0005036935675035017,
-      "loss": 3.6979,
+      "epoch": 1.6172506738544474,
+      "grad_norm": 0.6699851751327515,
+      "learning_rate": 0.0005035402050728548,
+      "loss": 3.6822,
       "step": 15000
     },
     {
-      "epoch": 1.614465611882467,
-      "eval_accuracy": 0.35539251344660977,
-      "eval_loss": 3.63720703125,
-      "eval_runtime": 184.8407,
-      "eval_samples_per_second": 97.441,
-      "eval_steps_per_second": 6.092,
+      "epoch": 1.6172506738544474,
+      "eval_accuracy": 0.35562035855235674,
+      "eval_loss": 3.640406370162964,
+      "eval_runtime": 184.0918,
+      "eval_samples_per_second": 97.837,
+      "eval_steps_per_second": 6.117,
       "step": 15000
     },
     {
-      "epoch": 1.619847163922075,
-      "grad_norm": 0.5999611020088196,
-      "learning_rate": 0.0005033703264734402,
-      "loss": 3.6837,
+      "epoch": 1.6226415094339623,
+      "grad_norm": 0.5111995339393616,
+      "learning_rate": 0.0005032164058283863,
+      "loss": 3.6909,
       "step": 15050
     },
     {
-      "epoch": 1.6252287159616834,
-      "grad_norm": 0.5812547206878662,
-      "learning_rate": 0.0005030470854433789,
-      "loss": 3.688,
+      "epoch": 1.628032345013477,
+      "grad_norm": 0.5510700941085815,
+      "learning_rate": 0.0005028926065839179,
+      "loss": 3.6663,
       "step": 15100
     },
     {
-      "epoch": 1.6306102680012917,
-      "grad_norm": 0.5843793749809265,
-      "learning_rate": 0.0005027238444133175,
-      "loss": 3.6731,
+      "epoch": 1.633423180592992,
+      "grad_norm": 0.5378367900848389,
+      "learning_rate": 0.0005025688073394495,
+      "loss": 3.6848,
       "step": 15150
     },
     {
-      "epoch": 1.6359918200408998,
-      "grad_norm": 0.5559870600700378,
-      "learning_rate": 0.0005024006033832561,
-      "loss": 3.6801,
+      "epoch": 1.6388140161725069,
+      "grad_norm": 0.6216373443603516,
+      "learning_rate": 0.0005022450080949811,
+      "loss": 3.6618,
       "step": 15200
     },
     {
-      "epoch": 1.641373372080508,
-      "grad_norm": 0.5399496555328369,
-      "learning_rate": 0.0005020773623531946,
-      "loss": 3.6836,
+      "epoch": 1.6442048517520216,
+      "grad_norm": 0.5785908102989197,
+      "learning_rate": 0.0005019212088505126,
+      "loss": 3.7039,
       "step": 15250
     },
     {
-      "epoch": 1.6467549241201163,
-      "grad_norm": 0.5161964297294617,
-      "learning_rate": 0.0005017541213231333,
-      "loss": 3.6662,
+      "epoch": 1.6495956873315363,
+      "grad_norm": 0.5875208377838135,
+      "learning_rate": 0.0005015974096060442,
+      "loss": 3.6864,
       "step": 15300
     },
     {
-      "epoch": 1.6521364761597246,
-      "grad_norm": 0.571781575679779,
-      "learning_rate": 0.0005014308802930718,
-      "loss": 3.6923,
+      "epoch": 1.6549865229110512,
+      "grad_norm": 0.5718501806259155,
+      "learning_rate": 0.0005012736103615758,
+      "loss": 3.6807,
       "step": 15350
     },
     {
-      "epoch": 1.6575180281993327,
-      "grad_norm": 0.54901522397995,
-      "learning_rate": 0.0005011076392630105,
-      "loss": 3.6763,
+      "epoch": 1.6603773584905661,
+      "grad_norm": 0.5843344926834106,
+      "learning_rate": 0.0005009498111171074,
+      "loss": 3.6706,
       "step": 15400
     },
     {
-      "epoch": 1.6628995802389408,
-      "grad_norm": 0.5774824619293213,
-      "learning_rate": 0.000500784398232949,
-      "loss": 3.675,
+      "epoch": 1.6657681940700808,
+      "grad_norm": 0.508507251739502,
+      "learning_rate": 0.0005006260118726389,
+      "loss": 3.6828,
       "step": 15450
     },
     {
-      "epoch": 1.6682811322785491,
-      "grad_norm": 0.5709084868431091,
-      "learning_rate": 0.0005004611572028875,
-      "loss": 3.6655,
+      "epoch": 1.6711590296495957,
+      "grad_norm": 0.5351792573928833,
+      "learning_rate": 0.0005003022126281705,
+      "loss": 3.6937,
       "step": 15500
     },
     {
-      "epoch": 1.6736626843181575,
-      "grad_norm": 0.5834586024284363,
-      "learning_rate": 0.0005001379161728262,
-      "loss": 3.6644,
+      "epoch": 1.6765498652291106,
+      "grad_norm": 0.5323431491851807,
+      "learning_rate": 0.000499978413383702,
+      "loss": 3.667,
       "step": 15550
     },
     {
-      "epoch": 1.6790442363577656,
-      "grad_norm": 0.5875089764595032,
-      "learning_rate": 0.0004998146751427647,
-      "loss": 3.6837,
+      "epoch": 1.6819407008086253,
+      "grad_norm": 0.5795451402664185,
+      "learning_rate": 0.0004996546141392336,
+      "loss": 3.6673,
       "step": 15600
     },
     {
-      "epoch": 1.6844257883973737,
-      "grad_norm": 0.5730432868003845,
-      "learning_rate": 0.0004994914341127034,
-      "loss": 3.6606,
+      "epoch": 1.68733153638814,
+      "grad_norm": 0.5577372908592224,
+      "learning_rate": 0.0004993308148947651,
+      "loss": 3.6636,
       "step": 15650
     },
     {
-      "epoch": 1.689807340436982,
-      "grad_norm": 0.598292887210846,
-      "learning_rate": 0.0004991681930826419,
-      "loss": 3.6657,
+      "epoch": 1.692722371967655,
+      "grad_norm": 0.5510057210922241,
+      "learning_rate": 0.0004990070156502968,
+      "loss": 3.6775,
       "step": 15700
     },
     {
-      "epoch": 1.6951888924765903,
-      "grad_norm": 0.5262405872344971,
-      "learning_rate": 0.0004988449520525805,
-      "loss": 3.6675,
+      "epoch": 1.6981132075471699,
+      "grad_norm": 0.6013910174369812,
+      "learning_rate": 0.0004986832164058284,
+      "loss": 3.6531,
       "step": 15750
     },
     {
-      "epoch": 1.7005704445161984,
-      "grad_norm": 0.562152624130249,
-      "learning_rate": 0.0004985217110225191,
-      "loss": 3.661,
+      "epoch": 1.7035040431266846,
+      "grad_norm": 0.49710965156555176,
+      "learning_rate": 0.0004983594171613599,
+      "loss": 3.674,
       "step": 15800
     },
     {
-      "epoch": 1.7059519965558065,
-      "grad_norm": 0.5154584646224976,
-      "learning_rate": 0.0004981984699924576,
-      "loss": 3.6626,
+      "epoch": 1.7088948787061995,
+      "grad_norm": 0.553176760673523,
+      "learning_rate": 0.0004980356179168915,
+      "loss": 3.6648,
       "step": 15850
     },
     {
-      "epoch": 1.7113335485954149,
-      "grad_norm": 0.5356432795524597,
-      "learning_rate": 0.0004978752289623962,
-      "loss": 3.652,
+      "epoch": 1.7142857142857144,
+      "grad_norm": 0.5849853157997131,
+      "learning_rate": 0.000497711818672423,
+      "loss": 3.6667,
       "step": 15900
     },
     {
-      "epoch": 1.7167151006350232,
-      "grad_norm": 0.5371306538581848,
-      "learning_rate": 0.0004975519879323348,
-      "loss": 3.6717,
+      "epoch": 1.719676549865229,
+      "grad_norm": 0.5109348297119141,
+      "learning_rate": 0.0004973880194279547,
+      "loss": 3.6563,
       "step": 15950
     },
     {
-      "epoch": 1.7220966526746313,
-      "grad_norm": 0.5752100944519043,
-      "learning_rate": 0.0004972352117228746,
-      "loss": 3.6631,
+      "epoch": 1.7250673854447438,
+      "grad_norm": 0.5275253653526306,
+      "learning_rate": 0.0004970642201834862,
+      "loss": 3.6547,
       "step": 16000
     },
     {
-      "epoch": 1.7220966526746313,
-      "eval_accuracy": 0.35704088640764325,
-      "eval_loss": 3.6225626468658447,
-      "eval_runtime": 183.8276,
-      "eval_samples_per_second": 97.978,
-      "eval_steps_per_second": 6.125,
+      "epoch": 1.7250673854447438,
+      "eval_accuracy": 0.3572827477359183,
+      "eval_loss": 3.619877338409424,
+      "eval_runtime": 184.0128,
+      "eval_samples_per_second": 97.879,
+      "eval_steps_per_second": 6.119,
       "step": 16000
     },
     {
-      "epoch": 1.7274782047142396,
-      "grad_norm": 0.49519824981689453,
-      "learning_rate": 0.0004969119706928133,
-      "loss": 3.649,
+      "epoch": 1.7304582210242587,
+      "grad_norm": 0.524674117565155,
+      "learning_rate": 0.0004967404209390178,
+      "loss": 3.6765,
       "step": 16050
     },
     {
-      "epoch": 1.732859756753848,
-      "grad_norm": 0.5395165085792542,
-      "learning_rate": 0.0004965887296627518,
-      "loss": 3.6648,
+      "epoch": 1.7358490566037736,
+      "grad_norm": 0.5455240607261658,
+      "learning_rate": 0.0004964166216945493,
+      "loss": 3.6762,
       "step": 16100
     },
     {
-      "epoch": 1.738241308793456,
-      "grad_norm": 0.5537828207015991,
-      "learning_rate": 0.0004962654886326904,
-      "loss": 3.6808,
+      "epoch": 1.7412398921832883,
+      "grad_norm": 0.6462677717208862,
+      "learning_rate": 0.000496092822450081,
+      "loss": 3.6584,
       "step": 16150
     },
     {
-      "epoch": 1.7436228608330642,
-      "grad_norm": 0.5962070226669312,
-      "learning_rate": 0.000495942247602629,
-      "loss": 3.6507,
+      "epoch": 1.7466307277628033,
+      "grad_norm": 0.49305227398872375,
+      "learning_rate": 0.0004957690232056125,
+      "loss": 3.6656,
       "step": 16200
     },
     {
-      "epoch": 1.7490044128726725,
-      "grad_norm": 0.6224369406700134,
-      "learning_rate": 0.0004956190065725676,
-      "loss": 3.6441,
+      "epoch": 1.7520215633423182,
+      "grad_norm": 0.5830181837081909,
+      "learning_rate": 0.0004954452239611441,
+      "loss": 3.6647,
       "step": 16250
     },
     {
-      "epoch": 1.7543859649122808,
-      "grad_norm": 0.5751168727874756,
-      "learning_rate": 0.0004952957655425062,
-      "loss": 3.6686,
+      "epoch": 1.7574123989218329,
+      "grad_norm": 0.5238760709762573,
+      "learning_rate": 0.0004951214247166756,
+      "loss": 3.6659,
       "step": 16300
     },
     {
-      "epoch": 1.759767516951889,
-      "grad_norm": 0.6183793544769287,
-      "learning_rate": 0.0004949725245124448,
-      "loss": 3.6619,
+      "epoch": 1.7628032345013476,
+      "grad_norm": 0.5242082476615906,
+      "learning_rate": 0.0004947976254722072,
+      "loss": 3.6653,
       "step": 16350
     },
     {
-      "epoch": 1.765149068991497,
-      "grad_norm": 0.5693207383155823,
-      "learning_rate": 0.0004946492834823833,
-      "loss": 3.6391,
+      "epoch": 1.7681940700808625,
+      "grad_norm": 0.5468643307685852,
+      "learning_rate": 0.0004944738262277387,
+      "loss": 3.6704,
       "step": 16400
     },
     {
-      "epoch": 1.7705306210311054,
-      "grad_norm": 0.8416994214057922,
-      "learning_rate": 0.0004943260424523219,
-      "loss": 3.6685,
+      "epoch": 1.7735849056603774,
+      "grad_norm": 0.5299263000488281,
+      "learning_rate": 0.0004941500269832703,
+      "loss": 3.671,
       "step": 16450
     },
     {
-      "epoch": 1.7759121730707137,
-      "grad_norm": 0.5473074913024902,
-      "learning_rate": 0.0004940028014222605,
-      "loss": 3.651,
+      "epoch": 1.778975741239892,
+      "grad_norm": 0.5592052340507507,
+      "learning_rate": 0.0004938262277388019,
+      "loss": 3.6673,
       "step": 16500
     },
     {
-      "epoch": 1.7812937251103218,
-      "grad_norm": 0.6257918477058411,
-      "learning_rate": 0.000493679560392199,
-      "loss": 3.647,
+      "epoch": 1.784366576819407,
+      "grad_norm": 0.5448793768882751,
+      "learning_rate": 0.0004935024284943335,
+      "loss": 3.6462,
       "step": 16550
     },
     {
-      "epoch": 1.78667527714993,
-      "grad_norm": 0.6103613972663879,
-      "learning_rate": 0.0004933563193621377,
-      "loss": 3.6511,
+      "epoch": 1.789757412398922,
+      "grad_norm": 0.5522679686546326,
+      "learning_rate": 0.000493178629249865,
+      "loss": 3.6596,
       "step": 16600
     },
     {
-      "epoch": 1.7920568291895382,
-      "grad_norm": 0.53249591588974,
-      "learning_rate": 0.0004930330783320762,
-      "loss": 3.6511,
+      "epoch": 1.7951482479784366,
+      "grad_norm": 0.5675358772277832,
+      "learning_rate": 0.0004928548300053966,
+      "loss": 3.6647,
       "step": 16650
     },
     {
-      "epoch": 1.7974383812291466,
-      "grad_norm": 0.5363080501556396,
-      "learning_rate": 0.0004927098373020149,
-      "loss": 3.6559,
+      "epoch": 1.8005390835579513,
+      "grad_norm": 0.5482249855995178,
+      "learning_rate": 0.0004925310307609282,
+      "loss": 3.6587,
       "step": 16700
     },
     {
-      "epoch": 1.8028199332687547,
-      "grad_norm": 0.5322283506393433,
-      "learning_rate": 0.0004923865962719534,
-      "loss": 3.6635,
+      "epoch": 1.8059299191374663,
+      "grad_norm": 0.588553249835968,
+      "learning_rate": 0.0004922072315164598,
+      "loss": 3.6471,
       "step": 16750
     },
     {
-      "epoch": 1.8082014853083628,
-      "grad_norm": 0.5365887880325317,
-      "learning_rate": 0.0004920633552418919,
-      "loss": 3.653,
+      "epoch": 1.8113207547169812,
+      "grad_norm": 0.5706552863121033,
+      "learning_rate": 0.0004918834322719913,
+      "loss": 3.6521,
       "step": 16800
     },
     {
-      "epoch": 1.813583037347971,
-      "grad_norm": 0.5999295711517334,
-      "learning_rate": 0.0004917401142118306,
-      "loss": 3.6537,
+      "epoch": 1.8167115902964959,
+      "grad_norm": 0.543329656124115,
+      "learning_rate": 0.0004915596330275229,
+      "loss": 3.649,
       "step": 16850
     },
     {
-      "epoch": 1.8189645893875794,
-      "grad_norm": 0.5953883528709412,
-      "learning_rate": 0.0004914168731817692,
-      "loss": 3.6539,
+      "epoch": 1.8221024258760108,
+      "grad_norm": 0.5550561547279358,
+      "learning_rate": 0.0004912358337830544,
+      "loss": 3.6587,
       "step": 16900
     },
     {
-      "epoch": 1.8243461414271875,
-      "grad_norm": 0.597101628780365,
-      "learning_rate": 0.0004910936321517078,
-      "loss": 3.6471,
+      "epoch": 1.8274932614555257,
+      "grad_norm": 0.565937876701355,
+      "learning_rate": 0.000490912034538586,
+      "loss": 3.6419,
       "step": 16950
     },
     {
-      "epoch": 1.8297276934667959,
-      "grad_norm": 0.5295618772506714,
-      "learning_rate": 0.0004907703911216463,
-      "loss": 3.6326,
+      "epoch": 1.8328840970350404,
+      "grad_norm": 0.5782588720321655,
+      "learning_rate": 0.0004905882352941175,
+      "loss": 3.6355,
       "step": 17000
     },
     {
-      "epoch": 1.8297276934667959,
-      "eval_accuracy": 0.35910942008585534,
-      "eval_loss": 3.6050846576690674,
-      "eval_runtime": 184.1833,
-      "eval_samples_per_second": 97.788,
-      "eval_steps_per_second": 6.113,
+      "epoch": 1.8328840970350404,
+      "eval_accuracy": 0.35895860987756883,
+      "eval_loss": 3.604949474334717,
+      "eval_runtime": 183.9848,
+      "eval_samples_per_second": 97.894,
+      "eval_steps_per_second": 6.12,
       "step": 17000
     },
     {
-      "epoch": 1.8351092455064042,
-      "grad_norm": 0.6159751415252686,
-      "learning_rate": 0.0004904471500915849,
-      "loss": 3.6271,
+      "epoch": 1.838274932614555,
+      "grad_norm": 0.5248723030090332,
+      "learning_rate": 0.0004902644360496492,
+      "loss": 3.6454,
       "step": 17050
     },
     {
-      "epoch": 1.8404907975460123,
-      "grad_norm": 0.6205544471740723,
-      "learning_rate": 0.0004901239090615235,
-      "loss": 3.6651,
+      "epoch": 1.8436657681940702,
+      "grad_norm": 0.5723742246627808,
+      "learning_rate": 0.0004899406368051808,
+      "loss": 3.6462,
       "step": 17100
     },
     {
-      "epoch": 1.8458723495856204,
-      "grad_norm": 0.5399280786514282,
-      "learning_rate": 0.000489800668031462,
-      "loss": 3.6452,
+      "epoch": 1.849056603773585,
+      "grad_norm": 0.5356608033180237,
+      "learning_rate": 0.0004896168375607123,
+      "loss": 3.6337,
       "step": 17150
     },
     {
-      "epoch": 1.8512539016252287,
-      "grad_norm": 0.5837388038635254,
-      "learning_rate": 0.0004894774270014007,
-      "loss": 3.6575,
+      "epoch": 1.8544474393530996,
+      "grad_norm": 0.5415787100791931,
+      "learning_rate": 0.0004892930383162439,
+      "loss": 3.6609,
       "step": 17200
     },
     {
-      "epoch": 1.856635453664837,
-      "grad_norm": 0.5471201539039612,
-      "learning_rate": 0.0004891541859713392,
-      "loss": 3.6555,
+      "epoch": 1.8598382749326146,
+      "grad_norm": 0.5791612863540649,
+      "learning_rate": 0.0004889692390717754,
+      "loss": 3.67,
       "step": 17250
     },
     {
-      "epoch": 1.8620170057044452,
-      "grad_norm": 0.5392946600914001,
-      "learning_rate": 0.0004888309449412779,
-      "loss": 3.6453,
+      "epoch": 1.8652291105121295,
+      "grad_norm": 0.5022833347320557,
+      "learning_rate": 0.0004886454398273071,
+      "loss": 3.6535,
       "step": 17300
     },
     {
-      "epoch": 1.8673985577440533,
-      "grad_norm": 0.5226132869720459,
-      "learning_rate": 0.0004885077039112164,
-      "loss": 3.6449,
+      "epoch": 1.8706199460916442,
+      "grad_norm": 0.5456568598747253,
+      "learning_rate": 0.0004883216405828386,
+      "loss": 3.6421,
       "step": 17350
     },
     {
-      "epoch": 1.8727801097836616,
-      "grad_norm": 0.5569941401481628,
-      "learning_rate": 0.00048818446288115497,
-      "loss": 3.629,
+      "epoch": 1.8760107816711589,
+      "grad_norm": 0.576461672782898,
+      "learning_rate": 0.00048799784133837017,
+      "loss": 3.6569,
       "step": 17400
     },
     {
-      "epoch": 1.87816166182327,
-      "grad_norm": 0.5523909330368042,
-      "learning_rate": 0.0004878612218510936,
-      "loss": 3.6612,
+      "epoch": 1.881401617250674,
+      "grad_norm": 0.5383881330490112,
+      "learning_rate": 0.0004876740420939017,
+      "loss": 3.6289,
       "step": 17450
     },
     {
-      "epoch": 1.883543213862878,
-      "grad_norm": 0.5485296845436096,
-      "learning_rate": 0.0004875379808210322,
-      "loss": 3.6438,
+      "epoch": 1.8867924528301887,
+      "grad_norm": 0.5738885402679443,
+      "learning_rate": 0.0004873502428494333,
+      "loss": 3.6522,
       "step": 17500
     },
     {
-      "epoch": 1.8889247659024861,
-      "grad_norm": 0.5691823959350586,
-      "learning_rate": 0.00048721473979097075,
-      "loss": 3.6349,
+      "epoch": 1.8921832884097034,
+      "grad_norm": 0.5352233052253723,
+      "learning_rate": 0.0004870264436049649,
+      "loss": 3.6572,
       "step": 17550
     },
     {
-      "epoch": 1.8943063179420945,
-      "grad_norm": 0.5617777705192566,
-      "learning_rate": 0.00048689149876090935,
-      "loss": 3.6613,
+      "epoch": 1.8975741239892183,
+      "grad_norm": 0.5754483342170715,
+      "learning_rate": 0.00048670264436049643,
+      "loss": 3.6438,
       "step": 17600
     },
     {
-      "epoch": 1.8996878699817028,
-      "grad_norm": 0.6149376034736633,
-      "learning_rate": 0.0004865682577308479,
-      "loss": 3.6477,
+      "epoch": 1.9029649595687332,
+      "grad_norm": 0.5442389249801636,
+      "learning_rate": 0.00048637884511602803,
+      "loss": 3.6423,
       "step": 17650
     },
     {
-      "epoch": 1.905069422021311,
-      "grad_norm": 0.5364314913749695,
-      "learning_rate": 0.0004862450167007865,
-      "loss": 3.6471,
+      "epoch": 1.908355795148248,
+      "grad_norm": 0.5781295299530029,
+      "learning_rate": 0.0004860550458715596,
+      "loss": 3.6568,
       "step": 17700
     },
     {
-      "epoch": 1.910450974060919,
-      "grad_norm": 0.5560828447341919,
-      "learning_rate": 0.00048592177567072513,
-      "loss": 3.6449,
+      "epoch": 1.9137466307277629,
+      "grad_norm": 0.5495123863220215,
+      "learning_rate": 0.0004857312466270912,
+      "loss": 3.6336,
       "step": 17750
     },
     {
-      "epoch": 1.9158325261005273,
-      "grad_norm": 0.562412440776825,
-      "learning_rate": 0.00048559853464066367,
-      "loss": 3.6439,
+      "epoch": 1.9191374663072778,
+      "grad_norm": 0.5798534750938416,
+      "learning_rate": 0.00048540744738262274,
+      "loss": 3.6362,
       "step": 17800
     },
     {
-      "epoch": 1.9212140781401357,
-      "grad_norm": 0.5733065605163574,
-      "learning_rate": 0.00048527529361060227,
-      "loss": 3.6473,
+      "epoch": 1.9245283018867925,
+      "grad_norm": 0.4734729826450348,
+      "learning_rate": 0.00048508364813815434,
+      "loss": 3.6268,
       "step": 17850
     },
     {
-      "epoch": 1.9265956301797438,
-      "grad_norm": 0.590917706489563,
-      "learning_rate": 0.00048495205258054086,
-      "loss": 3.6377,
+      "epoch": 1.9299191374663072,
+      "grad_norm": 0.5227976441383362,
+      "learning_rate": 0.00048475984889368584,
+      "loss": 3.6317,
       "step": 17900
     },
     {
-      "epoch": 1.931977182219352,
-      "grad_norm": 0.6305859684944153,
-      "learning_rate": 0.0004846288115504794,
-      "loss": 3.6313,
+      "epoch": 1.935309973045822,
+      "grad_norm": 0.5543326735496521,
+      "learning_rate": 0.0004844360496492175,
+      "loss": 3.641,
       "step": 17950
     },
     {
-      "epoch": 1.9373587342589604,
-      "grad_norm": 0.5712413787841797,
-      "learning_rate": 0.000484305570520418,
-      "loss": 3.6246,
+      "epoch": 1.940700808625337,
+      "grad_norm": 0.5557667016983032,
+      "learning_rate": 0.000484112250404749,
+      "loss": 3.6474,
       "step": 18000
     },
     {
-      "epoch": 1.9373587342589604,
-      "eval_accuracy": 0.36066337368737805,
-      "eval_loss": 3.5890228748321533,
-      "eval_runtime": 184.3111,
-      "eval_samples_per_second": 97.721,
-      "eval_steps_per_second": 6.109,
+      "epoch": 1.940700808625337,
+      "eval_accuracy": 0.360239953383565,
+      "eval_loss": 3.5897815227508545,
+      "eval_runtime": 183.5998,
+      "eval_samples_per_second": 98.099,
+      "eval_steps_per_second": 6.133,
       "step": 18000
     },
     {
-      "epoch": 1.9427402862985685,
-      "grad_norm": 0.5944366455078125,
-      "learning_rate": 0.00048398232949035665,
-      "loss": 3.6456,
+      "epoch": 1.9460916442048517,
+      "grad_norm": 0.5602201223373413,
+      "learning_rate": 0.00048378845116028055,
+      "loss": 3.6239,
       "step": 18050
     },
     {
-      "epoch": 1.9481218383381766,
-      "grad_norm": 0.5790172815322876,
-      "learning_rate": 0.0004836590884602952,
-      "loss": 3.6318,
+      "epoch": 1.9514824797843666,
+      "grad_norm": 0.5105127096176147,
+      "learning_rate": 0.00048346465191581215,
+      "loss": 3.6487,
       "step": 18100
     },
     {
-      "epoch": 1.953503390377785,
-      "grad_norm": 0.5425114631652832,
-      "learning_rate": 0.000483342312250835,
-      "loss": 3.6534,
+      "epoch": 1.9568733153638815,
+      "grad_norm": 0.5635515451431274,
+      "learning_rate": 0.0004831408526713437,
+      "loss": 3.6442,
       "step": 18150
     },
     {
-      "epoch": 1.9588849424173933,
-      "grad_norm": 0.5681806802749634,
-      "learning_rate": 0.0004830190712207736,
-      "loss": 3.6524,
+      "epoch": 1.9622641509433962,
+      "grad_norm": 0.5971663594245911,
+      "learning_rate": 0.0004828170534268753,
+      "loss": 3.6348,
       "step": 18200
     },
     {
-      "epoch": 1.9642664944570014,
-      "grad_norm": 0.6699956655502319,
-      "learning_rate": 0.0004826958301907122,
-      "loss": 3.6431,
+      "epoch": 1.967654986522911,
+      "grad_norm": 0.538390040397644,
+      "learning_rate": 0.00048249325418240686,
+      "loss": 3.6496,
       "step": 18250
     },
     {
-      "epoch": 1.9696480464966095,
-      "grad_norm": 0.5629168152809143,
-      "learning_rate": 0.0004823725891606507,
-      "loss": 3.6236,
+      "epoch": 1.9730458221024259,
+      "grad_norm": 0.5383585095405579,
+      "learning_rate": 0.00048216945493793846,
+      "loss": 3.6451,
       "step": 18300
     },
     {
-      "epoch": 1.9750295985362178,
-      "grad_norm": 0.5990884900093079,
-      "learning_rate": 0.0004820493481305893,
-      "loss": 3.6318,
+      "epoch": 1.9784366576819408,
+      "grad_norm": 0.6232275366783142,
+      "learning_rate": 0.00048184565569347,
+      "loss": 3.63,
       "step": 18350
     },
     {
-      "epoch": 1.9804111505758262,
-      "grad_norm": 0.560914158821106,
-      "learning_rate": 0.00048172610710052797,
-      "loss": 3.6473,
+      "epoch": 1.9838274932614555,
+      "grad_norm": 0.5328205823898315,
+      "learning_rate": 0.0004815218564490016,
+      "loss": 3.6627,
       "step": 18400
     },
     {
-      "epoch": 1.9857927026154343,
-      "grad_norm": 0.6116918325424194,
-      "learning_rate": 0.0004814028660704665,
-      "loss": 3.6316,
+      "epoch": 1.9892183288409704,
+      "grad_norm": 0.6225987672805786,
+      "learning_rate": 0.00048119805720453317,
+      "loss": 3.6153,
       "step": 18450
     },
     {
-      "epoch": 1.9911742546550424,
-      "grad_norm": 0.5204751491546631,
-      "learning_rate": 0.0004810796250404051,
-      "loss": 3.6317,
+      "epoch": 1.9946091644204853,
+      "grad_norm": 0.577847957611084,
+      "learning_rate": 0.0004808742579600647,
+      "loss": 3.6233,
       "step": 18500
     },
     {
-      "epoch": 1.9965558066946507,
-      "grad_norm": 0.5705627799034119,
-      "learning_rate": 0.00048075638401034364,
-      "loss": 3.6266,
+      "epoch": 2.0,
+      "grad_norm": 1.1217093467712402,
+      "learning_rate": 0.0004805504587155963,
+      "loss": 3.6389,
       "step": 18550
     },
     {
-      "epoch": 2.001937358734259,
-      "grad_norm": 0.5522881746292114,
-      "learning_rate": 0.00048043314298028224,
-      "loss": 3.6035,
+      "epoch": 2.0053908355795147,
+      "grad_norm": 0.5509372353553772,
+      "learning_rate": 0.0004802266594711278,
+      "loss": 3.5494,
       "step": 18600
     },
     {
-      "epoch": 2.007318910773867,
-      "grad_norm": 0.5946189165115356,
-      "learning_rate": 0.00048010990195022083,
-      "loss": 3.5188,
+      "epoch": 2.01078167115903,
+      "grad_norm": 0.5721856355667114,
+      "learning_rate": 0.0004799028602266594,
+      "loss": 3.5514,
       "step": 18650
     },
     {
-      "epoch": 2.0127004628134753,
-      "grad_norm": 0.5753241777420044,
-      "learning_rate": 0.0004797866609201594,
-      "loss": 3.5602,
+      "epoch": 2.0161725067385445,
+      "grad_norm": 0.5371805429458618,
+      "learning_rate": 0.000479579060982191,
+      "loss": 3.5437,
       "step": 18700
     },
     {
-      "epoch": 2.018082014853084,
-      "grad_norm": 0.5759482979774475,
-      "learning_rate": 0.000479463419890098,
-      "loss": 3.5473,
+      "epoch": 2.0215633423180592,
+      "grad_norm": 0.5734500288963318,
+      "learning_rate": 0.0004792552617377226,
+      "loss": 3.5601,
       "step": 18750
     },
     {
-      "epoch": 2.023463566892692,
-      "grad_norm": 0.5465207695960999,
-      "learning_rate": 0.0004791401788600366,
-      "loss": 3.5444,
+      "epoch": 2.026954177897574,
+      "grad_norm": 0.5779300928115845,
+      "learning_rate": 0.00047893146249325413,
+      "loss": 3.5493,
       "step": 18800
     },
     {
-      "epoch": 2.0288451189323,
-      "grad_norm": 0.5923919081687927,
-      "learning_rate": 0.00047881693782997515,
-      "loss": 3.5464,
+      "epoch": 2.032345013477089,
+      "grad_norm": 0.5518189072608948,
+      "learning_rate": 0.0004786076632487857,
+      "loss": 3.5319,
       "step": 18850
     },
     {
-      "epoch": 2.034226670971908,
-      "grad_norm": 0.5646787881851196,
-      "learning_rate": 0.00047849369679991375,
-      "loss": 3.5536,
+      "epoch": 2.0377358490566038,
+      "grad_norm": 0.5561443567276001,
+      "learning_rate": 0.0004782838640043173,
+      "loss": 3.5588,
       "step": 18900
     },
     {
-      "epoch": 2.0396082230115167,
-      "grad_norm": 0.5978553295135498,
-      "learning_rate": 0.0004781704557698523,
-      "loss": 3.5624,
+      "epoch": 2.0431266846361185,
+      "grad_norm": 0.5284110307693481,
+      "learning_rate": 0.00047796006475984883,
+      "loss": 3.5496,
       "step": 18950
     },
     {
-      "epoch": 2.044989775051125,
-      "grad_norm": 0.5860423445701599,
-      "learning_rate": 0.00047784721473979094,
-      "loss": 3.5539,
+      "epoch": 2.0485175202156336,
+      "grad_norm": 0.5990350246429443,
+      "learning_rate": 0.0004776427415002698,
+      "loss": 3.5646,
       "step": 19000
     },
     {
-      "epoch": 2.044989775051125,
-      "eval_accuracy": 0.36169546746872777,
-      "eval_loss": 3.579106092453003,
-      "eval_runtime": 184.1309,
-      "eval_samples_per_second": 97.816,
-      "eval_steps_per_second": 6.115,
+      "epoch": 2.0485175202156336,
+      "eval_accuracy": 0.3613797221767389,
+      "eval_loss": 3.5839173793792725,
+      "eval_runtime": 183.9308,
+      "eval_samples_per_second": 97.923,
+      "eval_steps_per_second": 6.122,
       "step": 19000
     },
     {
-      "epoch": 2.050371327090733,
-      "grad_norm": 0.5495768785476685,
-      "learning_rate": 0.00047752397370972953,
-      "loss": 3.5449,
+      "epoch": 2.0539083557951483,
+      "grad_norm": 0.5714938640594482,
+      "learning_rate": 0.0004773189422558014,
+      "loss": 3.5236,
       "step": 19050
     },
     {
-      "epoch": 2.055752879130341,
-      "grad_norm": 0.6042999625205994,
-      "learning_rate": 0.0004772007326796681,
-      "loss": 3.554,
+      "epoch": 2.059299191374663,
+      "grad_norm": 0.5469087958335876,
+      "learning_rate": 0.00047699514301133294,
+      "loss": 3.564,
       "step": 19100
     },
     {
-      "epoch": 2.0611344311699495,
-      "grad_norm": 0.6665642857551575,
-      "learning_rate": 0.00047687749164960667,
-      "loss": 3.5502,
+      "epoch": 2.0646900269541777,
+      "grad_norm": 0.5344583988189697,
+      "learning_rate": 0.00047667134376686455,
+      "loss": 3.5494,
       "step": 19150
     },
     {
-      "epoch": 2.0665159832095576,
-      "grad_norm": 0.5724649429321289,
-      "learning_rate": 0.00047655425061954526,
-      "loss": 3.5659,
+      "epoch": 2.070080862533693,
+      "grad_norm": 0.5535686612129211,
+      "learning_rate": 0.0004763475445223961,
+      "loss": 3.573,
       "step": 19200
     },
     {
-      "epoch": 2.0718975352491658,
-      "grad_norm": 0.5169804692268372,
-      "learning_rate": 0.00047623100958948386,
-      "loss": 3.548,
+      "epoch": 2.0754716981132075,
+      "grad_norm": 0.5836669206619263,
+      "learning_rate": 0.0004760237452779276,
+      "loss": 3.5682,
       "step": 19250
     },
     {
-      "epoch": 2.0772790872887743,
-      "grad_norm": 0.5063441395759583,
-      "learning_rate": 0.00047590776855942245,
-      "loss": 3.5655,
+      "epoch": 2.0808625336927222,
+      "grad_norm": 0.5648289918899536,
+      "learning_rate": 0.0004756999460334592,
+      "loss": 3.5661,
       "step": 19300
     },
     {
-      "epoch": 2.0826606393283824,
-      "grad_norm": 0.5674535632133484,
-      "learning_rate": 0.00047558452752936105,
-      "loss": 3.5459,
+      "epoch": 2.0862533692722374,
+      "grad_norm": 0.5942755341529846,
+      "learning_rate": 0.00047537614678899075,
+      "loss": 3.5567,
       "step": 19350
     },
     {
-      "epoch": 2.0880421913679905,
-      "grad_norm": 0.6405752301216125,
-      "learning_rate": 0.0004752612864992996,
-      "loss": 3.5313,
+      "epoch": 2.091644204851752,
+      "grad_norm": 0.5261635780334473,
+      "learning_rate": 0.00047505234754452235,
+      "loss": 3.5605,
       "step": 19400
     },
     {
-      "epoch": 2.0934237434075986,
-      "grad_norm": 0.586518406867981,
-      "learning_rate": 0.0004749380454692382,
-      "loss": 3.5814,
+      "epoch": 2.0970350404312668,
+      "grad_norm": 0.5810580849647522,
+      "learning_rate": 0.0004747285483000539,
+      "loss": 3.548,
       "step": 19450
     },
     {
-      "epoch": 2.098805295447207,
-      "grad_norm": 0.597720742225647,
-      "learning_rate": 0.0004746148044391767,
-      "loss": 3.5498,
+      "epoch": 2.1024258760107815,
+      "grad_norm": 0.5799669623374939,
+      "learning_rate": 0.0004744047490555855,
+      "loss": 3.5427,
       "step": 19500
     },
     {
-      "epoch": 2.1041868474868153,
-      "grad_norm": 0.6323477625846863,
-      "learning_rate": 0.00047429156340911537,
-      "loss": 3.5417,
+      "epoch": 2.1078167115902966,
+      "grad_norm": 0.6052512526512146,
+      "learning_rate": 0.00047408094981111706,
+      "loss": 3.5616,
       "step": 19550
     },
     {
-      "epoch": 2.1095683995264234,
-      "grad_norm": 0.6271188259124756,
-      "learning_rate": 0.00047396832237905397,
-      "loss": 3.5692,
+      "epoch": 2.1132075471698113,
+      "grad_norm": 0.5334910750389099,
+      "learning_rate": 0.00047375715056664866,
+      "loss": 3.5573,
       "step": 19600
     },
     {
-      "epoch": 2.1149499515660315,
-      "grad_norm": 0.5441372394561768,
-      "learning_rate": 0.0004736450813489925,
-      "loss": 3.5634,
+      "epoch": 2.118598382749326,
+      "grad_norm": 0.5874738097190857,
+      "learning_rate": 0.0004734333513221802,
+      "loss": 3.5585,
       "step": 19650
     },
     {
-      "epoch": 2.12033150360564,
-      "grad_norm": 0.5787696838378906,
-      "learning_rate": 0.0004733218403189311,
-      "loss": 3.5525,
+      "epoch": 2.123989218328841,
+      "grad_norm": 0.5650529861450195,
+      "learning_rate": 0.00047310955207771177,
+      "loss": 3.5481,
       "step": 19700
     },
     {
-      "epoch": 2.125713055645248,
-      "grad_norm": 0.6129936575889587,
-      "learning_rate": 0.0004729985992888697,
-      "loss": 3.5492,
+      "epoch": 2.129380053908356,
+      "grad_norm": 0.5615851283073425,
+      "learning_rate": 0.00047278575283324337,
+      "loss": 3.5418,
       "step": 19750
     },
     {
-      "epoch": 2.1310946076848563,
-      "grad_norm": 0.585528552532196,
-      "learning_rate": 0.00047267535825880824,
-      "loss": 3.5742,
+      "epoch": 2.1347708894878705,
+      "grad_norm": 0.546328604221344,
+      "learning_rate": 0.0004724619535887749,
+      "loss": 3.5744,
       "step": 19800
     },
     {
-      "epoch": 2.1364761597244644,
-      "grad_norm": 0.8197336792945862,
-      "learning_rate": 0.0004723521172287469,
-      "loss": 3.5695,
+      "epoch": 2.1401617250673857,
+      "grad_norm": 0.5430625081062317,
+      "learning_rate": 0.0004721381543443065,
+      "loss": 3.5731,
       "step": 19850
     },
     {
-      "epoch": 2.141857711764073,
-      "grad_norm": 0.5523197054862976,
-      "learning_rate": 0.0004720288761986855,
-      "loss": 3.5365,
+      "epoch": 2.1455525606469004,
+      "grad_norm": 0.9113521575927734,
+      "learning_rate": 0.000471814355099838,
+      "loss": 3.5625,
       "step": 19900
     },
     {
-      "epoch": 2.147239263803681,
-      "grad_norm": 0.5677591562271118,
-      "learning_rate": 0.000471705635168624,
-      "loss": 3.5599,
+      "epoch": 2.150943396226415,
+      "grad_norm": 0.5765600800514221,
+      "learning_rate": 0.0004714905558553697,
+      "loss": 3.5601,
       "step": 19950
     },
     {
-      "epoch": 2.152620815843289,
-      "grad_norm": 0.5868083238601685,
-      "learning_rate": 0.0004713823941385626,
-      "loss": 3.562,
+      "epoch": 2.1563342318059298,
+      "grad_norm": 0.5884748101234436,
+      "learning_rate": 0.0004711667566109012,
+      "loss": 3.5599,
       "step": 20000
     },
     {
-      "epoch": 2.152620815843289,
-      "eval_accuracy": 0.3632018311053878,
-      "eval_loss": 3.570688009262085,
-      "eval_runtime": 184.0571,
-      "eval_samples_per_second": 97.856,
-      "eval_steps_per_second": 6.118,
+      "epoch": 2.1563342318059298,
+      "eval_accuracy": 0.3625327466222262,
+      "eval_loss": 3.5735793113708496,
+      "eval_runtime": 183.4843,
+      "eval_samples_per_second": 98.161,
+      "eval_steps_per_second": 6.137,
       "step": 20000
     },
     {
-      "epoch": 2.1580023678828972,
-      "grad_norm": 0.561229944229126,
-      "learning_rate": 0.00047105915310850116,
-      "loss": 3.5775,
+      "epoch": 2.161725067385445,
+      "grad_norm": 0.5843698382377625,
+      "learning_rate": 0.00047084295736643273,
+      "loss": 3.5608,
       "step": 20050
     },
     {
-      "epoch": 2.163383919922506,
-      "grad_norm": 0.5729883909225464,
-      "learning_rate": 0.0004707359120784398,
-      "loss": 3.5575,
+      "epoch": 2.1671159029649596,
+      "grad_norm": 0.5710905194282532,
+      "learning_rate": 0.00047051915812196433,
+      "loss": 3.571,
       "step": 20100
     },
     {
-      "epoch": 2.168765471962114,
-      "grad_norm": 0.6092100739479065,
-      "learning_rate": 0.0004704126710483784,
-      "loss": 3.5663,
+      "epoch": 2.1725067385444743,
+      "grad_norm": 0.5336669683456421,
+      "learning_rate": 0.0004701953588774959,
+      "loss": 3.5473,
       "step": 20150
     },
     {
-      "epoch": 2.174147024001722,
-      "grad_norm": 0.5391753911972046,
-      "learning_rate": 0.0004700958948389182,
-      "loss": 3.5662,
+      "epoch": 2.177897574123989,
+      "grad_norm": 0.5899220705032349,
+      "learning_rate": 0.0004698715596330275,
+      "loss": 3.572,
       "step": 20200
     },
     {
-      "epoch": 2.1795285760413305,
-      "grad_norm": 0.5216943621635437,
-      "learning_rate": 0.0004697726538088568,
-      "loss": 3.5703,
+      "epoch": 2.183288409703504,
+      "grad_norm": 0.5227386355400085,
+      "learning_rate": 0.00046954776038855904,
+      "loss": 3.5711,
       "step": 20250
     },
     {
-      "epoch": 2.1849101280809387,
-      "grad_norm": 0.5354718565940857,
-      "learning_rate": 0.00046944941277879534,
-      "loss": 3.5427,
+      "epoch": 2.188679245283019,
+      "grad_norm": 0.5975064039230347,
+      "learning_rate": 0.00046922396114409064,
+      "loss": 3.546,
       "step": 20300
     },
     {
-      "epoch": 2.1902916801205468,
-      "grad_norm": 0.6368331909179688,
-      "learning_rate": 0.00046912617174873394,
-      "loss": 3.5627,
+      "epoch": 2.1940700808625335,
+      "grad_norm": 0.5341865420341492,
+      "learning_rate": 0.0004689001618996222,
+      "loss": 3.5638,
       "step": 20350
     },
     {
-      "epoch": 2.195673232160155,
-      "grad_norm": 0.5471709370613098,
-      "learning_rate": 0.0004688029307186725,
-      "loss": 3.5501,
+      "epoch": 2.1994609164420487,
+      "grad_norm": 0.5187112092971802,
+      "learning_rate": 0.0004685763626551538,
+      "loss": 3.5612,
       "step": 20400
     },
     {
-      "epoch": 2.2010547841997634,
-      "grad_norm": 0.5695568919181824,
-      "learning_rate": 0.00046847968968861107,
-      "loss": 3.5601,
+      "epoch": 2.2048517520215634,
+      "grad_norm": 0.5640769600868225,
+      "learning_rate": 0.00046825256341068535,
+      "loss": 3.5616,
       "step": 20450
     },
     {
-      "epoch": 2.2064363362393715,
-      "grad_norm": 0.6059097647666931,
-      "learning_rate": 0.0004681564486585497,
-      "loss": 3.5678,
+      "epoch": 2.210242587601078,
+      "grad_norm": 0.49423614144325256,
+      "learning_rate": 0.0004679287641662169,
+      "loss": 3.5668,
       "step": 20500
     },
     {
-      "epoch": 2.2118178882789796,
-      "grad_norm": 0.5702983140945435,
-      "learning_rate": 0.00046783320762848826,
-      "loss": 3.5725,
+      "epoch": 2.215633423180593,
+      "grad_norm": 0.6228650808334351,
+      "learning_rate": 0.0004676049649217485,
+      "loss": 3.5672,
       "step": 20550
     },
     {
-      "epoch": 2.2171994403185877,
-      "grad_norm": 0.5607845783233643,
-      "learning_rate": 0.00046750996659842685,
-      "loss": 3.5553,
+      "epoch": 2.221024258760108,
+      "grad_norm": 0.5874422192573547,
+      "learning_rate": 0.00046728116567728,
+      "loss": 3.5723,
       "step": 20600
     },
     {
-      "epoch": 2.2225809923581963,
-      "grad_norm": 0.5489804148674011,
-      "learning_rate": 0.00046718672556836545,
-      "loss": 3.5429,
+      "epoch": 2.2264150943396226,
+      "grad_norm": 0.5931791663169861,
+      "learning_rate": 0.0004669573664328116,
+      "loss": 3.5598,
       "step": 20650
     },
     {
-      "epoch": 2.2279625443978044,
-      "grad_norm": 0.5686173439025879,
-      "learning_rate": 0.000466863484538304,
-      "loss": 3.5843,
+      "epoch": 2.2318059299191373,
+      "grad_norm": 0.5493602156639099,
+      "learning_rate": 0.00046663356718834316,
+      "loss": 3.5606,
       "step": 20700
     },
     {
-      "epoch": 2.2333440964374125,
-      "grad_norm": 0.6201581358909607,
-      "learning_rate": 0.0004665402435082426,
-      "loss": 3.5601,
+      "epoch": 2.2371967654986524,
+      "grad_norm": 0.5716580748558044,
+      "learning_rate": 0.00046630976794387476,
+      "loss": 3.5628,
       "step": 20750
     },
     {
-      "epoch": 2.2387256484770206,
-      "grad_norm": 0.5640190243721008,
-      "learning_rate": 0.00046621700247818123,
-      "loss": 3.5754,
+      "epoch": 2.242587601078167,
+      "grad_norm": 0.5286273956298828,
+      "learning_rate": 0.0004659859686994063,
+      "loss": 3.5669,
       "step": 20800
     },
     {
-      "epoch": 2.244107200516629,
-      "grad_norm": 0.6988780498504639,
-      "learning_rate": 0.0004658937614481198,
-      "loss": 3.5475,
+      "epoch": 2.247978436657682,
+      "grad_norm": 0.5628600120544434,
+      "learning_rate": 0.0004656621694549379,
+      "loss": 3.5479,
       "step": 20850
     },
     {
-      "epoch": 2.2494887525562373,
-      "grad_norm": 0.616712749004364,
-      "learning_rate": 0.00046557052041805837,
-      "loss": 3.5727,
+      "epoch": 2.2533692722371965,
+      "grad_norm": 0.6254209280014038,
+      "learning_rate": 0.00046533837021046947,
+      "loss": 3.5606,
       "step": 20900
     },
     {
-      "epoch": 2.2548703045958454,
-      "grad_norm": 0.5751016139984131,
-      "learning_rate": 0.0004652472793879969,
-      "loss": 3.5648,
+      "epoch": 2.2587601078167117,
+      "grad_norm": 0.5592355132102966,
+      "learning_rate": 0.000465014570966001,
+      "loss": 3.565,
       "step": 20950
     },
     {
-      "epoch": 2.2602518566354535,
-      "grad_norm": 0.551199734210968,
-      "learning_rate": 0.0004649240383579355,
-      "loss": 3.5603,
+      "epoch": 2.2641509433962264,
+      "grad_norm": 0.559055507183075,
+      "learning_rate": 0.0004646907717215326,
+      "loss": 3.5482,
       "step": 21000
     },
     {
-      "epoch": 2.2602518566354535,
-      "eval_accuracy": 0.36398217614567135,
-      "eval_loss": 3.5630135536193848,
-      "eval_runtime": 184.4108,
-      "eval_samples_per_second": 97.668,
-      "eval_steps_per_second": 6.106,
+      "epoch": 2.2641509433962264,
+      "eval_accuracy": 0.36398869531894024,
+      "eval_loss": 3.559917688369751,
+      "eval_runtime": 183.7189,
+      "eval_samples_per_second": 98.036,
+      "eval_steps_per_second": 6.129,
       "step": 21000
     },
     {
-      "epoch": 2.265633408675062,
-      "grad_norm": 0.6206423044204712,
-      "learning_rate": 0.00046460079732787415,
-      "loss": 3.5794,
+      "epoch": 2.269541778975741,
+      "grad_norm": 0.5590160489082336,
+      "learning_rate": 0.0004643669724770642,
+      "loss": 3.5418,
       "step": 21050
     },
     {
-      "epoch": 2.27101496071467,
-      "grad_norm": 0.6502292156219482,
-      "learning_rate": 0.0004642775562978127,
-      "loss": 3.5543,
+      "epoch": 2.274932614555256,
+      "grad_norm": 0.5151308178901672,
+      "learning_rate": 0.0004640431732325958,
+      "loss": 3.5741,
       "step": 21100
     },
     {
-      "epoch": 2.2763965127542782,
-      "grad_norm": 0.5624255537986755,
-      "learning_rate": 0.0004639543152677513,
-      "loss": 3.5674,
+      "epoch": 2.280323450134771,
+      "grad_norm": 0.5813835859298706,
+      "learning_rate": 0.00046372584997301673,
+      "loss": 3.5675,
       "step": 21150
     },
     {
-      "epoch": 2.281778064793887,
-      "grad_norm": 0.6220043897628784,
-      "learning_rate": 0.0004636310742376899,
-      "loss": 3.5522,
+      "epoch": 2.2857142857142856,
+      "grad_norm": 0.5855079889297485,
+      "learning_rate": 0.0004634020507285483,
+      "loss": 3.5566,
       "step": 21200
     },
     {
-      "epoch": 2.287159616833495,
-      "grad_norm": 0.5274709463119507,
-      "learning_rate": 0.0004633078332076284,
-      "loss": 3.5407,
+      "epoch": 2.2911051212938007,
+      "grad_norm": 0.5452547073364258,
+      "learning_rate": 0.0004630782514840798,
+      "loss": 3.5307,
       "step": 21250
     },
     {
-      "epoch": 2.292541168873103,
-      "grad_norm": 0.5348024368286133,
-      "learning_rate": 0.000462984592177567,
-      "loss": 3.5461,
+      "epoch": 2.2964959568733154,
+      "grad_norm": 0.6098089814186096,
+      "learning_rate": 0.0004627544522396114,
+      "loss": 3.5531,
       "step": 21300
     },
     {
-      "epoch": 2.297922720912711,
-      "grad_norm": 0.5884036421775818,
-      "learning_rate": 0.00046266135114750567,
-      "loss": 3.5643,
+      "epoch": 2.30188679245283,
+      "grad_norm": 0.5738231539726257,
+      "learning_rate": 0.00046243065299514293,
+      "loss": 3.5683,
       "step": 21350
     },
     {
-      "epoch": 2.303304272952319,
-      "grad_norm": 0.5820002555847168,
-      "learning_rate": 0.0004623381101174442,
-      "loss": 3.5638,
+      "epoch": 2.3072776280323453,
+      "grad_norm": 0.5771190524101257,
+      "learning_rate": 0.00046210685375067454,
+      "loss": 3.5461,
       "step": 21400
     },
     {
-      "epoch": 2.3086858249919278,
-      "grad_norm": 0.6022721529006958,
-      "learning_rate": 0.0004620148690873828,
-      "loss": 3.5642,
+      "epoch": 2.31266846361186,
+      "grad_norm": 0.5646674633026123,
+      "learning_rate": 0.0004617830545062061,
+      "loss": 3.5582,
       "step": 21450
     },
     {
-      "epoch": 2.314067377031536,
-      "grad_norm": 0.5931360125541687,
-      "learning_rate": 0.00046169162805732134,
-      "loss": 3.5376,
+      "epoch": 2.3180592991913747,
+      "grad_norm": 0.5868499875068665,
+      "learning_rate": 0.0004614592552617377,
+      "loss": 3.5613,
       "step": 21500
     },
     {
-      "epoch": 2.319448929071144,
-      "grad_norm": 0.6036872863769531,
-      "learning_rate": 0.00046136838702725994,
-      "loss": 3.5702,
+      "epoch": 2.3234501347708894,
+      "grad_norm": 0.5620684027671814,
+      "learning_rate": 0.00046113545601726924,
+      "loss": 3.5472,
       "step": 21550
     },
     {
-      "epoch": 2.3248304811107525,
-      "grad_norm": 0.5146020650863647,
-      "learning_rate": 0.00046104514599719853,
-      "loss": 3.5693,
+      "epoch": 2.3288409703504045,
+      "grad_norm": 0.5650961995124817,
+      "learning_rate": 0.00046081165677280085,
+      "loss": 3.5579,
       "step": 21600
     },
     {
-      "epoch": 2.3302120331503606,
-      "grad_norm": 0.6073507070541382,
-      "learning_rate": 0.0004607219049671371,
-      "loss": 3.5649,
+      "epoch": 2.334231805929919,
+      "grad_norm": 0.5434009432792664,
+      "learning_rate": 0.0004604878575283324,
+      "loss": 3.5593,
       "step": 21650
     },
     {
-      "epoch": 2.3355935851899687,
-      "grad_norm": 0.57147216796875,
-      "learning_rate": 0.0004603986639370757,
-      "loss": 3.5451,
+      "epoch": 2.339622641509434,
+      "grad_norm": 0.512363851070404,
+      "learning_rate": 0.00046016405828386395,
+      "loss": 3.5606,
       "step": 21700
     },
     {
-      "epoch": 2.340975137229577,
-      "grad_norm": 0.6783694624900818,
-      "learning_rate": 0.0004600754229070143,
-      "loss": 3.5448,
+      "epoch": 2.3450134770889486,
+      "grad_norm": 0.6248358488082886,
+      "learning_rate": 0.00045984025903939555,
+      "loss": 3.5592,
       "step": 21750
     },
     {
-      "epoch": 2.3463566892691854,
-      "grad_norm": 0.5610417723655701,
-      "learning_rate": 0.00045975218187695286,
-      "loss": 3.5461,
+      "epoch": 2.3504043126684637,
+      "grad_norm": 0.5577827095985413,
+      "learning_rate": 0.0004595164597949271,
+      "loss": 3.5685,
       "step": 21800
     },
     {
-      "epoch": 2.3517382413087935,
-      "grad_norm": 0.5262538194656372,
-      "learning_rate": 0.00045942894084689145,
+      "epoch": 2.3557951482479784,
+      "grad_norm": 0.5624589920043945,
+      "learning_rate": 0.0004591926605504587,
       "loss": 3.5627,
       "step": 21850
     },
     {
-      "epoch": 2.3571197933484016,
-      "grad_norm": 0.5298727750778198,
-      "learning_rate": 0.0004591056998168301,
-      "loss": 3.5745,
+      "epoch": 2.361185983827493,
+      "grad_norm": 0.54544597864151,
+      "learning_rate": 0.0004588688613059902,
+      "loss": 3.5481,
       "step": 21900
     },
     {
-      "epoch": 2.3625013453880097,
-      "grad_norm": 0.5683722496032715,
-      "learning_rate": 0.00045878245878676864,
-      "loss": 3.5601,
+      "epoch": 2.3665768194070083,
+      "grad_norm": 0.5536805391311646,
+      "learning_rate": 0.00045854506206152186,
+      "loss": 3.5621,
       "step": 21950
     },
     {
-      "epoch": 2.3678828974276183,
-      "grad_norm": 0.5847147703170776,
-      "learning_rate": 0.00045845921775670723,
-      "loss": 3.546,
+      "epoch": 2.371967654986523,
+      "grad_norm": 0.5469595789909363,
+      "learning_rate": 0.00045822126281705336,
+      "loss": 3.562,
       "step": 22000
     },
     {
-      "epoch": 2.3678828974276183,
-      "eval_accuracy": 0.3645442375343357,
-      "eval_loss": 3.5517847537994385,
-      "eval_runtime": 183.8946,
-      "eval_samples_per_second": 97.942,
-      "eval_steps_per_second": 6.123,
+      "epoch": 2.371967654986523,
+      "eval_accuracy": 0.3649076814440751,
+      "eval_loss": 3.5507798194885254,
+      "eval_runtime": 183.8207,
+      "eval_samples_per_second": 97.981,
+      "eval_steps_per_second": 6.126,
       "step": 22000
     },
     {
-      "epoch": 2.3732644494672264,
-      "grad_norm": 0.5403057932853699,
-      "learning_rate": 0.0004581359767266458,
-      "loss": 3.5433,
+      "epoch": 2.3773584905660377,
+      "grad_norm": 0.5918774604797363,
+      "learning_rate": 0.00045789746357258497,
+      "loss": 3.538,
       "step": 22050
     },
     {
-      "epoch": 2.3786460015068345,
-      "grad_norm": 0.5787127614021301,
-      "learning_rate": 0.00045781273569658437,
-      "loss": 3.5425,
+      "epoch": 2.382749326145553,
+      "grad_norm": 0.5457159876823425,
+      "learning_rate": 0.0004575736643281165,
+      "loss": 3.5609,
       "step": 22100
     },
     {
-      "epoch": 2.384027553546443,
-      "grad_norm": 0.5755758881568909,
-      "learning_rate": 0.00045748949466652296,
-      "loss": 3.5561,
+      "epoch": 2.3881401617250675,
+      "grad_norm": 0.5094021558761597,
+      "learning_rate": 0.00045724986508364807,
+      "loss": 3.5564,
       "step": 22150
     },
     {
-      "epoch": 2.389409105586051,
-      "grad_norm": 0.6487693786621094,
-      "learning_rate": 0.00045716625363646156,
-      "loss": 3.5523,
+      "epoch": 2.393530997304582,
+      "grad_norm": 0.6784635186195374,
+      "learning_rate": 0.00045692606583917967,
+      "loss": 3.5821,
       "step": 22200
     },
     {
-      "epoch": 2.3947906576256592,
-      "grad_norm": 0.531506359577179,
-      "learning_rate": 0.00045684301260640015,
-      "loss": 3.5629,
+      "epoch": 2.398921832884097,
+      "grad_norm": 0.5692277550697327,
+      "learning_rate": 0.0004566022665947112,
+      "loss": 3.5521,
       "step": 22250
     },
     {
-      "epoch": 2.4001722096652673,
-      "grad_norm": 0.6475794315338135,
-      "learning_rate": 0.00045651977157633875,
-      "loss": 3.569,
+      "epoch": 2.404312668463612,
+      "grad_norm": 0.616717517375946,
+      "learning_rate": 0.0004562784673502428,
+      "loss": 3.5324,
       "step": 22300
     },
     {
-      "epoch": 2.4055537617048754,
-      "grad_norm": 0.5457855463027954,
-      "learning_rate": 0.0004561965305462773,
-      "loss": 3.5683,
+      "epoch": 2.4097035040431267,
+      "grad_norm": 0.5351888537406921,
+      "learning_rate": 0.0004559546681057744,
+      "loss": 3.5557,
       "step": 22350
     },
     {
-      "epoch": 2.410935313744484,
-      "grad_norm": 0.5556653141975403,
-      "learning_rate": 0.0004558732895162159,
-      "loss": 3.5537,
+      "epoch": 2.4150943396226414,
+      "grad_norm": 0.5345177054405212,
+      "learning_rate": 0.000455630868861306,
+      "loss": 3.5455,
       "step": 22400
     },
     {
-      "epoch": 2.416316865784092,
-      "grad_norm": 0.596947968006134,
-      "learning_rate": 0.0004555565133067557,
-      "loss": 3.552,
+      "epoch": 2.420485175202156,
+      "grad_norm": 0.5832816958427429,
+      "learning_rate": 0.00045530706961683753,
+      "loss": 3.5731,
       "step": 22450
     },
     {
-      "epoch": 2.4216984178237,
-      "grad_norm": 0.552823543548584,
-      "learning_rate": 0.0004552332722766943,
-      "loss": 3.5577,
+      "epoch": 2.4258760107816713,
+      "grad_norm": 0.6277952194213867,
+      "learning_rate": 0.00045498327037236914,
+      "loss": 3.5471,
       "step": 22500
     },
     {
-      "epoch": 2.4270799698633088,
-      "grad_norm": 0.5624712705612183,
-      "learning_rate": 0.0004549100312466328,
-      "loss": 3.5463,
+      "epoch": 2.431266846361186,
+      "grad_norm": 0.5768845677375793,
+      "learning_rate": 0.0004546594711279007,
+      "loss": 3.553,
       "step": 22550
     },
     {
-      "epoch": 2.432461521902917,
-      "grad_norm": 0.5931119322776794,
-      "learning_rate": 0.0004545867902165715,
-      "loss": 3.5592,
+      "epoch": 2.4366576819407006,
+      "grad_norm": 0.5368415713310242,
+      "learning_rate": 0.0004543356718834322,
+      "loss": 3.5538,
       "step": 22600
     },
     {
-      "epoch": 2.437843073942525,
-      "grad_norm": 0.6037808656692505,
-      "learning_rate": 0.00045426354918651007,
-      "loss": 3.547,
+      "epoch": 2.442048517520216,
+      "grad_norm": 0.6223293542861938,
+      "learning_rate": 0.0004540118726389638,
+      "loss": 3.5499,
       "step": 22650
     },
     {
-      "epoch": 2.443224625982133,
-      "grad_norm": 0.5643433928489685,
-      "learning_rate": 0.0004539403081564486,
-      "loss": 3.5637,
+      "epoch": 2.4474393530997305,
+      "grad_norm": 0.5658764243125916,
+      "learning_rate": 0.00045368807339449534,
+      "loss": 3.5387,
       "step": 22700
     },
     {
-      "epoch": 2.4486061780217416,
-      "grad_norm": 0.5612767934799194,
-      "learning_rate": 0.0004536170671263872,
-      "loss": 3.5429,
+      "epoch": 2.452830188679245,
+      "grad_norm": 0.58261638879776,
+      "learning_rate": 0.00045336427415002694,
+      "loss": 3.5648,
       "step": 22750
     },
     {
-      "epoch": 2.4539877300613497,
-      "grad_norm": 0.5697504878044128,
-      "learning_rate": 0.00045329382609632574,
-      "loss": 3.5538,
+      "epoch": 2.4582210242587603,
+      "grad_norm": 0.5521255731582642,
+      "learning_rate": 0.0004530404749055585,
+      "loss": 3.5745,
       "step": 22800
     },
     {
-      "epoch": 2.459369282100958,
-      "grad_norm": 0.5525127053260803,
-      "learning_rate": 0.0004529705850662644,
-      "loss": 3.5529,
+      "epoch": 2.463611859838275,
+      "grad_norm": 0.5698715448379517,
+      "learning_rate": 0.0004527166756610901,
+      "loss": 3.5392,
       "step": 22850
     },
     {
-      "epoch": 2.464750834140566,
-      "grad_norm": 0.6191689968109131,
-      "learning_rate": 0.000452647344036203,
-      "loss": 3.5386,
+      "epoch": 2.4690026954177897,
+      "grad_norm": 0.5453882813453674,
+      "learning_rate": 0.00045239287641662165,
+      "loss": 3.5493,
       "step": 22900
     },
     {
-      "epoch": 2.4701323861801745,
-      "grad_norm": 0.5639016032218933,
-      "learning_rate": 0.00045232410300614153,
-      "loss": 3.5754,
+      "epoch": 2.4743935309973044,
+      "grad_norm": 0.5743011832237244,
+      "learning_rate": 0.0004520690771721532,
+      "loss": 3.553,
       "step": 22950
     },
     {
-      "epoch": 2.4755139382197826,
-      "grad_norm": 0.6116729378700256,
-      "learning_rate": 0.0004520008619760801,
-      "loss": 3.5561,
+      "epoch": 2.4797843665768196,
+      "grad_norm": 0.5805160999298096,
+      "learning_rate": 0.0004517452779276848,
+      "loss": 3.5459,
       "step": 23000
     },
     {
-      "epoch": 2.4755139382197826,
-      "eval_accuracy": 0.3660388666591117,
-      "eval_loss": 3.5395596027374268,
-      "eval_runtime": 184.1301,
-      "eval_samples_per_second": 97.817,
-      "eval_steps_per_second": 6.115,
+      "epoch": 2.4797843665768196,
+      "eval_accuracy": 0.3661536041086438,
+      "eval_loss": 3.5383236408233643,
+      "eval_runtime": 183.8471,
+      "eval_samples_per_second": 97.967,
+      "eval_steps_per_second": 6.125,
       "step": 23000
     },
     {
-      "epoch": 2.4808954902593907,
-      "grad_norm": 0.5503861904144287,
-      "learning_rate": 0.0004516776209460187,
-      "loss": 3.5527,
+      "epoch": 2.4851752021563343,
+      "grad_norm": 0.5620664954185486,
+      "learning_rate": 0.00045142147868321636,
+      "loss": 3.5437,
       "step": 23050
     },
     {
-      "epoch": 2.4862770422989993,
-      "grad_norm": 0.5716922283172607,
-      "learning_rate": 0.00045135437991595726,
-      "loss": 3.5307,
+      "epoch": 2.490566037735849,
+      "grad_norm": 0.5519572496414185,
+      "learning_rate": 0.00045109767943874796,
+      "loss": 3.5415,
       "step": 23100
     },
     {
-      "epoch": 2.4916585943386074,
-      "grad_norm": 0.5459892153739929,
-      "learning_rate": 0.0004510311388858959,
-      "loss": 3.5607,
+      "epoch": 2.4959568733153636,
+      "grad_norm": 0.5925480723381042,
+      "learning_rate": 0.0004507738801942795,
+      "loss": 3.538,
       "step": 23150
     },
     {
-      "epoch": 2.4970401463782155,
-      "grad_norm": 0.6022793054580688,
-      "learning_rate": 0.0004507078978558345,
-      "loss": 3.5287,
+      "epoch": 2.501347708894879,
+      "grad_norm": 0.5759234428405762,
+      "learning_rate": 0.0004504500809498111,
+      "loss": 3.5515,
       "step": 23200
     },
     {
-      "epoch": 2.5024216984178236,
-      "grad_norm": 0.586172342300415,
-      "learning_rate": 0.00045038465682577304,
-      "loss": 3.5393,
+      "epoch": 2.5067385444743935,
+      "grad_norm": 0.5272903442382812,
+      "learning_rate": 0.0004501262817053426,
+      "loss": 3.5644,
       "step": 23250
     },
     {
-      "epoch": 2.5078032504574317,
-      "grad_norm": 0.5649908185005188,
-      "learning_rate": 0.00045006141579571164,
-      "loss": 3.5369,
+      "epoch": 2.512129380053908,
+      "grad_norm": 0.567111074924469,
+      "learning_rate": 0.00044980248246087427,
+      "loss": 3.5624,
       "step": 23300
     },
     {
-      "epoch": 2.5131848024970402,
-      "grad_norm": 0.6035097241401672,
-      "learning_rate": 0.0004497381747656502,
-      "loss": 3.5708,
+      "epoch": 2.5175202156334233,
+      "grad_norm": 0.5605776906013489,
+      "learning_rate": 0.00044947868321640577,
+      "loss": 3.5369,
       "step": 23350
     },
     {
-      "epoch": 2.5185663545366483,
-      "grad_norm": 0.5504043698310852,
-      "learning_rate": 0.00044941493373558877,
-      "loss": 3.5433,
+      "epoch": 2.522911051212938,
+      "grad_norm": 0.5629092454910278,
+      "learning_rate": 0.0004491548839719373,
+      "loss": 3.5407,
       "step": 23400
     },
     {
-      "epoch": 2.5239479065762565,
-      "grad_norm": 0.5592088103294373,
-      "learning_rate": 0.0004490916927055274,
-      "loss": 3.5428,
+      "epoch": 2.5283018867924527,
+      "grad_norm": 0.5650578141212463,
+      "learning_rate": 0.00044883756071235827,
+      "loss": 3.5449,
       "step": 23450
     },
     {
-      "epoch": 2.529329458615865,
-      "grad_norm": 0.5921136140823364,
-      "learning_rate": 0.00044876845167546596,
-      "loss": 3.5478,
+      "epoch": 2.533692722371968,
+      "grad_norm": 0.5151700973510742,
+      "learning_rate": 0.0004485137614678899,
+      "loss": 3.5247,
       "step": 23500
     },
     {
-      "epoch": 2.534711010655473,
-      "grad_norm": 0.5895003080368042,
-      "learning_rate": 0.00044844521064540455,
-      "loss": 3.5392,
+      "epoch": 2.5390835579514826,
+      "grad_norm": 0.6077624559402466,
+      "learning_rate": 0.0004481899622234214,
+      "loss": 3.5611,
       "step": 23550
     },
     {
-      "epoch": 2.540092562695081,
-      "grad_norm": 0.5717202425003052,
-      "learning_rate": 0.00044812196961534315,
-      "loss": 3.5425,
+      "epoch": 2.5444743935309972,
+      "grad_norm": 0.572530210018158,
+      "learning_rate": 0.00044786616297895303,
+      "loss": 3.5533,
       "step": 23600
     },
     {
-      "epoch": 2.5454741147346893,
-      "grad_norm": 0.5879724025726318,
-      "learning_rate": 0.0004477987285852817,
-      "loss": 3.5676,
+      "epoch": 2.5498652291105124,
+      "grad_norm": 0.5659880638122559,
+      "learning_rate": 0.0004475423637344846,
+      "loss": 3.5459,
       "step": 23650
     },
     {
-      "epoch": 2.550855666774298,
-      "grad_norm": 0.5525007247924805,
-      "learning_rate": 0.00044747548755522034,
-      "loss": 3.5493,
+      "epoch": 2.555256064690027,
+      "grad_norm": 0.5345480442047119,
+      "learning_rate": 0.00044721856449001613,
+      "loss": 3.5601,
       "step": 23700
     },
     {
-      "epoch": 2.556237218813906,
-      "grad_norm": 0.6007914543151855,
-      "learning_rate": 0.00044715224652515893,
-      "loss": 3.5563,
+      "epoch": 2.560646900269542,
+      "grad_norm": 0.5903283953666687,
+      "learning_rate": 0.00044689476524554774,
+      "loss": 3.5685,
       "step": 23750
     },
     {
-      "epoch": 2.561618770853514,
-      "grad_norm": 0.5593198537826538,
-      "learning_rate": 0.0004468290054950975,
-      "loss": 3.5568,
+      "epoch": 2.5660377358490565,
+      "grad_norm": 0.6142652630805969,
+      "learning_rate": 0.0004465709660010793,
+      "loss": 3.5422,
       "step": 23800
     },
     {
-      "epoch": 2.567000322893122,
-      "grad_norm": 0.5726816058158875,
-      "learning_rate": 0.00044650576446503607,
-      "loss": 3.5613,
+      "epoch": 2.571428571428571,
+      "grad_norm": 0.5753684043884277,
+      "learning_rate": 0.0004462471667566109,
+      "loss": 3.5405,
       "step": 23850
     },
     {
-      "epoch": 2.5723818749327307,
-      "grad_norm": 0.5541634559631348,
-      "learning_rate": 0.0004461825234349746,
-      "loss": 3.5451,
+      "epoch": 2.5768194070080863,
+      "grad_norm": 0.6161198616027832,
+      "learning_rate": 0.00044592336751214244,
+      "loss": 3.539,
       "step": 23900
     },
     {
-      "epoch": 2.577763426972339,
-      "grad_norm": 0.545291543006897,
-      "learning_rate": 0.0004458592824049132,
-      "loss": 3.5541,
+      "epoch": 2.582210242587601,
+      "grad_norm": 0.5361372828483582,
+      "learning_rate": 0.00044559956826767405,
+      "loss": 3.5651,
       "step": 23950
     },
     {
-      "epoch": 2.583144979011947,
-      "grad_norm": 0.5981674194335938,
-      "learning_rate": 0.00044553604137485185,
-      "loss": 3.5528,
+      "epoch": 2.5876010781671157,
+      "grad_norm": 0.5892603397369385,
+      "learning_rate": 0.00044527576902320554,
+      "loss": 3.5462,
       "step": 24000
     },
     {
-      "epoch": 2.583144979011947,
-      "eval_accuracy": 0.36709540734021967,
-      "eval_loss": 3.530630588531494,
-      "eval_runtime": 184.0185,
-      "eval_samples_per_second": 97.876,
-      "eval_steps_per_second": 6.119,
+      "epoch": 2.5876010781671157,
+      "eval_accuracy": 0.3668596305736623,
+      "eval_loss": 3.531897783279419,
+      "eval_runtime": 183.6864,
+      "eval_samples_per_second": 98.053,
+      "eval_steps_per_second": 6.13,
       "step": 24000
     },
     {
-      "epoch": 2.5885265310515555,
-      "grad_norm": 0.6674214005470276,
-      "learning_rate": 0.0004452128003447904,
-      "loss": 3.5395,
+      "epoch": 2.592991913746631,
+      "grad_norm": 0.6340603828430176,
+      "learning_rate": 0.00044495196977873715,
+      "loss": 3.5501,
       "step": 24050
     },
     {
-      "epoch": 2.5939080830911636,
-      "grad_norm": 0.5705569982528687,
-      "learning_rate": 0.000444889559314729,
-      "loss": 3.5585,
+      "epoch": 2.5983827493261455,
+      "grad_norm": 0.597061812877655,
+      "learning_rate": 0.0004446281705342687,
+      "loss": 3.5263,
       "step": 24100
     },
     {
-      "epoch": 2.5992896351307717,
-      "grad_norm": 0.5629080533981323,
-      "learning_rate": 0.0004445663182846676,
-      "loss": 3.5464,
+      "epoch": 2.6037735849056602,
+      "grad_norm": 0.5808542370796204,
+      "learning_rate": 0.00044430437128980025,
+      "loss": 3.5526,
       "step": 24150
     },
     {
-      "epoch": 2.60467118717038,
-      "grad_norm": 0.5345271229743958,
-      "learning_rate": 0.0004442430772546061,
-      "loss": 3.547,
+      "epoch": 2.6091644204851754,
+      "grad_norm": 0.5798895955085754,
+      "learning_rate": 0.00044398057204533185,
+      "loss": 3.5424,
       "step": 24200
     },
     {
-      "epoch": 2.610052739209988,
-      "grad_norm": 0.6864102482795715,
-      "learning_rate": 0.0004439198362245447,
-      "loss": 3.5398,
+      "epoch": 2.61455525606469,
+      "grad_norm": 0.5814908742904663,
+      "learning_rate": 0.0004436567728008634,
+      "loss": 3.5475,
       "step": 24250
     },
     {
-      "epoch": 2.6154342912495965,
-      "grad_norm": 0.5583524107933044,
-      "learning_rate": 0.00044359659519448337,
-      "loss": 3.534,
+      "epoch": 2.6199460916442048,
+      "grad_norm": 0.5761340856552124,
+      "learning_rate": 0.000443332973556395,
+      "loss": 3.5534,
       "step": 24300
     },
     {
-      "epoch": 2.6208158432892046,
-      "grad_norm": 0.5800455808639526,
-      "learning_rate": 0.0004432733541644219,
-      "loss": 3.5389,
+      "epoch": 2.62533692722372,
+      "grad_norm": 0.5729596614837646,
+      "learning_rate": 0.00044300917431192656,
+      "loss": 3.5558,
       "step": 24350
     },
     {
-      "epoch": 2.6261973953288127,
-      "grad_norm": 0.577061653137207,
-      "learning_rate": 0.0004429501131343605,
-      "loss": 3.5386,
+      "epoch": 2.6307277628032346,
+      "grad_norm": 0.5785598754882812,
+      "learning_rate": 0.00044268537506745816,
+      "loss": 3.5308,
       "step": 24400
     },
     {
-      "epoch": 2.6315789473684212,
-      "grad_norm": 0.5940808653831482,
-      "learning_rate": 0.00044262687210429904,
-      "loss": 3.5421,
+      "epoch": 2.6361185983827493,
+      "grad_norm": 0.5517538785934448,
+      "learning_rate": 0.0004423615758229897,
+      "loss": 3.5567,
       "step": 24450
     },
     {
-      "epoch": 2.6369604994080293,
-      "grad_norm": 0.55911785364151,
-      "learning_rate": 0.00044230363107423764,
-      "loss": 3.5452,
+      "epoch": 2.641509433962264,
+      "grad_norm": 0.5444765686988831,
+      "learning_rate": 0.0004420377765785213,
+      "loss": 3.5214,
       "step": 24500
     },
     {
-      "epoch": 2.6423420514476375,
-      "grad_norm": 0.6087802648544312,
-      "learning_rate": 0.0004419803900441762,
-      "loss": 3.5462,
+      "epoch": 2.6469002695417787,
+      "grad_norm": 0.5362924933433533,
+      "learning_rate": 0.00044171397733405287,
+      "loss": 3.5284,
       "step": 24550
     },
     {
-      "epoch": 2.6477236034872456,
-      "grad_norm": 0.5640580654144287,
-      "learning_rate": 0.0004416571490141148,
-      "loss": 3.5278,
+      "epoch": 2.652291105121294,
+      "grad_norm": 0.5639698505401611,
+      "learning_rate": 0.00044139017808958437,
+      "loss": 3.5345,
       "step": 24600
     },
     {
-      "epoch": 2.653105155526854,
-      "grad_norm": 0.6379339098930359,
-      "learning_rate": 0.0004413339079840534,
-      "loss": 3.5637,
+      "epoch": 2.6576819407008085,
+      "grad_norm": 0.5087600350379944,
+      "learning_rate": 0.00044106637884511597,
+      "loss": 3.5388,
       "step": 24650
     },
     {
-      "epoch": 2.658486707566462,
-      "grad_norm": 0.5877749919891357,
-      "learning_rate": 0.00044101066695399196,
-      "loss": 3.5535,
+      "epoch": 2.6630727762803232,
+      "grad_norm": 0.520511269569397,
+      "learning_rate": 0.0004407425796006475,
+      "loss": 3.5433,
       "step": 24700
     },
     {
-      "epoch": 2.6638682596060703,
-      "grad_norm": 0.5544137954711914,
-      "learning_rate": 0.0004406938907445318,
-      "loss": 3.5375,
+      "epoch": 2.6684636118598384,
+      "grad_norm": 0.5917512774467468,
+      "learning_rate": 0.00044041878035617913,
+      "loss": 3.5557,
       "step": 24750
     },
     {
-      "epoch": 2.6692498116456784,
-      "grad_norm": 0.5193669199943542,
-      "learning_rate": 0.00044037064971447036,
-      "loss": 3.5324,
+      "epoch": 2.673854447439353,
+      "grad_norm": 0.5649670958518982,
+      "learning_rate": 0.0004400949811117107,
+      "loss": 3.5326,
       "step": 24800
     },
     {
-      "epoch": 2.674631363685287,
-      "grad_norm": 0.5993558764457703,
-      "learning_rate": 0.00044004740868440896,
-      "loss": 3.5171,
+      "epoch": 2.6792452830188678,
+      "grad_norm": 0.5855710506439209,
+      "learning_rate": 0.0004397711818672423,
+      "loss": 3.5349,
       "step": 24850
     },
     {
-      "epoch": 2.680012915724895,
-      "grad_norm": 0.606824517250061,
-      "learning_rate": 0.00043972416765434755,
-      "loss": 3.5454,
+      "epoch": 2.684636118598383,
+      "grad_norm": 0.5762926936149597,
+      "learning_rate": 0.00043944738262277383,
+      "loss": 3.5423,
       "step": 24900
     },
     {
-      "epoch": 2.685394467764503,
-      "grad_norm": 0.5386514067649841,
-      "learning_rate": 0.00043940092662428615,
-      "loss": 3.5478,
+      "epoch": 2.6900269541778976,
+      "grad_norm": 0.5336632132530212,
+      "learning_rate": 0.00043912358337830544,
+      "loss": 3.5393,
       "step": 24950
     },
     {
-      "epoch": 2.6907760198041117,
-      "grad_norm": 0.5610313415527344,
-      "learning_rate": 0.00043907768559422474,
-      "loss": 3.5447,
+      "epoch": 2.6954177897574123,
+      "grad_norm": 0.5700528621673584,
+      "learning_rate": 0.000438799784133837,
+      "loss": 3.5523,
       "step": 25000
     },
     {
-      "epoch": 2.6907760198041117,
-      "eval_accuracy": 0.36763606410998456,
-      "eval_loss": 3.5236897468566895,
-      "eval_runtime": 184.5139,
-      "eval_samples_per_second": 97.613,
-      "eval_steps_per_second": 6.103,
+      "epoch": 2.6954177897574123,
+      "eval_accuracy": 0.367980167805693,
+      "eval_loss": 3.522153615951538,
+      "eval_runtime": 184.4043,
+      "eval_samples_per_second": 97.671,
+      "eval_steps_per_second": 6.106,
       "step": 25000
     },
     {
-      "epoch": 2.69615757184372,
-      "grad_norm": 0.5900009274482727,
-      "learning_rate": 0.00043875444456416334,
-      "loss": 3.5374,
+      "epoch": 2.7008086253369274,
+      "grad_norm": 0.6027398705482483,
+      "learning_rate": 0.00043847598488936854,
+      "loss": 3.5516,
       "step": 25050
     },
     {
-      "epoch": 2.701539123883328,
-      "grad_norm": 0.5528219938278198,
-      "learning_rate": 0.0004384312035341019,
-      "loss": 3.5508,
+      "epoch": 2.706199460916442,
+      "grad_norm": 0.5879538059234619,
+      "learning_rate": 0.00043815218564490014,
+      "loss": 3.5622,
       "step": 25100
     },
     {
-      "epoch": 2.706920675922936,
-      "grad_norm": 0.5879670977592468,
-      "learning_rate": 0.00043810796250404047,
-      "loss": 3.5379,
+      "epoch": 2.711590296495957,
+      "grad_norm": 0.5743058323860168,
+      "learning_rate": 0.0004378283864004317,
+      "loss": 3.5443,
       "step": 25150
     },
     {
-      "epoch": 2.712302227962544,
-      "grad_norm": 0.6659033894538879,
-      "learning_rate": 0.000437784721473979,
-      "loss": 3.539,
+      "epoch": 2.7169811320754715,
+      "grad_norm": 0.5928617715835571,
+      "learning_rate": 0.0004375045871559633,
+      "loss": 3.5291,
       "step": 25200
     },
     {
-      "epoch": 2.7176837800021527,
-      "grad_norm": 0.5466210246086121,
-      "learning_rate": 0.00043746148044391766,
-      "loss": 3.5477,
+      "epoch": 2.7223719676549867,
+      "grad_norm": 0.5669053196907043,
+      "learning_rate": 0.00043718078791149485,
+      "loss": 3.5318,
       "step": 25250
     },
     {
-      "epoch": 2.723065332041761,
-      "grad_norm": 0.5327200889587402,
-      "learning_rate": 0.00043713823941385625,
-      "loss": 3.546,
+      "epoch": 2.7277628032345014,
+      "grad_norm": 0.578813374042511,
+      "learning_rate": 0.00043685698866702645,
+      "loss": 3.5408,
       "step": 25300
     },
     {
-      "epoch": 2.728446884081369,
-      "grad_norm": 0.5748312473297119,
-      "learning_rate": 0.0004368149983837948,
-      "loss": 3.5431,
+      "epoch": 2.733153638814016,
+      "grad_norm": 0.538439929485321,
+      "learning_rate": 0.00043653318942255795,
+      "loss": 3.5313,
       "step": 25350
     },
     {
-      "epoch": 2.7338284361209775,
-      "grad_norm": 0.5589819550514221,
-      "learning_rate": 0.0004364917573537334,
-      "loss": 3.5408,
+      "epoch": 2.7385444743935308,
+      "grad_norm": 0.5391579270362854,
+      "learning_rate": 0.00043620939017808956,
+      "loss": 3.5237,
       "step": 25400
     },
     {
-      "epoch": 2.7392099881605856,
-      "grad_norm": 0.5818286538124084,
-      "learning_rate": 0.00043616851632367193,
-      "loss": 3.541,
+      "epoch": 2.743935309973046,
+      "grad_norm": 0.5517474412918091,
+      "learning_rate": 0.0004358855909336211,
+      "loss": 3.5417,
       "step": 25450
     },
     {
-      "epoch": 2.7445915402001937,
-      "grad_norm": 0.6057955026626587,
-      "learning_rate": 0.0004358452752936106,
-      "loss": 3.5448,
+      "epoch": 2.7493261455525606,
+      "grad_norm": 0.5709769129753113,
+      "learning_rate": 0.00043556179168915266,
+      "loss": 3.5577,
       "step": 25500
     },
     {
-      "epoch": 2.749973092239802,
-      "grad_norm": 0.5754573345184326,
-      "learning_rate": 0.0004355220342635492,
-      "loss": 3.5289,
+      "epoch": 2.7547169811320753,
+      "grad_norm": 0.6115706562995911,
+      "learning_rate": 0.00043523799244468426,
+      "loss": 3.5518,
       "step": 25550
     },
     {
-      "epoch": 2.7553546442794103,
-      "grad_norm": 0.5715060830116272,
-      "learning_rate": 0.00043519879323348777,
-      "loss": 3.5369,
+      "epoch": 2.7601078167115904,
+      "grad_norm": 0.58238685131073,
+      "learning_rate": 0.0004349141932002158,
+      "loss": 3.5414,
       "step": 25600
     },
     {
-      "epoch": 2.7607361963190185,
-      "grad_norm": 0.6356561779975891,
-      "learning_rate": 0.0004348755522034263,
-      "loss": 3.5337,
+      "epoch": 2.765498652291105,
+      "grad_norm": 0.5621404051780701,
+      "learning_rate": 0.0004345903939557474,
+      "loss": 3.5288,
       "step": 25650
     },
     {
-      "epoch": 2.7661177483586266,
-      "grad_norm": 0.5628809928894043,
-      "learning_rate": 0.0004345523111733649,
-      "loss": 3.5237,
+      "epoch": 2.77088948787062,
+      "grad_norm": 0.5885303616523743,
+      "learning_rate": 0.00043426659471127897,
+      "loss": 3.5514,
       "step": 25700
     },
     {
-      "epoch": 2.7714993003982347,
-      "grad_norm": 0.5270907282829285,
-      "learning_rate": 0.00043422907014330344,
-      "loss": 3.5477,
+      "epoch": 2.776280323450135,
+      "grad_norm": 0.5688592195510864,
+      "learning_rate": 0.00043394279546681057,
+      "loss": 3.5349,
       "step": 25750
     },
     {
-      "epoch": 2.776880852437843,
-      "grad_norm": 0.6290378570556641,
-      "learning_rate": 0.0004339058291132421,
-      "loss": 3.5433,
+      "epoch": 2.7816711590296497,
+      "grad_norm": 0.5848641395568848,
+      "learning_rate": 0.0004336189962223421,
+      "loss": 3.5355,
       "step": 25800
     },
     {
-      "epoch": 2.7822624044774513,
-      "grad_norm": 0.5788700580596924,
-      "learning_rate": 0.0004335825880831807,
-      "loss": 3.527,
+      "epoch": 2.7870619946091644,
+      "grad_norm": 0.5531324744224548,
+      "learning_rate": 0.0004332951969778737,
+      "loss": 3.523,
       "step": 25850
     },
     {
-      "epoch": 2.7876439565170594,
-      "grad_norm": 0.5836464166641235,
-      "learning_rate": 0.00043325934705311923,
-      "loss": 3.5346,
+      "epoch": 2.7924528301886795,
+      "grad_norm": 0.5385251045227051,
+      "learning_rate": 0.0004329713977334053,
+      "loss": 3.5402,
       "step": 25900
     },
     {
-      "epoch": 2.793025508556668,
-      "grad_norm": 0.5945377349853516,
-      "learning_rate": 0.0004329361060230578,
-      "loss": 3.5364,
+      "epoch": 2.797843665768194,
+      "grad_norm": 0.5430009961128235,
+      "learning_rate": 0.0004326475984889368,
+      "loss": 3.5146,
       "step": 25950
     },
     {
-      "epoch": 2.798407060596276,
-      "grad_norm": 0.5505304336547852,
-      "learning_rate": 0.00043261286499299636,
-      "loss": 3.5386,
+      "epoch": 2.803234501347709,
+      "grad_norm": 0.562092661857605,
+      "learning_rate": 0.0004323237992444684,
+      "loss": 3.5339,
       "step": 26000
     },
     {
-      "epoch": 2.798407060596276,
-      "eval_accuracy": 0.3685904710765469,
-      "eval_loss": 3.5148777961730957,
-      "eval_runtime": 184.2338,
-      "eval_samples_per_second": 97.762,
-      "eval_steps_per_second": 6.112,
+      "epoch": 2.803234501347709,
+      "eval_accuracy": 0.36839402665537835,
+      "eval_loss": 3.515779733657837,
+      "eval_runtime": 183.9113,
+      "eval_samples_per_second": 97.933,
+      "eval_steps_per_second": 6.123,
       "step": 26000
     },
     {
-      "epoch": 2.803788612635884,
-      "grad_norm": 0.6029397249221802,
-      "learning_rate": 0.00043228962396293496,
-      "loss": 3.5251,
+      "epoch": 2.8086253369272236,
+      "grad_norm": 0.5559754967689514,
+      "learning_rate": 0.00043199999999999993,
+      "loss": 3.539,
       "step": 26050
     },
     {
-      "epoch": 2.8091701646754923,
-      "grad_norm": 0.640298068523407,
-      "learning_rate": 0.0004319663829328736,
-      "loss": 3.5283,
+      "epoch": 2.8140161725067383,
+      "grad_norm": 0.5079193711280823,
+      "learning_rate": 0.00043167620075553153,
+      "loss": 3.5246,
       "step": 26100
     },
     {
-      "epoch": 2.8145517167151004,
-      "grad_norm": 0.6309469938278198,
-      "learning_rate": 0.00043164314190281215,
-      "loss": 3.5271,
+      "epoch": 2.8194070080862534,
+      "grad_norm": 0.572603702545166,
+      "learning_rate": 0.0004313524015110631,
+      "loss": 3.5364,
       "step": 26150
     },
     {
-      "epoch": 2.819933268754709,
-      "grad_norm": 0.5581336617469788,
-      "learning_rate": 0.00043131990087275074,
-      "loss": 3.5351,
+      "epoch": 2.824797843665768,
+      "grad_norm": 0.6666261553764343,
+      "learning_rate": 0.0004310286022665947,
+      "loss": 3.5467,
       "step": 26200
     },
     {
-      "epoch": 2.825314820794317,
-      "grad_norm": 0.6591759324073792,
-      "learning_rate": 0.00043099665984268934,
-      "loss": 3.5215,
+      "epoch": 2.830188679245283,
+      "grad_norm": 0.5429426431655884,
+      "learning_rate": 0.00043070480302212624,
+      "loss": 3.5337,
       "step": 26250
     },
     {
-      "epoch": 2.830696372833925,
-      "grad_norm": 0.5295543074607849,
-      "learning_rate": 0.0004306734188126279,
-      "loss": 3.5443,
+      "epoch": 2.835579514824798,
+      "grad_norm": 0.5463466048240662,
+      "learning_rate": 0.0004303874797625472,
+      "loss": 3.5309,
       "step": 26300
     },
     {
-      "epoch": 2.8360779248735337,
-      "grad_norm": 0.564895749092102,
-      "learning_rate": 0.00043035017778256647,
-      "loss": 3.5219,
+      "epoch": 2.8409703504043127,
+      "grad_norm": 0.5213525295257568,
+      "learning_rate": 0.00043006368051807874,
+      "loss": 3.5448,
       "step": 26350
     },
     {
-      "epoch": 2.841459476913142,
-      "grad_norm": 0.578912079334259,
-      "learning_rate": 0.0004300269367525051,
-      "loss": 3.5432,
+      "epoch": 2.8463611859838274,
+      "grad_norm": 0.5590488910675049,
+      "learning_rate": 0.00042973988127361035,
+      "loss": 3.5269,
       "step": 26400
     },
     {
-      "epoch": 2.84684102895275,
-      "grad_norm": 1.0079190731048584,
-      "learning_rate": 0.00042970369572244366,
-      "loss": 3.5434,
+      "epoch": 2.8517520215633425,
+      "grad_norm": 0.5565256476402283,
+      "learning_rate": 0.0004294160820291419,
+      "loss": 3.5379,
       "step": 26450
     },
     {
-      "epoch": 2.852222580992358,
-      "grad_norm": 0.5640878081321716,
-      "learning_rate": 0.00042938045469238226,
-      "loss": 3.5186,
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.5384224057197571,
+      "learning_rate": 0.0004290922827846735,
+      "loss": 3.5254,
       "step": 26500
     },
     {
-      "epoch": 2.857604133031966,
-      "grad_norm": 0.5524346828460693,
-      "learning_rate": 0.0004290572136623208,
-      "loss": 3.5352,
+      "epoch": 2.862533692722372,
+      "grad_norm": 0.5617730617523193,
+      "learning_rate": 0.00042876848354020505,
+      "loss": 3.5092,
       "step": 26550
     },
     {
-      "epoch": 2.8629856850715747,
-      "grad_norm": 0.568709671497345,
-      "learning_rate": 0.0004287339726322594,
-      "loss": 3.5349,
+      "epoch": 2.867924528301887,
+      "grad_norm": 0.5675151944160461,
+      "learning_rate": 0.00042844468429573655,
+      "loss": 3.5319,
       "step": 26600
     },
     {
-      "epoch": 2.868367237111183,
-      "grad_norm": 0.533467710018158,
-      "learning_rate": 0.00042841073160219804,
-      "loss": 3.5455,
+      "epoch": 2.8733153638814017,
+      "grad_norm": 0.5250852108001709,
+      "learning_rate": 0.00042812088505126815,
+      "loss": 3.526,
       "step": 26650
     },
     {
-      "epoch": 2.873748789150791,
-      "grad_norm": 0.5956966280937195,
-      "learning_rate": 0.0004280874905721366,
-      "loss": 3.5321,
+      "epoch": 2.8787061994609164,
+      "grad_norm": 0.5593746304512024,
+      "learning_rate": 0.0004277970858067997,
+      "loss": 3.5234,
       "step": 26700
     },
     {
-      "epoch": 2.8791303411903995,
-      "grad_norm": 0.5749139785766602,
-      "learning_rate": 0.0004277642495420752,
-      "loss": 3.5357,
+      "epoch": 2.884097035040431,
+      "grad_norm": 0.5367515683174133,
+      "learning_rate": 0.0004274732865623313,
+      "loss": 3.527,
       "step": 26750
     },
     {
-      "epoch": 2.8845118932300076,
-      "grad_norm": 0.6081565022468567,
-      "learning_rate": 0.00042744100851201377,
-      "loss": 3.514,
+      "epoch": 2.889487870619946,
+      "grad_norm": 0.588120698928833,
+      "learning_rate": 0.00042714948731786286,
+      "loss": 3.5074,
       "step": 26800
     },
     {
-      "epoch": 2.8898934452696157,
-      "grad_norm": 0.5851724743843079,
-      "learning_rate": 0.0004271177674819523,
-      "loss": 3.5226,
+      "epoch": 2.894878706199461,
+      "grad_norm": 0.5879933834075928,
+      "learning_rate": 0.00042682568807339447,
+      "loss": 3.5276,
       "step": 26850
     },
     {
-      "epoch": 2.895274997309224,
-      "grad_norm": 0.6097344160079956,
-      "learning_rate": 0.00042680099127249217,
-      "loss": 3.5233,
+      "epoch": 2.9002695417789757,
+      "grad_norm": 0.5280848741531372,
+      "learning_rate": 0.000426501888828926,
+      "loss": 3.5137,
       "step": 26900
     },
     {
-      "epoch": 2.9006565493488323,
-      "grad_norm": 0.575410008430481,
-      "learning_rate": 0.0004264777502424307,
-      "loss": 3.5413,
+      "epoch": 2.9056603773584904,
+      "grad_norm": 0.6295096278190613,
+      "learning_rate": 0.0004261780895844576,
+      "loss": 3.5142,
       "step": 26950
     },
     {
-      "epoch": 2.9060381013884404,
-      "grad_norm": 0.6111117601394653,
-      "learning_rate": 0.0004261545092123693,
-      "loss": 3.5145,
+      "epoch": 2.9110512129380055,
+      "grad_norm": 0.6104944944381714,
+      "learning_rate": 0.00042585429033998917,
+      "loss": 3.5378,
       "step": 27000
     },
     {
-      "epoch": 2.9060381013884404,
-      "eval_accuracy": 0.36941308209019036,
-      "eval_loss": 3.5066769123077393,
-      "eval_runtime": 184.6661,
-      "eval_samples_per_second": 97.533,
-      "eval_steps_per_second": 6.097,
+      "epoch": 2.9110512129380055,
+      "eval_accuracy": 0.3697847836194037,
+      "eval_loss": 3.5046472549438477,
+      "eval_runtime": 184.1441,
+      "eval_samples_per_second": 97.809,
+      "eval_steps_per_second": 6.115,
       "step": 27000
     },
     {
-      "epoch": 2.9114196534280485,
-      "grad_norm": 0.5597397089004517,
-      "learning_rate": 0.00042583126818230795,
-      "loss": 3.5104,
+      "epoch": 2.91644204851752,
+      "grad_norm": 0.5822877883911133,
+      "learning_rate": 0.0004255304910955207,
+      "loss": 3.5514,
       "step": 27050
     },
     {
-      "epoch": 2.9168012054676566,
-      "grad_norm": 0.5836156010627747,
-      "learning_rate": 0.0004255080271522465,
-      "loss": 3.5324,
+      "epoch": 2.921832884097035,
+      "grad_norm": 0.5345104336738586,
+      "learning_rate": 0.0004252066918510523,
+      "loss": 3.5334,
       "step": 27100
     },
     {
-      "epoch": 2.922182757507265,
-      "grad_norm": 0.5847307443618774,
-      "learning_rate": 0.0004251847861221851,
-      "loss": 3.5126,
+      "epoch": 2.92722371967655,
+      "grad_norm": 0.5554478168487549,
+      "learning_rate": 0.0004248828926065839,
+      "loss": 3.5461,
       "step": 27150
     },
     {
-      "epoch": 2.9275643095468733,
-      "grad_norm": 0.5847127437591553,
-      "learning_rate": 0.00042486154509212363,
-      "loss": 3.5196,
+      "epoch": 2.9326145552560647,
+      "grad_norm": 0.5697243213653564,
+      "learning_rate": 0.0004245590933621155,
+      "loss": 3.5349,
       "step": 27200
     },
     {
-      "epoch": 2.9329458615864814,
-      "grad_norm": 0.5771723985671997,
-      "learning_rate": 0.0004245383040620622,
-      "loss": 3.5229,
+      "epoch": 2.9380053908355794,
+      "grad_norm": 0.570512056350708,
+      "learning_rate": 0.00042423529411764703,
+      "loss": 3.5493,
       "step": 27250
     },
     {
-      "epoch": 2.93832741362609,
-      "grad_norm": 0.5682346224784851,
-      "learning_rate": 0.0004242150630320009,
-      "loss": 3.5362,
+      "epoch": 2.9433962264150946,
+      "grad_norm": 0.623167872428894,
+      "learning_rate": 0.00042391149487317864,
+      "loss": 3.5298,
       "step": 27300
     },
     {
-      "epoch": 2.943708965665698,
-      "grad_norm": 0.600759744644165,
-      "learning_rate": 0.0004238918220019394,
-      "loss": 3.5464,
+      "epoch": 2.9487870619946093,
+      "grad_norm": 0.5901476144790649,
+      "learning_rate": 0.00042358769562871013,
+      "loss": 3.5171,
       "step": 27350
     },
     {
-      "epoch": 2.949090517705306,
-      "grad_norm": 0.5634221434593201,
-      "learning_rate": 0.000423568580971878,
-      "loss": 3.5436,
+      "epoch": 2.954177897574124,
+      "grad_norm": 0.5781039595603943,
+      "learning_rate": 0.00042326389638424174,
+      "loss": 3.5172,
       "step": 27400
     },
     {
-      "epoch": 2.9544720697449143,
-      "grad_norm": 0.5330511331558228,
-      "learning_rate": 0.00042324533994181655,
-      "loss": 3.5156,
+      "epoch": 2.9595687331536387,
+      "grad_norm": 0.5595638751983643,
+      "learning_rate": 0.0004229400971397733,
+      "loss": 3.5319,
       "step": 27450
     },
     {
-      "epoch": 2.9598536217845224,
-      "grad_norm": 0.6540337800979614,
-      "learning_rate": 0.00042292209891175514,
-      "loss": 3.5174,
+      "epoch": 2.964959568733154,
+      "grad_norm": 0.5504574179649353,
+      "learning_rate": 0.00042261629789530484,
+      "loss": 3.5351,
       "step": 27500
     },
     {
-      "epoch": 2.965235173824131,
-      "grad_norm": 0.5485448241233826,
-      "learning_rate": 0.00042259885788169374,
-      "loss": 3.5411,
+      "epoch": 2.9703504043126685,
+      "grad_norm": 0.5639024376869202,
+      "learning_rate": 0.00042229249865083644,
+      "loss": 3.5209,
       "step": 27550
     },
     {
-      "epoch": 2.970616725863739,
-      "grad_norm": 0.5691707134246826,
-      "learning_rate": 0.00042227561685163233,
-      "loss": 3.5295,
+      "epoch": 2.975741239892183,
+      "grad_norm": 0.5995092391967773,
+      "learning_rate": 0.000421968699406368,
+      "loss": 3.5227,
       "step": 27600
     },
     {
-      "epoch": 2.975998277903347,
-      "grad_norm": 0.5650212168693542,
-      "learning_rate": 0.00042195237582157093,
-      "loss": 3.5074,
+      "epoch": 2.981132075471698,
+      "grad_norm": 0.5549217462539673,
+      "learning_rate": 0.0004216449001618996,
+      "loss": 3.5435,
       "step": 27650
     },
     {
-      "epoch": 2.9813798299429557,
-      "grad_norm": 0.6257863640785217,
-      "learning_rate": 0.0004216291347915095,
-      "loss": 3.5218,
+      "epoch": 2.986522911051213,
+      "grad_norm": 0.6059816479682922,
+      "learning_rate": 0.00042132110091743115,
+      "loss": 3.5094,
       "step": 27700
     },
     {
-      "epoch": 2.986761381982564,
-      "grad_norm": 0.5358322262763977,
-      "learning_rate": 0.00042130589376144806,
-      "loss": 3.527,
+      "epoch": 2.9919137466307277,
+      "grad_norm": 0.5828582048416138,
+      "learning_rate": 0.00042099730167296275,
+      "loss": 3.5018,
       "step": 27750
     },
     {
-      "epoch": 2.992142934022172,
-      "grad_norm": 0.5799550414085388,
-      "learning_rate": 0.00042098265273138666,
-      "loss": 3.5273,
+      "epoch": 2.9973045822102424,
+      "grad_norm": 0.6414007544517517,
+      "learning_rate": 0.0004206735024284943,
+      "loss": 3.5228,
       "step": 27800
     },
     {
-      "epoch": 2.9975244860617805,
-      "grad_norm": 0.5868208408355713,
-      "learning_rate": 0.0004206594117013252,
-      "loss": 3.539,
+      "epoch": 3.0026954177897576,
+      "grad_norm": 0.5754533410072327,
+      "learning_rate": 0.0004203497031840259,
+      "loss": 3.4827,
       "step": 27850
     },
     {
-      "epoch": 3.0029060381013886,
-      "grad_norm": 0.599449872970581,
-      "learning_rate": 0.00042033617067126385,
-      "loss": 3.4554,
+      "epoch": 3.0080862533692723,
+      "grad_norm": 0.5822769403457642,
+      "learning_rate": 0.00042002590393955746,
+      "loss": 3.428,
       "step": 27900
     },
     {
-      "epoch": 3.0082875901409967,
-      "grad_norm": 0.6563495993614197,
-      "learning_rate": 0.00042001292964120244,
-      "loss": 3.4225,
+      "epoch": 3.013477088948787,
+      "grad_norm": 0.6208178997039795,
+      "learning_rate": 0.00041970210469508896,
+      "loss": 3.4073,
       "step": 27950
     },
     {
-      "epoch": 3.0136691421806048,
-      "grad_norm": 0.6105541586875916,
-      "learning_rate": 0.000419689688611141,
-      "loss": 3.4242,
+      "epoch": 3.018867924528302,
+      "grad_norm": 0.5660999417304993,
+      "learning_rate": 0.00041937830545062056,
+      "loss": 3.4285,
       "step": 28000
     },
     {
-      "epoch": 3.0136691421806048,
-      "eval_accuracy": 0.3703614044950352,
-      "eval_loss": 3.4996602535247803,
-      "eval_runtime": 184.2896,
-      "eval_samples_per_second": 97.732,
-      "eval_steps_per_second": 6.11,
+      "epoch": 3.018867924528302,
+      "eval_accuracy": 0.37032391924873914,
+      "eval_loss": 3.5003163814544678,
+      "eval_runtime": 184.587,
+      "eval_samples_per_second": 97.575,
+      "eval_steps_per_second": 6.1,
       "step": 28000
     },
     {
-      "epoch": 3.0190506942202133,
-      "grad_norm": 0.6090134978294373,
-      "learning_rate": 0.0004193664475810796,
-      "loss": 3.4247,
+      "epoch": 3.024258760107817,
+      "grad_norm": 0.6071078181266785,
+      "learning_rate": 0.0004190545062061521,
+      "loss": 3.4211,
       "step": 28050
     },
     {
-      "epoch": 3.0244322462598214,
-      "grad_norm": 0.6373844742774963,
-      "learning_rate": 0.00041904320655101817,
-      "loss": 3.4486,
+      "epoch": 3.0296495956873315,
+      "grad_norm": 0.5652373433113098,
+      "learning_rate": 0.0004187307069616837,
+      "loss": 3.4348,
       "step": 28100
     },
     {
-      "epoch": 3.0298137982994295,
-      "grad_norm": 0.6366713643074036,
-      "learning_rate": 0.0004187199655209567,
-      "loss": 3.4378,
+      "epoch": 3.035040431266846,
+      "grad_norm": 0.5970408916473389,
+      "learning_rate": 0.00041840690771721527,
+      "loss": 3.4384,
       "step": 28150
     },
     {
-      "epoch": 3.0351953503390376,
-      "grad_norm": 0.5975115895271301,
-      "learning_rate": 0.00041839672449089536,
-      "loss": 3.4535,
+      "epoch": 3.0404312668463613,
+      "grad_norm": 0.5694037079811096,
+      "learning_rate": 0.00041808310847274687,
+      "loss": 3.4361,
       "step": 28200
     },
     {
-      "epoch": 3.040576902378646,
-      "grad_norm": 0.6184483170509338,
-      "learning_rate": 0.00041807348346083395,
-      "loss": 3.4623,
+      "epoch": 3.045822102425876,
+      "grad_norm": 0.6052969694137573,
+      "learning_rate": 0.0004177593092282784,
+      "loss": 3.4426,
       "step": 28250
     },
     {
-      "epoch": 3.0459584544182543,
-      "grad_norm": 0.5912527441978455,
-      "learning_rate": 0.0004177502424307725,
-      "loss": 3.4439,
+      "epoch": 3.0512129380053907,
+      "grad_norm": 0.6220088005065918,
+      "learning_rate": 0.0004174419859686994,
+      "loss": 3.442,
       "step": 28300
     },
     {
-      "epoch": 3.0513400064578624,
-      "grad_norm": 0.6498454809188843,
-      "learning_rate": 0.0004174270014007111,
-      "loss": 3.4491,
+      "epoch": 3.056603773584906,
+      "grad_norm": 0.6056026220321655,
+      "learning_rate": 0.0004171181867242309,
+      "loss": 3.4497,
       "step": 28350
     },
     {
-      "epoch": 3.0567215584974705,
-      "grad_norm": 0.574679434299469,
-      "learning_rate": 0.00041710376037064963,
-      "loss": 3.4435,
+      "epoch": 3.0619946091644206,
+      "grad_norm": 0.588292121887207,
+      "learning_rate": 0.00041679438747976253,
+      "loss": 3.4249,
       "step": 28400
     },
     {
-      "epoch": 3.062103110537079,
-      "grad_norm": 0.601494312286377,
-      "learning_rate": 0.0004167805193405883,
-      "loss": 3.4275,
+      "epoch": 3.0673854447439353,
+      "grad_norm": 0.6074293851852417,
+      "learning_rate": 0.0004164705882352941,
+      "loss": 3.4462,
       "step": 28450
     },
     {
-      "epoch": 3.067484662576687,
-      "grad_norm": 0.5695728063583374,
-      "learning_rate": 0.0004164572783105269,
-      "loss": 3.4594,
+      "epoch": 3.07277628032345,
+      "grad_norm": 0.587044894695282,
+      "learning_rate": 0.0004161467889908257,
+      "loss": 3.4316,
       "step": 28500
     },
     {
-      "epoch": 3.0728662146162953,
-      "grad_norm": 0.5873692035675049,
-      "learning_rate": 0.0004161340372804654,
-      "loss": 3.4718,
+      "epoch": 3.078167115902965,
+      "grad_norm": 0.5824921727180481,
+      "learning_rate": 0.00041582298974635724,
+      "loss": 3.429,
       "step": 28550
     },
     {
-      "epoch": 3.0782477666559034,
-      "grad_norm": 0.5661360025405884,
-      "learning_rate": 0.000415810796250404,
-      "loss": 3.4408,
+      "epoch": 3.08355795148248,
+      "grad_norm": 0.5803788304328918,
+      "learning_rate": 0.00041549919050188884,
+      "loss": 3.4512,
       "step": 28600
     },
     {
-      "epoch": 3.083629318695512,
-      "grad_norm": 0.6276664137840271,
-      "learning_rate": 0.0004154875552203426,
-      "loss": 3.4421,
+      "epoch": 3.0889487870619945,
+      "grad_norm": 0.6797330379486084,
+      "learning_rate": 0.00041517539125742034,
+      "loss": 3.4525,
       "step": 28650
     },
     {
-      "epoch": 3.08901087073512,
-      "grad_norm": 0.5694606900215149,
-      "learning_rate": 0.00041516431419028114,
-      "loss": 3.4527,
+      "epoch": 3.0943396226415096,
+      "grad_norm": 0.5927037000656128,
+      "learning_rate": 0.0004148515920129519,
+      "loss": 3.4539,
       "step": 28700
     },
     {
-      "epoch": 3.094392422774728,
-      "grad_norm": 0.6117746829986572,
-      "learning_rate": 0.0004148410731602198,
-      "loss": 3.4553,
+      "epoch": 3.0997304582210243,
+      "grad_norm": 0.5757393836975098,
+      "learning_rate": 0.0004145277927684835,
+      "loss": 3.4479,
       "step": 28750
     },
     {
-      "epoch": 3.0997739748143363,
-      "grad_norm": 0.5743106603622437,
-      "learning_rate": 0.0004145178321301584,
-      "loss": 3.4538,
+      "epoch": 3.105121293800539,
+      "grad_norm": 0.6510079503059387,
+      "learning_rate": 0.00041420399352401504,
+      "loss": 3.4463,
       "step": 28800
     },
     {
-      "epoch": 3.105155526853945,
-      "grad_norm": 0.6104516983032227,
-      "learning_rate": 0.00041419459110009693,
-      "loss": 3.4521,
+      "epoch": 3.1105121293800537,
+      "grad_norm": 0.6105151176452637,
+      "learning_rate": 0.00041388019427954665,
+      "loss": 3.4525,
       "step": 28850
     },
     {
-      "epoch": 3.110537078893553,
-      "grad_norm": 0.6102493405342102,
-      "learning_rate": 0.0004138713500700355,
-      "loss": 3.4463,
+      "epoch": 3.115902964959569,
+      "grad_norm": 0.6211234927177429,
+      "learning_rate": 0.0004135563950350782,
+      "loss": 3.4499,
       "step": 28900
     },
     {
-      "epoch": 3.115918630933161,
-      "grad_norm": 0.6518812775611877,
-      "learning_rate": 0.00041354810903997406,
-      "loss": 3.4416,
+      "epoch": 3.1212938005390836,
+      "grad_norm": 0.5854358673095703,
+      "learning_rate": 0.0004132325957906098,
+      "loss": 3.4444,
       "step": 28950
     },
     {
-      "epoch": 3.121300182972769,
-      "grad_norm": 0.5711073875427246,
-      "learning_rate": 0.00041322486800991266,
-      "loss": 3.4594,
+      "epoch": 3.1266846361185983,
+      "grad_norm": 0.5977933406829834,
+      "learning_rate": 0.00041290879654614135,
+      "loss": 3.4526,
       "step": 29000
     },
     {
-      "epoch": 3.121300182972769,
-      "eval_accuracy": 0.37099832772340363,
-      "eval_loss": 3.499441385269165,
-      "eval_runtime": 184.2527,
-      "eval_samples_per_second": 97.752,
-      "eval_steps_per_second": 6.111,
+      "epoch": 3.1266846361185983,
+      "eval_accuracy": 0.3711602205262472,
+      "eval_loss": 3.493985652923584,
+      "eval_runtime": 183.758,
+      "eval_samples_per_second": 98.015,
+      "eval_steps_per_second": 6.128,
       "step": 29000
     },
     {
-      "epoch": 3.1266817350123777,
-      "grad_norm": 0.5713046789169312,
-      "learning_rate": 0.00041290809180045246,
-      "loss": 3.4588,
+      "epoch": 3.1320754716981134,
+      "grad_norm": 0.5728035569190979,
+      "learning_rate": 0.00041258499730167296,
+      "loss": 3.4309,
       "step": 29050
     },
     {
-      "epoch": 3.132063287051986,
-      "grad_norm": 0.6167808175086975,
-      "learning_rate": 0.0004125848507703911,
-      "loss": 3.4449,
+      "epoch": 3.137466307277628,
+      "grad_norm": 0.5746806263923645,
+      "learning_rate": 0.0004122611980572045,
+      "loss": 3.4581,
       "step": 29100
     },
     {
-      "epoch": 3.137444839091594,
-      "grad_norm": 0.5783477425575256,
-      "learning_rate": 0.0004122616097403297,
-      "loss": 3.4579,
+      "epoch": 3.142857142857143,
+      "grad_norm": 0.5982994437217712,
+      "learning_rate": 0.00041193739881273606,
+      "loss": 3.4529,
       "step": 29150
     },
     {
-      "epoch": 3.1428263911312024,
-      "grad_norm": 0.6156630516052246,
-      "learning_rate": 0.00041193836871026825,
-      "loss": 3.4503,
+      "epoch": 3.1482479784366575,
+      "grad_norm": 0.5874792337417603,
+      "learning_rate": 0.00041161359956826766,
+      "loss": 3.4685,
       "step": 29200
     },
     {
-      "epoch": 3.1482079431708105,
-      "grad_norm": 0.5773206353187561,
-      "learning_rate": 0.00041161512768020684,
-      "loss": 3.457,
+      "epoch": 3.1536388140161726,
+      "grad_norm": 0.6152310967445374,
+      "learning_rate": 0.0004112962763086886,
+      "loss": 3.4688,
       "step": 29250
     },
     {
-      "epoch": 3.1535894952104186,
-      "grad_norm": 0.638312816619873,
-      "learning_rate": 0.0004112918866501454,
-      "loss": 3.4327,
+      "epoch": 3.1590296495956873,
+      "grad_norm": 0.6202549338340759,
+      "learning_rate": 0.0004109724770642201,
+      "loss": 3.4607,
       "step": 29300
     },
     {
-      "epoch": 3.1589710472500268,
-      "grad_norm": 0.5940792560577393,
-      "learning_rate": 0.000410968645620084,
-      "loss": 3.4464,
+      "epoch": 3.164420485175202,
+      "grad_norm": 0.5819991827011108,
+      "learning_rate": 0.00041064867781975177,
+      "loss": 3.4491,
       "step": 29350
     },
     {
-      "epoch": 3.1643525992896353,
-      "grad_norm": 0.6411643624305725,
-      "learning_rate": 0.0004106454045900226,
-      "loss": 3.4502,
+      "epoch": 3.169811320754717,
+      "grad_norm": 0.6437973976135254,
+      "learning_rate": 0.00041032487857528327,
+      "loss": 3.4589,
       "step": 29400
     },
     {
-      "epoch": 3.1697341513292434,
-      "grad_norm": 0.627984881401062,
-      "learning_rate": 0.00041032216355996117,
-      "loss": 3.4583,
+      "epoch": 3.175202156334232,
+      "grad_norm": 0.6172235012054443,
+      "learning_rate": 0.0004100010793308148,
+      "loss": 3.4705,
       "step": 29450
     },
     {
-      "epoch": 3.1751157033688515,
-      "grad_norm": 0.6114353537559509,
-      "learning_rate": 0.00040999892252989976,
-      "loss": 3.4434,
+      "epoch": 3.1805929919137466,
+      "grad_norm": 0.6386169791221619,
+      "learning_rate": 0.0004096772800863464,
+      "loss": 3.4415,
       "step": 29500
     },
     {
-      "epoch": 3.1804972554084596,
-      "grad_norm": 0.6601144671440125,
-      "learning_rate": 0.00040967568149983836,
-      "loss": 3.4406,
+      "epoch": 3.1859838274932613,
+      "grad_norm": 0.6090947985649109,
+      "learning_rate": 0.000409353480841878,
+      "loss": 3.4567,
       "step": 29550
     },
     {
-      "epoch": 3.185878807448068,
-      "grad_norm": 0.5516364574432373,
-      "learning_rate": 0.0004093524404697769,
-      "loss": 3.4536,
+      "epoch": 3.1913746630727764,
+      "grad_norm": 0.6054538488388062,
+      "learning_rate": 0.0004090296815974096,
+      "loss": 3.4675,
       "step": 29600
     },
     {
-      "epoch": 3.1912603594876763,
-      "grad_norm": 0.5681900382041931,
-      "learning_rate": 0.0004090291994397155,
-      "loss": 3.4596,
+      "epoch": 3.196765498652291,
+      "grad_norm": 0.5757823586463928,
+      "learning_rate": 0.00040870588235294113,
+      "loss": 3.4519,
       "step": 29650
     },
     {
-      "epoch": 3.1966419115272844,
-      "grad_norm": 0.6558794975280762,
-      "learning_rate": 0.00040870595840965414,
-      "loss": 3.4553,
+      "epoch": 3.202156334231806,
+      "grad_norm": 0.5926768779754639,
+      "learning_rate": 0.00040838208310847273,
+      "loss": 3.471,
       "step": 29700
     },
     {
-      "epoch": 3.2020234635668925,
-      "grad_norm": 0.6110433340072632,
-      "learning_rate": 0.0004083827173795927,
-      "loss": 3.4507,
+      "epoch": 3.207547169811321,
+      "grad_norm": 0.6421563029289246,
+      "learning_rate": 0.0004080582838640043,
+      "loss": 3.4675,
       "step": 29750
     },
     {
-      "epoch": 3.207405015606501,
-      "grad_norm": 0.6141265630722046,
-      "learning_rate": 0.0004080594763495313,
-      "loss": 3.4408,
+      "epoch": 3.2129380053908356,
+      "grad_norm": 0.6215593218803406,
+      "learning_rate": 0.0004077344846195359,
+      "loss": 3.4446,
       "step": 29800
     },
     {
-      "epoch": 3.212786567646109,
-      "grad_norm": 0.6038244962692261,
-      "learning_rate": 0.0004077362353194698,
-      "loss": 3.4284,
+      "epoch": 3.2183288409703503,
+      "grad_norm": 0.6261037588119507,
+      "learning_rate": 0.00040741068537506744,
+      "loss": 3.4644,
       "step": 29850
     },
     {
-      "epoch": 3.2181681196857173,
-      "grad_norm": 0.6001572608947754,
-      "learning_rate": 0.0004074129942894084,
-      "loss": 3.4569,
+      "epoch": 3.223719676549865,
+      "grad_norm": 0.5810261964797974,
+      "learning_rate": 0.000407086886130599,
+      "loss": 3.4587,
       "step": 29900
     },
     {
-      "epoch": 3.2235496717253254,
-      "grad_norm": 0.6228058338165283,
-      "learning_rate": 0.000407089753259347,
-      "loss": 3.4854,
+      "epoch": 3.22911051212938,
+      "grad_norm": 0.5672079920768738,
+      "learning_rate": 0.0004067630868861306,
+      "loss": 3.4528,
       "step": 29950
     },
     {
-      "epoch": 3.228931223764934,
-      "grad_norm": 0.6055977940559387,
-      "learning_rate": 0.0004067665122292856,
-      "loss": 3.4337,
+      "epoch": 3.234501347708895,
+      "grad_norm": 0.605877697467804,
+      "learning_rate": 0.0004064392876416621,
+      "loss": 3.4631,
       "step": 30000
     },
     {
-      "epoch": 3.228931223764934,
-      "eval_accuracy": 0.37154919786462304,
-      "eval_loss": 3.493218421936035,
-      "eval_runtime": 184.7775,
-      "eval_samples_per_second": 97.474,
-      "eval_steps_per_second": 6.094,
+      "epoch": 3.234501347708895,
+      "eval_accuracy": 0.3715522401454819,
+      "eval_loss": 3.4904942512512207,
+      "eval_runtime": 183.4308,
+      "eval_samples_per_second": 98.19,
+      "eval_steps_per_second": 6.139,
       "step": 30000
     },
     {
-      "epoch": 3.234312775804542,
-      "grad_norm": 0.6261052489280701,
-      "learning_rate": 0.0004064432711992242,
-      "loss": 3.4797,
+      "epoch": 3.2398921832884096,
+      "grad_norm": 0.5602097511291504,
+      "learning_rate": 0.0004061154883971937,
+      "loss": 3.4415,
       "step": 30050
     },
     {
-      "epoch": 3.23969432784415,
-      "grad_norm": 0.5575237274169922,
-      "learning_rate": 0.0004061200301691628,
-      "loss": 3.4505,
+      "epoch": 3.2452830188679247,
+      "grad_norm": 0.594542384147644,
+      "learning_rate": 0.00040579168915272525,
+      "loss": 3.4567,
       "step": 30100
     },
     {
-      "epoch": 3.2450758798837587,
-      "grad_norm": 0.5761787295341492,
-      "learning_rate": 0.00040579678913910133,
-      "loss": 3.4694,
+      "epoch": 3.2506738544474394,
+      "grad_norm": 0.5522177219390869,
+      "learning_rate": 0.00040546788990825685,
+      "loss": 3.4605,
       "step": 30150
     },
     {
-      "epoch": 3.250457431923367,
-      "grad_norm": 0.5979387760162354,
-      "learning_rate": 0.0004054735481090399,
-      "loss": 3.4579,
+      "epoch": 3.256064690026954,
+      "grad_norm": 0.6020456552505493,
+      "learning_rate": 0.0004051440906637884,
+      "loss": 3.4754,
       "step": 30200
     },
     {
-      "epoch": 3.255838983962975,
-      "grad_norm": 0.5906158089637756,
-      "learning_rate": 0.0004051503070789786,
-      "loss": 3.4587,
+      "epoch": 3.2614555256064692,
+      "grad_norm": 0.6003113389015198,
+      "learning_rate": 0.00040482029141931995,
+      "loss": 3.4614,
       "step": 30250
     },
     {
-      "epoch": 3.261220536002583,
-      "grad_norm": 0.6434659361839294,
-      "learning_rate": 0.0004048270660489171,
-      "loss": 3.4295,
+      "epoch": 3.266846361185984,
+      "grad_norm": 0.6419681310653687,
+      "learning_rate": 0.00040449649217485156,
+      "loss": 3.4401,
       "step": 30300
     },
     {
-      "epoch": 3.2666020880421915,
-      "grad_norm": 0.5845122337341309,
-      "learning_rate": 0.0004045038250188557,
-      "loss": 3.4527,
+      "epoch": 3.2722371967654986,
+      "grad_norm": 0.5756829380989075,
+      "learning_rate": 0.0004041726929303831,
+      "loss": 3.4567,
       "step": 30350
     },
     {
-      "epoch": 3.2719836400817996,
-      "grad_norm": 0.608493447303772,
-      "learning_rate": 0.00040418058398879425,
-      "loss": 3.4514,
+      "epoch": 3.2776280323450133,
+      "grad_norm": 0.6107509732246399,
+      "learning_rate": 0.0004038488936859147,
+      "loss": 3.4674,
       "step": 30400
     },
     {
-      "epoch": 3.2773651921214078,
-      "grad_norm": 0.5747212171554565,
-      "learning_rate": 0.00040385734295873284,
-      "loss": 3.4804,
+      "epoch": 3.2830188679245285,
+      "grad_norm": 0.6611049771308899,
+      "learning_rate": 0.00040352509444144626,
+      "loss": 3.4556,
       "step": 30450
     },
     {
-      "epoch": 3.282746744161016,
-      "grad_norm": 0.6562155485153198,
-      "learning_rate": 0.00040354056674927265,
-      "loss": 3.4651,
+      "epoch": 3.288409703504043,
+      "grad_norm": 0.6525335311889648,
+      "learning_rate": 0.00040320129519697787,
+      "loss": 3.4683,
       "step": 30500
     },
     {
-      "epoch": 3.2881282962006244,
-      "grad_norm": 0.6439652442932129,
-      "learning_rate": 0.00040321732571921124,
-      "loss": 3.4652,
+      "epoch": 3.293800539083558,
+      "grad_norm": 0.6149199604988098,
+      "learning_rate": 0.0004028774959525094,
+      "loss": 3.4623,
       "step": 30550
     },
     {
-      "epoch": 3.2935098482402325,
-      "grad_norm": 0.6053032279014587,
-      "learning_rate": 0.0004028940846891498,
-      "loss": 3.4706,
+      "epoch": 3.2991913746630726,
+      "grad_norm": 0.5852982997894287,
+      "learning_rate": 0.000402553696708041,
+      "loss": 3.4567,
       "step": 30600
     },
     {
-      "epoch": 3.2988914002798406,
-      "grad_norm": 0.6160561442375183,
-      "learning_rate": 0.00040257084365908843,
-      "loss": 3.447,
+      "epoch": 3.3045822102425877,
+      "grad_norm": 0.631427526473999,
+      "learning_rate": 0.0004022298974635726,
+      "loss": 3.4527,
       "step": 30650
     },
     {
-      "epoch": 3.304272952319449,
-      "grad_norm": 0.6740018129348755,
-      "learning_rate": 0.00040224760262902703,
-      "loss": 3.4561,
+      "epoch": 3.3099730458221024,
+      "grad_norm": 0.6369751691818237,
+      "learning_rate": 0.00040190609821910407,
+      "loss": 3.466,
       "step": 30700
     },
     {
-      "epoch": 3.3096545043590573,
-      "grad_norm": 0.6057961583137512,
-      "learning_rate": 0.00040192436159896557,
-      "loss": 3.4598,
+      "epoch": 3.315363881401617,
+      "grad_norm": 0.6102563738822937,
+      "learning_rate": 0.0004015822989746357,
+      "loss": 3.4459,
       "step": 30750
     },
     {
-      "epoch": 3.3150360563986654,
-      "grad_norm": 0.5896490216255188,
-      "learning_rate": 0.00040160112056890416,
-      "loss": 3.452,
+      "epoch": 3.3207547169811322,
+      "grad_norm": 0.5564303398132324,
+      "learning_rate": 0.0004012584997301672,
+      "loss": 3.4402,
       "step": 30800
     },
     {
-      "epoch": 3.3204176084382735,
-      "grad_norm": 0.6008713245391846,
-      "learning_rate": 0.00040127787953884276,
-      "loss": 3.4669,
+      "epoch": 3.326145552560647,
+      "grad_norm": 0.5567905902862549,
+      "learning_rate": 0.00040093470048569883,
+      "loss": 3.442,
       "step": 30850
     },
     {
-      "epoch": 3.3257991604778816,
-      "grad_norm": 0.6577219367027283,
-      "learning_rate": 0.00040095463850878135,
-      "loss": 3.4536,
+      "epoch": 3.3315363881401616,
+      "grad_norm": 0.6052920818328857,
+      "learning_rate": 0.0004006109012412304,
+      "loss": 3.4722,
       "step": 30900
     },
     {
-      "epoch": 3.33118071251749,
-      "grad_norm": 0.581386923789978,
-      "learning_rate": 0.00040063139747871995,
-      "loss": 3.4546,
+      "epoch": 3.3369272237196768,
+      "grad_norm": 0.6521329879760742,
+      "learning_rate": 0.000400287101996762,
+      "loss": 3.4698,
       "step": 30950
     },
     {
-      "epoch": 3.3365622645570983,
-      "grad_norm": 0.6249150037765503,
-      "learning_rate": 0.00040030815644865854,
-      "loss": 3.4576,
+      "epoch": 3.3423180592991915,
+      "grad_norm": 0.557958722114563,
+      "learning_rate": 0.00039996330275229354,
+      "loss": 3.4485,
       "step": 31000
     },
     {
-      "epoch": 3.3365622645570983,
-      "eval_accuracy": 0.3725115364919959,
-      "eval_loss": 3.485410213470459,
-      "eval_runtime": 184.3657,
-      "eval_samples_per_second": 97.692,
-      "eval_steps_per_second": 6.107,
+      "epoch": 3.3423180592991915,
+      "eval_accuracy": 0.3723944086789319,
+      "eval_loss": 3.4821529388427734,
+      "eval_runtime": 183.7959,
+      "eval_samples_per_second": 97.995,
+      "eval_steps_per_second": 6.126,
       "step": 31000
     },
     {
-      "epoch": 3.3419438165967064,
-      "grad_norm": 0.60781329870224,
-      "learning_rate": 0.0003999849154185971,
-      "loss": 3.4657,
+      "epoch": 3.347708894878706,
+      "grad_norm": 0.597503125667572,
+      "learning_rate": 0.00039963950350782514,
+      "loss": 3.4578,
       "step": 31050
     },
     {
-      "epoch": 3.347325368636315,
-      "grad_norm": 0.6397443413734436,
-      "learning_rate": 0.0003996616743885357,
-      "loss": 3.4562,
+      "epoch": 3.353099730458221,
+      "grad_norm": 0.6289362907409668,
+      "learning_rate": 0.0003993157042633567,
+      "loss": 3.4768,
       "step": 31100
     },
     {
-      "epoch": 3.352706920675923,
-      "grad_norm": 0.6920917630195618,
-      "learning_rate": 0.0003993384333584742,
-      "loss": 3.4667,
+      "epoch": 3.358490566037736,
+      "grad_norm": 0.5845758318901062,
+      "learning_rate": 0.00039899190501888824,
+      "loss": 3.4587,
       "step": 31150
     },
     {
-      "epoch": 3.358088472715531,
-      "grad_norm": 0.6365469098091125,
-      "learning_rate": 0.00039901519232841287,
-      "loss": 3.4531,
+      "epoch": 3.3638814016172507,
+      "grad_norm": 0.6318169236183167,
+      "learning_rate": 0.00039866810577441985,
+      "loss": 3.4795,
       "step": 31200
     },
     {
-      "epoch": 3.3634700247551392,
-      "grad_norm": 0.6367911696434021,
-      "learning_rate": 0.00039869195129835146,
-      "loss": 3.4559,
+      "epoch": 3.3692722371967654,
+      "grad_norm": 0.6276087760925293,
+      "learning_rate": 0.0003983443065299514,
+      "loss": 3.4764,
       "step": 31250
     },
     {
-      "epoch": 3.368851576794748,
-      "grad_norm": 0.6479049921035767,
-      "learning_rate": 0.00039836871026829,
-      "loss": 3.4599,
+      "epoch": 3.37466307277628,
+      "grad_norm": 0.5823773741722107,
+      "learning_rate": 0.000398020507285483,
+      "loss": 3.4665,
       "step": 31300
     },
     {
-      "epoch": 3.374233128834356,
-      "grad_norm": 0.6077846884727478,
-      "learning_rate": 0.0003980454692382286,
-      "loss": 3.4647,
+      "epoch": 3.3800539083557952,
+      "grad_norm": 0.5732158422470093,
+      "learning_rate": 0.0003976967080410145,
+      "loss": 3.4629,
       "step": 31350
     },
     {
-      "epoch": 3.379614680873964,
-      "grad_norm": 0.610712468624115,
-      "learning_rate": 0.0003977222282081672,
-      "loss": 3.4588,
+      "epoch": 3.38544474393531,
+      "grad_norm": 0.5848298072814941,
+      "learning_rate": 0.0003973729087965461,
+      "loss": 3.4703,
       "step": 31400
     },
     {
-      "epoch": 3.384996232913572,
-      "grad_norm": 0.6273388862609863,
-      "learning_rate": 0.00039739898717810573,
-      "loss": 3.4545,
+      "epoch": 3.3908355795148246,
+      "grad_norm": 0.6035614609718323,
+      "learning_rate": 0.00039704910955207765,
+      "loss": 3.4623,
       "step": 31450
     },
     {
-      "epoch": 3.3903777849531807,
-      "grad_norm": 0.5932088494300842,
-      "learning_rate": 0.0003970757461480444,
-      "loss": 3.4425,
+      "epoch": 3.3962264150943398,
+      "grad_norm": 0.6049636006355286,
+      "learning_rate": 0.00039672531030760926,
+      "loss": 3.4645,
       "step": 31500
     },
     {
-      "epoch": 3.3957593369927888,
-      "grad_norm": 0.5899380445480347,
-      "learning_rate": 0.000396752505117983,
-      "loss": 3.4601,
+      "epoch": 3.4016172506738545,
+      "grad_norm": 0.6059445738792419,
+      "learning_rate": 0.0003964015110631408,
+      "loss": 3.4707,
       "step": 31550
     },
     {
-      "epoch": 3.401140889032397,
-      "grad_norm": 0.6221842765808105,
-      "learning_rate": 0.0003964292640879215,
-      "loss": 3.4652,
+      "epoch": 3.407008086253369,
+      "grad_norm": 0.617137610912323,
+      "learning_rate": 0.00039607771181867236,
+      "loss": 3.4582,
       "step": 31600
     },
     {
-      "epoch": 3.4065224410720054,
-      "grad_norm": 0.596818745136261,
-      "learning_rate": 0.0003961060230578601,
-      "loss": 3.4464,
+      "epoch": 3.4123989218328843,
+      "grad_norm": 0.5670456290245056,
+      "learning_rate": 0.00039575391257420397,
+      "loss": 3.4462,
       "step": 31650
     },
     {
-      "epoch": 3.4119039931116135,
-      "grad_norm": 0.6313436627388,
-      "learning_rate": 0.00039578278202779865,
-      "loss": 3.4746,
+      "epoch": 3.417789757412399,
+      "grad_norm": 0.6105981469154358,
+      "learning_rate": 0.0003954301133297355,
+      "loss": 3.4664,
       "step": 31700
     },
     {
-      "epoch": 3.4172855451512216,
-      "grad_norm": 0.7080163359642029,
-      "learning_rate": 0.00039545954099773725,
-      "loss": 3.4722,
+      "epoch": 3.4231805929919137,
+      "grad_norm": 0.5884623527526855,
+      "learning_rate": 0.0003951063140852671,
+      "loss": 3.47,
       "step": 31750
     },
     {
-      "epoch": 3.4226670971908297,
-      "grad_norm": 0.5776293277740479,
-      "learning_rate": 0.0003951362999676759,
-      "loss": 3.4666,
+      "epoch": 3.4285714285714284,
+      "grad_norm": 0.6073600053787231,
+      "learning_rate": 0.00039478251484079867,
+      "loss": 3.4509,
       "step": 31800
     },
     {
-      "epoch": 3.428048649230438,
-      "grad_norm": 0.6150648593902588,
-      "learning_rate": 0.00039481305893761444,
-      "loss": 3.4449,
+      "epoch": 3.4339622641509435,
+      "grad_norm": 0.6530110836029053,
+      "learning_rate": 0.0003944587155963303,
+      "loss": 3.4647,
       "step": 31850
     },
     {
-      "epoch": 3.4334302012700464,
-      "grad_norm": 0.654681384563446,
-      "learning_rate": 0.00039448981790755303,
-      "loss": 3.4583,
+      "epoch": 3.439353099730458,
+      "grad_norm": 0.5859715342521667,
+      "learning_rate": 0.0003941349163518618,
+      "loss": 3.4745,
       "step": 31900
     },
     {
-      "epoch": 3.4388117533096545,
-      "grad_norm": 0.6297449469566345,
-      "learning_rate": 0.0003941665768774916,
-      "loss": 3.4802,
+      "epoch": 3.444743935309973,
+      "grad_norm": 0.5740375518798828,
+      "learning_rate": 0.00039381111710739343,
+      "loss": 3.4645,
       "step": 31950
     },
     {
-      "epoch": 3.4441933053492626,
-      "grad_norm": 0.5857325792312622,
-      "learning_rate": 0.00039384333584743016,
-      "loss": 3.4854,
+      "epoch": 3.450134770889488,
+      "grad_norm": 0.5447757244110107,
+      "learning_rate": 0.000393487317862925,
+      "loss": 3.4613,
       "step": 32000
     },
     {
-      "epoch": 3.4441933053492626,
-      "eval_accuracy": 0.3729826554135595,
-      "eval_loss": 3.477412700653076,
-      "eval_runtime": 184.1469,
-      "eval_samples_per_second": 97.808,
-      "eval_steps_per_second": 6.115,
+      "epoch": 3.450134770889488,
+      "eval_accuracy": 0.3732764528222099,
+      "eval_loss": 3.4755747318267822,
+      "eval_runtime": 183.5947,
+      "eval_samples_per_second": 98.102,
+      "eval_steps_per_second": 6.133,
       "step": 32000
     },
     {
-      "epoch": 3.449574857388871,
-      "grad_norm": 0.5666436553001404,
-      "learning_rate": 0.0003935200948173688,
-      "loss": 3.4641,
+      "epoch": 3.4555256064690028,
+      "grad_norm": 0.634827733039856,
+      "learning_rate": 0.0003931635186184565,
+      "loss": 3.4625,
       "step": 32050
     },
     {
-      "epoch": 3.4549564094284793,
-      "grad_norm": 0.6132357120513916,
-      "learning_rate": 0.0003931968537873074,
-      "loss": 3.4594,
+      "epoch": 3.4609164420485174,
+      "grad_norm": 0.5631187558174133,
+      "learning_rate": 0.0003928397193739881,
+      "loss": 3.4694,
       "step": 32100
     },
     {
-      "epoch": 3.4603379614680874,
-      "grad_norm": 0.6949991583824158,
-      "learning_rate": 0.00039287361275724595,
-      "loss": 3.4698,
+      "epoch": 3.466307277628032,
+      "grad_norm": 0.6147528290748596,
+      "learning_rate": 0.00039251592012951963,
+      "loss": 3.4653,
       "step": 32150
     },
     {
-      "epoch": 3.4657195135076955,
-      "grad_norm": 0.5902615785598755,
-      "learning_rate": 0.00039255037172718454,
-      "loss": 3.4537,
+      "epoch": 3.4716981132075473,
+      "grad_norm": 0.5503348708152771,
+      "learning_rate": 0.00039219212088505124,
+      "loss": 3.4459,
       "step": 32200
     },
     {
-      "epoch": 3.471101065547304,
-      "grad_norm": 0.6425248980522156,
-      "learning_rate": 0.0003922271306971231,
-      "loss": 3.4657,
+      "epoch": 3.477088948787062,
+      "grad_norm": 0.5855104923248291,
+      "learning_rate": 0.0003918683216405828,
+      "loss": 3.46,
       "step": 32250
     },
     {
-      "epoch": 3.476482617586912,
-      "grad_norm": 0.6459230184555054,
-      "learning_rate": 0.0003919038896670617,
-      "loss": 3.4617,
+      "epoch": 3.4824797843665767,
+      "grad_norm": 0.5776777267456055,
+      "learning_rate": 0.0003915445223961144,
+      "loss": 3.4835,
       "step": 32300
     },
     {
-      "epoch": 3.4818641696265202,
-      "grad_norm": 0.8496313691139221,
-      "learning_rate": 0.00039158064863700033,
-      "loss": 3.4397,
+      "epoch": 3.487870619946092,
+      "grad_norm": 0.6286254525184631,
+      "learning_rate": 0.00039122072315164594,
+      "loss": 3.4538,
       "step": 32350
     },
     {
-      "epoch": 3.4872457216661283,
-      "grad_norm": 0.6361708641052246,
-      "learning_rate": 0.00039125740760693887,
-      "loss": 3.4406,
+      "epoch": 3.4932614555256065,
+      "grad_norm": 0.5974652767181396,
+      "learning_rate": 0.0003908969239071775,
+      "loss": 3.4655,
       "step": 32400
     },
     {
-      "epoch": 3.492627273705737,
-      "grad_norm": 0.6062164902687073,
-      "learning_rate": 0.00039093416657687746,
-      "loss": 3.4758,
+      "epoch": 3.498652291105121,
+      "grad_norm": 0.6419471502304077,
+      "learning_rate": 0.0003905731246627091,
+      "loss": 3.4534,
       "step": 32450
     },
     {
-      "epoch": 3.498008825745345,
-      "grad_norm": 0.6049349904060364,
-      "learning_rate": 0.00039061092554681606,
-      "loss": 3.4733,
+      "epoch": 3.5040431266846364,
+      "grad_norm": 0.5857613682746887,
+      "learning_rate": 0.00039024932541824065,
+      "loss": 3.4636,
       "step": 32500
     },
     {
-      "epoch": 3.503390377784953,
-      "grad_norm": 0.6517959833145142,
-      "learning_rate": 0.0003902876845167546,
-      "loss": 3.4654,
+      "epoch": 3.509433962264151,
+      "grad_norm": 0.591844916343689,
+      "learning_rate": 0.00038992552617377225,
+      "loss": 3.4575,
       "step": 32550
     },
     {
-      "epoch": 3.5087719298245617,
-      "grad_norm": 0.5791226625442505,
-      "learning_rate": 0.0003899644434866932,
-      "loss": 3.4739,
+      "epoch": 3.5148247978436657,
+      "grad_norm": 0.5561016201972961,
+      "learning_rate": 0.0003896017269293038,
+      "loss": 3.4629,
       "step": 32600
     },
     {
-      "epoch": 3.5141534818641698,
-      "grad_norm": 0.6079575419425964,
-      "learning_rate": 0.00038964120245663184,
-      "loss": 3.4733,
+      "epoch": 3.5202156334231804,
+      "grad_norm": 0.5965845584869385,
+      "learning_rate": 0.0003892779276848354,
+      "loss": 3.4548,
       "step": 32650
     },
     {
-      "epoch": 3.519535033903778,
-      "grad_norm": 0.5950577855110168,
-      "learning_rate": 0.0003893179614265704,
-      "loss": 3.4655,
+      "epoch": 3.525606469002695,
+      "grad_norm": 0.586627185344696,
+      "learning_rate": 0.0003889541284403669,
+      "loss": 3.4661,
       "step": 32700
     },
     {
-      "epoch": 3.524916585943386,
-      "grad_norm": 0.8927932381629944,
-      "learning_rate": 0.000388994720396509,
-      "loss": 3.4671,
+      "epoch": 3.5309973045822103,
+      "grad_norm": 0.658444344997406,
+      "learning_rate": 0.0003886303291958985,
+      "loss": 3.4555,
       "step": 32750
     },
     {
-      "epoch": 3.530298137982994,
-      "grad_norm": 0.5713189840316772,
-      "learning_rate": 0.0003886714793664475,
-      "loss": 3.4672,
+      "epoch": 3.536388140161725,
+      "grad_norm": 0.6075886487960815,
+      "learning_rate": 0.00038830652995143006,
+      "loss": 3.4609,
       "step": 32800
     },
     {
-      "epoch": 3.5356796900226026,
-      "grad_norm": 0.6249322891235352,
-      "learning_rate": 0.0003883482383363861,
-      "loss": 3.449,
+      "epoch": 3.5417789757412397,
+      "grad_norm": 0.6294785737991333,
+      "learning_rate": 0.0003879827307069616,
+      "loss": 3.4389,
       "step": 32850
     },
     {
-      "epoch": 3.5410612420622107,
-      "grad_norm": 0.635231077671051,
-      "learning_rate": 0.00038802499730632476,
-      "loss": 3.4726,
+      "epoch": 3.547169811320755,
+      "grad_norm": 0.621010422706604,
+      "learning_rate": 0.0003876589314624932,
+      "loss": 3.4549,
       "step": 32900
     },
     {
-      "epoch": 3.546442794101819,
-      "grad_norm": 0.6587965488433838,
-      "learning_rate": 0.0003877017562762633,
-      "loss": 3.4602,
+      "epoch": 3.5525606469002695,
+      "grad_norm": 0.5769633054733276,
+      "learning_rate": 0.00038733513221802477,
+      "loss": 3.4456,
       "step": 32950
     },
     {
-      "epoch": 3.5518243461414274,
-      "grad_norm": 0.6618245840072632,
-      "learning_rate": 0.0003873785152462019,
-      "loss": 3.4504,
+      "epoch": 3.557951482479784,
+      "grad_norm": 0.6038095951080322,
+      "learning_rate": 0.00038701133297355637,
+      "loss": 3.4479,
       "step": 33000
     },
     {
-      "epoch": 3.5518243461414274,
-      "eval_accuracy": 0.3737593062556574,
-      "eval_loss": 3.4713170528411865,
-      "eval_runtime": 184.6965,
-      "eval_samples_per_second": 97.517,
-      "eval_steps_per_second": 6.096,
+      "epoch": 3.557951482479784,
+      "eval_accuracy": 0.37366271383839034,
+      "eval_loss": 3.4711806774139404,
+      "eval_runtime": 183.6637,
+      "eval_samples_per_second": 98.065,
+      "eval_steps_per_second": 6.131,
       "step": 33000
     },
     {
-      "epoch": 3.5572058981810355,
-      "grad_norm": 0.5909431576728821,
-      "learning_rate": 0.0003870552742161405,
-      "loss": 3.4404,
+      "epoch": 3.5633423180592994,
+      "grad_norm": 0.6252723336219788,
+      "learning_rate": 0.0003866875337290879,
+      "loss": 3.4537,
       "step": 33050
     },
     {
-      "epoch": 3.5625874502206436,
-      "grad_norm": 0.6251736879348755,
-      "learning_rate": 0.00038673203318607903,
-      "loss": 3.4581,
+      "epoch": 3.568733153638814,
+      "grad_norm": 0.6400101780891418,
+      "learning_rate": 0.00038636373448461953,
+      "loss": 3.4645,
       "step": 33100
     },
     {
-      "epoch": 3.5679690022602517,
-      "grad_norm": 0.6399909853935242,
-      "learning_rate": 0.0003864087921560176,
-      "loss": 3.4428,
+      "epoch": 3.5741239892183287,
+      "grad_norm": 0.5554074645042419,
+      "learning_rate": 0.0003860399352401511,
+      "loss": 3.4393,
       "step": 33150
     },
     {
-      "epoch": 3.57335055429986,
-      "grad_norm": 0.6242807507514954,
-      "learning_rate": 0.0003860855511259563,
-      "loss": 3.4589,
+      "epoch": 3.579514824797844,
+      "grad_norm": 0.5511495471000671,
+      "learning_rate": 0.0003857161359956827,
+      "loss": 3.447,
       "step": 33200
     },
     {
-      "epoch": 3.5787321063394684,
-      "grad_norm": 0.6046002507209778,
-      "learning_rate": 0.0003857623100958948,
-      "loss": 3.4656,
+      "epoch": 3.5849056603773586,
+      "grad_norm": 0.6984949111938477,
+      "learning_rate": 0.00038539233675121423,
+      "loss": 3.4765,
       "step": 33250
     },
     {
-      "epoch": 3.5841136583790765,
-      "grad_norm": 0.6714318990707397,
-      "learning_rate": 0.0003854390690658334,
-      "loss": 3.4618,
+      "epoch": 3.5902964959568733,
+      "grad_norm": 0.667104184627533,
+      "learning_rate": 0.0003850750134916352,
+      "loss": 3.4662,
       "step": 33300
     },
     {
-      "epoch": 3.5894952104186846,
-      "grad_norm": 0.5967142581939697,
-      "learning_rate": 0.00038511582803577195,
-      "loss": 3.4555,
+      "epoch": 3.595687331536388,
+      "grad_norm": 0.5665805339813232,
+      "learning_rate": 0.0003847512142471667,
+      "loss": 3.4545,
       "step": 33350
     },
     {
-      "epoch": 3.594876762458293,
-      "grad_norm": 0.6154034733772278,
-      "learning_rate": 0.00038479258700571054,
-      "loss": 3.4456,
+      "epoch": 3.601078167115903,
+      "grad_norm": 0.6430594325065613,
+      "learning_rate": 0.0003844274150026983,
+      "loss": 3.4523,
       "step": 33400
     },
     {
-      "epoch": 3.6002583144979012,
-      "grad_norm": 0.5872713923454285,
-      "learning_rate": 0.00038446934597564914,
-      "loss": 3.4508,
+      "epoch": 3.606469002695418,
+      "grad_norm": 0.5975932478904724,
+      "learning_rate": 0.00038410361575822984,
+      "loss": 3.4644,
       "step": 33450
     },
     {
-      "epoch": 3.6056398665375093,
-      "grad_norm": 0.6076340079307556,
-      "learning_rate": 0.00038414610494558773,
-      "loss": 3.456,
+      "epoch": 3.6118598382749325,
+      "grad_norm": 0.6220806837081909,
+      "learning_rate": 0.00038377981651376144,
+      "loss": 3.4455,
       "step": 33500
     },
     {
-      "epoch": 3.611021418577118,
-      "grad_norm": 0.5723451972007751,
-      "learning_rate": 0.00038382286391552633,
-      "loss": 3.4651,
+      "epoch": 3.617250673854447,
+      "grad_norm": 0.6150628924369812,
+      "learning_rate": 0.000383456017269293,
+      "loss": 3.4532,
       "step": 33550
     },
     {
-      "epoch": 3.616402970616726,
-      "grad_norm": 0.5870938301086426,
-      "learning_rate": 0.0003834996228854649,
-      "loss": 3.4636,
+      "epoch": 3.6226415094339623,
+      "grad_norm": 0.638692319393158,
+      "learning_rate": 0.00038313221802482454,
+      "loss": 3.4592,
       "step": 33600
     },
     {
-      "epoch": 3.621784522656334,
-      "grad_norm": 0.6072779893875122,
-      "learning_rate": 0.00038317638185540346,
-      "loss": 3.4644,
+      "epoch": 3.628032345013477,
+      "grad_norm": 0.6110245585441589,
+      "learning_rate": 0.00038280841878035615,
+      "loss": 3.4702,
       "step": 33650
     },
     {
-      "epoch": 3.627166074695942,
-      "grad_norm": 0.6000068187713623,
-      "learning_rate": 0.00038285960564594327,
-      "loss": 3.4671,
+      "epoch": 3.6334231805929917,
+      "grad_norm": 0.5673446655273438,
+      "learning_rate": 0.0003824846195358877,
+      "loss": 3.4648,
       "step": 33700
     },
     {
-      "epoch": 3.6325476267355503,
-      "grad_norm": 0.6160212159156799,
-      "learning_rate": 0.00038253636461588186,
-      "loss": 3.463,
+      "epoch": 3.638814016172507,
+      "grad_norm": 0.6043810844421387,
+      "learning_rate": 0.0003821608202914193,
+      "loss": 3.4476,
       "step": 33750
     },
     {
-      "epoch": 3.637929178775159,
-      "grad_norm": 0.7063570618629456,
-      "learning_rate": 0.00038221312358582046,
-      "loss": 3.4378,
+      "epoch": 3.6442048517520216,
+      "grad_norm": 0.5578144788742065,
+      "learning_rate": 0.00038183702104695085,
+      "loss": 3.4637,
       "step": 33800
     },
     {
-      "epoch": 3.643310730814767,
-      "grad_norm": 0.6322859525680542,
-      "learning_rate": 0.00038188988255575905,
-      "loss": 3.4596,
+      "epoch": 3.6495956873315363,
+      "grad_norm": 0.6210846304893494,
+      "learning_rate": 0.00038151322180248246,
+      "loss": 3.4607,
       "step": 33850
     },
     {
-      "epoch": 3.648692282854375,
-      "grad_norm": 0.6268704533576965,
-      "learning_rate": 0.00038156664152569765,
-      "loss": 3.4784,
+      "epoch": 3.6549865229110514,
+      "grad_norm": 0.6444474458694458,
+      "learning_rate": 0.000381189422558014,
+      "loss": 3.454,
       "step": 33900
     },
     {
-      "epoch": 3.6540738348939836,
-      "grad_norm": 0.6058254837989807,
-      "learning_rate": 0.00038124340049563624,
-      "loss": 3.455,
+      "epoch": 3.660377358490566,
+      "grad_norm": 0.6198441982269287,
+      "learning_rate": 0.0003808656233135456,
+      "loss": 3.4499,
       "step": 33950
     },
     {
-      "epoch": 3.6594553869335917,
-      "grad_norm": 0.6425801515579224,
-      "learning_rate": 0.0003809201594655748,
-      "loss": 3.449,
+      "epoch": 3.665768194070081,
+      "grad_norm": 0.6694725155830383,
+      "learning_rate": 0.00038054182406907716,
+      "loss": 3.464,
       "step": 34000
     },
     {
-      "epoch": 3.6594553869335917,
-      "eval_accuracy": 0.3744735903401498,
-      "eval_loss": 3.4654529094696045,
-      "eval_runtime": 184.3767,
-      "eval_samples_per_second": 97.686,
-      "eval_steps_per_second": 6.107,
+      "epoch": 3.665768194070081,
+      "eval_accuracy": 0.374257371093399,
+      "eval_loss": 3.4641661643981934,
+      "eval_runtime": 183.6771,
+      "eval_samples_per_second": 98.058,
+      "eval_steps_per_second": 6.13,
       "step": 34000
     },
     {
-      "epoch": 3.6648369389732,
-      "grad_norm": 0.6552558541297913,
-      "learning_rate": 0.0003805969184355134,
-      "loss": 3.4635,
+      "epoch": 3.671159029649596,
+      "grad_norm": 0.6388247609138489,
+      "learning_rate": 0.00038021802482460866,
+      "loss": 3.4471,
       "step": 34050
     },
     {
-      "epoch": 3.670218491012808,
-      "grad_norm": 0.6297048926353455,
-      "learning_rate": 0.0003802736774054519,
-      "loss": 3.4479,
+      "epoch": 3.6765498652291106,
+      "grad_norm": 0.5763834714889526,
+      "learning_rate": 0.00037989422558014027,
+      "loss": 3.4612,
       "step": 34100
     },
     {
-      "epoch": 3.675600043052416,
-      "grad_norm": 0.6090021729469299,
-      "learning_rate": 0.00037995043637539057,
-      "loss": 3.4711,
+      "epoch": 3.6819407008086253,
+      "grad_norm": 0.6030428409576416,
+      "learning_rate": 0.0003795704263356718,
+      "loss": 3.4607,
       "step": 34150
     },
     {
-      "epoch": 3.6809815950920246,
-      "grad_norm": 0.6067349314689636,
-      "learning_rate": 0.00037962719534532916,
-      "loss": 3.4607,
+      "epoch": 3.68733153638814,
+      "grad_norm": 0.6248852610588074,
+      "learning_rate": 0.0003792466270912034,
+      "loss": 3.4441,
       "step": 34200
     },
     {
-      "epoch": 3.6863631471316327,
-      "grad_norm": 0.6618533730506897,
-      "learning_rate": 0.0003793039543152677,
-      "loss": 3.4597,
+      "epoch": 3.6927223719676547,
+      "grad_norm": 0.6712538003921509,
+      "learning_rate": 0.00037892282784673497,
+      "loss": 3.4588,
       "step": 34250
     },
     {
-      "epoch": 3.691744699171241,
-      "grad_norm": 0.5938194990158081,
-      "learning_rate": 0.0003789807132852063,
-      "loss": 3.4435,
+      "epoch": 3.69811320754717,
+      "grad_norm": 0.6329993605613708,
+      "learning_rate": 0.0003785990286022666,
+      "loss": 3.4463,
       "step": 34300
     },
     {
-      "epoch": 3.6971262512108494,
-      "grad_norm": 0.6445248126983643,
-      "learning_rate": 0.0003786574722551449,
-      "loss": 3.4287,
+      "epoch": 3.7035040431266846,
+      "grad_norm": 0.5744032859802246,
+      "learning_rate": 0.00037827522935779813,
+      "loss": 3.4697,
       "step": 34350
     },
     {
-      "epoch": 3.7025078032504575,
-      "grad_norm": 0.6069079041481018,
-      "learning_rate": 0.00037833423122508343,
-      "loss": 3.4655,
+      "epoch": 3.7088948787061993,
+      "grad_norm": 0.609049916267395,
+      "learning_rate": 0.00037795143011332973,
+      "loss": 3.4542,
       "step": 34400
     },
     {
-      "epoch": 3.7078893552900656,
-      "grad_norm": 0.6393512487411499,
-      "learning_rate": 0.0003780109901950221,
-      "loss": 3.457,
+      "epoch": 3.7142857142857144,
+      "grad_norm": 0.6319896578788757,
+      "learning_rate": 0.0003776276308688613,
+      "loss": 3.4407,
       "step": 34450
     },
     {
-      "epoch": 3.713270907329674,
-      "grad_norm": 0.5912173390388489,
-      "learning_rate": 0.0003776877491649607,
-      "loss": 3.4475,
+      "epoch": 3.719676549865229,
+      "grad_norm": 0.7728492617607117,
+      "learning_rate": 0.00037730383162439283,
+      "loss": 3.4583,
       "step": 34500
     },
     {
-      "epoch": 3.7186524593692822,
-      "grad_norm": 0.6467220187187195,
-      "learning_rate": 0.0003773645081348992,
-      "loss": 3.4497,
+      "epoch": 3.725067385444744,
+      "grad_norm": 0.6464096903800964,
+      "learning_rate": 0.00037698003237992444,
+      "loss": 3.4606,
       "step": 34550
     },
     {
-      "epoch": 3.7240340114088903,
-      "grad_norm": 0.6172649264335632,
-      "learning_rate": 0.0003770412671048378,
-      "loss": 3.4442,
+      "epoch": 3.730458221024259,
+      "grad_norm": 0.6643472909927368,
+      "learning_rate": 0.000376656233135456,
+      "loss": 3.4623,
       "step": 34600
     },
     {
-      "epoch": 3.7294155634484984,
-      "grad_norm": 0.7250891327857971,
-      "learning_rate": 0.00037671802607477635,
-      "loss": 3.4512,
+      "epoch": 3.7358490566037736,
+      "grad_norm": 0.5791902542114258,
+      "learning_rate": 0.0003763324338909876,
+      "loss": 3.4528,
       "step": 34650
     },
     {
-      "epoch": 3.7347971154881066,
-      "grad_norm": 0.6259332299232483,
-      "learning_rate": 0.000376394785044715,
-      "loss": 3.4547,
+      "epoch": 3.7412398921832883,
+      "grad_norm": 0.5891251564025879,
+      "learning_rate": 0.0003760086346465191,
+      "loss": 3.4573,
       "step": 34700
     },
     {
-      "epoch": 3.740178667527715,
-      "grad_norm": 0.6603428721427917,
-      "learning_rate": 0.0003760715440146536,
-      "loss": 3.4524,
+      "epoch": 3.7466307277628035,
+      "grad_norm": 0.6279162764549255,
+      "learning_rate": 0.0003756848354020507,
+      "loss": 3.4399,
       "step": 34750
     },
     {
-      "epoch": 3.745560219567323,
-      "grad_norm": 0.6384559273719788,
-      "learning_rate": 0.00037574830298459214,
-      "loss": 3.4597,
+      "epoch": 3.752021563342318,
+      "grad_norm": 0.6248940825462341,
+      "learning_rate": 0.00037536103615758224,
+      "loss": 3.474,
       "step": 34800
     },
     {
-      "epoch": 3.7509417716069313,
-      "grad_norm": 0.5936117172241211,
-      "learning_rate": 0.00037542506195453073,
-      "loss": 3.4618,
+      "epoch": 3.757412398921833,
+      "grad_norm": 0.6553635597229004,
+      "learning_rate": 0.0003750372369131138,
+      "loss": 3.46,
       "step": 34850
     },
     {
-      "epoch": 3.75632332364654,
-      "grad_norm": 0.6444550156593323,
-      "learning_rate": 0.0003751018209244693,
-      "loss": 3.4451,
+      "epoch": 3.7628032345013476,
+      "grad_norm": 0.6073916554450989,
+      "learning_rate": 0.0003747134376686454,
+      "loss": 3.459,
       "step": 34900
     },
     {
-      "epoch": 3.761704875686148,
-      "grad_norm": 0.6175785660743713,
-      "learning_rate": 0.00037477857989440787,
-      "loss": 3.4498,
+      "epoch": 3.7681940700808623,
+      "grad_norm": 0.580033004283905,
+      "learning_rate": 0.00037438963842417695,
+      "loss": 3.4599,
       "step": 34950
     },
     {
-      "epoch": 3.767086427725756,
-      "grad_norm": 0.6283439993858337,
-      "learning_rate": 0.0003744553388643465,
-      "loss": 3.4402,
+      "epoch": 3.7735849056603774,
+      "grad_norm": 0.6097987294197083,
+      "learning_rate": 0.00037406583917970856,
+      "loss": 3.447,
       "step": 35000
     },
     {
-      "epoch": 3.767086427725756,
-      "eval_accuracy": 0.37506064189301147,
-      "eval_loss": 3.4579710960388184,
-      "eval_runtime": 184.4841,
-      "eval_samples_per_second": 97.629,
-      "eval_steps_per_second": 6.104,
+      "epoch": 3.7735849056603774,
+      "eval_accuracy": 0.3745048823718404,
+      "eval_loss": 3.4596734046936035,
+      "eval_runtime": 183.8415,
+      "eval_samples_per_second": 97.97,
+      "eval_steps_per_second": 6.125,
       "step": 35000
     },
     {
-      "epoch": 3.772467979765364,
-      "grad_norm": 0.6437238454818726,
-      "learning_rate": 0.0003741320978342851,
-      "loss": 3.45,
+      "epoch": 3.778975741239892,
+      "grad_norm": 0.6718741655349731,
+      "learning_rate": 0.0003737420399352401,
+      "loss": 3.4594,
       "step": 35050
     },
     {
-      "epoch": 3.7778495318049723,
-      "grad_norm": 0.6655225157737732,
-      "learning_rate": 0.00037380885680422365,
-      "loss": 3.4558,
+      "epoch": 3.784366576819407,
+      "grad_norm": 0.5822587013244629,
+      "learning_rate": 0.0003734182406907717,
+      "loss": 3.4532,
       "step": 35100
     },
     {
-      "epoch": 3.783231083844581,
-      "grad_norm": 0.6094281673431396,
-      "learning_rate": 0.00037348561577416224,
-      "loss": 3.451,
+      "epoch": 3.789757412398922,
+      "grad_norm": 0.5875661373138428,
+      "learning_rate": 0.00037309444144630326,
+      "loss": 3.4483,
       "step": 35150
     },
     {
-      "epoch": 3.788612635884189,
-      "grad_norm": 0.6181848645210266,
-      "learning_rate": 0.0003731623747441008,
-      "loss": 3.443,
+      "epoch": 3.7951482479784366,
+      "grad_norm": 0.6691891551017761,
+      "learning_rate": 0.00037277064220183487,
+      "loss": 3.4729,
       "step": 35200
     },
     {
-      "epoch": 3.793994187923797,
-      "grad_norm": 0.6608075499534607,
-      "learning_rate": 0.0003728391337140394,
-      "loss": 3.4426,
+      "epoch": 3.8005390835579513,
+      "grad_norm": 0.6240682005882263,
+      "learning_rate": 0.0003724468429573664,
+      "loss": 3.4471,
       "step": 35250
     },
     {
-      "epoch": 3.7993757399634056,
-      "grad_norm": 0.6112875938415527,
-      "learning_rate": 0.00037251589268397803,
-      "loss": 3.4448,
+      "epoch": 3.8059299191374665,
+      "grad_norm": 0.6282724142074585,
+      "learning_rate": 0.0003721230437128979,
+      "loss": 3.4403,
       "step": 35300
     },
     {
-      "epoch": 3.8047572920030137,
-      "grad_norm": 0.650355339050293,
-      "learning_rate": 0.00037219265165391657,
-      "loss": 3.4504,
+      "epoch": 3.811320754716981,
+      "grad_norm": 0.6456165909767151,
+      "learning_rate": 0.00037180572045331887,
+      "loss": 3.4422,
       "step": 35350
     },
     {
-      "epoch": 3.810138844042622,
-      "grad_norm": 0.6517062187194824,
-      "learning_rate": 0.00037186941062385516,
-      "loss": 3.4615,
+      "epoch": 3.816711590296496,
+      "grad_norm": 0.6330288648605347,
+      "learning_rate": 0.00037148192120885047,
+      "loss": 3.4571,
       "step": 35400
     },
     {
-      "epoch": 3.8155203960822304,
-      "grad_norm": 0.7008894085884094,
-      "learning_rate": 0.0003715461695937937,
-      "loss": 3.4503,
+      "epoch": 3.822102425876011,
+      "grad_norm": 0.5821753740310669,
+      "learning_rate": 0.000371158121964382,
+      "loss": 3.4666,
       "step": 35450
     },
     {
-      "epoch": 3.8209019481218385,
-      "grad_norm": 0.6126015186309814,
-      "learning_rate": 0.0003712229285637323,
-      "loss": 3.4488,
+      "epoch": 3.8274932614555257,
+      "grad_norm": 0.6028748750686646,
+      "learning_rate": 0.0003708343227199136,
+      "loss": 3.4447,
       "step": 35500
     },
     {
-      "epoch": 3.8262835001614466,
-      "grad_norm": 0.6550661325454712,
-      "learning_rate": 0.0003708996875336709,
-      "loss": 3.4358,
+      "epoch": 3.8328840970350404,
+      "grad_norm": 0.5814579129219055,
+      "learning_rate": 0.0003705105234754452,
+      "loss": 3.4496,
       "step": 35550
     },
     {
-      "epoch": 3.8316650522010547,
-      "grad_norm": 0.6141082644462585,
-      "learning_rate": 0.0003705764465036095,
-      "loss": 3.4613,
+      "epoch": 3.838274932614555,
+      "grad_norm": 0.6393029093742371,
+      "learning_rate": 0.0003701867242309768,
+      "loss": 3.4504,
       "step": 35600
     },
     {
-      "epoch": 3.837046604240663,
-      "grad_norm": 0.6304882764816284,
-      "learning_rate": 0.0003702532054735481,
-      "loss": 3.4615,
+      "epoch": 3.8436657681940702,
+      "grad_norm": 0.6390960216522217,
+      "learning_rate": 0.00036986292498650833,
+      "loss": 3.4478,
       "step": 35650
     },
     {
-      "epoch": 3.8424281562802713,
-      "grad_norm": 0.6499120593070984,
-      "learning_rate": 0.0003699299644434867,
-      "loss": 3.4667,
+      "epoch": 3.849056603773585,
+      "grad_norm": 0.5880372524261475,
+      "learning_rate": 0.0003695391257420399,
+      "loss": 3.4499,
       "step": 35700
     },
     {
-      "epoch": 3.8478097083198795,
-      "grad_norm": 0.6515492796897888,
-      "learning_rate": 0.0003696067234134252,
-      "loss": 3.4591,
+      "epoch": 3.8544474393530996,
+      "grad_norm": 0.5923008918762207,
+      "learning_rate": 0.0003692153264975715,
+      "loss": 3.4565,
       "step": 35750
     },
     {
-      "epoch": 3.8531912603594876,
-      "grad_norm": 0.6420202255249023,
-      "learning_rate": 0.0003692834823833638,
-      "loss": 3.4544,
+      "epoch": 3.8598382749326143,
+      "grad_norm": 0.585785984992981,
+      "learning_rate": 0.00036889152725310304,
+      "loss": 3.4454,
       "step": 35800
     },
     {
-      "epoch": 3.858572812399096,
-      "grad_norm": 0.5830044150352478,
-      "learning_rate": 0.00036896024135330246,
-      "loss": 3.4592,
+      "epoch": 3.8652291105121295,
+      "grad_norm": 0.600885808467865,
+      "learning_rate": 0.00036856772800863464,
+      "loss": 3.4475,
       "step": 35850
     },
     {
-      "epoch": 3.863954364438704,
-      "grad_norm": 0.61103355884552,
-      "learning_rate": 0.000368637000323241,
-      "loss": 3.4685,
+      "epoch": 3.870619946091644,
+      "grad_norm": 0.6213895678520203,
+      "learning_rate": 0.0003682439287641662,
+      "loss": 3.4587,
       "step": 35900
     },
     {
-      "epoch": 3.8693359164783123,
-      "grad_norm": 0.7067725658416748,
-      "learning_rate": 0.0003683137592931796,
-      "loss": 3.469,
+      "epoch": 3.876010781671159,
+      "grad_norm": 0.5899269580841064,
+      "learning_rate": 0.0003679201295196978,
+      "loss": 3.455,
       "step": 35950
     },
     {
-      "epoch": 3.8747174685179204,
-      "grad_norm": 0.5974989533424377,
-      "learning_rate": 0.00036799051826311814,
-      "loss": 3.4581,
+      "epoch": 3.881401617250674,
+      "grad_norm": 0.6595240831375122,
+      "learning_rate": 0.00036759633027522935,
+      "loss": 3.453,
       "step": 36000
     },
     {
-      "epoch": 3.8747174685179204,
-      "eval_accuracy": 0.37549394960961563,
-      "eval_loss": 3.4535152912139893,
-      "eval_runtime": 184.307,
-      "eval_samples_per_second": 97.723,
-      "eval_steps_per_second": 6.109,
+      "epoch": 3.881401617250674,
+      "eval_accuracy": 0.37545374804112425,
+      "eval_loss": 3.4512643814086914,
+      "eval_runtime": 183.5211,
+      "eval_samples_per_second": 98.141,
+      "eval_steps_per_second": 6.136,
       "step": 36000
     },
     {
-      "epoch": 3.8800990205575285,
-      "grad_norm": 0.568252682685852,
-      "learning_rate": 0.00036766727723305673,
-      "loss": 3.4568,
+      "epoch": 3.8867924528301887,
+      "grad_norm": 0.6180174350738525,
+      "learning_rate": 0.00036727253103076084,
+      "loss": 3.4486,
       "step": 36050
     },
     {
-      "epoch": 3.885480572597137,
-      "grad_norm": 0.6623275279998779,
-      "learning_rate": 0.0003673440362029953,
-      "loss": 3.4443,
+      "epoch": 3.8921832884097034,
+      "grad_norm": 0.5822926163673401,
+      "learning_rate": 0.00036694873178629245,
+      "loss": 3.4582,
       "step": 36100
     },
     {
-      "epoch": 3.890862124636745,
-      "grad_norm": 0.6388083696365356,
-      "learning_rate": 0.0003670207951729339,
-      "loss": 3.456,
+      "epoch": 3.8975741239892185,
+      "grad_norm": 0.6376808285713196,
+      "learning_rate": 0.000366624932541824,
+      "loss": 3.435,
       "step": 36150
     },
     {
-      "epoch": 3.8962436766763533,
-      "grad_norm": 0.634536862373352,
-      "learning_rate": 0.0003666975541428725,
-      "loss": 3.4427,
+      "epoch": 3.9029649595687332,
+      "grad_norm": 0.6071598529815674,
+      "learning_rate": 0.0003663011332973556,
+      "loss": 3.4454,
       "step": 36200
     },
     {
-      "epoch": 3.901625228715962,
-      "grad_norm": 0.5885773301124573,
-      "learning_rate": 0.0003663743131128111,
-      "loss": 3.4392,
+      "epoch": 3.908355795148248,
+      "grad_norm": 0.5952266454696655,
+      "learning_rate": 0.00036597733405288715,
+      "loss": 3.4652,
       "step": 36250
     },
     {
-      "epoch": 3.90700678075557,
-      "grad_norm": 0.7272632718086243,
-      "learning_rate": 0.00036605107208274965,
-      "loss": 3.4446,
+      "epoch": 3.913746630727763,
+      "grad_norm": 0.6375381946563721,
+      "learning_rate": 0.00036565353480841876,
+      "loss": 3.4503,
       "step": 36300
     },
     {
-      "epoch": 3.912388332795178,
-      "grad_norm": 0.590799868106842,
-      "learning_rate": 0.00036572783105268824,
-      "loss": 3.4486,
+      "epoch": 3.9191374663072778,
+      "grad_norm": 0.6298218965530396,
+      "learning_rate": 0.0003653297355639503,
+      "loss": 3.4557,
       "step": 36350
     },
     {
-      "epoch": 3.9177698848347866,
-      "grad_norm": 0.6622004508972168,
-      "learning_rate": 0.0003654045900226268,
-      "loss": 3.4538,
+      "epoch": 3.9245283018867925,
+      "grad_norm": 0.6288528442382812,
+      "learning_rate": 0.0003650059363194819,
+      "loss": 3.445,
       "step": 36400
     },
     {
-      "epoch": 3.9231514368743947,
-      "grad_norm": 0.6048895120620728,
-      "learning_rate": 0.00036508134899256543,
-      "loss": 3.4494,
+      "epoch": 3.929919137466307,
+      "grad_norm": 0.5978701710700989,
+      "learning_rate": 0.00036468213707501347,
+      "loss": 3.4672,
       "step": 36450
     },
     {
-      "epoch": 3.928532988914003,
-      "grad_norm": 0.6244843602180481,
-      "learning_rate": 0.00036475810796250403,
-      "loss": 3.4429,
+      "epoch": 3.935309973045822,
+      "grad_norm": 0.6203653812408447,
+      "learning_rate": 0.000364358337830545,
+      "loss": 3.4593,
       "step": 36500
     },
     {
-      "epoch": 3.933914540953611,
-      "grad_norm": 0.7105559706687927,
-      "learning_rate": 0.00036443486693244257,
-      "loss": 3.4539,
+      "epoch": 3.940700808625337,
+      "grad_norm": 0.5815897583961487,
+      "learning_rate": 0.0003640345385860766,
+      "loss": 3.4576,
       "step": 36550
     },
     {
-      "epoch": 3.939296092993219,
-      "grad_norm": 0.5961714386940002,
-      "learning_rate": 0.00036411162590238116,
-      "loss": 3.4572,
+      "epoch": 3.9460916442048517,
+      "grad_norm": 0.5865249633789062,
+      "learning_rate": 0.00036371073934160817,
+      "loss": 3.4599,
       "step": 36600
     },
     {
-      "epoch": 3.9446776450328276,
-      "grad_norm": 0.6044503450393677,
-      "learning_rate": 0.00036378838487231976,
-      "loss": 3.4577,
+      "epoch": 3.9514824797843664,
+      "grad_norm": 0.6294233202934265,
+      "learning_rate": 0.0003633869400971398,
+      "loss": 3.4563,
       "step": 36650
     },
     {
-      "epoch": 3.9500591970724357,
-      "grad_norm": 0.6206450462341309,
-      "learning_rate": 0.00036346514384225835,
-      "loss": 3.4502,
+      "epoch": 3.9568733153638815,
+      "grad_norm": 0.7033206224441528,
+      "learning_rate": 0.00036306314085267127,
+      "loss": 3.4494,
       "step": 36700
     },
     {
-      "epoch": 3.955440749112044,
-      "grad_norm": 0.6197695732116699,
-      "learning_rate": 0.00036314190281219695,
-      "loss": 3.4515,
+      "epoch": 3.9622641509433962,
+      "grad_norm": 0.6106424331665039,
+      "learning_rate": 0.00036273934160820293,
+      "loss": 3.482,
       "step": 36750
     },
     {
-      "epoch": 3.9608223011516523,
-      "grad_norm": 0.6754399538040161,
-      "learning_rate": 0.00036281866178213554,
-      "loss": 3.4478,
+      "epoch": 3.967654986522911,
+      "grad_norm": 0.6779366731643677,
+      "learning_rate": 0.00036241554236373443,
+      "loss": 3.4663,
       "step": 36800
     },
     {
-      "epoch": 3.9662038531912605,
-      "grad_norm": 0.6515491604804993,
-      "learning_rate": 0.0003624954207520741,
-      "loss": 3.4414,
+      "epoch": 3.973045822102426,
+      "grad_norm": 0.6342257261276245,
+      "learning_rate": 0.00036209174311926603,
+      "loss": 3.4406,
       "step": 36850
     },
     {
-      "epoch": 3.9715854052308686,
-      "grad_norm": 0.7095997929573059,
-      "learning_rate": 0.0003621721797220127,
-      "loss": 3.4407,
+      "epoch": 3.9784366576819408,
+      "grad_norm": 0.6370275020599365,
+      "learning_rate": 0.0003617679438747976,
+      "loss": 3.4522,
       "step": 36900
     },
     {
-      "epoch": 3.9769669572704767,
-      "grad_norm": 0.6265909671783447,
-      "learning_rate": 0.0003618489386919512,
-      "loss": 3.4681,
+      "epoch": 3.9838274932614555,
+      "grad_norm": 0.6269297003746033,
+      "learning_rate": 0.00036144414463032913,
+      "loss": 3.4313,
       "step": 36950
     },
     {
-      "epoch": 3.9823485093100848,
-      "grad_norm": 0.6624922156333923,
-      "learning_rate": 0.00036152569766188987,
-      "loss": 3.4399,
+      "epoch": 3.9892183288409706,
+      "grad_norm": 0.6149274110794067,
+      "learning_rate": 0.00036112034538586074,
+      "loss": 3.4401,
       "step": 37000
     },
     {
-      "epoch": 3.9823485093100848,
-      "eval_accuracy": 0.3764659583440039,
-      "eval_loss": 3.4450950622558594,
-      "eval_runtime": 184.7248,
-      "eval_samples_per_second": 97.502,
-      "eval_steps_per_second": 6.096,
+      "epoch": 3.9892183288409706,
+      "eval_accuracy": 0.3759343197639277,
+      "eval_loss": 3.4460744857788086,
+      "eval_runtime": 183.9017,
+      "eval_samples_per_second": 97.938,
+      "eval_steps_per_second": 6.123,
       "step": 37000
     },
     {
-      "epoch": 3.9877300613496933,
-      "grad_norm": 0.675195574760437,
-      "learning_rate": 0.00036120245663182846,
-      "loss": 3.4563,
+      "epoch": 3.9946091644204853,
+      "grad_norm": 0.6133876442909241,
+      "learning_rate": 0.0003607965461413923,
+      "loss": 3.4475,
       "step": 37050
     },
     {
-      "epoch": 3.9931116133893014,
-      "grad_norm": 0.6407047510147095,
-      "learning_rate": 0.000360879215601767,
-      "loss": 3.4663,
+      "epoch": 4.0,
+      "grad_norm": 1.1652570962905884,
+      "learning_rate": 0.0003604727468969239,
+      "loss": 3.4314,
       "step": 37100
     },
     {
-      "epoch": 3.9984931654289095,
-      "grad_norm": 0.6153064966201782,
-      "learning_rate": 0.0003605559745717056,
-      "loss": 3.457,
+      "epoch": 4.005390835579515,
+      "grad_norm": 0.6168572306632996,
+      "learning_rate": 0.00036014894765245544,
+      "loss": 3.3569,
       "step": 37150
     },
     {
-      "epoch": 4.003874717468518,
-      "grad_norm": 0.6592976450920105,
-      "learning_rate": 0.0003602327335416442,
-      "loss": 3.3949,
+      "epoch": 4.010781671159029,
+      "grad_norm": 0.6085106134414673,
+      "learning_rate": 0.00035982514840798705,
+      "loss": 3.3585,
       "step": 37200
     },
     {
-      "epoch": 4.009256269508126,
-      "grad_norm": 0.6417883634567261,
-      "learning_rate": 0.00035990949251158273,
-      "loss": 3.3379,
+      "epoch": 4.0161725067385445,
+      "grad_norm": 0.5930397510528564,
+      "learning_rate": 0.00035950782514840795,
+      "loss": 3.3546,
       "step": 37250
     },
     {
-      "epoch": 4.014637821547734,
-      "grad_norm": 0.6505351066589355,
-      "learning_rate": 0.0003595862514815214,
-      "loss": 3.3506,
+      "epoch": 4.02156334231806,
+      "grad_norm": 0.6373165249824524,
+      "learning_rate": 0.00035918402590393955,
+      "loss": 3.3747,
       "step": 37300
     },
     {
-      "epoch": 4.020019373587343,
-      "grad_norm": 0.622450053691864,
-      "learning_rate": 0.00035926301045146,
-      "loss": 3.354,
+      "epoch": 4.026954177897574,
+      "grad_norm": 0.5819648504257202,
+      "learning_rate": 0.00035886022665947105,
+      "loss": 3.3607,
       "step": 37350
     },
     {
-      "epoch": 4.0254009256269505,
-      "grad_norm": 0.6620802283287048,
-      "learning_rate": 0.0003589397694213985,
-      "loss": 3.3948,
+      "epoch": 4.032345013477089,
+      "grad_norm": 0.596740186214447,
+      "learning_rate": 0.0003585364274150027,
+      "loss": 3.3562,
       "step": 37400
     },
     {
-      "epoch": 4.030782477666559,
-      "grad_norm": 0.674140214920044,
-      "learning_rate": 0.0003586165283913371,
-      "loss": 3.374,
+      "epoch": 4.037735849056604,
+      "grad_norm": 0.6327202916145325,
+      "learning_rate": 0.0003582126281705342,
+      "loss": 3.3695,
       "step": 37450
     },
     {
-      "epoch": 4.036164029706168,
-      "grad_norm": 0.6449016332626343,
-      "learning_rate": 0.00035829328736127565,
-      "loss": 3.3668,
+      "epoch": 4.0431266846361185,
+      "grad_norm": 0.6567413210868835,
+      "learning_rate": 0.0003578888289260658,
+      "loss": 3.3703,
       "step": 37500
     },
     {
-      "epoch": 4.041545581745775,
-      "grad_norm": 0.6092488169670105,
-      "learning_rate": 0.00035797004633121425,
-      "loss": 3.3746,
+      "epoch": 4.048517520215634,
+      "grad_norm": 0.6062192916870117,
+      "learning_rate": 0.00035756502968159736,
+      "loss": 3.368,
       "step": 37550
     },
     {
-      "epoch": 4.046927133785384,
-      "grad_norm": 0.655305027961731,
-      "learning_rate": 0.0003576468053011529,
-      "loss": 3.352,
+      "epoch": 4.053908355795148,
+      "grad_norm": 0.668158769607544,
+      "learning_rate": 0.00035724123043712896,
+      "loss": 3.356,
       "step": 37600
     },
     {
-      "epoch": 4.0523086858249915,
-      "grad_norm": 0.650629460811615,
-      "learning_rate": 0.00035732356427109143,
-      "loss": 3.3727,
+      "epoch": 4.059299191374663,
+      "grad_norm": 0.5983052849769592,
+      "learning_rate": 0.0003569174311926605,
+      "loss": 3.3772,
       "step": 37650
     },
     {
-      "epoch": 4.0576902378646,
-      "grad_norm": 0.6936136484146118,
-      "learning_rate": 0.00035700032324103003,
-      "loss": 3.3571,
+      "epoch": 4.064690026954178,
+      "grad_norm": 0.5878796577453613,
+      "learning_rate": 0.00035659363194819206,
+      "loss": 3.3821,
       "step": 37700
     },
     {
-      "epoch": 4.063071789904209,
-      "grad_norm": 0.6292705535888672,
-      "learning_rate": 0.00035668354703156984,
-      "loss": 3.3797,
+      "epoch": 4.070080862533692,
+      "grad_norm": 0.6337990760803223,
+      "learning_rate": 0.00035626983270372367,
+      "loss": 3.3793,
       "step": 37750
     },
     {
-      "epoch": 4.068453341943816,
-      "grad_norm": 0.6342557072639465,
-      "learning_rate": 0.00035636030600150843,
-      "loss": 3.3791,
+      "epoch": 4.0754716981132075,
+      "grad_norm": 0.594143271446228,
+      "learning_rate": 0.0003559460334592552,
+      "loss": 3.3521,
       "step": 37800
     },
     {
-      "epoch": 4.073834893983425,
-      "grad_norm": 0.6861465573310852,
-      "learning_rate": 0.00035603706497144697,
-      "loss": 3.3849,
+      "epoch": 4.080862533692723,
+      "grad_norm": 0.6610286831855774,
+      "learning_rate": 0.0003556222342147868,
+      "loss": 3.3667,
       "step": 37850
     },
     {
-      "epoch": 4.079216446023033,
-      "grad_norm": 0.6176989674568176,
-      "learning_rate": 0.00035571382394138557,
-      "loss": 3.3603,
+      "epoch": 4.086253369272237,
+      "grad_norm": 0.6618052124977112,
+      "learning_rate": 0.0003552984349703184,
+      "loss": 3.3683,
       "step": 37900
     },
     {
-      "epoch": 4.084597998062641,
-      "grad_norm": 0.6110954880714417,
-      "learning_rate": 0.0003553905829113242,
-      "loss": 3.3471,
+      "epoch": 4.091644204851752,
+      "grad_norm": 0.6083658933639526,
+      "learning_rate": 0.00035497463572585,
+      "loss": 3.3693,
       "step": 37950
     },
     {
-      "epoch": 4.08997955010225,
-      "grad_norm": 0.6518476009368896,
-      "learning_rate": 0.00035506734188126275,
-      "loss": 3.3962,
+      "epoch": 4.097035040431267,
+      "grad_norm": 0.6676791906356812,
+      "learning_rate": 0.00035465083648138153,
+      "loss": 3.3826,
       "step": 38000
     },
     {
-      "epoch": 4.08997955010225,
-      "eval_accuracy": 0.37652365302743346,
-      "eval_loss": 3.4501006603240967,
-      "eval_runtime": 184.0449,
-      "eval_samples_per_second": 97.862,
-      "eval_steps_per_second": 6.118,
+      "epoch": 4.097035040431267,
+      "eval_accuracy": 0.3765557056293387,
+      "eval_loss": 3.4486758708953857,
+      "eval_runtime": 183.4657,
+      "eval_samples_per_second": 98.171,
+      "eval_steps_per_second": 6.137,
       "step": 38000
     },
     {
-      "epoch": 4.095361102141858,
-      "grad_norm": 0.7071831822395325,
-      "learning_rate": 0.00035474410085120135,
-      "loss": 3.3712,
+      "epoch": 4.1024258760107815,
+      "grad_norm": 0.6280378103256226,
+      "learning_rate": 0.00035432703723691314,
+      "loss": 3.3745,
       "step": 38050
     },
     {
-      "epoch": 4.100742654181466,
-      "grad_norm": 0.6040788292884827,
-      "learning_rate": 0.00035442085982113994,
-      "loss": 3.3925,
+      "epoch": 4.107816711590297,
+      "grad_norm": 0.639232873916626,
+      "learning_rate": 0.00035400323799244463,
+      "loss": 3.383,
       "step": 38100
     },
     {
-      "epoch": 4.106124206221074,
-      "grad_norm": 0.6902562975883484,
-      "learning_rate": 0.0003540976187910785,
-      "loss": 3.3507,
+      "epoch": 4.113207547169812,
+      "grad_norm": 0.6617724299430847,
+      "learning_rate": 0.0003536794387479762,
+      "loss": 3.3792,
       "step": 38150
     },
     {
-      "epoch": 4.111505758260682,
-      "grad_norm": 0.6432631015777588,
-      "learning_rate": 0.0003537743777610171,
-      "loss": 3.3778,
+      "epoch": 4.118598382749326,
+      "grad_norm": 0.5961510539054871,
+      "learning_rate": 0.0003533556395035078,
+      "loss": 3.3645,
       "step": 38200
     },
     {
-      "epoch": 4.1168873103002905,
-      "grad_norm": 0.6424291729927063,
-      "learning_rate": 0.00035345113673095573,
-      "loss": 3.3656,
+      "epoch": 4.123989218328841,
+      "grad_norm": 0.6423681378364563,
+      "learning_rate": 0.00035303184025903934,
+      "loss": 3.3716,
       "step": 38250
     },
     {
-      "epoch": 4.122268862339899,
-      "grad_norm": 0.6857510209083557,
-      "learning_rate": 0.00035312789570089427,
-      "loss": 3.3442,
+      "epoch": 4.129380053908355,
+      "grad_norm": 0.5851473808288574,
+      "learning_rate": 0.00035270804101457094,
+      "loss": 3.3768,
       "step": 38300
     },
     {
-      "epoch": 4.127650414379507,
-      "grad_norm": 0.6548146605491638,
-      "learning_rate": 0.00035280465467083286,
-      "loss": 3.3866,
+      "epoch": 4.1347708894878705,
+      "grad_norm": 0.6345853805541992,
+      "learning_rate": 0.0003523842417701025,
+      "loss": 3.3602,
       "step": 38350
     },
     {
-      "epoch": 4.133031966419115,
-      "grad_norm": 0.6027295589447021,
-      "learning_rate": 0.0003524814136407714,
-      "loss": 3.3592,
+      "epoch": 4.140161725067386,
+      "grad_norm": 0.6322952508926392,
+      "learning_rate": 0.0003520604425256341,
+      "loss": 3.3742,
       "step": 38400
     },
     {
-      "epoch": 4.138413518458724,
-      "grad_norm": 0.6444876194000244,
-      "learning_rate": 0.00035215817261071,
-      "loss": 3.3592,
+      "epoch": 4.1455525606469,
+      "grad_norm": 0.6358514428138733,
+      "learning_rate": 0.00035173664328116565,
+      "loss": 3.3609,
       "step": 38450
     },
     {
-      "epoch": 4.1437950704983315,
-      "grad_norm": 0.6224494576454163,
-      "learning_rate": 0.00035183493158064865,
-      "loss": 3.3752,
+      "epoch": 4.150943396226415,
+      "grad_norm": 0.5968437194824219,
+      "learning_rate": 0.00035141284403669725,
+      "loss": 3.3902,
       "step": 38500
     },
     {
-      "epoch": 4.14917662253794,
-      "grad_norm": 0.5903575420379639,
-      "learning_rate": 0.0003515116905505872,
-      "loss": 3.3655,
+      "epoch": 4.15633423180593,
+      "grad_norm": 0.6028518676757812,
+      "learning_rate": 0.0003510890447922288,
+      "loss": 3.3678,
       "step": 38550
     },
     {
-      "epoch": 4.154558174577549,
-      "grad_norm": 0.7473770976066589,
-      "learning_rate": 0.0003511884495205258,
-      "loss": 3.3896,
+      "epoch": 4.1617250673854445,
+      "grad_norm": 0.6527324914932251,
+      "learning_rate": 0.00035076524554776035,
+      "loss": 3.3751,
       "step": 38600
     },
     {
-      "epoch": 4.159939726617156,
-      "grad_norm": 0.6288912296295166,
-      "learning_rate": 0.0003508652084904644,
-      "loss": 3.3903,
+      "epoch": 4.16711590296496,
+      "grad_norm": 0.6300586462020874,
+      "learning_rate": 0.00035044144630329196,
+      "loss": 3.3759,
       "step": 38650
     },
     {
-      "epoch": 4.165321278656765,
-      "grad_norm": 0.6335214376449585,
-      "learning_rate": 0.0003505419674604029,
-      "loss": 3.3766,
+      "epoch": 4.172506738544475,
+      "grad_norm": 0.6409329771995544,
+      "learning_rate": 0.00035011764705882346,
+      "loss": 3.397,
       "step": 38700
     },
     {
-      "epoch": 4.1707028306963725,
-      "grad_norm": 0.6742566823959351,
-      "learning_rate": 0.0003502187264303415,
-      "loss": 3.3673,
+      "epoch": 4.177897574123989,
+      "grad_norm": 0.6493109464645386,
+      "learning_rate": 0.0003497938478143551,
+      "loss": 3.3677,
       "step": 38750
     },
     {
-      "epoch": 4.176084382735981,
-      "grad_norm": 0.6295595169067383,
-      "learning_rate": 0.00034989548540028016,
-      "loss": 3.384,
+      "epoch": 4.183288409703504,
+      "grad_norm": 0.6340633630752563,
+      "learning_rate": 0.0003494700485698866,
+      "loss": 3.3919,
       "step": 38800
     },
     {
-      "epoch": 4.18146593477559,
-      "grad_norm": 0.6516152024269104,
-      "learning_rate": 0.0003495787091908199,
-      "loss": 3.3958,
+      "epoch": 4.188679245283019,
+      "grad_norm": 0.6204349994659424,
+      "learning_rate": 0.0003491462493254182,
+      "loss": 3.3822,
       "step": 38850
     },
     {
-      "epoch": 4.186847486815197,
-      "grad_norm": 0.6820975542068481,
-      "learning_rate": 0.0003492554681607585,
-      "loss": 3.3867,
+      "epoch": 4.1940700808625335,
+      "grad_norm": 0.6226798892021179,
+      "learning_rate": 0.00034882245008094977,
+      "loss": 3.3894,
       "step": 38900
     },
     {
-      "epoch": 4.192229038854806,
-      "grad_norm": 0.650514543056488,
-      "learning_rate": 0.0003489322271306971,
-      "loss": 3.3893,
+      "epoch": 4.199460916442049,
+      "grad_norm": 0.6485829949378967,
+      "learning_rate": 0.0003484986508364813,
+      "loss": 3.385,
       "step": 38950
     },
     {
-      "epoch": 4.197610590894414,
-      "grad_norm": 0.6839084625244141,
-      "learning_rate": 0.0003486089861006357,
-      "loss": 3.3787,
+      "epoch": 4.204851752021563,
+      "grad_norm": 0.710810661315918,
+      "learning_rate": 0.0003481748515920129,
+      "loss": 3.3797,
       "step": 39000
     },
     {
-      "epoch": 4.197610590894414,
-      "eval_accuracy": 0.37696271934709175,
-      "eval_loss": 3.4459009170532227,
-      "eval_runtime": 184.0684,
-      "eval_samples_per_second": 97.849,
-      "eval_steps_per_second": 6.117,
+      "epoch": 4.204851752021563,
+      "eval_accuracy": 0.3766247002131009,
+      "eval_loss": 3.4453039169311523,
+      "eval_runtime": 183.5571,
+      "eval_samples_per_second": 98.122,
+      "eval_steps_per_second": 6.134,
       "step": 39000
     },
     {
-      "epoch": 4.202992142934022,
-      "grad_norm": 0.6607173085212708,
-      "learning_rate": 0.00034828574507057424,
-      "loss": 3.3867,
+      "epoch": 4.210242587601078,
+      "grad_norm": 0.6208673715591431,
+      "learning_rate": 0.00034785105234754447,
+      "loss": 3.3822,
       "step": 39050
     },
     {
-      "epoch": 4.208373694973631,
-      "grad_norm": 0.6593530774116516,
-      "learning_rate": 0.00034796250404051283,
-      "loss": 3.3728,
+      "epoch": 4.215633423180593,
+      "grad_norm": 0.6665465235710144,
+      "learning_rate": 0.0003475272531030761,
+      "loss": 3.385,
       "step": 39100
     },
     {
-      "epoch": 4.213755247013238,
-      "grad_norm": 0.6216228008270264,
-      "learning_rate": 0.00034763926301045137,
-      "loss": 3.3738,
+      "epoch": 4.2210242587601075,
+      "grad_norm": 0.6304270029067993,
+      "learning_rate": 0.00034720345385860763,
+      "loss": 3.3941,
       "step": 39150
     },
     {
-      "epoch": 4.219136799052847,
-      "grad_norm": 0.6555355191230774,
-      "learning_rate": 0.00034731602198039,
-      "loss": 3.3772,
+      "epoch": 4.226415094339623,
+      "grad_norm": 0.6752989888191223,
+      "learning_rate": 0.00034687965461413923,
+      "loss": 3.3906,
       "step": 39200
     },
     {
-      "epoch": 4.224518351092455,
-      "grad_norm": 0.6763815879821777,
-      "learning_rate": 0.0003469927809503286,
-      "loss": 3.3861,
+      "epoch": 4.231805929919138,
+      "grad_norm": 0.5948718786239624,
+      "learning_rate": 0.0003465558553696708,
+      "loss": 3.387,
       "step": 39250
     },
     {
-      "epoch": 4.229899903132063,
-      "grad_norm": 0.6993557810783386,
-      "learning_rate": 0.00034666953992026716,
-      "loss": 3.3736,
+      "epoch": 4.237196765498652,
+      "grad_norm": 0.6458513736724854,
+      "learning_rate": 0.0003462320561252024,
+      "loss": 3.3823,
       "step": 39300
     },
     {
-      "epoch": 4.2352814551716715,
-      "grad_norm": 0.6681185960769653,
-      "learning_rate": 0.00034634629889020575,
-      "loss": 3.3867,
+      "epoch": 4.242587601078167,
+      "grad_norm": 0.6506422758102417,
+      "learning_rate": 0.00034590825688073394,
+      "loss": 3.3977,
       "step": 39350
     },
     {
-      "epoch": 4.24066300721128,
-      "grad_norm": 0.673267126083374,
-      "learning_rate": 0.00034602305786014435,
-      "loss": 3.3843,
+      "epoch": 4.247978436657682,
+      "grad_norm": 0.6215380430221558,
+      "learning_rate": 0.00034558445763626543,
+      "loss": 3.3828,
       "step": 39400
     },
     {
-      "epoch": 4.246044559250888,
-      "grad_norm": 0.6191767454147339,
-      "learning_rate": 0.00034569981683008294,
-      "loss": 3.3884,
+      "epoch": 4.2533692722371965,
+      "grad_norm": 0.6403311491012573,
+      "learning_rate": 0.00034526065839179704,
+      "loss": 3.3864,
       "step": 39450
     },
     {
-      "epoch": 4.251426111290496,
-      "grad_norm": 0.6691135764122009,
-      "learning_rate": 0.00034537657580002154,
-      "loss": 3.3979,
+      "epoch": 4.258760107816712,
+      "grad_norm": 0.6085329651832581,
+      "learning_rate": 0.0003449368591473286,
+      "loss": 3.3863,
       "step": 39500
     },
     {
-      "epoch": 4.256807663330104,
-      "grad_norm": 0.6780727505683899,
-      "learning_rate": 0.00034505333476996013,
-      "loss": 3.3776,
+      "epoch": 4.264150943396227,
+      "grad_norm": 0.6336445212364197,
+      "learning_rate": 0.00034461953588774954,
+      "loss": 3.3869,
       "step": 39550
     },
     {
-      "epoch": 4.2621892153697125,
-      "grad_norm": 0.6386967897415161,
-      "learning_rate": 0.00034473009373989867,
-      "loss": 3.3767,
+      "epoch": 4.269541778975741,
+      "grad_norm": 0.6452109217643738,
+      "learning_rate": 0.00034429573664328115,
+      "loss": 3.3841,
       "step": 39600
     },
     {
-      "epoch": 4.267570767409321,
-      "grad_norm": 0.6862613558769226,
-      "learning_rate": 0.00034440685270983727,
-      "loss": 3.3865,
+      "epoch": 4.274932614555256,
+      "grad_norm": 0.6085174083709717,
+      "learning_rate": 0.0003439719373988127,
+      "loss": 3.3809,
       "step": 39650
     },
     {
-      "epoch": 4.272952319448929,
-      "grad_norm": 0.6203247904777527,
-      "learning_rate": 0.0003440836116797758,
-      "loss": 3.3937,
+      "epoch": 4.280323450134771,
+      "grad_norm": 0.6353808641433716,
+      "learning_rate": 0.00034364813815434425,
+      "loss": 3.3975,
       "step": 39700
     },
     {
-      "epoch": 4.278333871488537,
-      "grad_norm": 0.6543366312980652,
-      "learning_rate": 0.00034376037064971445,
-      "loss": 3.3948,
+      "epoch": 4.285714285714286,
+      "grad_norm": 0.6164100170135498,
+      "learning_rate": 0.00034332433890987585,
+      "loss": 3.3868,
       "step": 39750
     },
     {
-      "epoch": 4.283715423528146,
-      "grad_norm": 0.6546263098716736,
-      "learning_rate": 0.00034343712961965305,
-      "loss": 3.375,
+      "epoch": 4.291105121293801,
+      "grad_norm": 0.6289504170417786,
+      "learning_rate": 0.0003430005396654074,
+      "loss": 3.3913,
       "step": 39800
     },
     {
-      "epoch": 4.2890969755677535,
-      "grad_norm": 0.6535509824752808,
-      "learning_rate": 0.0003431138885895916,
-      "loss": 3.3772,
+      "epoch": 4.296495956873315,
+      "grad_norm": 0.6531320810317993,
+      "learning_rate": 0.000342676740420939,
+      "loss": 3.3607,
       "step": 39850
     },
     {
-      "epoch": 4.294478527607362,
-      "grad_norm": 0.6452800035476685,
-      "learning_rate": 0.0003427906475595302,
-      "loss": 3.3722,
+      "epoch": 4.30188679245283,
+      "grad_norm": 0.598456621170044,
+      "learning_rate": 0.00034235294117647056,
+      "loss": 3.4004,
       "step": 39900
     },
     {
-      "epoch": 4.299860079646971,
-      "grad_norm": 0.6130262613296509,
-      "learning_rate": 0.0003424674065294688,
-      "loss": 3.3759,
+      "epoch": 4.307277628032345,
+      "grad_norm": 0.6940332651138306,
+      "learning_rate": 0.00034202914193200216,
+      "loss": 3.3772,
       "step": 39950
     },
     {
-      "epoch": 4.305241631686578,
-      "grad_norm": 0.6329742670059204,
-      "learning_rate": 0.0003421441654994073,
-      "loss": 3.3976,
+      "epoch": 4.3126684636118595,
+      "grad_norm": 0.6354912519454956,
+      "learning_rate": 0.0003417053426875337,
+      "loss": 3.397,
       "step": 40000
     },
     {
-      "epoch": 4.305241631686578,
-      "eval_accuracy": 0.37801404468958466,
-      "eval_loss": 3.4401395320892334,
-      "eval_runtime": 184.1282,
-      "eval_samples_per_second": 97.818,
-      "eval_steps_per_second": 6.115,
+      "epoch": 4.3126684636118595,
+      "eval_accuracy": 0.3773185575546842,
+      "eval_loss": 3.440795660018921,
+      "eval_runtime": 183.676,
+      "eval_samples_per_second": 98.059,
+      "eval_steps_per_second": 6.13,
       "step": 40000
     },
     {
-      "epoch": 4.310623183726187,
-      "grad_norm": 0.6644855737686157,
-      "learning_rate": 0.00034182092446934597,
-      "loss": 3.3697,
+      "epoch": 4.318059299191375,
+      "grad_norm": 0.6281748414039612,
+      "learning_rate": 0.0003413815434430653,
+      "loss": 3.4086,
       "step": 40050
     },
     {
-      "epoch": 4.3160047357657945,
-      "grad_norm": 0.6772639751434326,
-      "learning_rate": 0.00034149768343928456,
-      "loss": 3.389,
+      "epoch": 4.32345013477089,
+      "grad_norm": 0.6316752433776855,
+      "learning_rate": 0.0003410577441985968,
+      "loss": 3.3781,
       "step": 40100
     },
     {
-      "epoch": 4.321386287805403,
-      "grad_norm": 0.6445088386535645,
-      "learning_rate": 0.0003411744424092231,
-      "loss": 3.39,
+      "epoch": 4.328840970350404,
+      "grad_norm": 0.6012856364250183,
+      "learning_rate": 0.00034073394495412837,
+      "loss": 3.3941,
       "step": 40150
     },
     {
-      "epoch": 4.326767839845012,
-      "grad_norm": 0.6840226054191589,
-      "learning_rate": 0.0003408512013791617,
-      "loss": 3.3781,
+      "epoch": 4.334231805929919,
+      "grad_norm": 0.6168755888938904,
+      "learning_rate": 0.00034041014570965997,
+      "loss": 3.4104,
       "step": 40200
     },
     {
-      "epoch": 4.332149391884619,
-      "grad_norm": 0.7001984715461731,
-      "learning_rate": 0.00034052796034910024,
-      "loss": 3.4049,
+      "epoch": 4.339622641509434,
+      "grad_norm": 0.6107414960861206,
+      "learning_rate": 0.0003400863464651915,
+      "loss": 3.3907,
       "step": 40250
     },
     {
-      "epoch": 4.337530943924228,
-      "grad_norm": 0.6908742189407349,
-      "learning_rate": 0.0003402047193190389,
-      "loss": 3.3838,
+      "epoch": 4.345013477088949,
+      "grad_norm": 0.6335568428039551,
+      "learning_rate": 0.0003397625472207231,
+      "loss": 3.4018,
       "step": 40300
     },
     {
-      "epoch": 4.342912495963836,
-      "grad_norm": 0.6147873401641846,
-      "learning_rate": 0.0003398814782889775,
-      "loss": 3.3824,
+      "epoch": 4.350404312668464,
+      "grad_norm": 0.6497905254364014,
+      "learning_rate": 0.0003394387479762547,
+      "loss": 3.4083,
       "step": 40350
     },
     {
-      "epoch": 4.348294048003444,
-      "grad_norm": 0.6918706893920898,
-      "learning_rate": 0.000339558237258916,
-      "loss": 3.4047,
+      "epoch": 4.355795148247978,
+      "grad_norm": 0.6541007161140442,
+      "learning_rate": 0.0003391149487317863,
+      "loss": 3.397,
       "step": 40400
     },
     {
-      "epoch": 4.3536756000430525,
-      "grad_norm": 0.6723856925964355,
-      "learning_rate": 0.0003392349962288546,
-      "loss": 3.4004,
+      "epoch": 4.361185983827493,
+      "grad_norm": 0.6449550986289978,
+      "learning_rate": 0.00033879114948731783,
+      "loss": 3.4032,
       "step": 40450
     },
     {
-      "epoch": 4.359057152082661,
-      "grad_norm": 0.6632176637649536,
-      "learning_rate": 0.0003389117551987932,
-      "loss": 3.3876,
+      "epoch": 4.366576819407008,
+      "grad_norm": 0.6576782464981079,
+      "learning_rate": 0.00033846735024284944,
+      "loss": 3.3723,
       "step": 40500
     },
     {
-      "epoch": 4.364438704122269,
-      "grad_norm": 0.6662275791168213,
-      "learning_rate": 0.00033858851416873175,
-      "loss": 3.396,
+      "epoch": 4.3719676549865225,
+      "grad_norm": 0.6394531726837158,
+      "learning_rate": 0.000338143550998381,
+      "loss": 3.3806,
       "step": 40550
     },
     {
-      "epoch": 4.369820256161877,
-      "grad_norm": 0.6214112043380737,
-      "learning_rate": 0.0003382652731386704,
-      "loss": 3.3953,
+      "epoch": 4.377358490566038,
+      "grad_norm": 0.6244447827339172,
+      "learning_rate": 0.00033781975175391254,
+      "loss": 3.3881,
       "step": 40600
     },
     {
-      "epoch": 4.375201808201485,
-      "grad_norm": 0.6457570195198059,
-      "learning_rate": 0.000337942032108609,
-      "loss": 3.3698,
+      "epoch": 4.382749326145553,
+      "grad_norm": 0.6317362189292908,
+      "learning_rate": 0.00033749595250944414,
+      "loss": 3.3891,
       "step": 40650
     },
     {
-      "epoch": 4.3805833602410935,
-      "grad_norm": 0.7015320062637329,
-      "learning_rate": 0.00033761879107854754,
-      "loss": 3.4086,
+      "epoch": 4.388140161725067,
+      "grad_norm": 0.6562482118606567,
+      "learning_rate": 0.00033717215326497564,
+      "loss": 3.3921,
       "step": 40700
     },
     {
-      "epoch": 4.385964912280702,
-      "grad_norm": 0.6032451391220093,
-      "learning_rate": 0.00033729555004848613,
-      "loss": 3.3934,
+      "epoch": 4.393530997304582,
+      "grad_norm": 0.6376356482505798,
+      "learning_rate": 0.0003368483540205073,
+      "loss": 3.3925,
       "step": 40750
     },
     {
-      "epoch": 4.39134646432031,
-      "grad_norm": 0.7044788002967834,
-      "learning_rate": 0.00033697230901842467,
-      "loss": 3.387,
+      "epoch": 4.398921832884097,
+      "grad_norm": 0.5876014232635498,
+      "learning_rate": 0.0003365245547760388,
+      "loss": 3.3929,
       "step": 40800
     },
     {
-      "epoch": 4.396728016359918,
-      "grad_norm": 0.6239883899688721,
-      "learning_rate": 0.00033664906798836327,
-      "loss": 3.4018,
+      "epoch": 4.404312668463612,
+      "grad_norm": 0.6881380081176758,
+      "learning_rate": 0.0003362007555315704,
+      "loss": 3.3851,
       "step": 40850
     },
     {
-      "epoch": 4.402109568399527,
-      "grad_norm": 0.6300894021987915,
-      "learning_rate": 0.0003363258269583019,
-      "loss": 3.3962,
+      "epoch": 4.409703504043127,
+      "grad_norm": 0.6236041188240051,
+      "learning_rate": 0.00033587695628710195,
+      "loss": 3.39,
       "step": 40900
     },
     {
-      "epoch": 4.4074911204391345,
-      "grad_norm": 0.6542284488677979,
-      "learning_rate": 0.00033600258592824046,
-      "loss": 3.3824,
+      "epoch": 4.415094339622642,
+      "grad_norm": 0.6073125600814819,
+      "learning_rate": 0.00033555315704263355,
+      "loss": 3.3982,
       "step": 40950
     },
     {
-      "epoch": 4.412872672478743,
-      "grad_norm": 0.6408656239509583,
-      "learning_rate": 0.00033567934489817905,
-      "loss": 3.3844,
+      "epoch": 4.420485175202156,
+      "grad_norm": 0.6230391263961792,
+      "learning_rate": 0.0003352293577981651,
+      "loss": 3.3907,
       "step": 41000
     },
     {
-      "epoch": 4.412872672478743,
-      "eval_accuracy": 0.37776924973333864,
-      "eval_loss": 3.43493390083313,
-      "eval_runtime": 184.3217,
-      "eval_samples_per_second": 97.715,
-      "eval_steps_per_second": 6.109,
+      "epoch": 4.420485175202156,
+      "eval_accuracy": 0.37776414304761136,
+      "eval_loss": 3.4336507320404053,
+      "eval_runtime": 183.8326,
+      "eval_samples_per_second": 97.975,
+      "eval_steps_per_second": 6.125,
       "step": 41000
     },
     {
-      "epoch": 4.418254224518351,
-      "grad_norm": 0.6970384120941162,
-      "learning_rate": 0.00033535610386811764,
-      "loss": 3.3984,
+      "epoch": 4.425876010781671,
+      "grad_norm": 0.6647446155548096,
+      "learning_rate": 0.00033490555855369665,
+      "loss": 3.3783,
       "step": 41050
     },
     {
-      "epoch": 4.423635776557959,
-      "grad_norm": 0.719208300113678,
-      "learning_rate": 0.0003350328628380562,
-      "loss": 3.4049,
+      "epoch": 4.431266846361186,
+      "grad_norm": 0.7069098353385925,
+      "learning_rate": 0.00033458175930922826,
+      "loss": 3.3957,
       "step": 41100
     },
     {
-      "epoch": 4.429017328597568,
-      "grad_norm": 0.6588863134384155,
-      "learning_rate": 0.0003347096218079948,
-      "loss": 3.3783,
+      "epoch": 4.436657681940701,
+      "grad_norm": 0.6159655451774597,
+      "learning_rate": 0.0003342579600647598,
+      "loss": 3.3931,
       "step": 41150
     },
     {
-      "epoch": 4.4343988806371755,
-      "grad_norm": 0.6663377285003662,
-      "learning_rate": 0.00033438638077793343,
-      "loss": 3.3826,
+      "epoch": 4.442048517520216,
+      "grad_norm": 0.6176379323005676,
+      "learning_rate": 0.0003339341608202914,
+      "loss": 3.376,
       "step": 41200
     },
     {
-      "epoch": 4.439780432676784,
-      "grad_norm": 0.649978518486023,
-      "learning_rate": 0.00033406313974787197,
-      "loss": 3.3947,
+      "epoch": 4.44743935309973,
+      "grad_norm": 0.6525024771690369,
+      "learning_rate": 0.00033361036157582297,
+      "loss": 3.3894,
       "step": 41250
     },
     {
-      "epoch": 4.445161984716393,
-      "grad_norm": 0.6662973761558533,
-      "learning_rate": 0.00033373989871781056,
-      "loss": 3.3922,
+      "epoch": 4.452830188679245,
+      "grad_norm": 0.6509463787078857,
+      "learning_rate": 0.00033328656233135457,
+      "loss": 3.3807,
       "step": 41300
     },
     {
-      "epoch": 4.450543536756,
-      "grad_norm": 0.6409628391265869,
-      "learning_rate": 0.0003334166576877491,
-      "loss": 3.3832,
+      "epoch": 4.45822102425876,
+      "grad_norm": 0.6493340134620667,
+      "learning_rate": 0.0003329627630868861,
+      "loss": 3.4081,
       "step": 41350
     },
     {
-      "epoch": 4.455925088795609,
-      "grad_norm": 0.6468245387077332,
-      "learning_rate": 0.0003330934166576877,
-      "loss": 3.3892,
+      "epoch": 4.463611859838275,
+      "grad_norm": 0.623526930809021,
+      "learning_rate": 0.0003326389638424177,
+      "loss": 3.4001,
       "step": 41400
     },
     {
-      "epoch": 4.461306640835216,
-      "grad_norm": 0.6183198690414429,
-      "learning_rate": 0.00033277017562762635,
-      "loss": 3.4058,
+      "epoch": 4.46900269541779,
+      "grad_norm": 0.6079691052436829,
+      "learning_rate": 0.0003323151645979492,
+      "loss": 3.3872,
       "step": 41450
     },
     {
-      "epoch": 4.466688192874825,
-      "grad_norm": 0.6112544536590576,
-      "learning_rate": 0.0003324469345975649,
-      "loss": 3.4,
+      "epoch": 4.474393530997305,
+      "grad_norm": 0.6355500817298889,
+      "learning_rate": 0.00033199136535348077,
+      "loss": 3.3767,
       "step": 41500
     },
     {
-      "epoch": 4.4720697449144335,
-      "grad_norm": 0.6665729880332947,
-      "learning_rate": 0.0003321236935675035,
-      "loss": 3.4103,
+      "epoch": 4.479784366576819,
+      "grad_norm": 0.6550726294517517,
+      "learning_rate": 0.0003316675661090124,
+      "loss": 3.3942,
       "step": 41550
     },
     {
-      "epoch": 4.477451296954041,
-      "grad_norm": 0.6289717555046082,
-      "learning_rate": 0.0003318004525374421,
-      "loss": 3.3997,
+      "epoch": 4.485175202156334,
+      "grad_norm": 0.6663205027580261,
+      "learning_rate": 0.00033134376686454393,
+      "loss": 3.3968,
       "step": 41600
     },
     {
-      "epoch": 4.48283284899365,
-      "grad_norm": 0.6453819274902344,
-      "learning_rate": 0.0003314772115073806,
-      "loss": 3.3909,
+      "epoch": 4.490566037735849,
+      "grad_norm": 0.6686147451400757,
+      "learning_rate": 0.00033101996762007553,
+      "loss": 3.3851,
       "step": 41650
     },
     {
-      "epoch": 4.488214401033258,
-      "grad_norm": 0.7051796913146973,
-      "learning_rate": 0.0003311539704773192,
-      "loss": 3.4105,
+      "epoch": 4.495956873315364,
+      "grad_norm": 0.6377660632133484,
+      "learning_rate": 0.0003306961683756071,
+      "loss": 3.4061,
       "step": 41700
     },
     {
-      "epoch": 4.493595953072866,
-      "grad_norm": 0.7093035578727722,
-      "learning_rate": 0.00033083072944725786,
-      "loss": 3.3901,
+      "epoch": 4.501347708894879,
+      "grad_norm": 0.6469005942344666,
+      "learning_rate": 0.0003303723691311387,
+      "loss": 3.3871,
       "step": 41750
     },
     {
-      "epoch": 4.4989775051124745,
-      "grad_norm": 0.6689607501029968,
-      "learning_rate": 0.0003305074884171964,
-      "loss": 3.3928,
+      "epoch": 4.506738544474393,
+      "grad_norm": 0.6090621948242188,
+      "learning_rate": 0.00033004856988667024,
+      "loss": 3.3872,
       "step": 41800
     },
     {
-      "epoch": 4.504359057152083,
-      "grad_norm": 0.6262616515159607,
-      "learning_rate": 0.000330184247387135,
-      "loss": 3.3853,
+      "epoch": 4.512129380053908,
+      "grad_norm": 0.6723948121070862,
+      "learning_rate": 0.0003297247706422018,
+      "loss": 3.3949,
       "step": 41850
     },
     {
-      "epoch": 4.509740609191691,
-      "grad_norm": 0.663016676902771,
-      "learning_rate": 0.00032986100635707354,
-      "loss": 3.3819,
+      "epoch": 4.517520215633423,
+      "grad_norm": 0.6774519085884094,
+      "learning_rate": 0.0003294009713977334,
+      "loss": 3.3961,
       "step": 41900
     },
     {
-      "epoch": 4.515122161231299,
-      "grad_norm": 0.6628140807151794,
-      "learning_rate": 0.00032953776532701213,
-      "loss": 3.3863,
+      "epoch": 4.5229110512129385,
+      "grad_norm": 0.61894690990448,
+      "learning_rate": 0.00032907717215326494,
+      "loss": 3.3959,
       "step": 41950
     },
     {
-      "epoch": 4.520503713270907,
-      "grad_norm": 0.6778246760368347,
-      "learning_rate": 0.00032921452429695067,
-      "loss": 3.3913,
+      "epoch": 4.528301886792453,
+      "grad_norm": 0.6208569407463074,
+      "learning_rate": 0.00032875337290879655,
+      "loss": 3.3865,
       "step": 42000
     },
     {
-      "epoch": 4.520503713270907,
-      "eval_accuracy": 0.37859772800292407,
-      "eval_loss": 3.43190598487854,
-      "eval_runtime": 183.9779,
-      "eval_samples_per_second": 97.898,
-      "eval_steps_per_second": 6.12,
+      "epoch": 4.528301886792453,
+      "eval_accuracy": 0.37887283711487035,
+      "eval_loss": 3.42965030670166,
+      "eval_runtime": 184.057,
+      "eval_samples_per_second": 97.856,
+      "eval_steps_per_second": 6.118,
       "step": 42000
     },
     {
-      "epoch": 4.5258852653105155,
-      "grad_norm": 0.6411513686180115,
-      "learning_rate": 0.0003288912832668893,
-      "loss": 3.3865,
+      "epoch": 4.533692722371968,
+      "grad_norm": 0.6180669069290161,
+      "learning_rate": 0.00032842957366432805,
+      "loss": 3.3838,
       "step": 42050
     },
     {
-      "epoch": 4.531266817350124,
-      "grad_norm": 0.6528933644294739,
-      "learning_rate": 0.0003285680422368279,
-      "loss": 3.4013,
+      "epoch": 4.539083557951482,
+      "grad_norm": 0.6204472184181213,
+      "learning_rate": 0.0003281057744198597,
+      "loss": 3.4125,
       "step": 42100
     },
     {
-      "epoch": 4.536648369389732,
-      "grad_norm": 0.6158764362335205,
-      "learning_rate": 0.0003282448012067665,
-      "loss": 3.3782,
+      "epoch": 4.544474393530997,
+      "grad_norm": 0.6484192609786987,
+      "learning_rate": 0.0003277819751753912,
+      "loss": 3.3988,
       "step": 42150
     },
     {
-      "epoch": 4.54202992142934,
-      "grad_norm": 0.7035838961601257,
-      "learning_rate": 0.00032792156017670505,
-      "loss": 3.4084,
+      "epoch": 4.549865229110512,
+      "grad_norm": 0.5903990864753723,
+      "learning_rate": 0.0003274581759309228,
+      "loss": 3.3797,
       "step": 42200
     },
     {
-      "epoch": 4.547411473468949,
-      "grad_norm": 0.6502971053123474,
-      "learning_rate": 0.00032759831914664365,
-      "loss": 3.3929,
+      "epoch": 4.555256064690027,
+      "grad_norm": 0.8267539739608765,
+      "learning_rate": 0.00032713437668645436,
+      "loss": 3.4023,
       "step": 42250
     },
     {
-      "epoch": 4.5527930255085565,
-      "grad_norm": 0.6222167611122131,
-      "learning_rate": 0.0003272750781165823,
-      "loss": 3.3799,
+      "epoch": 4.560646900269542,
+      "grad_norm": 0.645702600479126,
+      "learning_rate": 0.0003268105774419859,
+      "loss": 3.4002,
       "step": 42300
     },
     {
-      "epoch": 4.558174577548165,
-      "grad_norm": 0.7282003164291382,
-      "learning_rate": 0.00032695183708652083,
-      "loss": 3.3734,
+      "epoch": 4.566037735849057,
+      "grad_norm": 0.6652170419692993,
+      "learning_rate": 0.0003264867781975175,
+      "loss": 3.3839,
       "step": 42350
     },
     {
-      "epoch": 4.563556129587774,
-      "grad_norm": 0.6520505547523499,
-      "learning_rate": 0.00032662859605645943,
-      "loss": 3.3722,
+      "epoch": 4.571428571428571,
+      "grad_norm": 0.7304674983024597,
+      "learning_rate": 0.00032616297895304906,
+      "loss": 3.4082,
       "step": 42400
     },
     {
-      "epoch": 4.568937681627381,
-      "grad_norm": 0.6786864399909973,
-      "learning_rate": 0.00032630535502639797,
-      "loss": 3.4029,
+      "epoch": 4.576819407008086,
+      "grad_norm": 0.590006411075592,
+      "learning_rate": 0.00032583917970858067,
+      "loss": 3.3774,
       "step": 42450
     },
     {
-      "epoch": 4.57431923366699,
-      "grad_norm": 0.651889979839325,
-      "learning_rate": 0.00032598211399633656,
-      "loss": 3.3934,
+      "epoch": 4.5822102425876015,
+      "grad_norm": 0.6259329915046692,
+      "learning_rate": 0.0003255153804641122,
+      "loss": 3.3776,
       "step": 42500
     },
     {
-      "epoch": 4.579700785706597,
-      "grad_norm": 0.662212610244751,
-      "learning_rate": 0.0003256588729662751,
-      "loss": 3.3881,
+      "epoch": 4.587601078167116,
+      "grad_norm": 0.6422986388206482,
+      "learning_rate": 0.0003251915812196438,
+      "loss": 3.3924,
       "step": 42550
     },
     {
-      "epoch": 4.585082337746206,
-      "grad_norm": 0.6370143294334412,
-      "learning_rate": 0.00032533563193621375,
-      "loss": 3.3962,
+      "epoch": 4.592991913746631,
+      "grad_norm": 0.6286759972572327,
+      "learning_rate": 0.00032486778197517537,
+      "loss": 3.391,
       "step": 42600
     },
     {
-      "epoch": 4.5904638897858145,
-      "grad_norm": 0.6888731718063354,
-      "learning_rate": 0.00032501239090615235,
-      "loss": 3.3792,
+      "epoch": 4.598382749326145,
+      "grad_norm": 0.6902345418930054,
+      "learning_rate": 0.000324543982730707,
+      "loss": 3.4023,
       "step": 42650
     },
     {
-      "epoch": 4.595845441825422,
-      "grad_norm": 0.6885330677032471,
-      "learning_rate": 0.0003246891498760909,
-      "loss": 3.3741,
+      "epoch": 4.60377358490566,
+      "grad_norm": 0.6514720320701599,
+      "learning_rate": 0.00032422018348623853,
+      "loss": 3.3967,
       "step": 42700
     },
     {
-      "epoch": 4.601226993865031,
-      "grad_norm": 0.6344414949417114,
-      "learning_rate": 0.0003243659088460295,
-      "loss": 3.3869,
+      "epoch": 4.609164420485175,
+      "grad_norm": 0.6290830373764038,
+      "learning_rate": 0.0003239028602266595,
+      "loss": 3.3837,
       "step": 42750
     },
     {
-      "epoch": 4.606608545904638,
-      "grad_norm": 0.680259108543396,
-      "learning_rate": 0.0003240426678159681,
-      "loss": 3.3875,
+      "epoch": 4.6145552560646905,
+      "grad_norm": 0.6454638838768005,
+      "learning_rate": 0.000323579060982191,
+      "loss": 3.4053,
       "step": 42800
     },
     {
-      "epoch": 4.611990097944247,
-      "grad_norm": 0.6663812398910522,
-      "learning_rate": 0.0003237258916065079,
-      "loss": 3.415,
+      "epoch": 4.619946091644205,
+      "grad_norm": 0.6117614507675171,
+      "learning_rate": 0.0003232552617377226,
+      "loss": 3.3996,
       "step": 42850
     },
     {
-      "epoch": 4.6173716499838555,
-      "grad_norm": 0.6414598822593689,
-      "learning_rate": 0.0003234026505764465,
-      "loss": 3.3925,
+      "epoch": 4.62533692722372,
+      "grad_norm": 0.5953809022903442,
+      "learning_rate": 0.00032293146249325413,
+      "loss": 3.3871,
       "step": 42900
     },
     {
-      "epoch": 4.622753202023463,
-      "grad_norm": 0.657367467880249,
-      "learning_rate": 0.000323079409546385,
-      "loss": 3.3902,
+      "epoch": 4.630727762803234,
+      "grad_norm": 0.6086660027503967,
+      "learning_rate": 0.00032260766324878574,
+      "loss": 3.3894,
       "step": 42950
     },
     {
-      "epoch": 4.628134754063072,
-      "grad_norm": 0.6666000485420227,
-      "learning_rate": 0.00032275616851632367,
-      "loss": 3.3802,
+      "epoch": 4.636118598382749,
+      "grad_norm": 0.6196882128715515,
+      "learning_rate": 0.0003222838640043173,
+      "loss": 3.4048,
       "step": 43000
     },
     {
-      "epoch": 4.628134754063072,
-      "eval_accuracy": 0.378789283044141,
-      "eval_loss": 3.4260668754577637,
-      "eval_runtime": 184.1527,
-      "eval_samples_per_second": 97.805,
-      "eval_steps_per_second": 6.114,
+      "epoch": 4.636118598382749,
+      "eval_accuracy": 0.37893466060803677,
+      "eval_loss": 3.424194097518921,
+      "eval_runtime": 183.5359,
+      "eval_samples_per_second": 98.133,
+      "eval_steps_per_second": 6.135,
       "step": 43000
     },
     {
-      "epoch": 4.63351630610268,
-      "grad_norm": 0.6748586893081665,
-      "learning_rate": 0.00032243292748626226,
-      "loss": 3.4022,
+      "epoch": 4.6415094339622645,
+      "grad_norm": 0.5985151529312134,
+      "learning_rate": 0.00032196006475984884,
+      "loss": 3.3852,
       "step": 43050
     },
     {
-      "epoch": 4.638897858142288,
-      "grad_norm": 0.6140267252922058,
-      "learning_rate": 0.0003221096864562008,
-      "loss": 3.3837,
+      "epoch": 4.646900269541779,
+      "grad_norm": 0.6808710694313049,
+      "learning_rate": 0.00032163626551538044,
+      "loss": 3.3863,
       "step": 43100
     },
     {
-      "epoch": 4.6442794101818965,
-      "grad_norm": 0.6773905754089355,
-      "learning_rate": 0.0003217864454261394,
-      "loss": 3.3804,
+      "epoch": 4.652291105121294,
+      "grad_norm": 0.6364196538925171,
+      "learning_rate": 0.000321312466270912,
+      "loss": 3.3832,
       "step": 43150
     },
     {
-      "epoch": 4.649660962221505,
-      "grad_norm": 0.6858988404273987,
-      "learning_rate": 0.00032146320439607794,
-      "loss": 3.3782,
+      "epoch": 4.657681940700809,
+      "grad_norm": 0.6553201675415039,
+      "learning_rate": 0.0003209886670264436,
+      "loss": 3.3917,
       "step": 43200
     },
     {
-      "epoch": 4.655042514261113,
-      "grad_norm": 0.6207734942436218,
-      "learning_rate": 0.0003211399633660166,
-      "loss": 3.3924,
+      "epoch": 4.663072776280323,
+      "grad_norm": 0.6401380300521851,
+      "learning_rate": 0.00032066486778197515,
+      "loss": 3.3898,
       "step": 43250
     },
     {
-      "epoch": 4.660424066300721,
-      "grad_norm": 0.6594465374946594,
-      "learning_rate": 0.0003208167223359552,
-      "loss": 3.4044,
+      "epoch": 4.668463611859838,
+      "grad_norm": 0.6377171277999878,
+      "learning_rate": 0.00032034106853750675,
+      "loss": 3.398,
       "step": 43300
     },
     {
-      "epoch": 4.665805618340329,
-      "grad_norm": 0.7123188972473145,
-      "learning_rate": 0.0003204934813058937,
-      "loss": 3.3888,
+      "epoch": 4.6738544474393535,
+      "grad_norm": 0.6544540524482727,
+      "learning_rate": 0.0003200172692930383,
+      "loss": 3.3757,
       "step": 43350
     },
     {
-      "epoch": 4.6711871703799375,
-      "grad_norm": 0.6485627889633179,
-      "learning_rate": 0.0003201702402758323,
-      "loss": 3.4016,
+      "epoch": 4.679245283018868,
+      "grad_norm": 0.7005414366722107,
+      "learning_rate": 0.0003196934700485699,
+      "loss": 3.3839,
       "step": 43400
     },
     {
-      "epoch": 4.676568722419546,
-      "grad_norm": 0.6207435727119446,
-      "learning_rate": 0.0003198469992457709,
-      "loss": 3.3982,
+      "epoch": 4.684636118598383,
+      "grad_norm": 0.6461729407310486,
+      "learning_rate": 0.0003193696708041014,
+      "loss": 3.3907,
       "step": 43450
     },
     {
-      "epoch": 4.681950274459154,
-      "grad_norm": 0.6076232194900513,
-      "learning_rate": 0.00031952375821570945,
-      "loss": 3.3853,
+      "epoch": 4.690026954177897,
+      "grad_norm": 0.6113434433937073,
+      "learning_rate": 0.00031904587155963296,
+      "loss": 3.3896,
       "step": 43500
     },
     {
-      "epoch": 4.687331826498762,
-      "grad_norm": 0.6661508679389954,
-      "learning_rate": 0.0003192005171856481,
-      "loss": 3.397,
+      "epoch": 4.695417789757412,
+      "grad_norm": 0.6722586154937744,
+      "learning_rate": 0.00031872207231516456,
+      "loss": 3.377,
       "step": 43550
     },
     {
-      "epoch": 4.692713378538371,
-      "grad_norm": 0.6532484889030457,
-      "learning_rate": 0.0003188772761555867,
-      "loss": 3.3898,
+      "epoch": 4.7008086253369274,
+      "grad_norm": 0.6458831429481506,
+      "learning_rate": 0.0003183982730706961,
+      "loss": 3.3819,
       "step": 43600
     },
     {
-      "epoch": 4.6980949305779784,
-      "grad_norm": 0.6694145798683167,
-      "learning_rate": 0.00031855403512552524,
-      "loss": 3.4124,
+      "epoch": 4.706199460916442,
+      "grad_norm": 0.6122461557388306,
+      "learning_rate": 0.0003180744738262277,
+      "loss": 3.3987,
       "step": 43650
     },
     {
-      "epoch": 4.703476482617587,
-      "grad_norm": 0.7081990838050842,
-      "learning_rate": 0.00031823079409546383,
-      "loss": 3.3859,
+      "epoch": 4.711590296495957,
+      "grad_norm": 0.6254520416259766,
+      "learning_rate": 0.00031775067458175927,
+      "loss": 3.3896,
       "step": 43700
     },
     {
-      "epoch": 4.7088580346571955,
-      "grad_norm": 0.6278550028800964,
-      "learning_rate": 0.00031790755306540237,
-      "loss": 3.3846,
+      "epoch": 4.716981132075472,
+      "grad_norm": 0.6644843816757202,
+      "learning_rate": 0.00031742687533729087,
+      "loss": 3.39,
       "step": 43750
     },
     {
-      "epoch": 4.714239586696803,
-      "grad_norm": 0.6548607349395752,
-      "learning_rate": 0.00031758431203534097,
-      "loss": 3.388,
+      "epoch": 4.722371967654986,
+      "grad_norm": 0.6221049427986145,
+      "learning_rate": 0.0003171030760928224,
+      "loss": 3.3815,
       "step": 43800
     },
     {
-      "epoch": 4.719621138736412,
-      "grad_norm": 0.6434149742126465,
-      "learning_rate": 0.0003172610710052796,
-      "loss": 3.3952,
+      "epoch": 4.727762803234501,
+      "grad_norm": 0.6321914792060852,
+      "learning_rate": 0.000316779276848354,
+      "loss": 3.402,
       "step": 43850
     },
     {
-      "epoch": 4.725002690776019,
-      "grad_norm": 0.6363683938980103,
-      "learning_rate": 0.00031693782997521816,
-      "loss": 3.3814,
+      "epoch": 4.7331536388140165,
+      "grad_norm": 0.6218891143798828,
+      "learning_rate": 0.0003164554776038856,
+      "loss": 3.4087,
       "step": 43900
     },
     {
-      "epoch": 4.730384242815628,
-      "grad_norm": 0.6362830996513367,
-      "learning_rate": 0.00031661458894515675,
-      "loss": 3.3889,
+      "epoch": 4.738544474393531,
+      "grad_norm": 0.6218807101249695,
+      "learning_rate": 0.00031613167835941713,
+      "loss": 3.4077,
       "step": 43950
     },
     {
-      "epoch": 4.7357657948552365,
-      "grad_norm": 0.6621768474578857,
-      "learning_rate": 0.0003162913479150953,
-      "loss": 3.393,
+      "epoch": 4.743935309973046,
+      "grad_norm": 0.6768582463264465,
+      "learning_rate": 0.00031580787911494873,
+      "loss": 3.3723,
       "step": 44000
     },
     {
-      "epoch": 4.7357657948552365,
-      "eval_accuracy": 0.3792806114028381,
-      "eval_loss": 3.420133352279663,
-      "eval_runtime": 184.4795,
-      "eval_samples_per_second": 97.631,
-      "eval_steps_per_second": 6.104,
+      "epoch": 4.743935309973046,
+      "eval_accuracy": 0.37957755974523505,
+      "eval_loss": 3.418135166168213,
+      "eval_runtime": 183.6046,
+      "eval_samples_per_second": 98.097,
+      "eval_steps_per_second": 6.133,
       "step": 44000
     },
     {
-      "epoch": 4.741147346894844,
-      "grad_norm": 0.6609659194946289,
-      "learning_rate": 0.0003159681068850339,
-      "loss": 3.3955,
+      "epoch": 4.74932614555256,
+      "grad_norm": 0.6679436564445496,
+      "learning_rate": 0.00031548407987048023,
+      "loss": 3.4082,
       "step": 44050
     },
     {
-      "epoch": 4.746528898934453,
-      "grad_norm": 0.6639893054962158,
-      "learning_rate": 0.00031564486585497253,
-      "loss": 3.3899,
+      "epoch": 4.754716981132075,
+      "grad_norm": 0.698415994644165,
+      "learning_rate": 0.0003151602806260119,
+      "loss": 3.3843,
       "step": 44100
     },
     {
-      "epoch": 4.751910450974061,
-      "grad_norm": 0.6364026069641113,
-      "learning_rate": 0.0003153216248249111,
-      "loss": 3.3913,
+      "epoch": 4.7601078167115904,
+      "grad_norm": 0.6204515695571899,
+      "learning_rate": 0.0003148364813815434,
+      "loss": 3.3792,
       "step": 44150
     },
     {
-      "epoch": 4.757292003013669,
-      "grad_norm": 0.6532016396522522,
-      "learning_rate": 0.00031499838379484967,
-      "loss": 3.4,
+      "epoch": 4.765498652291106,
+      "grad_norm": 0.60456782579422,
+      "learning_rate": 0.000314512682137075,
+      "loss": 3.401,
       "step": 44200
     },
     {
-      "epoch": 4.7626735550532775,
-      "grad_norm": 0.6646419167518616,
-      "learning_rate": 0.00031467514276478826,
-      "loss": 3.3959,
+      "epoch": 4.77088948787062,
+      "grad_norm": 0.6760085225105286,
+      "learning_rate": 0.00031418888289260654,
+      "loss": 3.3948,
       "step": 44250
     },
     {
-      "epoch": 4.768055107092886,
-      "grad_norm": 0.6860271096229553,
-      "learning_rate": 0.0003143519017347268,
-      "loss": 3.4054,
+      "epoch": 4.776280323450135,
+      "grad_norm": 0.6306468844413757,
+      "learning_rate": 0.0003138650836481381,
+      "loss": 3.3888,
       "step": 44300
     },
     {
-      "epoch": 4.773436659132494,
-      "grad_norm": 0.6953567862510681,
-      "learning_rate": 0.0003140286607046654,
-      "loss": 3.3912,
+      "epoch": 4.781671159029649,
+      "grad_norm": 0.618640124797821,
+      "learning_rate": 0.0003135412844036697,
+      "loss": 3.3987,
       "step": 44350
     },
     {
-      "epoch": 4.778818211172102,
-      "grad_norm": 0.6766008734703064,
-      "learning_rate": 0.00031370541967460405,
-      "loss": 3.3885,
+      "epoch": 4.787061994609164,
+      "grad_norm": 0.7838083505630493,
+      "learning_rate": 0.00031321748515920124,
+      "loss": 3.3795,
       "step": 44400
     },
     {
-      "epoch": 4.78419976321171,
-      "grad_norm": 0.6261332631111145,
-      "learning_rate": 0.0003133821786445426,
-      "loss": 3.3869,
+      "epoch": 4.7924528301886795,
+      "grad_norm": 0.6274157166481018,
+      "learning_rate": 0.00031289368591473285,
+      "loss": 3.3961,
       "step": 44450
     },
     {
-      "epoch": 4.7895813152513185,
-      "grad_norm": 0.6700854897499084,
-      "learning_rate": 0.0003130589376144812,
-      "loss": 3.4018,
+      "epoch": 4.797843665768194,
+      "grad_norm": 0.6056527495384216,
+      "learning_rate": 0.0003125698866702644,
+      "loss": 3.379,
       "step": 44500
     },
     {
-      "epoch": 4.794962867290927,
-      "grad_norm": 0.680160641670227,
-      "learning_rate": 0.0003127356965844197,
-      "loss": 3.3795,
+      "epoch": 4.803234501347709,
+      "grad_norm": 0.6391441226005554,
+      "learning_rate": 0.000312246087425796,
+      "loss": 3.3785,
       "step": 44550
     },
     {
-      "epoch": 4.800344419330535,
-      "grad_norm": 0.6283779740333557,
-      "learning_rate": 0.0003124124555543583,
-      "loss": 3.3899,
+      "epoch": 4.808625336927224,
+      "grad_norm": 0.6809561252593994,
+      "learning_rate": 0.00031192228818132756,
+      "loss": 3.3926,
       "step": 44600
     },
     {
-      "epoch": 4.805725971370143,
-      "grad_norm": 0.6881921887397766,
-      "learning_rate": 0.0003120892145242969,
-      "loss": 3.3931,
+      "epoch": 4.814016172506738,
+      "grad_norm": 0.6451635360717773,
+      "learning_rate": 0.00031159848893685916,
+      "loss": 3.3838,
       "step": 44650
     },
     {
-      "epoch": 4.811107523409751,
-      "grad_norm": 0.7671242356300354,
-      "learning_rate": 0.0003117659734942355,
-      "loss": 3.4,
+      "epoch": 4.819407008086253,
+      "grad_norm": 0.6554741859436035,
+      "learning_rate": 0.0003112746896923907,
+      "loss": 3.3749,
       "step": 44700
     },
     {
-      "epoch": 4.8164890754493594,
-      "grad_norm": 0.6230366826057434,
-      "learning_rate": 0.00031144919728477526,
-      "loss": 3.3955,
+      "epoch": 4.824797843665769,
+      "grad_norm": 0.6644231677055359,
+      "learning_rate": 0.0003109508904479222,
+      "loss": 3.3767,
       "step": 44750
     },
     {
-      "epoch": 4.821870627488968,
-      "grad_norm": 0.6817081570625305,
-      "learning_rate": 0.0003111259562547139,
-      "loss": 3.3913,
+      "epoch": 4.830188679245283,
+      "grad_norm": 0.6254944205284119,
+      "learning_rate": 0.0003106270912034538,
+      "loss": 3.3938,
       "step": 44800
     },
     {
-      "epoch": 4.827252179528576,
-      "grad_norm": 0.6694238185882568,
-      "learning_rate": 0.0003108027152246525,
-      "loss": 3.3856,
+      "epoch": 4.835579514824798,
+      "grad_norm": 0.6484498977661133,
+      "learning_rate": 0.00031030329195898536,
+      "loss": 3.3877,
       "step": 44850
     },
     {
-      "epoch": 4.832633731568184,
-      "grad_norm": 0.6804167628288269,
-      "learning_rate": 0.0003104794741945911,
-      "loss": 3.3836,
+      "epoch": 4.840970350404312,
+      "grad_norm": 0.6299556493759155,
+      "learning_rate": 0.00030997949271451697,
+      "loss": 3.3888,
       "step": 44900
     },
     {
-      "epoch": 4.838015283607793,
-      "grad_norm": 0.7400542497634888,
-      "learning_rate": 0.00031015623316452964,
-      "loss": 3.3945,
+      "epoch": 4.846361185983827,
+      "grad_norm": 0.6518325209617615,
+      "learning_rate": 0.0003096556934700485,
+      "loss": 3.3854,
       "step": 44950
     },
     {
-      "epoch": 4.8433968356474,
-      "grad_norm": 0.6841332912445068,
-      "learning_rate": 0.00030983299213446823,
-      "loss": 3.4061,
+      "epoch": 4.8517520215633425,
+      "grad_norm": 0.6268826723098755,
+      "learning_rate": 0.0003093318942255801,
+      "loss": 3.3969,
       "step": 45000
     },
     {
-      "epoch": 4.8433968356474,
-      "eval_accuracy": 0.3799427421011795,
-      "eval_loss": 3.4151318073272705,
-      "eval_runtime": 184.0085,
-      "eval_samples_per_second": 97.881,
-      "eval_steps_per_second": 6.119,
+      "epoch": 4.8517520215633425,
+      "eval_accuracy": 0.38006736696350274,
+      "eval_loss": 3.4140915870666504,
+      "eval_runtime": 183.6741,
+      "eval_samples_per_second": 98.06,
+      "eval_steps_per_second": 6.13,
       "step": 45000
     },
     {
-      "epoch": 4.848778387687009,
-      "grad_norm": 0.6875273585319519,
-      "learning_rate": 0.0003095097511044069,
-      "loss": 3.381,
+      "epoch": 4.857142857142857,
+      "grad_norm": 0.6254715919494629,
+      "learning_rate": 0.0003090080949811117,
+      "loss": 3.3722,
       "step": 45050
     },
     {
-      "epoch": 4.8541599397266175,
-      "grad_norm": 0.630673348903656,
-      "learning_rate": 0.0003091865100743454,
-      "loss": 3.3863,
+      "epoch": 4.862533692722372,
+      "grad_norm": 0.6830570697784424,
+      "learning_rate": 0.0003086842957366433,
+      "loss": 3.3984,
       "step": 45100
     },
     {
-      "epoch": 4.859541491766225,
-      "grad_norm": 0.6930129528045654,
-      "learning_rate": 0.000308863269044284,
-      "loss": 3.3922,
+      "epoch": 4.867924528301887,
+      "grad_norm": 0.701416015625,
+      "learning_rate": 0.00030836049649217483,
+      "loss": 3.3715,
       "step": 45150
     },
     {
-      "epoch": 4.864923043805834,
-      "grad_norm": 0.6872902512550354,
-      "learning_rate": 0.00030854002801422256,
-      "loss": 3.4027,
+      "epoch": 4.873315363881401,
+      "grad_norm": 0.6478974223136902,
+      "learning_rate": 0.0003080366972477064,
+      "loss": 3.3933,
       "step": 45200
     },
     {
-      "epoch": 4.870304595845441,
-      "grad_norm": 0.6678783297538757,
-      "learning_rate": 0.00030821678698416115,
-      "loss": 3.3917,
+      "epoch": 4.878706199460916,
+      "grad_norm": 0.6832784414291382,
+      "learning_rate": 0.000307712898003238,
+      "loss": 3.3898,
       "step": 45250
     },
     {
-      "epoch": 4.87568614788505,
-      "grad_norm": 0.7070086598396301,
-      "learning_rate": 0.0003078935459540997,
-      "loss": 3.3919,
+      "epoch": 4.884097035040432,
+      "grad_norm": 0.6223458647727966,
+      "learning_rate": 0.00030738909875876953,
+      "loss": 3.3962,
       "step": 45300
     },
     {
-      "epoch": 4.8810676999246585,
-      "grad_norm": 0.7027028203010559,
-      "learning_rate": 0.00030757030492403834,
-      "loss": 3.3948,
+      "epoch": 4.889487870619946,
+      "grad_norm": 0.6796822547912598,
+      "learning_rate": 0.00030706529951430114,
+      "loss": 3.3877,
       "step": 45350
     },
     {
-      "epoch": 4.886449251964266,
-      "grad_norm": 0.6884028911590576,
-      "learning_rate": 0.00030724706389397694,
-      "loss": 3.4086,
+      "epoch": 4.894878706199461,
+      "grad_norm": 0.6090751886367798,
+      "learning_rate": 0.0003067415002698327,
+      "loss": 3.3837,
       "step": 45400
     },
     {
-      "epoch": 4.891830804003875,
-      "grad_norm": 0.7701573967933655,
-      "learning_rate": 0.0003069238228639155,
-      "loss": 3.3962,
+      "epoch": 4.900269541778976,
+      "grad_norm": 0.6062365174293518,
+      "learning_rate": 0.0003064177010253643,
+      "loss": 3.3928,
       "step": 45450
     },
     {
-      "epoch": 4.897212356043483,
-      "grad_norm": 0.6813535094261169,
-      "learning_rate": 0.00030660058183385407,
-      "loss": 3.4013,
+      "epoch": 4.90566037735849,
+      "grad_norm": 0.6820232272148132,
+      "learning_rate": 0.0003060939017808958,
+      "loss": 3.391,
       "step": 45500
     },
     {
-      "epoch": 4.902593908083091,
-      "grad_norm": 0.6463482975959778,
-      "learning_rate": 0.00030627734080379267,
-      "loss": 3.3926,
+      "epoch": 4.9110512129380055,
+      "grad_norm": 0.641365647315979,
+      "learning_rate": 0.0003057701025364274,
+      "loss": 3.3799,
       "step": 45550
     },
     {
-      "epoch": 4.9079754601226995,
-      "grad_norm": 0.6464250683784485,
-      "learning_rate": 0.0003059540997737312,
-      "loss": 3.3822,
+      "epoch": 4.916442048517521,
+      "grad_norm": 0.6937678456306458,
+      "learning_rate": 0.00030544630329195895,
+      "loss": 3.4047,
       "step": 45600
     },
     {
-      "epoch": 4.913357012162308,
-      "grad_norm": 0.6345609426498413,
-      "learning_rate": 0.00030563085874366986,
-      "loss": 3.393,
+      "epoch": 4.921832884097035,
+      "grad_norm": 0.6434536576271057,
+      "learning_rate": 0.0003051225040474905,
+      "loss": 3.3787,
       "step": 45650
     },
     {
-      "epoch": 4.918738564201916,
-      "grad_norm": 0.6901547908782959,
-      "learning_rate": 0.00030530761771360845,
-      "loss": 3.3725,
+      "epoch": 4.92722371967655,
+      "grad_norm": 0.655083417892456,
+      "learning_rate": 0.0003047987048030221,
+      "loss": 3.3902,
       "step": 45700
     },
     {
-      "epoch": 4.924120116241524,
-      "grad_norm": 0.6914418935775757,
-      "learning_rate": 0.000304984376683547,
-      "loss": 3.3789,
+      "epoch": 4.932614555256064,
+      "grad_norm": 0.6419015526771545,
+      "learning_rate": 0.00030447490555855365,
+      "loss": 3.3935,
       "step": 45750
     },
     {
-      "epoch": 4.929501668281132,
-      "grad_norm": 0.734369695186615,
-      "learning_rate": 0.0003046611356534856,
-      "loss": 3.3904,
+      "epoch": 4.938005390835579,
+      "grad_norm": 0.6669827103614807,
+      "learning_rate": 0.00030415110631408526,
+      "loss": 3.3875,
       "step": 45800
     },
     {
-      "epoch": 4.9348832203207404,
-      "grad_norm": 0.6802307963371277,
-      "learning_rate": 0.0003043378946234241,
-      "loss": 3.3965,
+      "epoch": 4.943396226415095,
+      "grad_norm": 0.6806350350379944,
+      "learning_rate": 0.0003038273070696168,
+      "loss": 3.3902,
       "step": 45850
     },
     {
-      "epoch": 4.940264772360349,
-      "grad_norm": 0.6808255314826965,
-      "learning_rate": 0.0003040146535933628,
-      "loss": 3.3812,
+      "epoch": 4.948787061994609,
+      "grad_norm": 0.662300705909729,
+      "learning_rate": 0.0003035035078251484,
+      "loss": 3.3905,
       "step": 45900
     },
     {
-      "epoch": 4.945646324399957,
-      "grad_norm": 0.6597874760627747,
-      "learning_rate": 0.00030369141256330137,
-      "loss": 3.3646,
+      "epoch": 4.954177897574124,
+      "grad_norm": 0.6265142560005188,
+      "learning_rate": 0.00030317970858067996,
+      "loss": 3.3757,
       "step": 45950
     },
     {
-      "epoch": 4.951027876439565,
-      "grad_norm": 0.6720778346061707,
-      "learning_rate": 0.0003033681715332399,
-      "loss": 3.3894,
+      "epoch": 4.959568733153639,
+      "grad_norm": 0.6365964412689209,
+      "learning_rate": 0.00030285590933621157,
+      "loss": 3.3867,
       "step": 46000
     },
     {
-      "epoch": 4.951027876439565,
-      "eval_accuracy": 0.38033237135688225,
-      "eval_loss": 3.410205841064453,
-      "eval_runtime": 184.0,
-      "eval_samples_per_second": 97.886,
-      "eval_steps_per_second": 6.12,
+      "epoch": 4.959568733153639,
+      "eval_accuracy": 0.3805977017089252,
+      "eval_loss": 3.410641670227051,
+      "eval_runtime": 183.6273,
+      "eval_samples_per_second": 98.085,
+      "eval_steps_per_second": 6.132,
       "step": 46000
     },
     {
-      "epoch": 4.956409428479174,
-      "grad_norm": 0.6535595059394836,
-      "learning_rate": 0.0003030449305031785,
-      "loss": 3.3842,
+      "epoch": 4.964959568733153,
+      "grad_norm": 0.5964719653129578,
+      "learning_rate": 0.0003025321100917431,
+      "loss": 3.3936,
       "step": 46050
     },
     {
-      "epoch": 4.961790980518781,
-      "grad_norm": 0.6358091235160828,
-      "learning_rate": 0.0003027216894731171,
-      "loss": 3.3774,
+      "epoch": 4.9703504043126685,
+      "grad_norm": 0.6579270958900452,
+      "learning_rate": 0.0003022083108472746,
+      "loss": 3.3882,
       "step": 46100
     },
     {
-      "epoch": 4.96717253255839,
-      "grad_norm": 1.3344215154647827,
-      "learning_rate": 0.00030239844844305564,
-      "loss": 3.3972,
+      "epoch": 4.975741239892184,
+      "grad_norm": 0.6702588200569153,
+      "learning_rate": 0.0003018845116028062,
+      "loss": 3.3865,
       "step": 46150
     },
     {
-      "epoch": 4.9725540845979985,
-      "grad_norm": 0.6452608108520508,
-      "learning_rate": 0.0003020752074129943,
-      "loss": 3.3968,
+      "epoch": 4.981132075471698,
+      "grad_norm": 0.6525739431381226,
+      "learning_rate": 0.00030156071235833777,
+      "loss": 3.3865,
       "step": 46200
     },
     {
-      "epoch": 4.977935636637606,
-      "grad_norm": 0.7212709188461304,
-      "learning_rate": 0.0003017519663829329,
-      "loss": 3.3917,
+      "epoch": 4.986522911051213,
+      "grad_norm": 0.6661361455917358,
+      "learning_rate": 0.0003012369131138694,
+      "loss": 3.4074,
       "step": 46250
     },
     {
-      "epoch": 4.983317188677215,
-      "grad_norm": 0.6865209341049194,
-      "learning_rate": 0.0003014287253528714,
-      "loss": 3.3774,
+      "epoch": 4.991913746630727,
+      "grad_norm": 0.6791821122169495,
+      "learning_rate": 0.0003009131138694009,
+      "loss": 3.3957,
       "step": 46300
     },
     {
-      "epoch": 4.988698740716822,
-      "grad_norm": 0.7008549571037292,
-      "learning_rate": 0.00030110548432281,
-      "loss": 3.384,
+      "epoch": 4.997304582210242,
+      "grad_norm": 0.6901088953018188,
+      "learning_rate": 0.00030058931462493253,
+      "loss": 3.3834,
       "step": 46350
     },
     {
-      "epoch": 4.994080292756431,
-      "grad_norm": 0.6504687666893005,
-      "learning_rate": 0.00030078224329274856,
-      "loss": 3.3863,
+      "epoch": 5.002695417789758,
+      "grad_norm": 0.6481421589851379,
+      "learning_rate": 0.0003002655153804641,
+      "loss": 3.34,
       "step": 46400
     },
     {
-      "epoch": 4.9994618447960395,
-      "grad_norm": 0.6690630912780762,
-      "learning_rate": 0.00030045900226268715,
-      "loss": 3.3824,
+      "epoch": 5.008086253369272,
+      "grad_norm": 0.6943140625953674,
+      "learning_rate": 0.00029994171613599563,
+      "loss": 3.2891,
       "step": 46450
     },
     {
-      "epoch": 5.004843396835647,
-      "grad_norm": 0.6338120102882385,
-      "learning_rate": 0.0003001357612326258,
-      "loss": 3.2995,
+      "epoch": 5.013477088948787,
+      "grad_norm": 0.638863205909729,
+      "learning_rate": 0.00029961791689152724,
+      "loss": 3.2942,
       "step": 46500
     },
     {
-      "epoch": 5.010224948875256,
-      "grad_norm": 0.6364915370941162,
-      "learning_rate": 0.00029981252020256434,
-      "loss": 3.2997,
+      "epoch": 5.018867924528302,
+      "grad_norm": 0.6785563230514526,
+      "learning_rate": 0.0002992941176470588,
+      "loss": 3.3015,
       "step": 46550
     },
     {
-      "epoch": 5.015606500914864,
-      "grad_norm": 0.6910309791564941,
-      "learning_rate": 0.00029948927917250294,
-      "loss": 3.3019,
+      "epoch": 5.024258760107816,
+      "grad_norm": 0.6411895751953125,
+      "learning_rate": 0.0002989703184025904,
+      "loss": 3.2999,
       "step": 46600
     },
     {
-      "epoch": 5.020988052954472,
-      "grad_norm": 0.6775645613670349,
-      "learning_rate": 0.00029916603814244153,
-      "loss": 3.2952,
+      "epoch": 5.0296495956873315,
+      "grad_norm": 0.7144386768341064,
+      "learning_rate": 0.00029864651915812194,
+      "loss": 3.3046,
       "step": 46650
     },
     {
-      "epoch": 5.0263696049940805,
-      "grad_norm": 0.7527064681053162,
-      "learning_rate": 0.0002988427971123801,
-      "loss": 3.3052,
+      "epoch": 5.035040431266847,
+      "grad_norm": 0.657825767993927,
+      "learning_rate": 0.0002983227199136535,
+      "loss": 3.2974,
       "step": 46700
     },
     {
-      "epoch": 5.031751157033688,
-      "grad_norm": 0.6929546594619751,
-      "learning_rate": 0.00029851955608231867,
-      "loss": 3.3145,
+      "epoch": 5.040431266846361,
+      "grad_norm": 0.6665033102035522,
+      "learning_rate": 0.0002979989206691851,
+      "loss": 3.313,
       "step": 46750
     },
     {
-      "epoch": 5.037132709073297,
-      "grad_norm": 0.624110996723175,
-      "learning_rate": 0.00029819631505225726,
-      "loss": 3.3169,
+      "epoch": 5.045822102425876,
+      "grad_norm": 0.6799901127815247,
+      "learning_rate": 0.000297681597409606,
+      "loss": 3.2983,
       "step": 46800
     },
     {
-      "epoch": 5.042514261112905,
-      "grad_norm": 0.698697566986084,
-      "learning_rate": 0.00029787307402219586,
-      "loss": 3.3072,
+      "epoch": 5.051212938005391,
+      "grad_norm": 0.7204418778419495,
+      "learning_rate": 0.0002973577981651376,
+      "loss": 3.315,
       "step": 46850
     },
     {
-      "epoch": 5.047895813152513,
-      "grad_norm": 0.760403573513031,
-      "learning_rate": 0.00029754983299213445,
-      "loss": 3.3023,
+      "epoch": 5.056603773584905,
+      "grad_norm": 0.679721474647522,
+      "learning_rate": 0.00029703399892066915,
+      "loss": 3.3029,
       "step": 46900
     },
     {
-      "epoch": 5.0532773651921215,
-      "grad_norm": 0.7195192575454712,
-      "learning_rate": 0.000297226591962073,
-      "loss": 3.3267,
+      "epoch": 5.061994609164421,
+      "grad_norm": 0.6594417095184326,
+      "learning_rate": 0.00029671019967620076,
+      "loss": 3.2934,
       "step": 46950
     },
     {
-      "epoch": 5.05865891723173,
-      "grad_norm": 0.640288770198822,
-      "learning_rate": 0.00029690335093201164,
-      "loss": 3.2988,
+      "epoch": 5.067385444743936,
+      "grad_norm": 0.7225853204727173,
+      "learning_rate": 0.0002963864004317323,
+      "loss": 3.3111,
       "step": 47000
     },
     {
-      "epoch": 5.05865891723173,
-      "eval_accuracy": 0.38089660580330287,
-      "eval_loss": 3.4140713214874268,
-      "eval_runtime": 184.5126,
-      "eval_samples_per_second": 97.614,
-      "eval_steps_per_second": 6.103,
+      "epoch": 5.067385444743936,
+      "eval_accuracy": 0.3804139696756309,
+      "eval_loss": 3.4154088497161865,
+      "eval_runtime": 183.2512,
+      "eval_samples_per_second": 98.286,
+      "eval_steps_per_second": 6.145,
       "step": 47000
     },
     {
-      "epoch": 5.064040469271338,
-      "grad_norm": 0.7123163938522339,
-      "learning_rate": 0.0002965801099019502,
-      "loss": 3.3039,
+      "epoch": 5.07277628032345,
+      "grad_norm": 0.6604564785957336,
+      "learning_rate": 0.00029606260118726386,
+      "loss": 3.313,
       "step": 47050
     },
     {
-      "epoch": 5.069422021310946,
-      "grad_norm": 0.6437656879425049,
-      "learning_rate": 0.0002962568688718888,
-      "loss": 3.308,
+      "epoch": 5.078167115902965,
+      "grad_norm": 0.647045910358429,
+      "learning_rate": 0.0002957388019427954,
+      "loss": 3.3159,
       "step": 47100
     },
     {
-      "epoch": 5.074803573350554,
-      "grad_norm": 0.6438406109809875,
-      "learning_rate": 0.00029593362784182737,
-      "loss": 3.3086,
+      "epoch": 5.083557951482479,
+      "grad_norm": 0.6736243367195129,
+      "learning_rate": 0.000295415002698327,
+      "loss": 3.3109,
       "step": 47150
     },
     {
-      "epoch": 5.080185125390162,
-      "grad_norm": 0.6933695673942566,
-      "learning_rate": 0.00029561038681176596,
-      "loss": 3.3286,
+      "epoch": 5.0889487870619945,
+      "grad_norm": 0.6551802754402161,
+      "learning_rate": 0.00029509120345385856,
+      "loss": 3.3092,
       "step": 47200
     },
     {
-      "epoch": 5.085566677429771,
-      "grad_norm": 0.6499897241592407,
-      "learning_rate": 0.00029528714578170456,
-      "loss": 3.3282,
+      "epoch": 5.09433962264151,
+      "grad_norm": 0.6889127492904663,
+      "learning_rate": 0.00029476740420939017,
+      "loss": 3.3115,
       "step": 47250
     },
     {
-      "epoch": 5.090948229469379,
-      "grad_norm": 0.6867937445640564,
-      "learning_rate": 0.0002949639047516431,
-      "loss": 3.3086,
+      "epoch": 5.099730458221024,
+      "grad_norm": 0.6516887545585632,
+      "learning_rate": 0.0002944436049649217,
+      "loss": 3.3085,
       "step": 47300
     },
     {
-      "epoch": 5.096329781508987,
-      "grad_norm": 0.7074191570281982,
-      "learning_rate": 0.0002946406637215817,
-      "loss": 3.2997,
+      "epoch": 5.105121293800539,
+      "grad_norm": 0.6999820470809937,
+      "learning_rate": 0.0002941198057204533,
+      "loss": 3.2985,
       "step": 47350
     },
     {
-      "epoch": 5.101711333548596,
-      "grad_norm": 0.6474207043647766,
-      "learning_rate": 0.0002943174226915203,
-      "loss": 3.3096,
+      "epoch": 5.110512129380054,
+      "grad_norm": 0.6833156943321228,
+      "learning_rate": 0.00029379600647598487,
+      "loss": 3.3138,
       "step": 47400
     },
     {
-      "epoch": 5.107092885588203,
-      "grad_norm": 0.7220308780670166,
-      "learning_rate": 0.0002939941816614589,
-      "loss": 3.3151,
+      "epoch": 5.115902964959568,
+      "grad_norm": 0.6783242225646973,
+      "learning_rate": 0.0002934722072315164,
+      "loss": 3.3288,
       "step": 47450
     },
     {
-      "epoch": 5.112474437627812,
-      "grad_norm": 0.6929433941841125,
-      "learning_rate": 0.0002936709406313974,
-      "loss": 3.3169,
+      "epoch": 5.121293800539084,
+      "grad_norm": 0.6660628914833069,
+      "learning_rate": 0.000293148407987048,
+      "loss": 3.3228,
       "step": 47500
     },
     {
-      "epoch": 5.1178559896674205,
-      "grad_norm": 0.6843468546867371,
-      "learning_rate": 0.0002933476996013361,
-      "loss": 3.3147,
+      "epoch": 5.126684636118599,
+      "grad_norm": 0.6789077520370483,
+      "learning_rate": 0.0002928246087425796,
+      "loss": 3.3296,
       "step": 47550
     },
     {
-      "epoch": 5.123237541707028,
-      "grad_norm": 0.6644719243049622,
-      "learning_rate": 0.0002930244585712746,
-      "loss": 3.3146,
+      "epoch": 5.132075471698113,
+      "grad_norm": 0.6862949132919312,
+      "learning_rate": 0.00029250080949811113,
+      "loss": 3.3219,
       "step": 47600
     },
     {
-      "epoch": 5.128619093746637,
-      "grad_norm": 0.7787826657295227,
-      "learning_rate": 0.0002927012175412132,
-      "loss": 3.3084,
+      "epoch": 5.137466307277628,
+      "grad_norm": 0.6389700770378113,
+      "learning_rate": 0.00029217701025364273,
+      "loss": 3.3121,
       "step": 47650
     },
     {
-      "epoch": 5.134000645786244,
-      "grad_norm": 0.7502739429473877,
-      "learning_rate": 0.0002923779765111518,
-      "loss": 3.32,
+      "epoch": 5.142857142857143,
+      "grad_norm": 0.6830697655677795,
+      "learning_rate": 0.0002918532110091743,
+      "loss": 3.3252,
       "step": 47700
     },
     {
-      "epoch": 5.139382197825853,
-      "grad_norm": 0.6958211660385132,
-      "learning_rate": 0.0002920547354810904,
-      "loss": 3.3128,
+      "epoch": 5.1482479784366575,
+      "grad_norm": 0.64373779296875,
+      "learning_rate": 0.0002915294117647059,
+      "loss": 3.3187,
       "step": 47750
     },
     {
-      "epoch": 5.1447637498654615,
-      "grad_norm": 0.7047584652900696,
-      "learning_rate": 0.00029173149445102894,
-      "loss": 3.344,
+      "epoch": 5.153638814016173,
+      "grad_norm": 0.6759216785430908,
+      "learning_rate": 0.00029120561252023744,
+      "loss": 3.308,
       "step": 47800
     },
     {
-      "epoch": 5.150145301905069,
-      "grad_norm": 0.6888059377670288,
-      "learning_rate": 0.00029140825342096753,
-      "loss": 3.3247,
+      "epoch": 5.159029649595688,
+      "grad_norm": 0.651127278804779,
+      "learning_rate": 0.000290881813275769,
+      "loss": 3.3319,
       "step": 47850
     },
     {
-      "epoch": 5.155526853944678,
-      "grad_norm": 0.7417131066322327,
-      "learning_rate": 0.0002910850123909061,
-      "loss": 3.3113,
+      "epoch": 5.164420485175202,
+      "grad_norm": 0.6850906610488892,
+      "learning_rate": 0.00029055801403130054,
+      "loss": 3.3238,
       "step": 47900
     },
     {
-      "epoch": 5.160908405984286,
-      "grad_norm": 0.6905604600906372,
-      "learning_rate": 0.0002907617713608447,
-      "loss": 3.3091,
+      "epoch": 5.169811320754717,
+      "grad_norm": 0.9405094981193542,
+      "learning_rate": 0.00029023421478683215,
+      "loss": 3.3112,
       "step": 47950
     },
     {
-      "epoch": 5.166289958023894,
-      "grad_norm": 0.6897056698799133,
-      "learning_rate": 0.0002904385303307833,
-      "loss": 3.3063,
+      "epoch": 5.175202156334231,
+      "grad_norm": 0.6840382218360901,
+      "learning_rate": 0.0002899104155423637,
+      "loss": 3.3174,
       "step": 48000
     },
     {
-      "epoch": 5.166289958023894,
-      "eval_accuracy": 0.3808204401289449,
-      "eval_loss": 3.41259765625,
-      "eval_runtime": 184.1998,
-      "eval_samples_per_second": 97.78,
-      "eval_steps_per_second": 6.113,
+      "epoch": 5.175202156334231,
+      "eval_accuracy": 0.38094006695842864,
+      "eval_loss": 3.411670446395874,
+      "eval_runtime": 183.7964,
+      "eval_samples_per_second": 97.994,
+      "eval_steps_per_second": 6.126,
       "step": 48000
     },
     {
-      "epoch": 5.1716715100635025,
-      "grad_norm": 0.7102882862091064,
-      "learning_rate": 0.00029011528930072186,
-      "loss": 3.3235,
+      "epoch": 5.180592991913747,
+      "grad_norm": 0.6287639737129211,
+      "learning_rate": 0.0002895866162978953,
+      "loss": 3.3208,
       "step": 48050
     },
     {
-      "epoch": 5.17705306210311,
-      "grad_norm": 0.6777679920196533,
-      "learning_rate": 0.00028979204827066045,
-      "loss": 3.325,
+      "epoch": 5.185983827493262,
+      "grad_norm": 0.6605691909790039,
+      "learning_rate": 0.00028926281705342685,
+      "loss": 3.3316,
       "step": 48100
     },
     {
-      "epoch": 5.182434614142719,
-      "grad_norm": 0.7214860916137695,
-      "learning_rate": 0.00028946880724059905,
-      "loss": 3.3411,
+      "epoch": 5.191374663072776,
+      "grad_norm": 0.6740576028823853,
+      "learning_rate": 0.0002889390178089584,
+      "loss": 3.3262,
       "step": 48150
     },
     {
-      "epoch": 5.187816166182327,
-      "grad_norm": 0.6319332718849182,
-      "learning_rate": 0.00028914556621053764,
-      "loss": 3.3318,
+      "epoch": 5.196765498652291,
+      "grad_norm": 0.6887629628181458,
+      "learning_rate": 0.00028861521856449,
+      "loss": 3.3363,
       "step": 48200
     },
     {
-      "epoch": 5.193197718221935,
-      "grad_norm": 0.707127034664154,
-      "learning_rate": 0.00028882232518047624,
-      "loss": 3.3286,
+      "epoch": 5.202156334231806,
+      "grad_norm": 0.6479724645614624,
+      "learning_rate": 0.00028829141932002156,
+      "loss": 3.3306,
       "step": 48250
     },
     {
-      "epoch": 5.198579270261543,
-      "grad_norm": 0.7037991285324097,
-      "learning_rate": 0.00028849908415041483,
-      "loss": 3.3115,
+      "epoch": 5.2075471698113205,
+      "grad_norm": 0.6248446106910706,
+      "learning_rate": 0.0002879676200755531,
+      "loss": 3.3056,
       "step": 48300
     },
     {
-      "epoch": 5.203960822301152,
-      "grad_norm": 0.6779250502586365,
-      "learning_rate": 0.00028817584312035337,
-      "loss": 3.3145,
+      "epoch": 5.212938005390836,
+      "grad_norm": 0.6646822690963745,
+      "learning_rate": 0.0002876438208310847,
+      "loss": 3.3394,
       "step": 48350
     },
     {
-      "epoch": 5.20934237434076,
-      "grad_norm": 0.6654755473136902,
-      "learning_rate": 0.00028785260209029197,
-      "loss": 3.3269,
+      "epoch": 5.218328840970351,
+      "grad_norm": 0.6973866820335388,
+      "learning_rate": 0.00028732002158661626,
+      "loss": 3.3275,
       "step": 48400
     },
     {
-      "epoch": 5.214723926380368,
-      "grad_norm": 0.713239848613739,
-      "learning_rate": 0.00028752936106023056,
-      "loss": 3.3167,
+      "epoch": 5.223719676549865,
+      "grad_norm": 0.6720141768455505,
+      "learning_rate": 0.00028699622234214787,
+      "loss": 3.3337,
       "step": 48450
     },
     {
-      "epoch": 5.220105478419977,
-      "grad_norm": 0.6545668244361877,
-      "learning_rate": 0.00028720612003016915,
-      "loss": 3.3313,
+      "epoch": 5.22911051212938,
+      "grad_norm": 0.686879575252533,
+      "learning_rate": 0.0002866724230976794,
+      "loss": 3.3398,
       "step": 48500
     },
     {
-      "epoch": 5.225487030459584,
-      "grad_norm": 0.6693907380104065,
-      "learning_rate": 0.00028688287900010775,
-      "loss": 3.3347,
+      "epoch": 5.234501347708895,
+      "grad_norm": 0.7400215268135071,
+      "learning_rate": 0.00028634862385321097,
+      "loss": 3.3177,
       "step": 48550
     },
     {
-      "epoch": 5.230868582499193,
-      "grad_norm": 0.7082215547561646,
-      "learning_rate": 0.0002865596379700463,
-      "loss": 3.3182,
+      "epoch": 5.2398921832884096,
+      "grad_norm": 0.6599448323249817,
+      "learning_rate": 0.0002860248246087426,
+      "loss": 3.3204,
       "step": 48600
     },
     {
-      "epoch": 5.236250134538801,
-      "grad_norm": 0.6843087673187256,
-      "learning_rate": 0.0002862363969399849,
-      "loss": 3.3212,
+      "epoch": 5.245283018867925,
+      "grad_norm": 0.6918583512306213,
+      "learning_rate": 0.0002857010253642741,
+      "loss": 3.3422,
       "step": 48650
     },
     {
-      "epoch": 5.241631686578409,
-      "grad_norm": 0.6862715482711792,
-      "learning_rate": 0.0002859131559099235,
-      "loss": 3.3275,
+      "epoch": 5.250673854447439,
+      "grad_norm": 0.693386971950531,
+      "learning_rate": 0.0002853772261198057,
+      "loss": 3.3288,
       "step": 48700
     },
     {
-      "epoch": 5.247013238618018,
-      "grad_norm": 0.6855129599571228,
-      "learning_rate": 0.0002855963797004633,
-      "loss": 3.33,
+      "epoch": 5.256064690026954,
+      "grad_norm": 0.7383105158805847,
+      "learning_rate": 0.0002850534268753373,
+      "loss": 3.3309,
       "step": 48750
     },
     {
-      "epoch": 5.252394790657625,
-      "grad_norm": 0.6837905049324036,
-      "learning_rate": 0.0002852731386704019,
-      "loss": 3.3324,
+      "epoch": 5.261455525606469,
+      "grad_norm": 0.6746464371681213,
+      "learning_rate": 0.0002847361036157582,
+      "loss": 3.3401,
       "step": 48800
     },
     {
-      "epoch": 5.257776342697234,
-      "grad_norm": 0.6358972787857056,
-      "learning_rate": 0.0002849498976403405,
-      "loss": 3.3167,
+      "epoch": 5.2668463611859835,
+      "grad_norm": 0.6412659287452698,
+      "learning_rate": 0.0002844123043712898,
+      "loss": 3.3324,
       "step": 48850
     },
     {
-      "epoch": 5.2631578947368425,
-      "grad_norm": 0.6836221218109131,
-      "learning_rate": 0.000284626656610279,
-      "loss": 3.3239,
+      "epoch": 5.272237196765499,
+      "grad_norm": 0.7231650948524475,
+      "learning_rate": 0.00028408850512682133,
+      "loss": 3.3225,
       "step": 48900
     },
     {
-      "epoch": 5.26853944677645,
-      "grad_norm": 0.7453120946884155,
-      "learning_rate": 0.0002843034155802176,
-      "loss": 3.3433,
+      "epoch": 5.277628032345014,
+      "grad_norm": 0.6406319737434387,
+      "learning_rate": 0.00028376470588235294,
+      "loss": 3.3355,
       "step": 48950
     },
     {
-      "epoch": 5.273920998816059,
-      "grad_norm": 0.6284498572349548,
-      "learning_rate": 0.0002839801745501562,
-      "loss": 3.3432,
+      "epoch": 5.283018867924528,
+      "grad_norm": 0.6728077530860901,
+      "learning_rate": 0.0002834409066378845,
+      "loss": 3.3123,
       "step": 49000
     },
     {
-      "epoch": 5.273920998816059,
-      "eval_accuracy": 0.3816899891901242,
-      "eval_loss": 3.4081289768218994,
-      "eval_runtime": 183.9045,
-      "eval_samples_per_second": 97.937,
-      "eval_steps_per_second": 6.123,
+      "epoch": 5.283018867924528,
+      "eval_accuracy": 0.38115900252737484,
+      "eval_loss": 3.409712314605713,
+      "eval_runtime": 183.6676,
+      "eval_samples_per_second": 98.063,
+      "eval_steps_per_second": 6.131,
       "step": 49000
     },
     {
-      "epoch": 5.279302550855666,
-      "grad_norm": 0.6980060338973999,
-      "learning_rate": 0.000283663398340696,
-      "loss": 3.3392,
+      "epoch": 5.288409703504043,
+      "grad_norm": 0.7094541788101196,
+      "learning_rate": 0.00028311710739341604,
+      "loss": 3.3339,
       "step": 49050
     },
     {
-      "epoch": 5.284684102895275,
-      "grad_norm": 0.6563853621482849,
-      "learning_rate": 0.0002833401573106346,
-      "loss": 3.3385,
+      "epoch": 5.293800539083558,
+      "grad_norm": 0.6673089265823364,
+      "learning_rate": 0.00028279330814894764,
+      "loss": 3.3366,
       "step": 49100
     },
     {
-      "epoch": 5.2900656549348835,
-      "grad_norm": 0.7191060781478882,
-      "learning_rate": 0.0002830169162805732,
-      "loss": 3.3377,
+      "epoch": 5.2991913746630726,
+      "grad_norm": 0.7256841659545898,
+      "learning_rate": 0.0002824695089044792,
+      "loss": 3.3322,
       "step": 49150
     },
     {
-      "epoch": 5.295447206974491,
-      "grad_norm": 0.6789143085479736,
-      "learning_rate": 0.0002826936752505118,
-      "loss": 3.3321,
+      "epoch": 5.304582210242588,
+      "grad_norm": 0.7026752233505249,
+      "learning_rate": 0.00028214570966001075,
+      "loss": 3.3275,
       "step": 49200
     },
     {
-      "epoch": 5.3008287590141,
-      "grad_norm": 0.6590706706047058,
-      "learning_rate": 0.00028237043422045034,
-      "loss": 3.343,
+      "epoch": 5.309973045822103,
+      "grad_norm": 0.6675985455513,
+      "learning_rate": 0.00028182191041554235,
+      "loss": 3.309,
       "step": 49250
     },
     {
-      "epoch": 5.306210311053708,
-      "grad_norm": 0.6632896661758423,
-      "learning_rate": 0.00028204719319038893,
-      "loss": 3.314,
+      "epoch": 5.315363881401617,
+      "grad_norm": 0.6676135063171387,
+      "learning_rate": 0.0002814981111710739,
+      "loss": 3.3123,
       "step": 49300
     },
     {
-      "epoch": 5.311591863093316,
-      "grad_norm": 0.7267322540283203,
-      "learning_rate": 0.0002817239521603275,
-      "loss": 3.3426,
+      "epoch": 5.320754716981132,
+      "grad_norm": 0.6714651584625244,
+      "learning_rate": 0.0002811743119266055,
+      "loss": 3.3421,
       "step": 49350
     },
     {
-      "epoch": 5.316973415132924,
-      "grad_norm": 0.6905198693275452,
-      "learning_rate": 0.0002814007111302661,
-      "loss": 3.3185,
+      "epoch": 5.3261455525606465,
+      "grad_norm": 0.6427444815635681,
+      "learning_rate": 0.00028085051268213706,
+      "loss": 3.3131,
       "step": 49400
     },
     {
-      "epoch": 5.322354967172533,
-      "grad_norm": 0.6762885451316833,
-      "learning_rate": 0.0002810774701002047,
-      "loss": 3.3412,
+      "epoch": 5.331536388140162,
+      "grad_norm": 0.6853386163711548,
+      "learning_rate": 0.0002805267134376686,
+      "loss": 3.3307,
       "step": 49450
     },
     {
-      "epoch": 5.327736519212141,
-      "grad_norm": 0.7283837795257568,
-      "learning_rate": 0.00028075422907014325,
-      "loss": 3.3324,
+      "epoch": 5.336927223719677,
+      "grad_norm": 0.7251453995704651,
+      "learning_rate": 0.00028020291419320016,
+      "loss": 3.3406,
       "step": 49500
     },
     {
-      "epoch": 5.333118071251749,
-      "grad_norm": 0.7100688219070435,
-      "learning_rate": 0.00028043098804008185,
-      "loss": 3.3433,
+      "epoch": 5.342318059299191,
+      "grad_norm": 0.6374843120574951,
+      "learning_rate": 0.00027987911494873176,
+      "loss": 3.3277,
       "step": 49550
     },
     {
-      "epoch": 5.338499623291357,
-      "grad_norm": 0.6812211275100708,
-      "learning_rate": 0.00028010774701002044,
-      "loss": 3.3305,
+      "epoch": 5.347708894878706,
+      "grad_norm": 0.6895403861999512,
+      "learning_rate": 0.0002795553157042633,
+      "loss": 3.3241,
       "step": 49600
     },
     {
-      "epoch": 5.343881175330965,
-      "grad_norm": 0.6936942934989929,
-      "learning_rate": 0.00027978450597995904,
-      "loss": 3.3567,
+      "epoch": 5.353099730458221,
+      "grad_norm": 0.6677640676498413,
+      "learning_rate": 0.0002792315164597949,
+      "loss": 3.3268,
       "step": 49650
     },
     {
-      "epoch": 5.349262727370574,
-      "grad_norm": 0.6920641660690308,
-      "learning_rate": 0.0002794612649498976,
-      "loss": 3.3213,
+      "epoch": 5.3584905660377355,
+      "grad_norm": 0.6784765720367432,
+      "learning_rate": 0.00027890771721532647,
+      "loss": 3.3029,
       "step": 49700
     },
     {
-      "epoch": 5.354644279410182,
-      "grad_norm": 0.7323217988014221,
-      "learning_rate": 0.00027913802391983623,
-      "loss": 3.3312,
+      "epoch": 5.363881401617251,
+      "grad_norm": 0.642457902431488,
+      "learning_rate": 0.00027858391797085807,
+      "loss": 3.3357,
       "step": 49750
     },
     {
-      "epoch": 5.36002583144979,
-      "grad_norm": 0.6813651919364929,
-      "learning_rate": 0.00027881478288977477,
-      "loss": 3.3503,
+      "epoch": 5.369272237196766,
+      "grad_norm": 0.6742371320724487,
+      "learning_rate": 0.0002782601187263896,
+      "loss": 3.3265,
       "step": 49800
     },
     {
-      "epoch": 5.365407383489399,
-      "grad_norm": 0.697881281375885,
-      "learning_rate": 0.00027849154185971336,
-      "loss": 3.3306,
+      "epoch": 5.37466307277628,
+      "grad_norm": 0.6685590147972107,
+      "learning_rate": 0.0002779363194819212,
+      "loss": 3.3281,
       "step": 49850
     },
     {
-      "epoch": 5.370788935529006,
-      "grad_norm": 0.73151034116745,
-      "learning_rate": 0.00027816830082965196,
-      "loss": 3.3337,
+      "epoch": 5.380053908355795,
+      "grad_norm": 0.692225992679596,
+      "learning_rate": 0.0002776125202374527,
+      "loss": 3.3313,
       "step": 49900
     },
     {
-      "epoch": 5.376170487568615,
-      "grad_norm": 0.6968307495117188,
-      "learning_rate": 0.00027784505979959055,
-      "loss": 3.3471,
+      "epoch": 5.38544474393531,
+      "grad_norm": 0.686076283454895,
+      "learning_rate": 0.00027728872099298433,
+      "loss": 3.3262,
       "step": 49950
     },
     {
-      "epoch": 5.3815520396082235,
-      "grad_norm": 0.6323778033256531,
-      "learning_rate": 0.00027752181876952915,
-      "loss": 3.3325,
+      "epoch": 5.390835579514825,
+      "grad_norm": 0.6989368200302124,
+      "learning_rate": 0.0002769649217485159,
+      "loss": 3.3394,
       "step": 50000
     },
     {
-      "epoch": 5.3815520396082235,
-      "eval_accuracy": 0.3820180122584361,
-      "eval_loss": 3.4034645557403564,
-      "eval_runtime": 184.1386,
-      "eval_samples_per_second": 97.812,
-      "eval_steps_per_second": 6.115,
+      "epoch": 5.390835579514825,
+      "eval_accuracy": 0.38204593605060444,
+      "eval_loss": 3.4039955139160156,
+      "eval_runtime": 183.4202,
+      "eval_samples_per_second": 98.195,
+      "eval_steps_per_second": 6.139,
       "step": 50000
     },
     {
-      "epoch": 5.386933591647831,
-      "grad_norm": 0.6919659376144409,
-      "learning_rate": 0.0002771985777394677,
-      "loss": 3.3212,
+      "epoch": 5.39622641509434,
+      "grad_norm": 0.6822784543037415,
+      "learning_rate": 0.0002766411225040475,
+      "loss": 3.3313,
       "step": 50050
     },
     {
-      "epoch": 5.39231514368744,
-      "grad_norm": 0.6490268707275391,
-      "learning_rate": 0.0002768753367094063,
-      "loss": 3.3219,
+      "epoch": 5.401617250673855,
+      "grad_norm": 0.6851001381874084,
+      "learning_rate": 0.00027631732325957903,
+      "loss": 3.3418,
       "step": 50100
     },
     {
-      "epoch": 5.397696695727047,
-      "grad_norm": 0.7063648700714111,
-      "learning_rate": 0.0002765520956793449,
-      "loss": 3.3245,
+      "epoch": 5.407008086253369,
+      "grad_norm": 0.6685195565223694,
+      "learning_rate": 0.0002759935240151106,
+      "loss": 3.313,
       "step": 50150
     },
     {
-      "epoch": 5.403078247766656,
-      "grad_norm": 0.7237613201141357,
-      "learning_rate": 0.00027622885464928347,
-      "loss": 3.3175,
+      "epoch": 5.412398921832884,
+      "grad_norm": 0.7843024730682373,
+      "learning_rate": 0.0002756697247706422,
+      "loss": 3.3129,
       "step": 50200
     },
     {
-      "epoch": 5.4084597998062645,
-      "grad_norm": 0.6712755560874939,
-      "learning_rate": 0.000275905613619222,
-      "loss": 3.3331,
+      "epoch": 5.4177897574123985,
+      "grad_norm": 0.7111794352531433,
+      "learning_rate": 0.00027534592552617374,
+      "loss": 3.3371,
       "step": 50250
     },
     {
-      "epoch": 5.413841351845872,
-      "grad_norm": 0.7124558091163635,
-      "learning_rate": 0.00027558237258916066,
-      "loss": 3.3226,
+      "epoch": 5.423180592991914,
+      "grad_norm": 0.6508558988571167,
+      "learning_rate": 0.00027502212628170535,
+      "loss": 3.3195,
       "step": 50300
     },
     {
-      "epoch": 5.419222903885481,
-      "grad_norm": 0.7070554494857788,
-      "learning_rate": 0.0002752591315590992,
-      "loss": 3.3364,
+      "epoch": 5.428571428571429,
+      "grad_norm": 0.658149242401123,
+      "learning_rate": 0.0002746983270372369,
+      "loss": 3.3327,
       "step": 50350
     },
     {
-      "epoch": 5.424604455925088,
-      "grad_norm": 0.7024772763252258,
-      "learning_rate": 0.0002749358905290378,
-      "loss": 3.3117,
+      "epoch": 5.433962264150943,
+      "grad_norm": 0.6759116649627686,
+      "learning_rate": 0.00027437452779276845,
+      "loss": 3.3227,
       "step": 50400
     },
     {
-      "epoch": 5.429986007964697,
-      "grad_norm": 0.6279031038284302,
-      "learning_rate": 0.0002746126494989764,
-      "loss": 3.328,
+      "epoch": 5.439353099730458,
+      "grad_norm": 0.6558474898338318,
+      "learning_rate": 0.00027405072854830005,
+      "loss": 3.3348,
       "step": 50450
     },
     {
-      "epoch": 5.435367560004305,
-      "grad_norm": 0.6562391519546509,
-      "learning_rate": 0.000274289408468915,
-      "loss": 3.3276,
+      "epoch": 5.444743935309973,
+      "grad_norm": 0.6985653042793274,
+      "learning_rate": 0.0002737269293038316,
+      "loss": 3.335,
       "step": 50500
     },
     {
-      "epoch": 5.440749112043913,
-      "grad_norm": 0.7366046905517578,
-      "learning_rate": 0.0002739661674388535,
-      "loss": 3.3354,
+      "epoch": 5.450134770889488,
+      "grad_norm": 0.6962571144104004,
+      "learning_rate": 0.00027340313005936315,
+      "loss": 3.3645,
       "step": 50550
     },
     {
-      "epoch": 5.446130664083522,
-      "grad_norm": 0.6988213062286377,
-      "learning_rate": 0.0002736429264087921,
-      "loss": 3.335,
+      "epoch": 5.455525606469003,
+      "grad_norm": 0.6763041019439697,
+      "learning_rate": 0.00027307933081489476,
+      "loss": 3.3216,
       "step": 50600
     },
     {
-      "epoch": 5.45151221612313,
-      "grad_norm": 0.6984215974807739,
-      "learning_rate": 0.0002733196853787307,
-      "loss": 3.3239,
+      "epoch": 5.460916442048518,
+      "grad_norm": 0.6788709759712219,
+      "learning_rate": 0.00027276200755531565,
+      "loss": 3.3401,
       "step": 50650
     },
     {
-      "epoch": 5.456893768162738,
-      "grad_norm": 0.6875210404396057,
-      "learning_rate": 0.0002729964443486693,
-      "loss": 3.3494,
+      "epoch": 5.466307277628032,
+      "grad_norm": 0.6613984107971191,
+      "learning_rate": 0.00027243820831084726,
+      "loss": 3.3386,
       "step": 50700
     },
     {
-      "epoch": 5.462275320202346,
-      "grad_norm": 0.7105006575584412,
-      "learning_rate": 0.0002726732033186079,
-      "loss": 3.3254,
+      "epoch": 5.471698113207547,
+      "grad_norm": 0.7615039348602295,
+      "learning_rate": 0.0002721144090663788,
+      "loss": 3.3294,
       "step": 50750
     },
     {
-      "epoch": 5.467656872241955,
-      "grad_norm": 0.772357702255249,
-      "learning_rate": 0.00027234996228854644,
-      "loss": 3.3215,
+      "epoch": 5.4770889487870615,
+      "grad_norm": 0.6573368906974792,
+      "learning_rate": 0.00027179060982191036,
+      "loss": 3.3422,
       "step": 50800
     },
     {
-      "epoch": 5.473038424281563,
-      "grad_norm": 0.7547757625579834,
-      "learning_rate": 0.0002720267212584851,
-      "loss": 3.3237,
+      "epoch": 5.482479784366577,
+      "grad_norm": 0.6838216781616211,
+      "learning_rate": 0.00027146681057744197,
+      "loss": 3.3442,
       "step": 50850
     },
     {
-      "epoch": 5.478419976321171,
-      "grad_norm": 0.6670495271682739,
-      "learning_rate": 0.00027170348022842363,
-      "loss": 3.3262,
+      "epoch": 5.487870619946092,
+      "grad_norm": 0.6906365156173706,
+      "learning_rate": 0.0002711430113329735,
+      "loss": 3.3355,
       "step": 50900
     },
     {
-      "epoch": 5.483801528360779,
-      "grad_norm": 0.6717911958694458,
-      "learning_rate": 0.00027138023919836223,
-      "loss": 3.3375,
+      "epoch": 5.493261455525606,
+      "grad_norm": 0.6942813992500305,
+      "learning_rate": 0.0002708192120885051,
+      "loss": 3.3432,
       "step": 50950
     },
     {
-      "epoch": 5.489183080400387,
-      "grad_norm": 0.6847409009933472,
-      "learning_rate": 0.0002710569981683008,
-      "loss": 3.3373,
+      "epoch": 5.498652291105121,
+      "grad_norm": 0.6663623452186584,
+      "learning_rate": 0.00027049541284403667,
+      "loss": 3.3221,
       "step": 51000
     },
     {
-      "epoch": 5.489183080400387,
-      "eval_accuracy": 0.38237319854870166,
-      "eval_loss": 3.399523973464966,
-      "eval_runtime": 184.134,
-      "eval_samples_per_second": 97.815,
-      "eval_steps_per_second": 6.115,
+      "epoch": 5.498652291105121,
+      "eval_accuracy": 0.38235624869820256,
+      "eval_loss": 3.3973066806793213,
+      "eval_runtime": 183.6193,
+      "eval_samples_per_second": 98.089,
+      "eval_steps_per_second": 6.132,
       "step": 51000
     },
     {
-      "epoch": 5.494564632439996,
-      "grad_norm": 0.7280586957931519,
-      "learning_rate": 0.0002707337571382394,
-      "loss": 3.3331,
+      "epoch": 5.504043126684636,
+      "grad_norm": 0.7017855644226074,
+      "learning_rate": 0.0002701716135995683,
+      "loss": 3.341,
       "step": 51050
     },
     {
-      "epoch": 5.499946184479604,
-      "grad_norm": 0.7104876637458801,
-      "learning_rate": 0.00027041051610817796,
-      "loss": 3.346,
+      "epoch": 5.509433962264151,
+      "grad_norm": 0.6982112526893616,
+      "learning_rate": 0.0002698478143550998,
+      "loss": 3.3139,
       "step": 51100
     },
     {
-      "epoch": 5.505327736519212,
-      "grad_norm": 0.692229688167572,
-      "learning_rate": 0.00027008727507811655,
-      "loss": 3.3201,
+      "epoch": 5.514824797843666,
+      "grad_norm": 0.7699517011642456,
+      "learning_rate": 0.0002695240151106314,
+      "loss": 3.3275,
       "step": 51150
     },
     {
-      "epoch": 5.510709288558821,
-      "grad_norm": 0.658868134021759,
-      "learning_rate": 0.00026976403404805515,
-      "loss": 3.3207,
+      "epoch": 5.520215633423181,
+      "grad_norm": 0.703709065914154,
+      "learning_rate": 0.00026920021586616293,
+      "loss": 3.3352,
       "step": 51200
     },
     {
-      "epoch": 5.516090840598428,
-      "grad_norm": 0.698153555393219,
-      "learning_rate": 0.00026944079301799374,
-      "loss": 3.3452,
+      "epoch": 5.525606469002695,
+      "grad_norm": 0.6830039620399475,
+      "learning_rate": 0.00026887641662169453,
+      "loss": 3.3385,
       "step": 51250
     },
     {
-      "epoch": 5.521472392638037,
-      "grad_norm": 0.7251303791999817,
-      "learning_rate": 0.00026911755198793234,
-      "loss": 3.3477,
+      "epoch": 5.53099730458221,
+      "grad_norm": 0.7063279151916504,
+      "learning_rate": 0.0002685526173772261,
+      "loss": 3.3502,
       "step": 51300
     },
     {
-      "epoch": 5.5268539446776455,
-      "grad_norm": 0.6895780563354492,
-      "learning_rate": 0.0002687943109578709,
-      "loss": 3.3188,
+      "epoch": 5.536388140161725,
+      "grad_norm": 0.675649881362915,
+      "learning_rate": 0.0002682288181327577,
+      "loss": 3.3169,
       "step": 51350
     },
     {
-      "epoch": 5.532235496717253,
-      "grad_norm": 0.7204925417900085,
-      "learning_rate": 0.00026847106992780947,
-      "loss": 3.3279,
+      "epoch": 5.54177897574124,
+      "grad_norm": 0.8147872686386108,
+      "learning_rate": 0.00026790501888828924,
+      "loss": 3.338,
       "step": 51400
     },
     {
-      "epoch": 5.537617048756862,
-      "grad_norm": 0.7200186252593994,
-      "learning_rate": 0.00026814782889774807,
-      "loss": 3.3376,
+      "epoch": 5.547169811320755,
+      "grad_norm": 0.7457138895988464,
+      "learning_rate": 0.00026758121964382084,
+      "loss": 3.3357,
       "step": 51450
     },
     {
-      "epoch": 5.542998600796469,
-      "grad_norm": 0.7235830426216125,
-      "learning_rate": 0.00026782458786768666,
-      "loss": 3.3367,
+      "epoch": 5.55256064690027,
+      "grad_norm": 0.6662501692771912,
+      "learning_rate": 0.00026725742039935234,
+      "loss": 3.3231,
       "step": 51500
     },
     {
-      "epoch": 5.548380152836078,
-      "grad_norm": 0.6808749437332153,
-      "learning_rate": 0.0002675013468376252,
-      "loss": 3.3245,
+      "epoch": 5.557951482479784,
+      "grad_norm": 0.6656699776649475,
+      "learning_rate": 0.00026693362115488394,
+      "loss": 3.3468,
       "step": 51550
     },
     {
-      "epoch": 5.553761704875686,
-      "grad_norm": 0.7232927680015564,
-      "learning_rate": 0.00026717810580756385,
-      "loss": 3.3472,
+      "epoch": 5.563342318059299,
+      "grad_norm": 0.6908094882965088,
+      "learning_rate": 0.0002666098219104155,
+      "loss": 3.3343,
       "step": 51600
     },
     {
-      "epoch": 5.559143256915294,
-      "grad_norm": 0.7236559987068176,
-      "learning_rate": 0.0002668548647775024,
-      "loss": 3.3412,
+      "epoch": 5.568733153638814,
+      "grad_norm": 0.719346284866333,
+      "learning_rate": 0.0002662860226659471,
+      "loss": 3.3265,
       "step": 51650
     },
     {
-      "epoch": 5.564524808954903,
-      "grad_norm": 0.7104244232177734,
-      "learning_rate": 0.000266531623747441,
-      "loss": 3.3445,
+      "epoch": 5.574123989218329,
+      "grad_norm": 0.657321572303772,
+      "learning_rate": 0.00026596222342147865,
+      "loss": 3.3462,
       "step": 51700
     },
     {
-      "epoch": 5.569906360994511,
-      "grad_norm": 0.7107536792755127,
-      "learning_rate": 0.0002662083827173796,
-      "loss": 3.3265,
+      "epoch": 5.579514824797844,
+      "grad_norm": 0.6842498183250427,
+      "learning_rate": 0.00026563842417701026,
+      "loss": 3.3369,
       "step": 51750
     },
     {
-      "epoch": 5.575287913034119,
-      "grad_norm": 0.7243977189064026,
-      "learning_rate": 0.0002658851416873182,
-      "loss": 3.3212,
+      "epoch": 5.584905660377358,
+      "grad_norm": 0.7126151323318481,
+      "learning_rate": 0.0002653146249325418,
+      "loss": 3.3398,
       "step": 51800
     },
     {
-      "epoch": 5.580669465073727,
-      "grad_norm": 0.8111730217933655,
-      "learning_rate": 0.00026556190065725677,
-      "loss": 3.3283,
+      "epoch": 5.590296495956873,
+      "grad_norm": 0.6378708481788635,
+      "learning_rate": 0.00026499082568807336,
+      "loss": 3.3478,
       "step": 51850
     },
     {
-      "epoch": 5.586051017113336,
-      "grad_norm": 0.7091906070709229,
-      "learning_rate": 0.0002652386596271953,
-      "loss": 3.3271,
+      "epoch": 5.595687331536388,
+      "grad_norm": 0.7025904655456543,
+      "learning_rate": 0.00026466702644360496,
+      "loss": 3.3434,
       "step": 51900
     },
     {
-      "epoch": 5.591432569152944,
-      "grad_norm": 0.7432578802108765,
-      "learning_rate": 0.0002649154185971339,
-      "loss": 3.3401,
+      "epoch": 5.601078167115903,
+      "grad_norm": 0.6371275782585144,
+      "learning_rate": 0.0002643432271991365,
+      "loss": 3.3528,
       "step": 51950
     },
     {
-      "epoch": 5.596814121192552,
-      "grad_norm": 0.7412343621253967,
-      "learning_rate": 0.0002645921775670725,
-      "loss": 3.3506,
+      "epoch": 5.606469002695418,
+      "grad_norm": 0.714966356754303,
+      "learning_rate": 0.00026401942795466806,
+      "loss": 3.3428,
       "step": 52000
     },
     {
-      "epoch": 5.596814121192552,
-      "eval_accuracy": 0.38275261443294983,
-      "eval_loss": 3.394073486328125,
-      "eval_runtime": 183.9682,
-      "eval_samples_per_second": 97.903,
-      "eval_steps_per_second": 6.121,
+      "epoch": 5.606469002695418,
+      "eval_accuracy": 0.3825655141601333,
+      "eval_loss": 3.3958473205566406,
+      "eval_runtime": 183.6356,
+      "eval_samples_per_second": 98.08,
+      "eval_steps_per_second": 6.132,
       "step": 52000
     },
     {
-      "epoch": 5.60219567323216,
-      "grad_norm": 0.6942436695098877,
-      "learning_rate": 0.0002642689365370111,
-      "loss": 3.3277,
+      "epoch": 5.611859838274933,
+      "grad_norm": 0.8294630646705627,
+      "learning_rate": 0.00026369562871019967,
+      "loss": 3.3444,
       "step": 52050
     },
     {
-      "epoch": 5.607577225271768,
-      "grad_norm": 0.7447067499160767,
-      "learning_rate": 0.00026394569550694963,
-      "loss": 3.3234,
+      "epoch": 5.617250673854447,
+      "grad_norm": 0.6631956100463867,
+      "learning_rate": 0.0002633718294657312,
+      "loss": 3.3405,
       "step": 52100
     },
     {
-      "epoch": 5.612958777311377,
-      "grad_norm": 0.7501024603843689,
-      "learning_rate": 0.00026362245447688823,
-      "loss": 3.3183,
+      "epoch": 5.622641509433962,
+      "grad_norm": 0.689293622970581,
+      "learning_rate": 0.0002630480302212628,
+      "loss": 3.3411,
       "step": 52150
     },
     {
-      "epoch": 5.618340329350985,
-      "grad_norm": 0.6899356842041016,
-      "learning_rate": 0.0002632992134468268,
-      "loss": 3.3369,
+      "epoch": 5.628032345013477,
+      "grad_norm": 0.7070659399032593,
+      "learning_rate": 0.00026272423097679437,
+      "loss": 3.3278,
       "step": 52200
     },
     {
-      "epoch": 5.623721881390593,
-      "grad_norm": 0.7304120659828186,
-      "learning_rate": 0.0002629759724167654,
-      "loss": 3.329,
+      "epoch": 5.633423180592992,
+      "grad_norm": 0.7154769897460938,
+      "learning_rate": 0.0002624004317323259,
+      "loss": 3.3291,
       "step": 52250
     },
     {
-      "epoch": 5.629103433430201,
-      "grad_norm": 0.6875249147415161,
-      "learning_rate": 0.000262652731386704,
-      "loss": 3.3431,
+      "epoch": 5.638814016172507,
+      "grad_norm": 0.6829545497894287,
+      "learning_rate": 0.00026207663248785753,
+      "loss": 3.3489,
       "step": 52300
     },
     {
-      "epoch": 5.634484985469809,
-      "grad_norm": 0.770589292049408,
-      "learning_rate": 0.00026232949035664255,
-      "loss": 3.3448,
+      "epoch": 5.644204851752022,
+      "grad_norm": 0.7409862875938416,
+      "learning_rate": 0.0002617528332433891,
+      "loss": 3.3226,
       "step": 52350
     },
     {
-      "epoch": 5.639866537509418,
-      "grad_norm": 0.6867998838424683,
-      "learning_rate": 0.00026200624932658115,
-      "loss": 3.3254,
+      "epoch": 5.649595687331536,
+      "grad_norm": 0.6577402949333191,
+      "learning_rate": 0.00026142903399892063,
+      "loss": 3.3248,
       "step": 52400
     },
     {
-      "epoch": 5.645248089549026,
-      "grad_norm": 0.701833188533783,
-      "learning_rate": 0.00026168300829651974,
-      "loss": 3.3515,
+      "epoch": 5.654986522911051,
+      "grad_norm": 0.7112260460853577,
+      "learning_rate": 0.00026110523475445223,
+      "loss": 3.3286,
       "step": 52450
     },
     {
-      "epoch": 5.650629641588634,
-      "grad_norm": 0.6975635290145874,
-      "learning_rate": 0.00026135976726645834,
-      "loss": 3.3351,
+      "epoch": 5.660377358490566,
+      "grad_norm": 0.6959606409072876,
+      "learning_rate": 0.0002607814355099838,
+      "loss": 3.3431,
       "step": 52500
     },
     {
-      "epoch": 5.656011193628243,
-      "grad_norm": 0.7372641563415527,
-      "learning_rate": 0.0002610365262363969,
-      "loss": 3.329,
+      "epoch": 5.665768194070081,
+      "grad_norm": 0.7187190055847168,
+      "learning_rate": 0.00026045763626551534,
+      "loss": 3.3478,
       "step": 52550
     },
     {
-      "epoch": 5.66139274566785,
-      "grad_norm": 0.7018235921859741,
-      "learning_rate": 0.00026071328520633553,
-      "loss": 3.3242,
+      "epoch": 5.671159029649596,
+      "grad_norm": 0.7454319000244141,
+      "learning_rate": 0.00026013383702104694,
+      "loss": 3.3464,
       "step": 52600
     },
     {
-      "epoch": 5.666774297707459,
-      "grad_norm": 0.7098267674446106,
-      "learning_rate": 0.00026039004417627407,
-      "loss": 3.3422,
+      "epoch": 5.67654986522911,
+      "grad_norm": 0.6750582456588745,
+      "learning_rate": 0.0002598100377765785,
+      "loss": 3.3393,
       "step": 52650
     },
     {
-      "epoch": 5.672155849747067,
-      "grad_norm": 0.7544245719909668,
-      "learning_rate": 0.00026006680314621266,
-      "loss": 3.349,
+      "epoch": 5.681940700808625,
+      "grad_norm": 0.7068287134170532,
+      "learning_rate": 0.0002594862385321101,
+      "loss": 3.3378,
       "step": 52700
     },
     {
-      "epoch": 5.677537401786675,
-      "grad_norm": 0.7348368167877197,
-      "learning_rate": 0.00025974356211615126,
-      "loss": 3.33,
+      "epoch": 5.6873315363881405,
+      "grad_norm": 0.687258780002594,
+      "learning_rate": 0.00025916243928764165,
+      "loss": 3.335,
       "step": 52750
     },
     {
-      "epoch": 5.682918953826284,
-      "grad_norm": 0.6879133582115173,
-      "learning_rate": 0.00025942032108608985,
-      "loss": 3.3439,
+      "epoch": 5.692722371967655,
+      "grad_norm": 0.6578938364982605,
+      "learning_rate": 0.0002588386400431732,
+      "loss": 3.3315,
       "step": 52800
     },
     {
-      "epoch": 5.688300505865891,
-      "grad_norm": 0.7134944796562195,
-      "learning_rate": 0.00025909708005602845,
-      "loss": 3.3526,
+      "epoch": 5.69811320754717,
+      "grad_norm": 0.6887868642807007,
+      "learning_rate": 0.00025851484079870475,
+      "loss": 3.333,
       "step": 52850
     },
     {
-      "epoch": 5.6936820579055,
-      "grad_norm": 0.7157636880874634,
-      "learning_rate": 0.000258773839025967,
-      "loss": 3.3244,
+      "epoch": 5.703504043126685,
+      "grad_norm": 0.6832111477851868,
+      "learning_rate": 0.00025819104155423635,
+      "loss": 3.328,
       "step": 52900
     },
     {
-      "epoch": 5.699063609945108,
-      "grad_norm": 0.6918929815292358,
-      "learning_rate": 0.0002584505979959056,
-      "loss": 3.3486,
+      "epoch": 5.708894878706199,
+      "grad_norm": 0.6999159455299377,
+      "learning_rate": 0.0002578672423097679,
+      "loss": 3.3221,
       "step": 52950
     },
     {
-      "epoch": 5.704445161984716,
-      "grad_norm": 0.7037851214408875,
-      "learning_rate": 0.0002581273569658442,
-      "loss": 3.323,
+      "epoch": 5.714285714285714,
+      "grad_norm": 0.6764535307884216,
+      "learning_rate": 0.0002575434430652995,
+      "loss": 3.35,
       "step": 53000
     },
     {
-      "epoch": 5.704445161984716,
-      "eval_accuracy": 0.3833323862423279,
-      "eval_loss": 3.3892478942871094,
-      "eval_runtime": 184.0857,
-      "eval_samples_per_second": 97.84,
-      "eval_steps_per_second": 6.117,
+      "epoch": 5.714285714285714,
+      "eval_accuracy": 0.3829353685902538,
+      "eval_loss": 3.392476797103882,
+      "eval_runtime": 183.5207,
+      "eval_samples_per_second": 98.142,
+      "eval_steps_per_second": 6.136,
       "step": 53000
     },
     {
-      "epoch": 5.709826714024325,
-      "grad_norm": 0.730811595916748,
-      "learning_rate": 0.00025780411593578277,
-      "loss": 3.3242,
+      "epoch": 5.719676549865229,
+      "grad_norm": 0.7244840860366821,
+      "learning_rate": 0.00025721964382083106,
+      "loss": 3.3289,
       "step": 53050
     },
     {
-      "epoch": 5.715208266063933,
-      "grad_norm": 0.6601924300193787,
-      "learning_rate": 0.0002574873397263226,
-      "loss": 3.3345,
+      "epoch": 5.725067385444744,
+      "grad_norm": 0.6984117031097412,
+      "learning_rate": 0.00025689584457636266,
+      "loss": 3.3212,
       "step": 53100
     },
     {
-      "epoch": 5.720589818103541,
-      "grad_norm": 0.7023354768753052,
-      "learning_rate": 0.00025716409869626117,
-      "loss": 3.3334,
+      "epoch": 5.730458221024259,
+      "grad_norm": 0.6968520879745483,
+      "learning_rate": 0.0002565720453318942,
+      "loss": 3.3479,
       "step": 53150
     },
     {
-      "epoch": 5.725971370143149,
-      "grad_norm": 0.7079684734344482,
-      "learning_rate": 0.0002568408576661997,
-      "loss": 3.3024,
+      "epoch": 5.735849056603773,
+      "grad_norm": 0.6894665956497192,
+      "learning_rate": 0.00025624824608742576,
+      "loss": 3.3145,
       "step": 53200
     },
     {
-      "epoch": 5.731352922182758,
-      "grad_norm": 0.7269954681396484,
-      "learning_rate": 0.00025651761663613836,
-      "loss": 3.3144,
+      "epoch": 5.741239892183288,
+      "grad_norm": 0.7057806849479675,
+      "learning_rate": 0.0002559244468429573,
+      "loss": 3.3325,
       "step": 53250
     },
     {
-      "epoch": 5.736734474222366,
-      "grad_norm": 0.6841776967048645,
-      "learning_rate": 0.0002561943756060769,
-      "loss": 3.327,
+      "epoch": 5.7466307277628035,
+      "grad_norm": 0.6735665798187256,
+      "learning_rate": 0.0002556006475984889,
+      "loss": 3.3367,
       "step": 53300
     },
     {
-      "epoch": 5.742116026261974,
-      "grad_norm": Infinity,
-      "learning_rate": 0.0002558775993966167,
-      "loss": 3.359,
+      "epoch": 5.752021563342318,
+      "grad_norm": 0.6803260445594788,
+      "learning_rate": 0.00025527684835402047,
+      "loss": 3.3264,
       "step": 53350
     },
     {
-      "epoch": 5.747497578301582,
-      "grad_norm": 0.6885505318641663,
-      "learning_rate": 0.0002555543583665553,
-      "loss": 3.3354,
+      "epoch": 5.757412398921833,
+      "grad_norm": 0.6724193692207336,
+      "learning_rate": 0.0002549530491095521,
+      "loss": 3.3538,
       "step": 53400
     },
     {
-      "epoch": 5.75287913034119,
-      "grad_norm": 0.6598957180976868,
-      "learning_rate": 0.0002552311173364939,
-      "loss": 3.3288,
+      "epoch": 5.762803234501348,
+      "grad_norm": 0.6589038372039795,
+      "learning_rate": 0.0002546292498650836,
+      "loss": 3.3275,
       "step": 53450
     },
     {
-      "epoch": 5.758260682380799,
-      "grad_norm": 0.7532528638839722,
-      "learning_rate": 0.0002549078763064325,
-      "loss": 3.324,
+      "epoch": 5.768194070080862,
+      "grad_norm": 0.6979022026062012,
+      "learning_rate": 0.00025430545062061523,
+      "loss": 3.3243,
       "step": 53500
     },
     {
-      "epoch": 5.763642234420407,
-      "grad_norm": 0.7090945839881897,
-      "learning_rate": 0.00025458463527637103,
-      "loss": 3.3497,
+      "epoch": 5.773584905660377,
+      "grad_norm": 0.7001596093177795,
+      "learning_rate": 0.0002539816513761468,
+      "loss": 3.3527,
       "step": 53550
     },
     {
-      "epoch": 5.769023786460015,
-      "grad_norm": 0.7386473417282104,
-      "learning_rate": 0.0002542613942463097,
-      "loss": 3.3253,
+      "epoch": 5.7789757412398925,
+      "grad_norm": 0.6990877389907837,
+      "learning_rate": 0.00025365785213167833,
+      "loss": 3.3337,
       "step": 53600
     },
     {
-      "epoch": 5.774405338499624,
-      "grad_norm": 0.6926794052124023,
-      "learning_rate": 0.0002539381532162482,
-      "loss": 3.3503,
+      "epoch": 5.784366576819407,
+      "grad_norm": 0.716033935546875,
+      "learning_rate": 0.0002533340528872099,
+      "loss": 3.3509,
       "step": 53650
     },
     {
-      "epoch": 5.779786890539231,
-      "grad_norm": 0.6957927942276001,
-      "learning_rate": 0.0002536149121861868,
-      "loss": 3.3296,
+      "epoch": 5.789757412398922,
+      "grad_norm": 0.7423722147941589,
+      "learning_rate": 0.0002530102536427415,
+      "loss": 3.3522,
       "step": 53700
     },
     {
-      "epoch": 5.78516844257884,
-      "grad_norm": 0.6737018823623657,
-      "learning_rate": 0.0002532916711561254,
-      "loss": 3.3303,
+      "epoch": 5.795148247978437,
+      "grad_norm": 0.681389331817627,
+      "learning_rate": 0.00025268645439827304,
+      "loss": 3.3322,
       "step": 53750
     },
     {
-      "epoch": 5.790549994618448,
-      "grad_norm": 0.6735786199569702,
-      "learning_rate": 0.000252968430126064,
-      "loss": 3.3267,
+      "epoch": 5.800539083557951,
+      "grad_norm": 0.728777289390564,
+      "learning_rate": 0.00025236265515380464,
+      "loss": 3.3393,
       "step": 53800
     },
     {
-      "epoch": 5.795931546658056,
-      "grad_norm": 0.7254409790039062,
-      "learning_rate": 0.00025264518909600255,
-      "loss": 3.317,
+      "epoch": 5.8059299191374665,
+      "grad_norm": 0.6941030621528625,
+      "learning_rate": 0.0002520388559093362,
+      "loss": 3.3219,
       "step": 53850
     },
     {
-      "epoch": 5.801313098697665,
-      "grad_norm": 0.7492697834968567,
-      "learning_rate": 0.00025232194806594114,
-      "loss": 3.3346,
+      "epoch": 5.811320754716981,
+      "grad_norm": 0.6763263940811157,
+      "learning_rate": 0.00025171505666486774,
+      "loss": 3.3551,
       "step": 53900
     },
     {
-      "epoch": 5.806694650737272,
-      "grad_norm": 0.7044953107833862,
-      "learning_rate": 0.00025199870703587974,
-      "loss": 3.3272,
+      "epoch": 5.816711590296496,
+      "grad_norm": 0.6964560151100159,
+      "learning_rate": 0.00025139125742039935,
+      "loss": 3.3294,
       "step": 53950
     },
     {
-      "epoch": 5.812076202776881,
-      "grad_norm": 0.7152127623558044,
-      "learning_rate": 0.00025167546600581833,
-      "loss": 3.3388,
+      "epoch": 5.822102425876011,
+      "grad_norm": 0.6470865607261658,
+      "learning_rate": 0.0002510674581759309,
+      "loss": 3.3319,
       "step": 54000
     },
     {
-      "epoch": 5.812076202776881,
-      "eval_accuracy": 0.3838037224696671,
-      "eval_loss": 3.383706569671631,
-      "eval_runtime": 184.3385,
-      "eval_samples_per_second": 97.706,
-      "eval_steps_per_second": 6.108,
+      "epoch": 5.822102425876011,
+      "eval_accuracy": 0.38318559619089054,
+      "eval_loss": 3.387439727783203,
+      "eval_runtime": 185.3555,
+      "eval_samples_per_second": 97.17,
+      "eval_steps_per_second": 6.075,
       "step": 54000
     },
     {
-      "epoch": 5.817457754816489,
-      "grad_norm": 0.6760851740837097,
-      "learning_rate": 0.0002513522249757569,
-      "loss": 3.3252,
+      "epoch": 5.827493261455525,
+      "grad_norm": 0.7239425182342529,
+      "learning_rate": 0.0002507436589314625,
+      "loss": 3.3272,
       "step": 54050
     },
     {
-      "epoch": 5.822839306856097,
-      "grad_norm": 0.7422081232070923,
-      "learning_rate": 0.00025102898394569547,
-      "loss": 3.3384,
+      "epoch": 5.83288409703504,
+      "grad_norm": 0.6678457856178284,
+      "learning_rate": 0.00025041985968699405,
+      "loss": 3.3339,
       "step": 54100
     },
     {
-      "epoch": 5.828220858895706,
-      "grad_norm": 0.7860513925552368,
-      "learning_rate": 0.00025070574291563406,
-      "loss": 3.3348,
+      "epoch": 5.8382749326145555,
+      "grad_norm": 0.6725283265113831,
+      "learning_rate": 0.0002500960604425256,
+      "loss": 3.3496,
       "step": 54150
     },
     {
-      "epoch": 5.833602410935313,
-      "grad_norm": 0.705047070980072,
-      "learning_rate": 0.00025038250188557265,
-      "loss": 3.3267,
+      "epoch": 5.84366576819407,
+      "grad_norm": 0.7176863551139832,
+      "learning_rate": 0.00024977226119805715,
+      "loss": 3.3397,
       "step": 54200
     },
     {
-      "epoch": 5.838983962974922,
-      "grad_norm": 0.7171577215194702,
-      "learning_rate": 0.00025005926085551125,
-      "loss": 3.3154,
+      "epoch": 5.849056603773585,
+      "grad_norm": 0.6935498714447021,
+      "learning_rate": 0.00024944846195358876,
+      "loss": 3.3478,
       "step": 54250
     },
     {
-      "epoch": 5.84436551501453,
-      "grad_norm": 0.7100509405136108,
-      "learning_rate": 0.0002497360198254498,
-      "loss": 3.3315,
+      "epoch": 5.8544474393531,
+      "grad_norm": 0.7223532795906067,
+      "learning_rate": 0.0002491246627091203,
+      "loss": 3.3461,
       "step": 54300
     },
     {
-      "epoch": 5.849747067054138,
-      "grad_norm": 0.7151844501495361,
-      "learning_rate": 0.00024941277879538844,
-      "loss": 3.336,
+      "epoch": 5.859838274932614,
+      "grad_norm": 0.673150897026062,
+      "learning_rate": 0.0002488008634646519,
+      "loss": 3.3365,
       "step": 54350
     },
     {
-      "epoch": 5.855128619093747,
-      "grad_norm": 0.7535050511360168,
-      "learning_rate": 0.000249089537765327,
-      "loss": 3.3343,
+      "epoch": 5.8652291105121295,
+      "grad_norm": 0.6913243532180786,
+      "learning_rate": 0.00024847706422018346,
+      "loss": 3.3225,
       "step": 54400
     },
     {
-      "epoch": 5.860510171133355,
-      "grad_norm": 0.6949968934059143,
-      "learning_rate": 0.0002487662967352656,
-      "loss": 3.3246,
+      "epoch": 5.870619946091644,
+      "grad_norm": 0.6828725934028625,
+      "learning_rate": 0.00024815326497571507,
+      "loss": 3.318,
       "step": 54450
     },
     {
-      "epoch": 5.865891723172963,
-      "grad_norm": 0.7528766989707947,
-      "learning_rate": 0.00024844305570520417,
-      "loss": 3.3319,
+      "epoch": 5.876010781671159,
+      "grad_norm": 0.7479648590087891,
+      "learning_rate": 0.00024782946573124657,
+      "loss": 3.332,
       "step": 54500
     },
     {
-      "epoch": 5.871273275212571,
-      "grad_norm": 0.6865386962890625,
-      "learning_rate": 0.00024811981467514276,
-      "loss": 3.3407,
+      "epoch": 5.881401617250674,
+      "grad_norm": 0.7425312399864197,
+      "learning_rate": 0.00024750566648677817,
+      "loss": 3.3284,
       "step": 54550
     },
     {
-      "epoch": 5.87665482725218,
-      "grad_norm": 0.721444308757782,
-      "learning_rate": 0.00024779657364508136,
-      "loss": 3.3203,
+      "epoch": 5.886792452830189,
+      "grad_norm": 0.6899173855781555,
+      "learning_rate": 0.0002471818672423097,
+      "loss": 3.3596,
       "step": 54600
     },
     {
-      "epoch": 5.882036379291788,
-      "grad_norm": 0.7100372910499573,
-      "learning_rate": 0.0002474733326150199,
-      "loss": 3.3361,
+      "epoch": 5.892183288409703,
+      "grad_norm": 0.7049915194511414,
+      "learning_rate": 0.0002468645439827307,
+      "loss": 3.3453,
       "step": 54650
     },
     {
-      "epoch": 5.887417931331396,
-      "grad_norm": 0.6949239373207092,
-      "learning_rate": 0.0002471500915849585,
-      "loss": 3.3499,
+      "epoch": 5.8975741239892185,
+      "grad_norm": 0.7107455730438232,
+      "learning_rate": 0.0002465407447382623,
+      "loss": 3.3435,
       "step": 54700
     },
     {
-      "epoch": 5.892799483371004,
-      "grad_norm": 0.7012357115745544,
-      "learning_rate": 0.0002468268505548971,
-      "loss": 3.3406,
+      "epoch": 5.902964959568733,
+      "grad_norm": 0.7119571566581726,
+      "learning_rate": 0.00024621694549379383,
+      "loss": 3.3442,
       "step": 54750
     },
     {
-      "epoch": 5.898181035410612,
-      "grad_norm": 0.6897000074386597,
-      "learning_rate": 0.0002465036095248357,
-      "loss": 3.3447,
+      "epoch": 5.908355795148248,
+      "grad_norm": 0.6925951838493347,
+      "learning_rate": 0.00024589314624932543,
+      "loss": 3.3476,
       "step": 54800
     },
     {
-      "epoch": 5.903562587450221,
-      "grad_norm": 0.726473867893219,
-      "learning_rate": 0.0002461803684947742,
-      "loss": 3.3388,
+      "epoch": 5.913746630727763,
+      "grad_norm": 0.6913626790046692,
+      "learning_rate": 0.00024556934700485693,
+      "loss": 3.3439,
       "step": 54850
     },
     {
-      "epoch": 5.9089441394898286,
-      "grad_norm": 0.7095021605491638,
-      "learning_rate": 0.0002458571274647128,
-      "loss": 3.3416,
+      "epoch": 5.919137466307277,
+      "grad_norm": 0.7057418823242188,
+      "learning_rate": 0.00024524554776038853,
+      "loss": 3.3232,
       "step": 54900
     },
     {
-      "epoch": 5.914325691529437,
-      "grad_norm": 0.7249286770820618,
-      "learning_rate": 0.0002455338864346514,
-      "loss": 3.3427,
+      "epoch": 5.9245283018867925,
+      "grad_norm": 0.7266544699668884,
+      "learning_rate": 0.0002449217485159201,
+      "loss": 3.3287,
       "step": 54950
     },
     {
-      "epoch": 5.919707243569046,
-      "grad_norm": 0.7552708983421326,
-      "learning_rate": 0.00024521064540459,
-      "loss": 3.3396,
+      "epoch": 5.929919137466308,
+      "grad_norm": 0.661906898021698,
+      "learning_rate": 0.0002445979492714517,
+      "loss": 3.3318,
       "step": 55000
     },
     {
-      "epoch": 5.919707243569046,
-      "eval_accuracy": 0.38397126522267705,
-      "eval_loss": 3.3809568881988525,
-      "eval_runtime": 184.0462,
-      "eval_samples_per_second": 97.861,
-      "eval_steps_per_second": 6.118,
+      "epoch": 5.929919137466308,
+      "eval_accuracy": 0.3839206329769555,
+      "eval_loss": 3.3820478916168213,
+      "eval_runtime": 185.4571,
+      "eval_samples_per_second": 97.117,
+      "eval_steps_per_second": 6.071,
       "step": 55000
     },
     {
-      "epoch": 5.925088795608653,
-      "grad_norm": 0.7218322157859802,
-      "learning_rate": 0.0002448874043745286,
-      "loss": 3.3392,
+      "epoch": 5.935309973045822,
+      "grad_norm": 0.6756860017776489,
+      "learning_rate": 0.00024427415002698324,
+      "loss": 3.3462,
       "step": 55050
     },
     {
-      "epoch": 5.930470347648262,
-      "grad_norm": 0.7611395120620728,
-      "learning_rate": 0.00024456416334446714,
-      "loss": 3.3307,
+      "epoch": 5.940700808625337,
+      "grad_norm": 0.7302395105361938,
+      "learning_rate": 0.00024395035078251482,
+      "loss": 3.3318,
       "step": 55100
     },
     {
-      "epoch": 5.93585189968787,
-      "grad_norm": 0.7061910033226013,
-      "learning_rate": 0.00024424092231440574,
-      "loss": 3.3327,
+      "epoch": 5.946091644204852,
+      "grad_norm": 0.7301084399223328,
+      "learning_rate": 0.0002436265515380464,
+      "loss": 3.3298,
       "step": 55150
     },
     {
-      "epoch": 5.941233451727478,
-      "grad_norm": 0.6887156367301941,
-      "learning_rate": 0.00024391768128434436,
-      "loss": 3.35,
+      "epoch": 5.951482479784366,
+      "grad_norm": 0.7300922870635986,
+      "learning_rate": 0.00024330275229357797,
+      "loss": 3.3517,
       "step": 55200
     },
     {
-      "epoch": 5.946615003767087,
-      "grad_norm": 0.7248396277427673,
-      "learning_rate": 0.00024359444025428293,
-      "loss": 3.3488,
+      "epoch": 5.9568733153638815,
+      "grad_norm": 0.6984791159629822,
+      "learning_rate": 0.00024297895304910952,
+      "loss": 3.3235,
       "step": 55250
     },
     {
-      "epoch": 5.951996555806694,
-      "grad_norm": 0.7237182855606079,
-      "learning_rate": 0.0002432711992242215,
-      "loss": 3.3265,
+      "epoch": 5.962264150943396,
+      "grad_norm": 0.6925639510154724,
+      "learning_rate": 0.0002426551538046411,
+      "loss": 3.3236,
       "step": 55300
     },
     {
-      "epoch": 5.957378107846303,
-      "grad_norm": 0.7203229069709778,
-      "learning_rate": 0.0002429479581941601,
-      "loss": 3.3224,
+      "epoch": 5.967654986522911,
+      "grad_norm": 0.7360100746154785,
+      "learning_rate": 0.00024233135456017265,
+      "loss": 3.3322,
       "step": 55350
     },
     {
-      "epoch": 5.962759659885911,
-      "grad_norm": 0.7003002762794495,
-      "learning_rate": 0.00024262471716409868,
-      "loss": 3.3244,
+      "epoch": 5.973045822102426,
+      "grad_norm": 0.782108724117279,
+      "learning_rate": 0.00024200755531570423,
+      "loss": 3.337,
       "step": 55400
     },
     {
-      "epoch": 5.968141211925519,
-      "grad_norm": 0.725002110004425,
-      "learning_rate": 0.00024230147613403728,
-      "loss": 3.3313,
+      "epoch": 5.97843665768194,
+      "grad_norm": 0.6809326410293579,
+      "learning_rate": 0.0002416837560712358,
+      "loss": 3.3307,
       "step": 55450
     },
     {
-      "epoch": 5.973522763965128,
-      "grad_norm": 0.7069681286811829,
-      "learning_rate": 0.00024197823510397584,
-      "loss": 3.3428,
+      "epoch": 5.9838274932614555,
+      "grad_norm": 0.725308895111084,
+      "learning_rate": 0.00024135995682676739,
+      "loss": 3.3075,
       "step": 55500
     },
     {
-      "epoch": 5.978904316004736,
-      "grad_norm": 0.7072627544403076,
-      "learning_rate": 0.0002416549940739144,
+      "epoch": 5.989218328840971,
+      "grad_norm": 0.6809728741645813,
+      "learning_rate": 0.00024103615758229896,
       "loss": 3.3378,
       "step": 55550
     },
     {
-      "epoch": 5.984285868044344,
-      "grad_norm": 0.7657013535499573,
-      "learning_rate": 0.00024133175304385303,
-      "loss": 3.3343,
+      "epoch": 5.994609164420485,
+      "grad_norm": 0.7342197299003601,
+      "learning_rate": 0.00024071235833783054,
+      "loss": 3.3266,
       "step": 55600
     },
     {
-      "epoch": 5.989667420083952,
-      "grad_norm": 0.7375352382659912,
-      "learning_rate": 0.0002410085120137916,
-      "loss": 3.3289,
+      "epoch": 6.0,
+      "grad_norm": 1.4940729141235352,
+      "learning_rate": 0.00024038855909336212,
+      "loss": 3.3575,
       "step": 55650
     },
     {
-      "epoch": 5.995048972123561,
-      "grad_norm": 0.7755512595176697,
-      "learning_rate": 0.00024068527098373017,
-      "loss": 3.3562,
+      "epoch": 6.005390835579515,
+      "grad_norm": 0.6556679606437683,
+      "learning_rate": 0.00024006475984889364,
+      "loss": 3.2374,
       "step": 55700
     },
     {
-      "epoch": 6.000430524163169,
-      "grad_norm": 0.7189819812774658,
-      "learning_rate": 0.0002403620299536688,
-      "loss": 3.3257,
+      "epoch": 6.010781671159029,
+      "grad_norm": 0.7130705118179321,
+      "learning_rate": 0.00023974096060442522,
+      "loss": 3.2386,
       "step": 55750
     },
     {
-      "epoch": 6.005812076202777,
-      "grad_norm": 0.713076114654541,
-      "learning_rate": 0.00024003878892360736,
-      "loss": 3.2465,
+      "epoch": 6.0161725067385445,
+      "grad_norm": 0.7809520363807678,
+      "learning_rate": 0.0002394171613599568,
+      "loss": 3.251,
       "step": 55800
     },
     {
-      "epoch": 6.011193628242385,
-      "grad_norm": 0.7407640814781189,
-      "learning_rate": 0.00023971554789354593,
-      "loss": 3.2446,
+      "epoch": 6.02156334231806,
+      "grad_norm": 0.700687825679779,
+      "learning_rate": 0.00023909336211548837,
+      "loss": 3.2536,
       "step": 55850
     },
     {
-      "epoch": 6.016575180281993,
-      "grad_norm": 0.7104265093803406,
-      "learning_rate": 0.00023939230686348452,
-      "loss": 3.2488,
+      "epoch": 6.026954177897574,
+      "grad_norm": 0.7130185961723328,
+      "learning_rate": 0.00023876956287101995,
+      "loss": 3.2534,
       "step": 55900
     },
     {
-      "epoch": 6.021956732321602,
-      "grad_norm": 0.7195842266082764,
-      "learning_rate": 0.0002390690658334231,
-      "loss": 3.2401,
+      "epoch": 6.032345013477089,
+      "grad_norm": 0.7143113613128662,
+      "learning_rate": 0.00023844576362655153,
+      "loss": 3.2509,
       "step": 55950
     },
     {
-      "epoch": 6.0273382843612096,
-      "grad_norm": 0.7403207421302795,
-      "learning_rate": 0.00023874582480336168,
-      "loss": 3.237,
+      "epoch": 6.037735849056604,
+      "grad_norm": 0.7439408302307129,
+      "learning_rate": 0.0002381219643820831,
+      "loss": 3.2649,
       "step": 56000
     },
     {
-      "epoch": 6.0273382843612096,
-      "eval_accuracy": 0.3839623556858762,
-      "eval_loss": 3.3840363025665283,
-      "eval_runtime": 183.8506,
-      "eval_samples_per_second": 97.965,
-      "eval_steps_per_second": 6.125,
+      "epoch": 6.037735849056604,
+      "eval_accuracy": 0.383912158051706,
+      "eval_loss": 3.386408805847168,
+      "eval_runtime": 185.4382,
+      "eval_samples_per_second": 97.127,
+      "eval_steps_per_second": 6.072,
       "step": 56000
     },
     {
-      "epoch": 6.032719836400818,
-      "grad_norm": 0.6788797378540039,
-      "learning_rate": 0.00023842258377330028,
-      "loss": 3.2471,
+      "epoch": 6.0431266846361185,
+      "grad_norm": 0.7821211814880371,
+      "learning_rate": 0.00023779816513761466,
+      "loss": 3.2624,
       "step": 56050
     },
     {
-      "epoch": 6.038101388440427,
-      "grad_norm": 0.6888821125030518,
-      "learning_rate": 0.00023809934274323885,
-      "loss": 3.2575,
+      "epoch": 6.048517520215634,
+      "grad_norm": 0.6759442687034607,
+      "learning_rate": 0.00023748084187803558,
+      "loss": 3.2505,
       "step": 56100
     },
     {
-      "epoch": 6.043482940480034,
-      "grad_norm": 0.7463328838348389,
-      "learning_rate": 0.0002377761017131774,
-      "loss": 3.2439,
+      "epoch": 6.053908355795148,
+      "grad_norm": 0.7412766218185425,
+      "learning_rate": 0.00023715704263356716,
+      "loss": 3.2657,
       "step": 56150
     },
     {
-      "epoch": 6.048864492519643,
-      "grad_norm": 0.690376341342926,
-      "learning_rate": 0.00023745286068311603,
-      "loss": 3.2494,
+      "epoch": 6.059299191374663,
+      "grad_norm": 0.7447103261947632,
+      "learning_rate": 0.00023683324338909874,
+      "loss": 3.259,
       "step": 56200
     },
     {
-      "epoch": 6.0542460445592505,
-      "grad_norm": 0.7049649953842163,
-      "learning_rate": 0.0002371296196530546,
-      "loss": 3.2616,
+      "epoch": 6.064690026954178,
+      "grad_norm": 0.6834601759910583,
+      "learning_rate": 0.00023650944414463032,
+      "loss": 3.2416,
       "step": 56250
     },
     {
-      "epoch": 6.059627596598859,
-      "grad_norm": 0.7419145703315735,
-      "learning_rate": 0.00023680637862299317,
-      "loss": 3.2423,
+      "epoch": 6.070080862533692,
+      "grad_norm": 0.7556888461112976,
+      "learning_rate": 0.0002361856449001619,
+      "loss": 3.2681,
       "step": 56300
     },
     {
-      "epoch": 6.065009148638468,
-      "grad_norm": 0.732083261013031,
-      "learning_rate": 0.0002364831375929318,
-      "loss": 3.2486,
+      "epoch": 6.0754716981132075,
+      "grad_norm": 0.7148715853691101,
+      "learning_rate": 0.00023586184565569347,
+      "loss": 3.2538,
       "step": 56350
     },
     {
-      "epoch": 6.070390700678075,
-      "grad_norm": 0.7247629165649414,
-      "learning_rate": 0.00023615989656287036,
-      "loss": 3.2325,
+      "epoch": 6.080862533692723,
+      "grad_norm": 0.6975698471069336,
+      "learning_rate": 0.00023553804641122502,
+      "loss": 3.2559,
       "step": 56400
     },
     {
-      "epoch": 6.075772252717684,
-      "grad_norm": 0.7294711470603943,
-      "learning_rate": 0.00023583665553280895,
-      "loss": 3.2576,
+      "epoch": 6.086253369272237,
+      "grad_norm": 0.7379735112190247,
+      "learning_rate": 0.00023521424716675657,
+      "loss": 3.2518,
       "step": 56450
     },
     {
-      "epoch": 6.081153804757292,
-      "grad_norm": 0.6983347535133362,
-      "learning_rate": 0.00023551341450274752,
-      "loss": 3.2536,
+      "epoch": 6.091644204851752,
+      "grad_norm": 0.7769717574119568,
+      "learning_rate": 0.00023489044792228815,
+      "loss": 3.2717,
       "step": 56500
     },
     {
-      "epoch": 6.0865353567969,
-      "grad_norm": 0.7216185927391052,
-      "learning_rate": 0.00023519017347268612,
-      "loss": 3.2575,
+      "epoch": 6.097035040431267,
+      "grad_norm": 0.7235531806945801,
+      "learning_rate": 0.00023456664867781973,
+      "loss": 3.2488,
       "step": 56550
     },
     {
-      "epoch": 6.091916908836509,
-      "grad_norm": 0.738471269607544,
-      "learning_rate": 0.0002348669324426247,
-      "loss": 3.2558,
+      "epoch": 6.1024258760107815,
+      "grad_norm": 0.7254287600517273,
+      "learning_rate": 0.0002342428494333513,
+      "loss": 3.255,
       "step": 56600
     },
     {
-      "epoch": 6.097298460876116,
-      "grad_norm": 0.7062331438064575,
-      "learning_rate": 0.00023454369141256328,
-      "loss": 3.2474,
+      "epoch": 6.107816711590297,
+      "grad_norm": 0.7020230889320374,
+      "learning_rate": 0.00023391905018888288,
+      "loss": 3.254,
       "step": 56650
     },
     {
-      "epoch": 6.102680012915725,
-      "grad_norm": 0.7130937576293945,
-      "learning_rate": 0.00023422045038250185,
-      "loss": 3.2571,
+      "epoch": 6.113207547169812,
+      "grad_norm": 0.7128614187240601,
+      "learning_rate": 0.00023359525094441443,
+      "loss": 3.2763,
       "step": 56700
     },
     {
-      "epoch": 6.108061564955333,
-      "grad_norm": 0.7500676512718201,
-      "learning_rate": 0.00023389720935244047,
-      "loss": 3.2526,
+      "epoch": 6.118598382749326,
+      "grad_norm": 0.72292160987854,
+      "learning_rate": 0.000233271451699946,
+      "loss": 3.2677,
       "step": 56750
     },
     {
-      "epoch": 6.113443116994941,
-      "grad_norm": 0.7299767136573792,
-      "learning_rate": 0.00023357396832237903,
-      "loss": 3.2539,
+      "epoch": 6.123989218328841,
+      "grad_norm": 0.7195286750793457,
+      "learning_rate": 0.0002329476524554776,
+      "loss": 3.2702,
       "step": 56800
     },
     {
-      "epoch": 6.11882466903455,
-      "grad_norm": 0.7288519740104675,
-      "learning_rate": 0.0002332507272923176,
-      "loss": 3.286,
+      "epoch": 6.129380053908355,
+      "grad_norm": 0.727357804775238,
+      "learning_rate": 0.00023262385321100917,
+      "loss": 3.27,
       "step": 56850
     },
     {
-      "epoch": 6.124206221074158,
-      "grad_norm": 0.7326245903968811,
-      "learning_rate": 0.00023292748626225622,
-      "loss": 3.2707,
+      "epoch": 6.1347708894878705,
+      "grad_norm": 0.7259669899940491,
+      "learning_rate": 0.00023230005396654072,
+      "loss": 3.2769,
       "step": 56900
     },
     {
-      "epoch": 6.129587773113766,
-      "grad_norm": 0.7590323686599731,
-      "learning_rate": 0.0002326042452321948,
-      "loss": 3.2554,
+      "epoch": 6.140161725067386,
+      "grad_norm": 0.7762994766235352,
+      "learning_rate": 0.0002319762547220723,
+      "loss": 3.2559,
       "step": 56950
     },
     {
-      "epoch": 6.134969325153374,
-      "grad_norm": 0.732631266117096,
-      "learning_rate": 0.00023228100420213336,
-      "loss": 3.2561,
+      "epoch": 6.1455525606469,
+      "grad_norm": 0.7135004997253418,
+      "learning_rate": 0.00023165245547760387,
+      "loss": 3.2537,
       "step": 57000
     },
     {
-      "epoch": 6.134969325153374,
-      "eval_accuracy": 0.3845425621068056,
-      "eval_loss": 3.3839833736419678,
-      "eval_runtime": 184.6599,
-      "eval_samples_per_second": 97.536,
-      "eval_steps_per_second": 6.098,
+      "epoch": 6.1455525606469,
+      "eval_accuracy": 0.3839402991496499,
+      "eval_loss": 3.385230779647827,
+      "eval_runtime": 185.6484,
+      "eval_samples_per_second": 97.017,
+      "eval_steps_per_second": 6.065,
       "step": 57000
     },
     {
-      "epoch": 6.140350877192983,
-      "grad_norm": 0.7149685621261597,
-      "learning_rate": 0.00023195776317207195,
-      "loss": 3.2676,
+      "epoch": 6.150943396226415,
+      "grad_norm": 0.7256219983100891,
+      "learning_rate": 0.00023132865623313542,
+      "loss": 3.2828,
       "step": 57050
     },
     {
-      "epoch": 6.1457324292325906,
-      "grad_norm": 0.7450768351554871,
-      "learning_rate": 0.00023163452214201055,
-      "loss": 3.267,
+      "epoch": 6.15633423180593,
+      "grad_norm": 0.7467809319496155,
+      "learning_rate": 0.000231004856988667,
+      "loss": 3.2713,
       "step": 57100
     },
     {
-      "epoch": 6.151113981272199,
-      "grad_norm": 0.71061110496521,
-      "learning_rate": 0.00023131128111194912,
-      "loss": 3.2632,
+      "epoch": 6.1617250673854445,
+      "grad_norm": 0.6944103837013245,
+      "learning_rate": 0.00023068105774419858,
+      "loss": 3.2551,
       "step": 57150
     },
     {
-      "epoch": 6.156495533311807,
-      "grad_norm": 0.8081583976745605,
-      "learning_rate": 0.0002309880400818877,
-      "loss": 3.265,
+      "epoch": 6.16711590296496,
+      "grad_norm": 0.6952575445175171,
+      "learning_rate": 0.00023035725849973016,
+      "loss": 3.2618,
       "step": 57200
     },
     {
-      "epoch": 6.161877085351415,
-      "grad_norm": 0.6962241530418396,
-      "learning_rate": 0.00023066479905182628,
-      "loss": 3.2733,
+      "epoch": 6.172506738544475,
+      "grad_norm": 0.7093820571899414,
+      "learning_rate": 0.00023003345925526173,
+      "loss": 3.2645,
       "step": 57250
     },
     {
-      "epoch": 6.167258637391024,
-      "grad_norm": 0.7313778400421143,
-      "learning_rate": 0.00023034155802176487,
-      "loss": 3.2848,
+      "epoch": 6.177897574123989,
+      "grad_norm": 0.7516891360282898,
+      "learning_rate": 0.00022970966001079328,
+      "loss": 3.263,
       "step": 57300
     },
     {
-      "epoch": 6.1726401894306315,
-      "grad_norm": 0.7355571985244751,
-      "learning_rate": 0.00023001831699170347,
-      "loss": 3.2765,
+      "epoch": 6.183288409703504,
+      "grad_norm": 0.7763015627861023,
+      "learning_rate": 0.00022938586076632484,
+      "loss": 3.2802,
       "step": 57350
     },
     {
-      "epoch": 6.17802174147024,
-      "grad_norm": 0.7399037480354309,
-      "learning_rate": 0.00022970154078224327,
-      "loss": 3.2481,
+      "epoch": 6.188679245283019,
+      "grad_norm": 0.6774693131446838,
+      "learning_rate": 0.0002290620615218564,
+      "loss": 3.2703,
       "step": 57400
     },
     {
-      "epoch": 6.183403293509849,
-      "grad_norm": 0.7529656291007996,
-      "learning_rate": 0.00022937829975218187,
-      "loss": 3.2717,
+      "epoch": 6.1940700808625335,
+      "grad_norm": 0.7566379308700562,
+      "learning_rate": 0.000228738262277388,
+      "loss": 3.276,
       "step": 57450
     },
     {
-      "epoch": 6.188784845549456,
-      "grad_norm": 0.6955074071884155,
-      "learning_rate": 0.00022905505872212044,
-      "loss": 3.2703,
+      "epoch": 6.199460916442049,
+      "grad_norm": 0.7387679815292358,
+      "learning_rate": 0.00022841446303291957,
+      "loss": 3.273,
       "step": 57500
     },
     {
-      "epoch": 6.194166397589065,
-      "grad_norm": 0.704826831817627,
-      "learning_rate": 0.00022873181769205903,
-      "loss": 3.2711,
+      "epoch": 6.204851752021563,
+      "grad_norm": 0.7531148195266724,
+      "learning_rate": 0.00022809066378845115,
+      "loss": 3.2638,
       "step": 57550
     },
     {
-      "epoch": 6.1995479496286725,
-      "grad_norm": 0.75359708070755,
-      "learning_rate": 0.0002284085766619976,
-      "loss": 3.2582,
+      "epoch": 6.210242587601078,
+      "grad_norm": 0.8101642727851868,
+      "learning_rate": 0.00022776686454398272,
+      "loss": 3.2498,
       "step": 57600
     },
     {
-      "epoch": 6.204929501668281,
-      "grad_norm": 0.7706862688064575,
-      "learning_rate": 0.0002280853356319362,
-      "loss": 3.2637,
+      "epoch": 6.215633423180593,
+      "grad_norm": 0.7339323163032532,
+      "learning_rate": 0.0002274430652995143,
+      "loss": 3.2721,
       "step": 57650
     },
     {
-      "epoch": 6.21031105370789,
-      "grad_norm": 0.7804993391036987,
-      "learning_rate": 0.0002277620946018748,
-      "loss": 3.2706,
+      "epoch": 6.2210242587601075,
+      "grad_norm": 0.7086048126220703,
+      "learning_rate": 0.00022711926605504588,
+      "loss": 3.2757,
       "step": 57700
     },
     {
-      "epoch": 6.215692605747497,
-      "grad_norm": 0.7386913299560547,
-      "learning_rate": 0.00022743885357181336,
-      "loss": 3.2786,
+      "epoch": 6.226415094339623,
+      "grad_norm": 0.7538915276527405,
+      "learning_rate": 0.0002267954668105774,
+      "loss": 3.2926,
       "step": 57750
     },
     {
-      "epoch": 6.221074157787106,
-      "grad_norm": 0.7626097202301025,
-      "learning_rate": 0.00022711561254175192,
-      "loss": 3.2787,
+      "epoch": 6.231805929919138,
+      "grad_norm": 0.7328342199325562,
+      "learning_rate": 0.00022647166756610898,
+      "loss": 3.2866,
       "step": 57800
     },
     {
-      "epoch": 6.226455709826714,
-      "grad_norm": 0.7361026406288147,
-      "learning_rate": 0.00022679237151169054,
-      "loss": 3.2744,
+      "epoch": 6.237196765498652,
+      "grad_norm": 0.7042235136032104,
+      "learning_rate": 0.00022614786832164056,
+      "loss": 3.2587,
       "step": 57850
     },
     {
-      "epoch": 6.231837261866322,
-      "grad_norm": 0.7097683548927307,
-      "learning_rate": 0.0002264691304816291,
-      "loss": 3.2767,
+      "epoch": 6.242587601078167,
+      "grad_norm": 0.7145513296127319,
+      "learning_rate": 0.00022582406907717214,
+      "loss": 3.2803,
       "step": 57900
     },
     {
-      "epoch": 6.237218813905931,
-      "grad_norm": 0.792580783367157,
-      "learning_rate": 0.00022614588945156768,
-      "loss": 3.2917,
+      "epoch": 6.247978436657682,
+      "grad_norm": 0.7408495545387268,
+      "learning_rate": 0.0002255002698327037,
+      "loss": 3.2699,
       "step": 57950
     },
     {
-      "epoch": 6.242600365945538,
-      "grad_norm": 0.6779635548591614,
-      "learning_rate": 0.0002258226484215063,
-      "loss": 3.2553,
+      "epoch": 6.2533692722371965,
+      "grad_norm": 0.7923481464385986,
+      "learning_rate": 0.0002251764705882353,
+      "loss": 3.2566,
       "step": 58000
     },
     {
-      "epoch": 6.242600365945538,
-      "eval_accuracy": 0.3850510576217773,
-      "eval_loss": 3.3806025981903076,
-      "eval_runtime": 184.4045,
-      "eval_samples_per_second": 97.671,
-      "eval_steps_per_second": 6.106,
+      "epoch": 6.2533692722371965,
+      "eval_accuracy": 0.3843151516126099,
+      "eval_loss": 3.3821535110473633,
+      "eval_runtime": 185.4378,
+      "eval_samples_per_second": 97.127,
+      "eval_steps_per_second": 6.072,
       "step": 58000
     },
     {
-      "epoch": 6.247981917985147,
-      "grad_norm": 0.7419663667678833,
-      "learning_rate": 0.00022549940739144487,
-      "loss": 3.259,
+      "epoch": 6.258760107816712,
+      "grad_norm": 0.7456364631652832,
+      "learning_rate": 0.00022485267134376687,
+      "loss": 3.2721,
       "step": 58050
     },
     {
-      "epoch": 6.253363470024755,
-      "grad_norm": 0.7085452675819397,
-      "learning_rate": 0.00022517616636138344,
-      "loss": 3.2813,
+      "epoch": 6.264150943396227,
+      "grad_norm": 0.7139208912849426,
+      "learning_rate": 0.00022452887209929842,
+      "loss": 3.2846,
       "step": 58100
     },
     {
-      "epoch": 6.258745022064363,
-      "grad_norm": 0.7902224659919739,
-      "learning_rate": 0.00022485292533132203,
-      "loss": 3.2794,
+      "epoch": 6.269541778975741,
+      "grad_norm": 0.7725496292114258,
+      "learning_rate": 0.00022420507285482997,
+      "loss": 3.2611,
       "step": 58150
     },
     {
-      "epoch": 6.264126574103972,
-      "grad_norm": 0.7155488133430481,
-      "learning_rate": 0.00022452968430126063,
-      "loss": 3.2745,
+      "epoch": 6.274932614555256,
+      "grad_norm": 0.7034215927124023,
+      "learning_rate": 0.00022388127361036155,
+      "loss": 3.2786,
       "step": 58200
     },
     {
-      "epoch": 6.26950812614358,
-      "grad_norm": 1.2818570137023926,
-      "learning_rate": 0.00022420644327119922,
-      "loss": 3.268,
+      "epoch": 6.280323450134771,
+      "grad_norm": 0.7483360767364502,
+      "learning_rate": 0.0002235639503507825,
+      "loss": 3.2824,
       "step": 58250
     },
     {
-      "epoch": 6.274889678183188,
-      "grad_norm": 0.750627875328064,
-      "learning_rate": 0.0002238832022411378,
-      "loss": 3.2734,
+      "epoch": 6.285714285714286,
+      "grad_norm": 0.7298248410224915,
+      "learning_rate": 0.00022324015110631408,
+      "loss": 3.268,
       "step": 58300
     },
     {
-      "epoch": 6.280271230222796,
-      "grad_norm": 0.7265231013298035,
-      "learning_rate": 0.00022355996121107636,
-      "loss": 3.274,
+      "epoch": 6.291105121293801,
+      "grad_norm": 0.7724031805992126,
+      "learning_rate": 0.00022291635186184565,
+      "loss": 3.2586,
       "step": 58350
     },
     {
-      "epoch": 6.285652782262405,
-      "grad_norm": 0.679812490940094,
-      "learning_rate": 0.00022323672018101498,
-      "loss": 3.29,
+      "epoch": 6.296495956873315,
+      "grad_norm": 0.7413439750671387,
+      "learning_rate": 0.0002225925526173772,
+      "loss": 3.2683,
       "step": 58400
     },
     {
-      "epoch": 6.2910343343020125,
-      "grad_norm": 0.7458456754684448,
-      "learning_rate": 0.00022291347915095355,
-      "loss": 3.2672,
+      "epoch": 6.30188679245283,
+      "grad_norm": 0.7157818675041199,
+      "learning_rate": 0.00022226875337290878,
+      "loss": 3.2867,
       "step": 58450
     },
     {
-      "epoch": 6.296415886341621,
-      "grad_norm": 0.7149653434753418,
-      "learning_rate": 0.0002225902381208921,
-      "loss": 3.2806,
+      "epoch": 6.307277628032345,
+      "grad_norm": 0.688450276851654,
+      "learning_rate": 0.00022194495412844033,
+      "loss": 3.2592,
       "step": 58500
     },
     {
-      "epoch": 6.301797438381229,
-      "grad_norm": 0.7718719244003296,
-      "learning_rate": 0.00022226699709083073,
-      "loss": 3.2599,
+      "epoch": 6.3126684636118595,
+      "grad_norm": 0.7490639686584473,
+      "learning_rate": 0.0002216211548839719,
+      "loss": 3.2803,
       "step": 58550
     },
     {
-      "epoch": 6.307178990420837,
-      "grad_norm": 0.7340017557144165,
-      "learning_rate": 0.0002219437560607693,
-      "loss": 3.2805,
+      "epoch": 6.318059299191375,
+      "grad_norm": 0.7094032764434814,
+      "learning_rate": 0.0002212973556395035,
+      "loss": 3.287,
       "step": 58600
     },
     {
-      "epoch": 6.312560542460446,
-      "grad_norm": 0.7550706267356873,
-      "learning_rate": 0.00022162051503070787,
-      "loss": 3.2808,
+      "epoch": 6.32345013477089,
+      "grad_norm": 0.708122193813324,
+      "learning_rate": 0.00022097355639503507,
+      "loss": 3.2805,
       "step": 58650
     },
     {
-      "epoch": 6.3179420945000535,
-      "grad_norm": 0.7718273401260376,
-      "learning_rate": 0.00022129727400064646,
-      "loss": 3.2815,
+      "epoch": 6.328840970350404,
+      "grad_norm": 0.7124499678611755,
+      "learning_rate": 0.00022064975715056664,
+      "loss": 3.2957,
       "step": 58700
     },
     {
-      "epoch": 6.323323646539662,
-      "grad_norm": 0.7422057390213013,
-      "learning_rate": 0.00022097403297058506,
-      "loss": 3.2822,
+      "epoch": 6.334231805929919,
+      "grad_norm": 0.7175976037979126,
+      "learning_rate": 0.0002203259579060982,
+      "loss": 3.2933,
       "step": 58750
     },
     {
-      "epoch": 6.328705198579271,
-      "grad_norm": 0.7373623847961426,
-      "learning_rate": 0.00022065079194052363,
-      "loss": 3.2781,
+      "epoch": 6.339622641509434,
+      "grad_norm": 0.7456777095794678,
+      "learning_rate": 0.00022000215866162977,
+      "loss": 3.279,
       "step": 58800
     },
     {
-      "epoch": 6.334086750618878,
-      "grad_norm": 0.716949462890625,
-      "learning_rate": 0.00022032755091046222,
-      "loss": 3.2824,
+      "epoch": 6.345013477088949,
+      "grad_norm": 0.728626012802124,
+      "learning_rate": 0.00021967835941716135,
+      "loss": 3.2823,
       "step": 58850
     },
     {
-      "epoch": 6.339468302658487,
-      "grad_norm": 0.7594158053398132,
-      "learning_rate": 0.0002200043098804008,
-      "loss": 3.2713,
+      "epoch": 6.350404312668464,
+      "grad_norm": 0.7576169967651367,
+      "learning_rate": 0.0002193545601726929,
+      "loss": 3.2701,
       "step": 58900
     },
     {
-      "epoch": 6.344849854698095,
-      "grad_norm": 0.7525697350502014,
-      "learning_rate": 0.00021968106885033938,
-      "loss": 3.2712,
+      "epoch": 6.355795148247978,
+      "grad_norm": 0.6919223070144653,
+      "learning_rate": 0.00021903076092822448,
+      "loss": 3.2899,
       "step": 58950
     },
     {
-      "epoch": 6.350231406737703,
-      "grad_norm": 0.8482348918914795,
-      "learning_rate": 0.00021935782782027798,
-      "loss": 3.2965,
+      "epoch": 6.361185983827493,
+      "grad_norm": 0.7274947166442871,
+      "learning_rate": 0.00021870696168375606,
+      "loss": 3.2887,
       "step": 59000
     },
     {
-      "epoch": 6.350231406737703,
-      "eval_accuracy": 0.3853254061635089,
-      "eval_loss": 3.375702142715454,
-      "eval_runtime": 184.5908,
-      "eval_samples_per_second": 97.573,
-      "eval_steps_per_second": 6.1,
+      "epoch": 6.361185983827493,
+      "eval_accuracy": 0.3848371200856706,
+      "eval_loss": 3.378706932067871,
+      "eval_runtime": 183.6452,
+      "eval_samples_per_second": 98.075,
+      "eval_steps_per_second": 6.131,
       "step": 59000
     },
     {
-      "epoch": 6.355612958777312,
-      "grad_norm": 0.7227912545204163,
-      "learning_rate": 0.00021903458679021655,
-      "loss": 3.299,
+      "epoch": 6.366576819407008,
+      "grad_norm": 0.7238966822624207,
+      "learning_rate": 0.0002183831624392876,
+      "loss": 3.2926,
       "step": 59050
     },
     {
-      "epoch": 6.360994510816919,
-      "grad_norm": 0.7208841443061829,
-      "learning_rate": 0.0002187113457601551,
-      "loss": 3.2816,
+      "epoch": 6.3719676549865225,
+      "grad_norm": 0.7809830904006958,
+      "learning_rate": 0.00021805936319481918,
+      "loss": 3.2823,
       "step": 59100
     },
     {
-      "epoch": 6.366376062856528,
-      "grad_norm": 0.6990681290626526,
-      "learning_rate": 0.00021838810473009373,
-      "loss": 3.2922,
+      "epoch": 6.377358490566038,
+      "grad_norm": 0.7627469897270203,
+      "learning_rate": 0.00021773556395035076,
+      "loss": 3.2846,
       "step": 59150
     },
     {
-      "epoch": 6.371757614896136,
-      "grad_norm": 0.7942614555358887,
-      "learning_rate": 0.0002180648637000323,
-      "loss": 3.2885,
+      "epoch": 6.382749326145553,
+      "grad_norm": 0.7125155925750732,
+      "learning_rate": 0.00021741176470588234,
+      "loss": 3.3025,
       "step": 59200
     },
     {
-      "epoch": 6.377139166935744,
-      "grad_norm": 0.7575851678848267,
-      "learning_rate": 0.0002177416226699709,
-      "loss": 3.2699,
+      "epoch": 6.388140161725067,
+      "grad_norm": 0.7636107206344604,
+      "learning_rate": 0.00021708796546141392,
+      "loss": 3.2771,
       "step": 59250
     },
     {
-      "epoch": 6.382520718975353,
-      "grad_norm": 0.7590829133987427,
-      "learning_rate": 0.00021741838163990946,
-      "loss": 3.2947,
+      "epoch": 6.393530997304582,
+      "grad_norm": 0.7409188151359558,
+      "learning_rate": 0.0002167641662169455,
+      "loss": 3.2668,
       "step": 59300
     },
     {
-      "epoch": 6.387902271014961,
-      "grad_norm": 0.7357906103134155,
-      "learning_rate": 0.00021709514060984806,
-      "loss": 3.2785,
+      "epoch": 6.398921832884097,
+      "grad_norm": 0.7753366231918335,
+      "learning_rate": 0.00021644036697247702,
+      "loss": 3.2823,
       "step": 59350
     },
     {
-      "epoch": 6.393283823054569,
-      "grad_norm": 0.7962828278541565,
-      "learning_rate": 0.00021677189957978665,
-      "loss": 3.2822,
+      "epoch": 6.404312668463612,
+      "grad_norm": 0.7260041236877441,
+      "learning_rate": 0.0002161165677280086,
+      "loss": 3.3024,
       "step": 59400
     },
     {
-      "epoch": 6.398665375094177,
-      "grad_norm": 0.750723659992218,
-      "learning_rate": 0.00021645512337032643,
-      "loss": 3.295,
+      "epoch": 6.409703504043127,
+      "grad_norm": 0.7569220662117004,
+      "learning_rate": 0.00021579276848354017,
+      "loss": 3.2825,
       "step": 59450
     },
     {
-      "epoch": 6.404046927133785,
-      "grad_norm": 0.7317831516265869,
-      "learning_rate": 0.00021613188234026506,
-      "loss": 3.287,
+      "epoch": 6.415094339622642,
+      "grad_norm": 0.7487280964851379,
+      "learning_rate": 0.00021546896923907175,
+      "loss": 3.2704,
       "step": 59500
     },
     {
-      "epoch": 6.4094284791733935,
-      "grad_norm": 0.7471715211868286,
-      "learning_rate": 0.00021580864131020362,
-      "loss": 3.2852,
+      "epoch": 6.420485175202156,
+      "grad_norm": 0.7295806407928467,
+      "learning_rate": 0.00021514516999460333,
+      "loss": 3.3035,
       "step": 59550
     },
     {
-      "epoch": 6.414810031213002,
-      "grad_norm": 0.759760320186615,
-      "learning_rate": 0.0002154854002801422,
-      "loss": 3.2947,
+      "epoch": 6.425876010781671,
+      "grad_norm": 0.7379623055458069,
+      "learning_rate": 0.0002148213707501349,
+      "loss": 3.2726,
       "step": 59600
     },
     {
-      "epoch": 6.42019158325261,
-      "grad_norm": 0.8137605786323547,
-      "learning_rate": 0.0002151621592500808,
-      "loss": 3.2968,
+      "epoch": 6.431266846361186,
+      "grad_norm": 0.7402395009994507,
+      "learning_rate": 0.00021449757150566648,
+      "loss": 3.276,
       "step": 59650
     },
     {
-      "epoch": 6.425573135292218,
-      "grad_norm": 0.7391119599342346,
-      "learning_rate": 0.00021483891822001938,
-      "loss": 3.2797,
+      "epoch": 6.436657681940701,
+      "grad_norm": 0.7622630000114441,
+      "learning_rate": 0.00021417377226119806,
+      "loss": 3.3128,
       "step": 59700
     },
     {
-      "epoch": 6.430954687331827,
-      "grad_norm": 0.7140395641326904,
-      "learning_rate": 0.00021451567718995795,
-      "loss": 3.2755,
+      "epoch": 6.442048517520216,
+      "grad_norm": 0.7114800810813904,
+      "learning_rate": 0.0002138499730167296,
+      "loss": 3.294,
       "step": 59750
     },
     {
-      "epoch": 6.4363362393714345,
-      "grad_norm": 0.7186073064804077,
-      "learning_rate": 0.00021419243615989654,
-      "loss": 3.295,
+      "epoch": 6.44743935309973,
+      "grad_norm": 0.7696911692619324,
+      "learning_rate": 0.00021352617377226116,
+      "loss": 3.2828,
       "step": 59800
     },
     {
-      "epoch": 6.441717791411043,
-      "grad_norm": 0.7441525459289551,
-      "learning_rate": 0.00021386919512983514,
-      "loss": 3.2836,
+      "epoch": 6.452830188679245,
+      "grad_norm": 0.7608886361122131,
+      "learning_rate": 0.00021320237452779274,
+      "loss": 3.2768,
       "step": 59850
     },
     {
-      "epoch": 6.447099343450651,
-      "grad_norm": 0.743151843547821,
-      "learning_rate": 0.0002135459540997737,
-      "loss": 3.2637,
+      "epoch": 6.45822102425876,
+      "grad_norm": 0.7601315379142761,
+      "learning_rate": 0.00021287857528332432,
+      "loss": 3.2739,
       "step": 59900
     },
     {
-      "epoch": 6.452480895490259,
-      "grad_norm": 0.7524924278259277,
-      "learning_rate": 0.0002132227130697123,
-      "loss": 3.2825,
+      "epoch": 6.463611859838275,
+      "grad_norm": 0.719896674156189,
+      "learning_rate": 0.0002125547760388559,
+      "loss": 3.2707,
       "step": 59950
     },
     {
-      "epoch": 6.457862447529868,
-      "grad_norm": 0.747959554195404,
-      "learning_rate": 0.00021289947203965087,
-      "loss": 3.2712,
+      "epoch": 6.46900269541779,
+      "grad_norm": 0.7298495173454285,
+      "learning_rate": 0.00021223097679438747,
+      "loss": 3.2905,
       "step": 60000
     },
     {
-      "epoch": 6.457862447529868,
-      "eval_accuracy": 0.3854671981821068,
-      "eval_loss": 3.372908353805542,
-      "eval_runtime": 184.6808,
-      "eval_samples_per_second": 97.525,
-      "eval_steps_per_second": 6.097,
+      "epoch": 6.46900269541779,
+      "eval_accuracy": 0.385286725735447,
+      "eval_loss": 3.3724496364593506,
+      "eval_runtime": 184.0657,
+      "eval_samples_per_second": 97.851,
+      "eval_steps_per_second": 6.117,
       "step": 60000
     },
     {
-      "epoch": 6.4632439995694755,
-      "grad_norm": 0.8120265007019043,
-      "learning_rate": 0.0002125762310095895,
-      "loss": 3.263,
+      "epoch": 6.474393530997305,
+      "grad_norm": 0.7553102374076843,
+      "learning_rate": 0.00021190717754991905,
+      "loss": 3.2652,
       "step": 60050
     },
     {
-      "epoch": 6.468625551609084,
-      "grad_norm": 0.7335713505744934,
-      "learning_rate": 0.00021225298997952806,
-      "loss": 3.282,
+      "epoch": 6.479784366576819,
+      "grad_norm": 0.7708149552345276,
+      "learning_rate": 0.0002115833783054506,
+      "loss": 3.2889,
       "step": 60100
     },
     {
-      "epoch": 6.474007103648693,
-      "grad_norm": 0.744031548500061,
-      "learning_rate": 0.00021192974894946662,
-      "loss": 3.2892,
+      "epoch": 6.485175202156334,
+      "grad_norm": 0.754406213760376,
+      "learning_rate": 0.00021125957906098218,
+      "loss": 3.2904,
       "step": 60150
     },
     {
-      "epoch": 6.4793886556883,
-      "grad_norm": 0.7717245221138,
-      "learning_rate": 0.00021160650791940524,
-      "loss": 3.3058,
+      "epoch": 6.490566037735849,
+      "grad_norm": 0.7574535012245178,
+      "learning_rate": 0.00021093577981651373,
+      "loss": 3.2602,
       "step": 60200
     },
     {
-      "epoch": 6.484770207727909,
-      "grad_norm": 0.7391660213470459,
-      "learning_rate": 0.0002112832668893438,
-      "loss": 3.2783,
+      "epoch": 6.495956873315364,
+      "grad_norm": 0.7688772678375244,
+      "learning_rate": 0.0002106119805720453,
+      "loss": 3.2718,
       "step": 60250
     },
     {
-      "epoch": 6.490151759767517,
-      "grad_norm": 0.7157920598983765,
-      "learning_rate": 0.00021096002585928238,
-      "loss": 3.2907,
+      "epoch": 6.501347708894879,
+      "grad_norm": 0.7309618592262268,
+      "learning_rate": 0.00021028818132757689,
+      "loss": 3.3053,
       "step": 60300
     },
     {
-      "epoch": 6.495533311807125,
-      "grad_norm": 0.7572237253189087,
-      "learning_rate": 0.00021063678482922097,
-      "loss": 3.28,
+      "epoch": 6.506738544474393,
+      "grad_norm": 0.7908754944801331,
+      "learning_rate": 0.00020996438208310846,
+      "loss": 3.2638,
       "step": 60350
     },
     {
-      "epoch": 6.500914863846734,
-      "grad_norm": 0.7682546377182007,
-      "learning_rate": 0.00021031354379915957,
-      "loss": 3.2659,
+      "epoch": 6.512129380053908,
+      "grad_norm": 0.7500775456428528,
+      "learning_rate": 0.00020964058283864001,
+      "loss": 3.28,
       "step": 60400
     },
     {
-      "epoch": 6.506296415886341,
-      "grad_norm": 0.7918459177017212,
-      "learning_rate": 0.00020999030276909814,
-      "loss": 3.2761,
+      "epoch": 6.517520215633423,
+      "grad_norm": 0.7242320775985718,
+      "learning_rate": 0.0002093167835941716,
+      "loss": 3.2828,
       "step": 60450
     },
     {
-      "epoch": 6.51167796792595,
-      "grad_norm": 0.7500776052474976,
-      "learning_rate": 0.00020966706173903673,
-      "loss": 3.2804,
+      "epoch": 6.5229110512129385,
+      "grad_norm": 0.7319316267967224,
+      "learning_rate": 0.00020899946033459254,
+      "loss": 3.2878,
       "step": 60500
     },
     {
-      "epoch": 6.517059519965558,
-      "grad_norm": 0.7752882838249207,
-      "learning_rate": 0.0002093438207089753,
-      "loss": 3.287,
+      "epoch": 6.528301886792453,
+      "grad_norm": 0.7341902256011963,
+      "learning_rate": 0.0002086756610901241,
+      "loss": 3.2869,
       "step": 60550
     },
     {
-      "epoch": 6.522441072005166,
-      "grad_norm": 0.7291586399078369,
-      "learning_rate": 0.00020902057967891387,
-      "loss": 3.2863,
+      "epoch": 6.533692722371968,
+      "grad_norm": 0.729345440864563,
+      "learning_rate": 0.00020835186184565567,
+      "loss": 3.2844,
       "step": 60600
     },
     {
-      "epoch": 6.5278226240447745,
-      "grad_norm": 0.7302510738372803,
-      "learning_rate": 0.0002086973386488525,
-      "loss": 3.3077,
+      "epoch": 6.539083557951482,
+      "grad_norm": 0.7409000992774963,
+      "learning_rate": 0.00020802806260118725,
+      "loss": 3.2876,
       "step": 60650
     },
     {
-      "epoch": 6.533204176084383,
-      "grad_norm": 0.7242251038551331,
-      "learning_rate": 0.00020837409761879106,
-      "loss": 3.2746,
+      "epoch": 6.544474393530997,
+      "grad_norm": 0.7650253772735596,
+      "learning_rate": 0.00020770426335671883,
+      "loss": 3.2808,
       "step": 60700
     },
     {
-      "epoch": 6.538585728123991,
-      "grad_norm": 0.7295466661453247,
-      "learning_rate": 0.00020805085658872962,
-      "loss": 3.2869,
+      "epoch": 6.549865229110512,
+      "grad_norm": 0.8125891089439392,
+      "learning_rate": 0.00020738046411225038,
+      "loss": 3.2828,
       "step": 60750
     },
     {
-      "epoch": 6.543967280163599,
-      "grad_norm": 0.7399840354919434,
-      "learning_rate": 0.00020772761555866825,
-      "loss": 3.2958,
+      "epoch": 6.555256064690027,
+      "grad_norm": 0.7584858536720276,
+      "learning_rate": 0.00020705666486778196,
+      "loss": 3.2762,
       "step": 60800
     },
     {
-      "epoch": 6.549348832203208,
-      "grad_norm": 0.8592695593833923,
-      "learning_rate": 0.0002074043745286068,
-      "loss": 3.2937,
+      "epoch": 6.560646900269542,
+      "grad_norm": 0.7711278796195984,
+      "learning_rate": 0.00020673286562331353,
+      "loss": 3.2765,
       "step": 60850
     },
     {
-      "epoch": 6.5547303842428155,
-      "grad_norm": 0.7525405883789062,
-      "learning_rate": 0.00020708113349854538,
-      "loss": 3.2769,
+      "epoch": 6.566037735849057,
+      "grad_norm": 0.7544257044792175,
+      "learning_rate": 0.0002064090663788451,
+      "loss": 3.2738,
       "step": 60900
     },
     {
-      "epoch": 6.560111936282424,
-      "grad_norm": 0.719514787197113,
-      "learning_rate": 0.00020675789246848397,
-      "loss": 3.2716,
+      "epoch": 6.571428571428571,
+      "grad_norm": 0.7313734889030457,
+      "learning_rate": 0.00020608526713437666,
+      "loss": 3.2841,
       "step": 60950
     },
     {
-      "epoch": 6.565493488322032,
-      "grad_norm": 0.7651938796043396,
-      "learning_rate": 0.00020643465143842257,
-      "loss": 3.2857,
+      "epoch": 6.576819407008086,
+      "grad_norm": 0.7042865753173828,
+      "learning_rate": 0.00020576146788990824,
+      "loss": 3.2855,
       "step": 61000
     },
     {
-      "epoch": 6.565493488322032,
-      "eval_accuracy": 0.38612998079777516,
-      "eval_loss": 3.3699281215667725,
-      "eval_runtime": 184.6571,
-      "eval_samples_per_second": 97.538,
-      "eval_steps_per_second": 6.098,
+      "epoch": 6.576819407008086,
+      "eval_accuracy": 0.38551544006429644,
+      "eval_loss": 3.3711304664611816,
+      "eval_runtime": 183.4425,
+      "eval_samples_per_second": 98.183,
+      "eval_steps_per_second": 6.138,
       "step": 61000
     },
     {
-      "epoch": 6.57087504036164,
-      "grad_norm": 0.8320351243019104,
-      "learning_rate": 0.00020611141040836116,
-      "loss": 3.2741,
+      "epoch": 6.5822102425876015,
+      "grad_norm": 0.7999841570854187,
+      "learning_rate": 0.0002054376686454398,
+      "loss": 3.2767,
       "step": 61050
     },
     {
-      "epoch": 6.576256592401249,
-      "grad_norm": 0.7015548944473267,
-      "learning_rate": 0.00020578816937829973,
-      "loss": 3.3026,
+      "epoch": 6.587601078167116,
+      "grad_norm": 0.7251748442649841,
+      "learning_rate": 0.00020511386940097137,
+      "loss": 3.297,
       "step": 61100
     },
     {
-      "epoch": 6.5816381444408565,
-      "grad_norm": 0.7299136519432068,
-      "learning_rate": 0.0002054649283482383,
-      "loss": 3.2858,
+      "epoch": 6.592991913746631,
+      "grad_norm": 0.7549523115158081,
+      "learning_rate": 0.00020479007015650294,
+      "loss": 3.2981,
       "step": 61150
     },
     {
-      "epoch": 6.587019696480465,
-      "grad_norm": 0.7206780910491943,
-      "learning_rate": 0.00020514168731817692,
-      "loss": 3.2945,
+      "epoch": 6.598382749326145,
+      "grad_norm": 0.7553712725639343,
+      "learning_rate": 0.00020446627091203452,
+      "loss": 3.2911,
       "step": 61200
     },
     {
-      "epoch": 6.592401248520073,
-      "grad_norm": 0.7657489776611328,
-      "learning_rate": 0.0002048184462881155,
-      "loss": 3.2834,
+      "epoch": 6.60377358490566,
+      "grad_norm": 0.7162251472473145,
+      "learning_rate": 0.0002041424716675661,
+      "loss": 3.2905,
       "step": 61250
     },
     {
-      "epoch": 6.597782800559681,
-      "grad_norm": 0.786909282207489,
-      "learning_rate": 0.00020449520525805406,
-      "loss": 3.2806,
+      "epoch": 6.609164420485175,
+      "grad_norm": 0.7627036571502686,
+      "learning_rate": 0.00020381867242309768,
+      "loss": 3.2762,
       "step": 61300
     },
     {
-      "epoch": 6.60316435259929,
-      "grad_norm": 0.6916054487228394,
-      "learning_rate": 0.00020417196422799268,
-      "loss": 3.2892,
+      "epoch": 6.6145552560646905,
+      "grad_norm": 0.7106635570526123,
+      "learning_rate": 0.00020349487317862926,
+      "loss": 3.3057,
       "step": 61350
     },
     {
-      "epoch": 6.608545904638898,
-      "grad_norm": 0.8359130620956421,
-      "learning_rate": 0.00020384872319793125,
-      "loss": 3.2798,
+      "epoch": 6.619946091644205,
+      "grad_norm": 0.8031513094902039,
+      "learning_rate": 0.00020317107393416078,
+      "loss": 3.2917,
       "step": 61400
     },
     {
-      "epoch": 6.613927456678506,
-      "grad_norm": 0.7628897428512573,
-      "learning_rate": 0.00020353194698847105,
-      "loss": 3.2831,
+      "epoch": 6.62533692722372,
+      "grad_norm": 0.7408685088157654,
+      "learning_rate": 0.00020284727468969236,
+      "loss": 3.3023,
       "step": 61450
     },
     {
-      "epoch": 6.619309008718115,
-      "grad_norm": 0.7674877643585205,
-      "learning_rate": 0.00020321517077901086,
-      "loss": 3.2636,
+      "epoch": 6.630727762803234,
+      "grad_norm": 0.7966960668563843,
+      "learning_rate": 0.00020252347544522393,
+      "loss": 3.2897,
       "step": 61500
     },
     {
-      "epoch": 6.624690560757722,
-      "grad_norm": 0.7460737228393555,
-      "learning_rate": 0.00020289192974894945,
-      "loss": 3.2839,
+      "epoch": 6.636118598382749,
+      "grad_norm": 0.7547671794891357,
+      "learning_rate": 0.0002021996762007555,
+      "loss": 3.2943,
       "step": 61550
     },
     {
-      "epoch": 6.630072112797331,
-      "grad_norm": 0.7583357691764832,
-      "learning_rate": 0.00020256868871888802,
-      "loss": 3.2861,
+      "epoch": 6.6415094339622645,
+      "grad_norm": 0.7418311834335327,
+      "learning_rate": 0.0002018758769562871,
+      "loss": 3.2906,
       "step": 61600
     },
     {
-      "epoch": 6.635453664836939,
-      "grad_norm": 0.8294389843940735,
-      "learning_rate": 0.00020224544768882664,
-      "loss": 3.2915,
+      "epoch": 6.646900269541779,
+      "grad_norm": 0.7400493621826172,
+      "learning_rate": 0.00020155207771181867,
+      "loss": 3.2906,
       "step": 61650
     },
     {
-      "epoch": 6.640835216876547,
-      "grad_norm": 0.7348142266273499,
-      "learning_rate": 0.0002019222066587652,
-      "loss": 3.2809,
+      "epoch": 6.652291105121294,
+      "grad_norm": 0.7756510376930237,
+      "learning_rate": 0.00020122827846735024,
+      "loss": 3.2716,
       "step": 61700
     },
     {
-      "epoch": 6.6462167689161555,
-      "grad_norm": 0.726068913936615,
-      "learning_rate": 0.00020159896562870378,
-      "loss": 3.2742,
+      "epoch": 6.657681940700809,
+      "grad_norm": 0.7837494611740112,
+      "learning_rate": 0.00020090447922288182,
+      "loss": 3.2932,
       "step": 61750
     },
     {
-      "epoch": 6.651598320955763,
-      "grad_norm": 0.7640007734298706,
-      "learning_rate": 0.00020127572459864237,
-      "loss": 3.294,
+      "epoch": 6.663072776280323,
+      "grad_norm": 0.7266611456871033,
+      "learning_rate": 0.00020058067997841335,
+      "loss": 3.3023,
       "step": 61800
     },
     {
-      "epoch": 6.656979872995372,
-      "grad_norm": 0.7816165685653687,
-      "learning_rate": 0.00020095248356858097,
-      "loss": 3.2971,
+      "epoch": 6.668463611859838,
+      "grad_norm": 0.818396806716919,
+      "learning_rate": 0.00020025688073394492,
+      "loss": 3.2976,
       "step": 61850
     },
     {
-      "epoch": 6.66236142503498,
-      "grad_norm": 0.7704948782920837,
-      "learning_rate": 0.00020062924253851953,
-      "loss": 3.2863,
+      "epoch": 6.6738544474393535,
+      "grad_norm": 0.7452014088630676,
+      "learning_rate": 0.0001999330814894765,
+      "loss": 3.272,
       "step": 61900
     },
     {
-      "epoch": 6.667742977074588,
-      "grad_norm": 0.7741549611091614,
-      "learning_rate": 0.00020030600150845813,
-      "loss": 3.2754,
+      "epoch": 6.679245283018868,
+      "grad_norm": 0.7601395845413208,
+      "learning_rate": 0.00019960928224500808,
+      "loss": 3.2741,
       "step": 61950
     },
     {
-      "epoch": 6.6731245291141965,
-      "grad_norm": 0.7607588768005371,
-      "learning_rate": 0.0001999827604783967,
-      "loss": 3.2798,
+      "epoch": 6.684636118598383,
+      "grad_norm": 0.8315152525901794,
+      "learning_rate": 0.00019928548300053966,
+      "loss": 3.2744,
       "step": 62000
     },
     {
-      "epoch": 6.6731245291141965,
-      "eval_accuracy": 0.386048056520363,
-      "eval_loss": 3.3638477325439453,
-      "eval_runtime": 184.5657,
-      "eval_samples_per_second": 97.586,
-      "eval_steps_per_second": 6.101,
+      "epoch": 6.684636118598383,
+      "eval_accuracy": 0.3863460913916381,
+      "eval_loss": 3.3641045093536377,
+      "eval_runtime": 183.6686,
+      "eval_samples_per_second": 98.062,
+      "eval_steps_per_second": 6.131,
       "step": 62000
     },
     {
-      "epoch": 6.678506081153805,
-      "grad_norm": 0.7352211475372314,
-      "learning_rate": 0.0001996595194483353,
-      "loss": 3.2716,
+      "epoch": 6.690026954177897,
+      "grad_norm": 0.7635255455970764,
+      "learning_rate": 0.00019896168375607123,
+      "loss": 3.2993,
       "step": 62050
     },
     {
-      "epoch": 6.683887633193413,
-      "grad_norm": 0.7325161695480347,
-      "learning_rate": 0.00019933627841827389,
-      "loss": 3.2848,
+      "epoch": 6.695417789757412,
+      "grad_norm": 0.7203477621078491,
+      "learning_rate": 0.00019863788451160278,
+      "loss": 3.2907,
       "step": 62100
     },
     {
-      "epoch": 6.689269185233021,
-      "grad_norm": 0.7613785266876221,
-      "learning_rate": 0.00019901303738821245,
-      "loss": 3.2751,
+      "epoch": 6.7008086253369274,
+      "grad_norm": 0.7851729989051819,
+      "learning_rate": 0.00019831408526713436,
+      "loss": 3.278,
       "step": 62150
     },
     {
-      "epoch": 6.69465073727263,
-      "grad_norm": 0.7282025814056396,
-      "learning_rate": 0.00019868979635815102,
-      "loss": 3.2849,
+      "epoch": 6.706199460916442,
+      "grad_norm": 0.7568328380584717,
+      "learning_rate": 0.00019799028602266594,
+      "loss": 3.2825,
       "step": 62200
     },
     {
-      "epoch": 6.7000322893122375,
-      "grad_norm": 0.7329760789871216,
-      "learning_rate": 0.00019836655532808964,
-      "loss": 3.2808,
+      "epoch": 6.711590296495957,
+      "grad_norm": 0.8196280598640442,
+      "learning_rate": 0.0001976664867781975,
+      "loss": 3.2929,
       "step": 62250
     },
     {
-      "epoch": 6.705413841351846,
-      "grad_norm": 0.7544186115264893,
-      "learning_rate": 0.0001980433142980282,
-      "loss": 3.2799,
+      "epoch": 6.716981132075472,
+      "grad_norm": 0.7875428795814514,
+      "learning_rate": 0.00019734268753372907,
+      "loss": 3.2781,
       "step": 62300
     },
     {
-      "epoch": 6.710795393391454,
-      "grad_norm": 0.7261156439781189,
-      "learning_rate": 0.00019772007326796678,
-      "loss": 3.2821,
+      "epoch": 6.722371967654986,
+      "grad_norm": 0.7377816438674927,
+      "learning_rate": 0.00019701888828926065,
+      "loss": 3.2958,
       "step": 62350
     },
     {
-      "epoch": 6.716176945431062,
-      "grad_norm": 0.722864031791687,
-      "learning_rate": 0.0001973968322379054,
-      "loss": 3.2838,
+      "epoch": 6.727762803234501,
+      "grad_norm": 0.7832558155059814,
+      "learning_rate": 0.0001966950890447922,
+      "loss": 3.2779,
       "step": 62400
     },
     {
-      "epoch": 6.721558497470671,
-      "grad_norm": 0.7303403615951538,
-      "learning_rate": 0.00019707359120784397,
-      "loss": 3.307,
+      "epoch": 6.7331536388140165,
+      "grad_norm": 0.7229846715927124,
+      "learning_rate": 0.00019637128980032377,
+      "loss": 3.2829,
       "step": 62450
     },
     {
-      "epoch": 6.7269400495102785,
-      "grad_norm": 0.7062789797782898,
-      "learning_rate": 0.00019675035017778253,
-      "loss": 3.2812,
+      "epoch": 6.738544474393531,
+      "grad_norm": 0.7550538778305054,
+      "learning_rate": 0.00019604749055585535,
+      "loss": 3.2874,
       "step": 62500
     },
     {
-      "epoch": 6.732321601549887,
-      "grad_norm": 0.748625636100769,
-      "learning_rate": 0.00019642710914772113,
-      "loss": 3.2913,
+      "epoch": 6.743935309973046,
+      "grad_norm": 0.75005704164505,
+      "learning_rate": 0.00019572369131138693,
+      "loss": 3.2933,
       "step": 62550
     },
     {
-      "epoch": 6.737703153589496,
-      "grad_norm": 0.7621287703514099,
-      "learning_rate": 0.00019610386811765972,
-      "loss": 3.2881,
+      "epoch": 6.74932614555256,
+      "grad_norm": 0.7449986934661865,
+      "learning_rate": 0.0001953998920669185,
+      "loss": 3.2773,
       "step": 62600
     },
     {
-      "epoch": 6.743084705629103,
-      "grad_norm": 0.7435240149497986,
-      "learning_rate": 0.00019578062708759832,
-      "loss": 3.2807,
+      "epoch": 6.754716981132075,
+      "grad_norm": 0.7422336339950562,
+      "learning_rate": 0.00019507609282245006,
+      "loss": 3.2822,
       "step": 62650
     },
     {
-      "epoch": 6.748466257668712,
-      "grad_norm": 0.7503770589828491,
-      "learning_rate": 0.00019545738605753689,
-      "loss": 3.3027,
+      "epoch": 6.7601078167115904,
+      "grad_norm": 0.754895806312561,
+      "learning_rate": 0.00019475229357798164,
+      "loss": 3.2715,
       "step": 62700
     },
     {
-      "epoch": 6.75384780970832,
-      "grad_norm": 0.7563770413398743,
-      "learning_rate": 0.00019513414502747545,
-      "loss": 3.2875,
+      "epoch": 6.765498652291106,
+      "grad_norm": 0.7931267619132996,
+      "learning_rate": 0.00019442849433351319,
+      "loss": 3.3003,
       "step": 62750
     },
     {
-      "epoch": 6.759229361747928,
-      "grad_norm": 0.8158665299415588,
-      "learning_rate": 0.00019481090399741408,
-      "loss": 3.2725,
+      "epoch": 6.77088948787062,
+      "grad_norm": 0.783345639705658,
+      "learning_rate": 0.00019410469508904476,
+      "loss": 3.2854,
       "step": 62800
     },
     {
-      "epoch": 6.7646109137875365,
-      "grad_norm": 0.7590814828872681,
-      "learning_rate": 0.00019448766296735264,
-      "loss": 3.3002,
+      "epoch": 6.776280323450135,
+      "grad_norm": 0.7650436162948608,
+      "learning_rate": 0.00019378089584457634,
+      "loss": 3.2992,
       "step": 62850
     },
     {
-      "epoch": 6.769992465827144,
-      "grad_norm": 0.7146130800247192,
-      "learning_rate": 0.0001941644219372912,
-      "loss": 3.2843,
+      "epoch": 6.781671159029649,
+      "grad_norm": 0.7753744125366211,
+      "learning_rate": 0.00019345709660010792,
+      "loss": 3.2989,
       "step": 62900
     },
     {
-      "epoch": 6.775374017866753,
-      "grad_norm": 0.7704887986183167,
-      "learning_rate": 0.00019384118090722983,
-      "loss": 3.2762,
+      "epoch": 6.787061994609164,
+      "grad_norm": 0.7236669063568115,
+      "learning_rate": 0.0001931332973556395,
+      "loss": 3.272,
       "step": 62950
     },
     {
-      "epoch": 6.780755569906361,
-      "grad_norm": 0.748637855052948,
-      "learning_rate": 0.0001935179398771684,
-      "loss": 3.2641,
+      "epoch": 6.7924528301886795,
+      "grad_norm": 0.7842093110084534,
+      "learning_rate": 0.00019280949811117107,
+      "loss": 3.2813,
       "step": 63000
     },
     {
-      "epoch": 6.780755569906361,
-      "eval_accuracy": 0.3869664993810588,
-      "eval_loss": 3.359816551208496,
-      "eval_runtime": 184.8195,
-      "eval_samples_per_second": 97.452,
-      "eval_steps_per_second": 6.092,
+      "epoch": 6.7924528301886795,
+      "eval_accuracy": 0.3865612241095108,
+      "eval_loss": 3.3622243404388428,
+      "eval_runtime": 183.9001,
+      "eval_samples_per_second": 97.939,
+      "eval_steps_per_second": 6.123,
       "step": 63000
     },
     {
-      "epoch": 6.786137121945969,
-      "grad_norm": 0.8034060597419739,
-      "learning_rate": 0.00019319469884710697,
-      "loss": 3.2973,
+      "epoch": 6.797843665768194,
+      "grad_norm": 0.7585256099700928,
+      "learning_rate": 0.00019248569886670265,
+      "loss": 3.2818,
       "step": 63050
     },
     {
-      "epoch": 6.7915186739855775,
-      "grad_norm": 0.7926182150840759,
-      "learning_rate": 0.00019287145781704556,
-      "loss": 3.2821,
+      "epoch": 6.803234501347709,
+      "grad_norm": 0.7767728567123413,
+      "learning_rate": 0.00019216189962223418,
+      "loss": 3.2819,
       "step": 63100
     },
     {
-      "epoch": 6.796900226025185,
-      "grad_norm": 0.7208576798439026,
-      "learning_rate": 0.00019254821678698416,
-      "loss": 3.283,
+      "epoch": 6.808625336927224,
+      "grad_norm": 0.7501540184020996,
+      "learning_rate": 0.00019183810037776575,
+      "loss": 3.2829,
       "step": 63150
     },
     {
-      "epoch": 6.802281778064794,
-      "grad_norm": 0.7681335210800171,
-      "learning_rate": 0.00019222497575692272,
-      "loss": 3.2823,
+      "epoch": 6.814016172506738,
+      "grad_norm": 0.7633841037750244,
+      "learning_rate": 0.00019151430113329733,
+      "loss": 3.2847,
       "step": 63200
     },
     {
-      "epoch": 6.807663330104402,
-      "grad_norm": 0.7635934352874756,
-      "learning_rate": 0.00019190173472686132,
-      "loss": 3.3013,
+      "epoch": 6.819407008086253,
+      "grad_norm": 0.7208578586578369,
+      "learning_rate": 0.0001911905018888289,
+      "loss": 3.2791,
       "step": 63250
     },
     {
-      "epoch": 6.813044882144011,
-      "grad_norm": 0.7212764024734497,
-      "learning_rate": 0.0001915784936967999,
-      "loss": 3.299,
+      "epoch": 6.824797843665769,
+      "grad_norm": 0.7506759166717529,
+      "learning_rate": 0.00019086670264436049,
+      "loss": 3.269,
       "step": 63300
     },
     {
-      "epoch": 6.8184264341836185,
-      "grad_norm": 0.7291616201400757,
-      "learning_rate": 0.00019125525266673845,
-      "loss": 3.2863,
+      "epoch": 6.830188679245283,
+      "grad_norm": 0.7481265664100647,
+      "learning_rate": 0.00019054290339989206,
+      "loss": 3.2873,
       "step": 63350
     },
     {
-      "epoch": 6.823807986223227,
-      "grad_norm": 0.7780003547668457,
-      "learning_rate": 0.00019093201163667708,
-      "loss": 3.2753,
+      "epoch": 6.835579514824798,
+      "grad_norm": 0.7677626609802246,
+      "learning_rate": 0.00019021910415542364,
+      "loss": 3.277,
       "step": 63400
     },
     {
-      "epoch": 6.829189538262835,
-      "grad_norm": 0.7213042378425598,
-      "learning_rate": 0.00019060877060661564,
-      "loss": 3.2894,
+      "epoch": 6.840970350404312,
+      "grad_norm": 0.85811847448349,
+      "learning_rate": 0.0001898953049109552,
+      "loss": 3.2981,
       "step": 63450
     },
     {
-      "epoch": 6.834571090302443,
-      "grad_norm": 0.7561571002006531,
-      "learning_rate": 0.0001902855295765542,
-      "loss": 3.2773,
+      "epoch": 6.846361185983827,
+      "grad_norm": 0.7475490570068359,
+      "learning_rate": 0.00018957150566648677,
+      "loss": 3.2839,
       "step": 63500
     },
     {
-      "epoch": 6.839952642342052,
-      "grad_norm": 0.8355714678764343,
-      "learning_rate": 0.00018996228854649283,
-      "loss": 3.2936,
+      "epoch": 6.8517520215633425,
+      "grad_norm": 0.7986608147621155,
+      "learning_rate": 0.00018924770642201832,
+      "loss": 3.2864,
       "step": 63550
     },
     {
-      "epoch": 6.8453341943816595,
-      "grad_norm": 0.7631604075431824,
-      "learning_rate": 0.0001896390475164314,
-      "loss": 3.2791,
+      "epoch": 6.857142857142857,
+      "grad_norm": 0.8158076405525208,
+      "learning_rate": 0.0001889239071775499,
+      "loss": 3.28,
       "step": 63600
     },
     {
-      "epoch": 6.850715746421268,
-      "grad_norm": 0.81952303647995,
-      "learning_rate": 0.0001893222713069712,
-      "loss": 3.2909,
+      "epoch": 6.862533692722372,
+      "grad_norm": 0.7751656174659729,
+      "learning_rate": 0.00018860010793308148,
+      "loss": 3.2886,
       "step": 63650
     },
     {
-      "epoch": 6.856097298460876,
-      "grad_norm": 0.7389697432518005,
-      "learning_rate": 0.0001889990302769098,
-      "loss": 3.292,
+      "epoch": 6.867924528301887,
+      "grad_norm": 0.7646748423576355,
+      "learning_rate": 0.00018827630868861305,
+      "loss": 3.2876,
       "step": 63700
     },
     {
-      "epoch": 6.861478850500484,
-      "grad_norm": 0.7638382315635681,
-      "learning_rate": 0.0001886757892468484,
-      "loss": 3.2677,
+      "epoch": 6.873315363881401,
+      "grad_norm": 0.7480393648147583,
+      "learning_rate": 0.0001879525094441446,
+      "loss": 3.2891,
       "step": 63750
     },
     {
-      "epoch": 6.866860402540093,
-      "grad_norm": 0.7346477508544922,
-      "learning_rate": 0.00018835254821678696,
-      "loss": 3.2803,
+      "epoch": 6.878706199460916,
+      "grad_norm": 0.7870710492134094,
+      "learning_rate": 0.00018762871019967618,
+      "loss": 3.2732,
       "step": 63800
     },
     {
-      "epoch": 6.8722419545797,
-      "grad_norm": 0.7897524237632751,
-      "learning_rate": 0.00018802930718672553,
-      "loss": 3.2859,
+      "epoch": 6.884097035040432,
+      "grad_norm": 0.7654228210449219,
+      "learning_rate": 0.00018730491095520776,
+      "loss": 3.2743,
       "step": 63850
     },
     {
-      "epoch": 6.877623506619309,
-      "grad_norm": 0.791483461856842,
-      "learning_rate": 0.00018770606615666415,
-      "loss": 3.2651,
+      "epoch": 6.889487870619946,
+      "grad_norm": 0.7906088829040527,
+      "learning_rate": 0.00018698111171073934,
+      "loss": 3.3025,
       "step": 63900
     },
     {
-      "epoch": 6.8830050586589175,
-      "grad_norm": 0.7509163022041321,
-      "learning_rate": 0.00018738282512660272,
-      "loss": 3.2808,
+      "epoch": 6.894878706199461,
+      "grad_norm": 0.7547808289527893,
+      "learning_rate": 0.0001866573124662709,
+      "loss": 3.2894,
       "step": 63950
     },
     {
-      "epoch": 6.888386610698525,
-      "grad_norm": 0.8272793889045715,
-      "learning_rate": 0.0001870595840965413,
-      "loss": 3.2795,
+      "epoch": 6.900269541778976,
+      "grad_norm": 0.7568020224571228,
+      "learning_rate": 0.00018633351322180246,
+      "loss": 3.2983,
       "step": 64000
     },
     {
-      "epoch": 6.888386610698525,
-      "eval_accuracy": 0.3871891291481907,
-      "eval_loss": 3.356895923614502,
-      "eval_runtime": 184.5849,
-      "eval_samples_per_second": 97.576,
-      "eval_steps_per_second": 6.1,
+      "epoch": 6.900269541778976,
+      "eval_accuracy": 0.3869987692887397,
+      "eval_loss": 3.3588764667510986,
+      "eval_runtime": 183.5221,
+      "eval_samples_per_second": 98.141,
+      "eval_steps_per_second": 6.136,
       "step": 64000
     },
     {
-      "epoch": 6.893768162738134,
-      "grad_norm": 0.7592813968658447,
-      "learning_rate": 0.0001867363430664799,
-      "loss": 3.2996,
+      "epoch": 6.90566037735849,
+      "grad_norm": 0.7180277109146118,
+      "learning_rate": 0.00018600971397733404,
+      "loss": 3.282,
       "step": 64050
     },
     {
-      "epoch": 6.899149714777742,
-      "grad_norm": 0.7349041700363159,
-      "learning_rate": 0.00018641310203641848,
-      "loss": 3.2695,
+      "epoch": 6.9110512129380055,
+      "grad_norm": 0.8041961193084717,
+      "learning_rate": 0.0001856859147328656,
+      "loss": 3.2673,
       "step": 64100
     },
     {
-      "epoch": 6.90453126681735,
-      "grad_norm": 0.7781025767326355,
-      "learning_rate": 0.00018608986100635705,
-      "loss": 3.2748,
+      "epoch": 6.916442048517521,
+      "grad_norm": 0.732753574848175,
+      "learning_rate": 0.00018536859147328655,
+      "loss": 3.2776,
       "step": 64150
     },
     {
-      "epoch": 6.9099128188569585,
-      "grad_norm": 0.7788296937942505,
-      "learning_rate": 0.00018576661997629564,
-      "loss": 3.2818,
+      "epoch": 6.921832884097035,
+      "grad_norm": 0.8041583299636841,
+      "learning_rate": 0.00018504479222881812,
+      "loss": 3.2846,
       "step": 64200
     },
     {
-      "epoch": 6.915294370896566,
-      "grad_norm": 0.8603556752204895,
-      "learning_rate": 0.00018544337894623423,
-      "loss": 3.2798,
+      "epoch": 6.92722371967655,
+      "grad_norm": 0.7492255568504333,
+      "learning_rate": 0.0001847209929843497,
+      "loss": 3.2804,
       "step": 64250
     },
     {
-      "epoch": 6.920675922936175,
-      "grad_norm": 0.7542293667793274,
-      "learning_rate": 0.0001851201379161728,
-      "loss": 3.2944,
+      "epoch": 6.932614555256064,
+      "grad_norm": 0.8601753115653992,
+      "learning_rate": 0.00018439719373988125,
+      "loss": 3.2752,
       "step": 64300
     },
     {
-      "epoch": 6.926057474975783,
-      "grad_norm": 0.7522438764572144,
-      "learning_rate": 0.0001847968968861114,
-      "loss": 3.3009,
+      "epoch": 6.938005390835579,
+      "grad_norm": 0.7451220154762268,
+      "learning_rate": 0.00018407339449541283,
+      "loss": 3.2889,
       "step": 64350
     },
     {
-      "epoch": 6.931439027015391,
-      "grad_norm": 0.7493849992752075,
-      "learning_rate": 0.00018447365585604996,
-      "loss": 3.2968,
+      "epoch": 6.943396226415095,
+      "grad_norm": 0.778103768825531,
+      "learning_rate": 0.00018374959525094438,
+      "loss": 3.2795,
       "step": 64400
     },
     {
-      "epoch": 6.9368205790549995,
-      "grad_norm": 0.7522703409194946,
-      "learning_rate": 0.00018415041482598859,
-      "loss": 3.2705,
+      "epoch": 6.948787061994609,
+      "grad_norm": 0.7740045785903931,
+      "learning_rate": 0.00018342579600647596,
+      "loss": 3.2769,
       "step": 64450
     },
     {
-      "epoch": 6.942202131094608,
-      "grad_norm": 0.7539739012718201,
-      "learning_rate": 0.00018382717379592715,
-      "loss": 3.2899,
+      "epoch": 6.954177897574124,
+      "grad_norm": 0.737163782119751,
+      "learning_rate": 0.00018310199676200753,
+      "loss": 3.2916,
       "step": 64500
     },
     {
-      "epoch": 6.947583683134216,
-      "grad_norm": 0.8078305125236511,
-      "learning_rate": 0.00018350393276586572,
-      "loss": 3.3006,
+      "epoch": 6.959568733153639,
+      "grad_norm": 0.7356468439102173,
+      "learning_rate": 0.0001827781975175391,
+      "loss": 3.2844,
       "step": 64550
     },
     {
-      "epoch": 6.952965235173824,
-      "grad_norm": 0.7467326521873474,
-      "learning_rate": 0.00018318069173580434,
-      "loss": 3.2725,
+      "epoch": 6.964959568733153,
+      "grad_norm": 0.7236778736114502,
+      "learning_rate": 0.0001824543982730707,
+      "loss": 3.2871,
       "step": 64600
     },
     {
-      "epoch": 6.958346787213433,
-      "grad_norm": 0.7241435050964355,
-      "learning_rate": 0.0001828574507057429,
-      "loss": 3.29,
+      "epoch": 6.9703504043126685,
+      "grad_norm": 0.7288910150527954,
+      "learning_rate": 0.00018213059902860227,
+      "loss": 3.2924,
       "step": 64650
     },
     {
-      "epoch": 6.9637283392530405,
-      "grad_norm": 0.7517207860946655,
-      "learning_rate": 0.00018253420967568148,
-      "loss": 3.2657,
+      "epoch": 6.975741239892184,
+      "grad_norm": 0.7726614475250244,
+      "learning_rate": 0.00018180679978413382,
+      "loss": 3.2981,
       "step": 64700
     },
     {
-      "epoch": 6.969109891292649,
-      "grad_norm": 0.8167835474014282,
-      "learning_rate": 0.00018221096864562007,
-      "loss": 3.291,
+      "epoch": 6.981132075471698,
+      "grad_norm": 0.8178704977035522,
+      "learning_rate": 0.00018148300053966537,
+      "loss": 3.2902,
       "step": 64750
     },
     {
-      "epoch": 6.974491443332257,
-      "grad_norm": 0.7494839429855347,
-      "learning_rate": 0.00018188772761555867,
-      "loss": 3.3016,
+      "epoch": 6.986522911051213,
+      "grad_norm": 0.8220640420913696,
+      "learning_rate": 0.00018115920129519695,
+      "loss": 3.288,
       "step": 64800
     },
     {
-      "epoch": 6.979872995371865,
-      "grad_norm": 0.7137006521224976,
-      "learning_rate": 0.00018156448658549723,
-      "loss": 3.2964,
+      "epoch": 6.991913746630727,
+      "grad_norm": 0.7409705519676208,
+      "learning_rate": 0.00018083540205072852,
+      "loss": 3.2857,
       "step": 64850
     },
     {
-      "epoch": 6.985254547411474,
-      "grad_norm": 0.7802633047103882,
-      "learning_rate": 0.00018124124555543583,
-      "loss": 3.2759,
+      "epoch": 6.997304582210242,
+      "grad_norm": 0.79615318775177,
+      "learning_rate": 0.0001805116028062601,
+      "loss": 3.2914,
       "step": 64900
     },
     {
-      "epoch": 6.990636099451081,
-      "grad_norm": 0.7316725254058838,
-      "learning_rate": 0.0001809180045253744,
-      "loss": 3.2743,
+      "epoch": 7.002695417789758,
+      "grad_norm": 0.835076093673706,
+      "learning_rate": 0.00018018780356179168,
+      "loss": 3.2366,
       "step": 64950
     },
     {
-      "epoch": 6.99601765149069,
-      "grad_norm": 0.7353840470314026,
-      "learning_rate": 0.00018059476349531296,
-      "loss": 3.2994,
+      "epoch": 7.008086253369272,
+      "grad_norm": 0.794971764087677,
+      "learning_rate": 0.00017986400431732326,
+      "loss": 3.2014,
       "step": 65000
     },
     {
-      "epoch": 6.99601765149069,
-      "eval_accuracy": 0.38757897570966904,
-      "eval_loss": 3.3525335788726807,
-      "eval_runtime": 184.903,
-      "eval_samples_per_second": 97.408,
-      "eval_steps_per_second": 6.09,
+      "epoch": 7.008086253369272,
+      "eval_accuracy": 0.3871302392829952,
+      "eval_loss": 3.3600101470947266,
+      "eval_runtime": 183.7138,
+      "eval_samples_per_second": 98.038,
+      "eval_steps_per_second": 6.129,
       "step": 65000
     },
     {
-      "epoch": 7.0013992035302985,
-      "grad_norm": 0.7722090482711792,
-      "learning_rate": 0.00018027152246525159,
-      "loss": 3.2508,
+      "epoch": 7.013477088948787,
+      "grad_norm": 0.7871414422988892,
+      "learning_rate": 0.00017954020507285483,
+      "loss": 3.1982,
       "step": 65050
     },
     {
-      "epoch": 7.006780755569906,
-      "grad_norm": 0.745133638381958,
-      "learning_rate": 0.00017994828143519015,
-      "loss": 3.185,
+      "epoch": 7.018867924528302,
+      "grad_norm": 0.8027901649475098,
+      "learning_rate": 0.0001792164058283864,
+      "loss": 3.1985,
       "step": 65100
     },
     {
-      "epoch": 7.012162307609515,
-      "grad_norm": 0.7634187936782837,
-      "learning_rate": 0.00017962504040512872,
-      "loss": 3.185,
+      "epoch": 7.024258760107816,
+      "grad_norm": 0.767401933670044,
+      "learning_rate": 0.00017889260658391794,
+      "loss": 3.2055,
       "step": 65150
     },
     {
-      "epoch": 7.017543859649122,
-      "grad_norm": 0.7494111657142639,
-      "learning_rate": 0.00017930179937506734,
-      "loss": 3.21,
+      "epoch": 7.0296495956873315,
+      "grad_norm": 0.7891222834587097,
+      "learning_rate": 0.00017856880733944951,
+      "loss": 3.184,
       "step": 65200
     },
     {
-      "epoch": 7.022925411688731,
-      "grad_norm": 0.798454999923706,
-      "learning_rate": 0.0001789785583450059,
-      "loss": 3.2116,
+      "epoch": 7.035040431266847,
+      "grad_norm": 0.8046509027481079,
+      "learning_rate": 0.0001782450080949811,
+      "loss": 3.1978,
       "step": 65250
     },
     {
-      "epoch": 7.0283069637283395,
-      "grad_norm": 0.7529196739196777,
-      "learning_rate": 0.00017865531731494448,
-      "loss": 3.2022,
+      "epoch": 7.040431266846361,
+      "grad_norm": 0.791054904460907,
+      "learning_rate": 0.00017792120885051267,
+      "loss": 3.2013,
       "step": 65300
     },
     {
-      "epoch": 7.033688515767947,
-      "grad_norm": 0.7898669242858887,
-      "learning_rate": 0.00017833207628488307,
-      "loss": 3.1969,
+      "epoch": 7.045822102425876,
+      "grad_norm": 0.763992965221405,
+      "learning_rate": 0.00017759740960604425,
+      "loss": 3.1919,
       "step": 65350
     },
     {
-      "epoch": 7.039070067807556,
-      "grad_norm": 0.7746036052703857,
-      "learning_rate": 0.00017800883525482167,
-      "loss": 3.1942,
+      "epoch": 7.051212938005391,
+      "grad_norm": 0.889952540397644,
+      "learning_rate": 0.00017727361036157582,
+      "loss": 3.2239,
       "step": 65400
     },
     {
-      "epoch": 7.044451619847164,
-      "grad_norm": 0.8117377161979675,
-      "learning_rate": 0.00017768559422476026,
-      "loss": 3.2071,
+      "epoch": 7.056603773584905,
+      "grad_norm": 0.7865025401115417,
+      "learning_rate": 0.00017694981111710737,
+      "loss": 3.2114,
       "step": 65450
     },
     {
-      "epoch": 7.049833171886772,
-      "grad_norm": 0.7499861121177673,
-      "learning_rate": 0.00017736235319469883,
-      "loss": 3.2166,
+      "epoch": 7.061994609164421,
+      "grad_norm": 0.7957040667533875,
+      "learning_rate": 0.00017662601187263895,
+      "loss": 3.202,
       "step": 65500
     },
     {
-      "epoch": 7.0552147239263805,
-      "grad_norm": 0.7656523585319519,
-      "learning_rate": 0.0001770391121646374,
-      "loss": 3.1925,
+      "epoch": 7.067385444743936,
+      "grad_norm": 0.7598467469215393,
+      "learning_rate": 0.0001763022126281705,
+      "loss": 3.2151,
       "step": 65550
     },
     {
-      "epoch": 7.060596275965988,
-      "grad_norm": 0.7908286452293396,
-      "learning_rate": 0.00017671587113457602,
-      "loss": 3.2114,
+      "epoch": 7.07277628032345,
+      "grad_norm": 0.7556450963020325,
+      "learning_rate": 0.00017597841338370208,
+      "loss": 3.1908,
       "step": 65600
     },
     {
-      "epoch": 7.065977828005597,
-      "grad_norm": 0.7967495322227478,
-      "learning_rate": 0.0001763926301045146,
-      "loss": 3.2132,
+      "epoch": 7.078167115902965,
+      "grad_norm": 0.7661687135696411,
+      "learning_rate": 0.00017565461413923366,
+      "loss": 3.215,
       "step": 65650
     },
     {
-      "epoch": 7.071359380045205,
-      "grad_norm": 0.7433943748474121,
-      "learning_rate": 0.00017606938907445315,
-      "loss": 3.2086,
+      "epoch": 7.083557951482479,
+      "grad_norm": 0.7638933658599854,
+      "learning_rate": 0.00017533081489476524,
+      "loss": 3.2194,
       "step": 65700
     },
     {
-      "epoch": 7.076740932084813,
-      "grad_norm": 0.7751272320747375,
-      "learning_rate": 0.00017574614804439178,
-      "loss": 3.2388,
+      "epoch": 7.0889487870619945,
+      "grad_norm": 0.8166995644569397,
+      "learning_rate": 0.0001750070156502968,
+      "loss": 3.2183,
       "step": 65750
     },
     {
-      "epoch": 7.0821224841244215,
-      "grad_norm": 0.7907826900482178,
-      "learning_rate": 0.00017542290701433034,
-      "loss": 3.2292,
+      "epoch": 7.09433962264151,
+      "grad_norm": 0.832233190536499,
+      "learning_rate": 0.00017468321640582836,
+      "loss": 3.2143,
       "step": 65800
     },
     {
-      "epoch": 7.08750403616403,
-      "grad_norm": 0.7536311745643616,
-      "learning_rate": 0.0001750996659842689,
-      "loss": 3.2051,
+      "epoch": 7.099730458221024,
+      "grad_norm": 0.7524721026420593,
+      "learning_rate": 0.00017435941716135994,
+      "loss": 3.198,
       "step": 65850
     },
     {
-      "epoch": 7.092885588203638,
-      "grad_norm": 0.8473614454269409,
-      "learning_rate": 0.0001747764249542075,
-      "loss": 3.1845,
+      "epoch": 7.105121293800539,
+      "grad_norm": 0.7654711008071899,
+      "learning_rate": 0.00017403561791689152,
+      "loss": 3.2277,
       "step": 65900
     },
     {
-      "epoch": 7.098267140243246,
-      "grad_norm": 0.764302134513855,
-      "learning_rate": 0.0001744531839241461,
-      "loss": 3.193,
+      "epoch": 7.110512129380054,
+      "grad_norm": 0.7651508450508118,
+      "learning_rate": 0.0001737118186724231,
+      "loss": 3.1887,
       "step": 65950
     },
     {
-      "epoch": 7.103648692282855,
-      "grad_norm": 0.7599833607673645,
-      "learning_rate": 0.00017412994289408467,
-      "loss": 3.2226,
+      "epoch": 7.115902964959568,
+      "grad_norm": 0.7957179546356201,
+      "learning_rate": 0.00017338801942795465,
+      "loss": 3.2214,
       "step": 66000
     },
     {
-      "epoch": 7.103648692282855,
-      "eval_accuracy": 0.38751063304323374,
-      "eval_loss": 3.3595468997955322,
-      "eval_runtime": 184.5489,
-      "eval_samples_per_second": 97.595,
-      "eval_steps_per_second": 6.101,
+      "epoch": 7.115902964959568,
+      "eval_accuracy": 0.3870713494177998,
+      "eval_loss": 3.360347032546997,
+      "eval_runtime": 183.8356,
+      "eval_samples_per_second": 97.973,
+      "eval_steps_per_second": 6.125,
       "step": 66000
     },
     {
-      "epoch": 7.109030244322462,
-      "grad_norm": 0.7426865100860596,
-      "learning_rate": 0.00017380670186402326,
-      "loss": 3.2087,
+      "epoch": 7.121293800539084,
+      "grad_norm": 0.7835813164710999,
+      "learning_rate": 0.00017306422018348623,
+      "loss": 3.2142,
       "step": 66050
     },
     {
-      "epoch": 7.114411796362071,
-      "grad_norm": 0.7286989688873291,
-      "learning_rate": 0.00017348346083396183,
-      "loss": 3.2345,
+      "epoch": 7.126684636118599,
+      "grad_norm": 0.7825093865394592,
+      "learning_rate": 0.00017274042093901778,
+      "loss": 3.2036,
       "step": 66100
     },
     {
-      "epoch": 7.119793348401679,
-      "grad_norm": 0.7323970794677734,
-      "learning_rate": 0.00017316021980390042,
-      "loss": 3.2154,
+      "epoch": 7.132075471698113,
+      "grad_norm": 0.7593418955802917,
+      "learning_rate": 0.00017241662169454935,
+      "loss": 3.2382,
       "step": 66150
     },
     {
-      "epoch": 7.125174900441287,
-      "grad_norm": 0.8056437373161316,
-      "learning_rate": 0.00017283697877383902,
-      "loss": 3.2244,
+      "epoch": 7.137466307277628,
+      "grad_norm": 0.8279274702072144,
+      "learning_rate": 0.00017209282245008093,
+      "loss": 3.2337,
       "step": 66200
     },
     {
-      "epoch": 7.130556452480896,
-      "grad_norm": 0.8104016184806824,
-      "learning_rate": 0.0001725137377437776,
-      "loss": 3.2237,
+      "epoch": 7.142857142857143,
+      "grad_norm": 0.7875446677207947,
+      "learning_rate": 0.0001717690232056125,
+      "loss": 3.223,
       "step": 66250
     },
     {
-      "epoch": 7.135938004520503,
-      "grad_norm": 0.7950676679611206,
-      "learning_rate": 0.00017219049671371615,
-      "loss": 3.2161,
+      "epoch": 7.1482479784366575,
+      "grad_norm": 0.822761058807373,
+      "learning_rate": 0.0001714452239611441,
+      "loss": 3.2112,
       "step": 66300
     },
     {
-      "epoch": 7.141319556560112,
-      "grad_norm": 0.7405356764793396,
-      "learning_rate": 0.00017186725568365478,
-      "loss": 3.2312,
+      "epoch": 7.153638814016173,
+      "grad_norm": 0.7847926616668701,
+      "learning_rate": 0.000171127900701565,
+      "loss": 3.2183,
       "step": 66350
     },
     {
-      "epoch": 7.1467011085997205,
-      "grad_norm": 0.7578369379043579,
-      "learning_rate": 0.00017154401465359334,
-      "loss": 3.2037,
+      "epoch": 7.159029649595688,
+      "grad_norm": 0.7854703664779663,
+      "learning_rate": 0.0001708041014570966,
+      "loss": 3.2161,
       "step": 66400
     },
     {
-      "epoch": 7.152082660639328,
-      "grad_norm": 0.7615239024162292,
-      "learning_rate": 0.00017122077362353194,
-      "loss": 3.2061,
+      "epoch": 7.164420485175202,
+      "grad_norm": 3.3990447521209717,
+      "learning_rate": 0.00017048030221262814,
+      "loss": 3.2148,
       "step": 66450
     },
     {
-      "epoch": 7.157464212678937,
-      "grad_norm": 0.7853385210037231,
-      "learning_rate": 0.00017089753259347053,
-      "loss": 3.2221,
+      "epoch": 7.169811320754717,
+      "grad_norm": 0.7549009323120117,
+      "learning_rate": 0.00017015650296815972,
+      "loss": 3.2194,
       "step": 66500
     },
     {
-      "epoch": 7.162845764718545,
-      "grad_norm": 0.7812248468399048,
-      "learning_rate": 0.0001705742915634091,
-      "loss": 3.2089,
+      "epoch": 7.175202156334231,
+      "grad_norm": 0.764918863773346,
+      "learning_rate": 0.0001698327037236913,
+      "loss": 3.2173,
       "step": 66550
     },
     {
-      "epoch": 7.168227316758153,
-      "grad_norm": 0.79538494348526,
-      "learning_rate": 0.0001702510505333477,
-      "loss": 3.2266,
+      "epoch": 7.180592991913747,
+      "grad_norm": 0.7957922220230103,
+      "learning_rate": 0.00016950890447922287,
+      "loss": 3.2251,
       "step": 66600
     },
     {
-      "epoch": 7.1736088687977615,
-      "grad_norm": 0.7920806407928467,
-      "learning_rate": 0.00016992780950328626,
-      "loss": 3.2159,
+      "epoch": 7.185983827493262,
+      "grad_norm": 0.7517008185386658,
+      "learning_rate": 0.00016918510523475445,
+      "loss": 3.224,
       "step": 66650
     },
     {
-      "epoch": 7.178990420837369,
-      "grad_norm": 0.8788477182388306,
-      "learning_rate": 0.00016960456847322486,
-      "loss": 3.2352,
+      "epoch": 7.191374663072776,
+      "grad_norm": 0.7847917079925537,
+      "learning_rate": 0.00016886130599028603,
+      "loss": 3.2224,
       "step": 66700
     },
     {
-      "epoch": 7.184371972876978,
-      "grad_norm": 0.787079930305481,
-      "learning_rate": 0.00016928132744316345,
-      "loss": 3.2172,
+      "epoch": 7.196765498652291,
+      "grad_norm": 0.8399714231491089,
+      "learning_rate": 0.00016853750674581755,
+      "loss": 3.2064,
       "step": 66750
     },
     {
-      "epoch": 7.189753524916586,
-      "grad_norm": 0.7809692025184631,
-      "learning_rate": 0.00016895808641310202,
-      "loss": 3.2311,
+      "epoch": 7.202156334231806,
+      "grad_norm": 0.7885647416114807,
+      "learning_rate": 0.00016821370750134913,
+      "loss": 3.215,
       "step": 66800
     },
     {
-      "epoch": 7.195135076956194,
-      "grad_norm": 0.7488144636154175,
-      "learning_rate": 0.0001686348453830406,
-      "loss": 3.2273,
+      "epoch": 7.2075471698113205,
+      "grad_norm": 0.7742317914962769,
+      "learning_rate": 0.0001678899082568807,
+      "loss": 3.2136,
       "step": 66850
     },
     {
-      "epoch": 7.2005166289958025,
-      "grad_norm": 0.7899309396743774,
-      "learning_rate": 0.0001683116043529792,
-      "loss": 3.2404,
+      "epoch": 7.212938005390836,
+      "grad_norm": 0.7558757662773132,
+      "learning_rate": 0.00016756610901241228,
+      "loss": 3.2134,
       "step": 66900
     },
     {
-      "epoch": 7.205898181035411,
-      "grad_norm": 0.7522909045219421,
-      "learning_rate": 0.00016798836332291778,
-      "loss": 3.2386,
+      "epoch": 7.218328840970351,
+      "grad_norm": 0.8128474950790405,
+      "learning_rate": 0.00016724230976794386,
+      "loss": 3.2153,
       "step": 66950
     },
     {
-      "epoch": 7.211279733075019,
-      "grad_norm": 0.7979075312614441,
-      "learning_rate": 0.00016766512229285634,
-      "loss": 3.236,
+      "epoch": 7.223719676549865,
+      "grad_norm": 0.7827127575874329,
+      "learning_rate": 0.00016691851052347544,
+      "loss": 3.2243,
       "step": 67000
     },
     {
-      "epoch": 7.211279733075019,
-      "eval_accuracy": 0.3877005582911335,
-      "eval_loss": 3.3552801609039307,
-      "eval_runtime": 184.2064,
-      "eval_samples_per_second": 97.776,
-      "eval_steps_per_second": 6.113,
+      "epoch": 7.223719676549865,
+      "eval_accuracy": 0.38756995751998047,
+      "eval_loss": 3.358355760574341,
+      "eval_runtime": 183.444,
+      "eval_samples_per_second": 98.183,
+      "eval_steps_per_second": 6.138,
       "step": 67000
     },
     {
-      "epoch": 7.216661285114627,
-      "grad_norm": 0.7617997527122498,
-      "learning_rate": 0.00016734188126279494,
-      "loss": 3.2404,
+      "epoch": 7.22911051212938,
+      "grad_norm": 0.8113185167312622,
+      "learning_rate": 0.00016659471127900702,
+      "loss": 3.2343,
       "step": 67050
     },
     {
-      "epoch": 7.222042837154235,
-      "grad_norm": 0.7634185552597046,
-      "learning_rate": 0.00016701864023273353,
-      "loss": 3.2352,
+      "epoch": 7.234501347708895,
+      "grad_norm": 0.7938553690910339,
+      "learning_rate": 0.0001662709120345386,
+      "loss": 3.2358,
       "step": 67100
     },
     {
-      "epoch": 7.2274243891938434,
-      "grad_norm": 0.8232079148292542,
-      "learning_rate": 0.0001666953992026721,
-      "loss": 3.2292,
+      "epoch": 7.2398921832884096,
+      "grad_norm": 0.8004481196403503,
+      "learning_rate": 0.00016594711279007015,
+      "loss": 3.1933,
       "step": 67150
     },
     {
-      "epoch": 7.232805941233452,
-      "grad_norm": 0.8213094472885132,
-      "learning_rate": 0.0001663721581726107,
-      "loss": 3.2231,
+      "epoch": 7.245283018867925,
+      "grad_norm": 0.8019503951072693,
+      "learning_rate": 0.0001656233135456017,
+      "loss": 3.2262,
       "step": 67200
     },
     {
-      "epoch": 7.23818749327306,
-      "grad_norm": 0.7654265761375427,
-      "learning_rate": 0.00016604891714254926,
-      "loss": 3.2287,
+      "epoch": 7.250673854447439,
+      "grad_norm": 0.756632387638092,
+      "learning_rate": 0.00016529951430113327,
+      "loss": 3.2188,
       "step": 67250
     },
     {
-      "epoch": 7.243569045312668,
-      "grad_norm": 0.7495639324188232,
-      "learning_rate": 0.00016572567611248786,
-      "loss": 3.2287,
+      "epoch": 7.256064690026954,
+      "grad_norm": 0.7605134844779968,
+      "learning_rate": 0.00016497571505666485,
+      "loss": 3.2257,
       "step": 67300
     },
     {
-      "epoch": 7.248950597352277,
-      "grad_norm": 0.7579506039619446,
-      "learning_rate": 0.00016540243508242645,
-      "loss": 3.2304,
+      "epoch": 7.261455525606469,
+      "grad_norm": 0.7907736897468567,
+      "learning_rate": 0.00016465191581219643,
+      "loss": 3.241,
       "step": 67350
     },
     {
-      "epoch": 7.254332149391884,
-      "grad_norm": 0.7823368906974792,
-      "learning_rate": 0.00016507919405236502,
-      "loss": 3.209,
+      "epoch": 7.2668463611859835,
+      "grad_norm": 0.7794830799102783,
+      "learning_rate": 0.000164328116567728,
+      "loss": 3.2353,
       "step": 67400
     },
     {
-      "epoch": 7.259713701431493,
-      "grad_norm": 0.7844114303588867,
-      "learning_rate": 0.00016475595302230364,
-      "loss": 3.2216,
+      "epoch": 7.272237196765499,
+      "grad_norm": 0.8242986798286438,
+      "learning_rate": 0.00016400431732325956,
+      "loss": 3.2464,
       "step": 67450
     },
     {
-      "epoch": 7.265095253471101,
-      "grad_norm": 0.7908133864402771,
-      "learning_rate": 0.0001644327119922422,
-      "loss": 3.2328,
+      "epoch": 7.277628032345014,
+      "grad_norm": 0.7823339700698853,
+      "learning_rate": 0.00016368051807879114,
+      "loss": 3.2166,
       "step": 67500
     },
     {
-      "epoch": 7.270476805510709,
-      "grad_norm": 0.8082903623580933,
-      "learning_rate": 0.00016410947096218078,
-      "loss": 3.2432,
+      "epoch": 7.283018867924528,
+      "grad_norm": 0.7602329254150391,
+      "learning_rate": 0.0001633567188343227,
+      "loss": 3.2096,
       "step": 67550
     },
     {
-      "epoch": 7.275858357550318,
-      "grad_norm": 0.7870227694511414,
-      "learning_rate": 0.00016378622993211937,
-      "loss": 3.2239,
+      "epoch": 7.288409703504043,
+      "grad_norm": 0.7734456062316895,
+      "learning_rate": 0.00016303291958985426,
+      "loss": 3.2457,
       "step": 67600
     },
     {
-      "epoch": 7.281239909589925,
-      "grad_norm": 0.7916932702064514,
-      "learning_rate": 0.00016346298890205797,
-      "loss": 3.2237,
+      "epoch": 7.293800539083558,
+      "grad_norm": 0.8424938917160034,
+      "learning_rate": 0.00016270912034538584,
+      "loss": 3.2314,
       "step": 67650
     },
     {
-      "epoch": 7.286621461629534,
-      "grad_norm": 0.7457194924354553,
-      "learning_rate": 0.00016314621269259777,
-      "loss": 3.2412,
+      "epoch": 7.2991913746630726,
+      "grad_norm": 0.7849023342132568,
+      "learning_rate": 0.00016238532110091742,
+      "loss": 3.2246,
       "step": 67700
     },
     {
-      "epoch": 7.2920030136691425,
-      "grad_norm": 0.8166563510894775,
-      "learning_rate": 0.00016282297166253634,
-      "loss": 3.2098,
+      "epoch": 7.304582210242588,
+      "grad_norm": 0.7933061122894287,
+      "learning_rate": 0.000162061521856449,
+      "loss": 3.2295,
       "step": 67750
     },
     {
-      "epoch": 7.29738456570875,
-      "grad_norm": 0.8328801393508911,
-      "learning_rate": 0.00016249973063247494,
-      "loss": 3.2359,
+      "epoch": 7.309973045822103,
+      "grad_norm": 0.8396109938621521,
+      "learning_rate": 0.00016173772261198055,
+      "loss": 3.2416,
       "step": 67800
     },
     {
-      "epoch": 7.302766117748359,
-      "grad_norm": 0.777811586856842,
-      "learning_rate": 0.00016217648960241353,
-      "loss": 3.2321,
+      "epoch": 7.315363881401617,
+      "grad_norm": 0.8306365013122559,
+      "learning_rate": 0.00016141392336751212,
+      "loss": 3.2286,
       "step": 67850
     },
     {
-      "epoch": 7.308147669787967,
-      "grad_norm": 0.8480938673019409,
-      "learning_rate": 0.0001618532485723521,
-      "loss": 3.2132,
+      "epoch": 7.320754716981132,
+      "grad_norm": 0.8018426895141602,
+      "learning_rate": 0.0001610901241230437,
+      "loss": 3.2235,
       "step": 67900
     },
     {
-      "epoch": 7.313529221827575,
-      "grad_norm": 0.7613167762756348,
-      "learning_rate": 0.00016153000754229067,
-      "loss": 3.235,
+      "epoch": 7.3261455525606465,
+      "grad_norm": 0.7814752459526062,
+      "learning_rate": 0.00016076632487857528,
+      "loss": 3.2307,
       "step": 67950
     },
     {
-      "epoch": 7.3189107738671835,
-      "grad_norm": 0.7387501001358032,
-      "learning_rate": 0.0001612067665122293,
-      "loss": 3.2177,
+      "epoch": 7.331536388140162,
+      "grad_norm": 0.8125154376029968,
+      "learning_rate": 0.00016044252563410686,
+      "loss": 3.226,
       "step": 68000
     },
     {
-      "epoch": 7.3189107738671835,
-      "eval_accuracy": 0.3881735243117899,
-      "eval_loss": 3.3549270629882812,
-      "eval_runtime": 183.9194,
-      "eval_samples_per_second": 97.929,
-      "eval_steps_per_second": 6.122,
+      "epoch": 7.331536388140162,
+      "eval_accuracy": 0.38801782472355173,
+      "eval_loss": 3.355308771133423,
+      "eval_runtime": 183.676,
+      "eval_samples_per_second": 98.059,
+      "eval_steps_per_second": 6.13,
       "step": 68000
     },
     {
-      "epoch": 7.324292325906791,
-      "grad_norm": 0.8005468845367432,
-      "learning_rate": 0.00016088352548216785,
-      "loss": 3.224,
+      "epoch": 7.336927223719677,
+      "grad_norm": 0.7748324275016785,
+      "learning_rate": 0.0001601187263896384,
+      "loss": 3.223,
       "step": 68050
     },
     {
-      "epoch": 7.3296738779464,
-      "grad_norm": 0.7966600060462952,
-      "learning_rate": 0.00016056028445210642,
-      "loss": 3.2397,
+      "epoch": 7.342318059299191,
+      "grad_norm": 0.7646241188049316,
+      "learning_rate": 0.00015979492714516996,
+      "loss": 3.2285,
       "step": 68100
     },
     {
-      "epoch": 7.335055429986008,
-      "grad_norm": 0.727301836013794,
-      "learning_rate": 0.00016023704342204504,
-      "loss": 3.2308,
+      "epoch": 7.347708894878706,
+      "grad_norm": 0.7891027331352234,
+      "learning_rate": 0.00015947112790070154,
+      "loss": 3.2304,
       "step": 68150
     },
     {
-      "epoch": 7.340436982025616,
-      "grad_norm": 0.7933337688446045,
-      "learning_rate": 0.0001599138023919836,
-      "loss": 3.2197,
+      "epoch": 7.353099730458221,
+      "grad_norm": 0.8173832297325134,
+      "learning_rate": 0.00015914732865623311,
+      "loss": 3.2261,
       "step": 68200
     },
     {
-      "epoch": 7.3458185340652244,
-      "grad_norm": 0.77813321352005,
-      "learning_rate": 0.0001595905613619222,
-      "loss": 3.2116,
+      "epoch": 7.3584905660377355,
+      "grad_norm": 0.8146997690200806,
+      "learning_rate": 0.0001588235294117647,
+      "loss": 3.2288,
       "step": 68250
     },
     {
-      "epoch": 7.351200086104833,
-      "grad_norm": 0.8122656345367432,
-      "learning_rate": 0.00015926732033186077,
-      "loss": 3.2054,
+      "epoch": 7.363881401617251,
+      "grad_norm": 0.8188638687133789,
+      "learning_rate": 0.00015849973016729627,
+      "loss": 3.2446,
       "step": 68300
     },
     {
-      "epoch": 7.356581638144441,
-      "grad_norm": 0.7639948725700378,
-      "learning_rate": 0.00015894407930179934,
-      "loss": 3.2372,
+      "epoch": 7.369272237196766,
+      "grad_norm": 0.7957025170326233,
+      "learning_rate": 0.00015817593092282785,
+      "loss": 3.2231,
       "step": 68350
     },
     {
-      "epoch": 7.361963190184049,
-      "grad_norm": 0.7615376710891724,
-      "learning_rate": 0.00015862083827173796,
-      "loss": 3.2182,
+      "epoch": 7.37466307277628,
+      "grad_norm": 0.7923418879508972,
+      "learning_rate": 0.00015785213167835942,
+      "loss": 3.2206,
       "step": 68400
     },
     {
-      "epoch": 7.367344742223658,
-      "grad_norm": 0.7874367833137512,
-      "learning_rate": 0.00015829759724167653,
-      "loss": 3.2262,
+      "epoch": 7.380053908355795,
+      "grad_norm": 0.839530885219574,
+      "learning_rate": 0.00015752833243389095,
+      "loss": 3.2353,
       "step": 68450
     },
     {
-      "epoch": 7.372726294263265,
-      "grad_norm": 0.7851904630661011,
-      "learning_rate": 0.0001579743562116151,
-      "loss": 3.2176,
+      "epoch": 7.38544474393531,
+      "grad_norm": 0.9345459938049316,
+      "learning_rate": 0.00015720453318942253,
+      "loss": 3.2093,
       "step": 68500
     },
     {
-      "epoch": 7.378107846302874,
-      "grad_norm": 0.7756659388542175,
-      "learning_rate": 0.00015765111518155372,
-      "loss": 3.2207,
+      "epoch": 7.390835579514825,
+      "grad_norm": 0.782017707824707,
+      "learning_rate": 0.0001568807339449541,
+      "loss": 3.201,
       "step": 68550
     },
     {
-      "epoch": 7.383489398342482,
-      "grad_norm": 0.7584941387176514,
-      "learning_rate": 0.0001573278741514923,
-      "loss": 3.2037,
+      "epoch": 7.39622641509434,
+      "grad_norm": 0.803848147392273,
+      "learning_rate": 0.00015655693470048568,
+      "loss": 3.2296,
       "step": 68600
     },
     {
-      "epoch": 7.38887095038209,
-      "grad_norm": 0.7656226754188538,
-      "learning_rate": 0.00015700463312143085,
-      "loss": 3.241,
+      "epoch": 7.401617250673855,
+      "grad_norm": 0.8133928775787354,
+      "learning_rate": 0.00015623313545601726,
+      "loss": 3.2408,
       "step": 68650
     },
     {
-      "epoch": 7.394252502421699,
-      "grad_norm": 0.814250648021698,
-      "learning_rate": 0.00015668139209136945,
-      "loss": 3.2298,
+      "epoch": 7.407008086253369,
+      "grad_norm": 0.7830129861831665,
+      "learning_rate": 0.00015590933621154884,
+      "loss": 3.2185,
       "step": 68700
     },
     {
-      "epoch": 7.399634054461306,
-      "grad_norm": 0.8023512363433838,
-      "learning_rate": 0.00015635815106130804,
-      "loss": 3.2192,
+      "epoch": 7.412398921832884,
+      "grad_norm": 0.8253718018531799,
+      "learning_rate": 0.00015558553696708041,
+      "loss": 3.2344,
       "step": 68750
     },
     {
-      "epoch": 7.405015606500915,
-      "grad_norm": 0.845595121383667,
-      "learning_rate": 0.0001560349100312466,
-      "loss": 3.2426,
+      "epoch": 7.4177897574123985,
+      "grad_norm": 0.8411740660667419,
+      "learning_rate": 0.00015526173772261196,
+      "loss": 3.225,
       "step": 68800
     },
     {
-      "epoch": 7.4103971585405235,
-      "grad_norm": 0.7891848087310791,
-      "learning_rate": 0.0001557116690011852,
-      "loss": 3.214,
+      "epoch": 7.423180592991914,
+      "grad_norm": 0.8184799551963806,
+      "learning_rate": 0.00015493793847814354,
+      "loss": 3.2399,
       "step": 68850
     },
     {
-      "epoch": 7.415778710580131,
-      "grad_norm": 0.8064113855361938,
-      "learning_rate": 0.00015538842797112377,
-      "loss": 3.2318,
+      "epoch": 7.428571428571429,
+      "grad_norm": 0.8034173250198364,
+      "learning_rate": 0.0001546141392336751,
+      "loss": 3.2341,
       "step": 68900
     },
     {
-      "epoch": 7.42116026261974,
-      "grad_norm": 0.7896233797073364,
-      "learning_rate": 0.00015506518694106237,
-      "loss": 3.2226,
+      "epoch": 7.433962264150943,
+      "grad_norm": 0.8255000114440918,
+      "learning_rate": 0.00015429033998920667,
+      "loss": 3.2394,
       "step": 68950
     },
     {
-      "epoch": 7.426541814659347,
-      "grad_norm": 0.7546834945678711,
-      "learning_rate": 0.00015474194591100096,
-      "loss": 3.2373,
+      "epoch": 7.439353099730458,
+      "grad_norm": 0.8404207825660706,
+      "learning_rate": 0.00015396654074473825,
+      "loss": 3.2498,
       "step": 69000
     },
     {
-      "epoch": 7.426541814659347,
-      "eval_accuracy": 0.3881545100564224,
-      "eval_loss": 3.350152015686035,
-      "eval_runtime": 183.5756,
-      "eval_samples_per_second": 98.112,
-      "eval_steps_per_second": 6.134,
+      "epoch": 7.439353099730458,
+      "eval_accuracy": 0.3881027912818227,
+      "eval_loss": 3.3517191410064697,
+      "eval_runtime": 183.9439,
+      "eval_samples_per_second": 97.916,
+      "eval_steps_per_second": 6.121,
       "step": 69000
     },
     {
-      "epoch": 7.431923366698956,
-      "grad_norm": 0.8021217584609985,
-      "learning_rate": 0.00015441870488093953,
-      "loss": 3.2293,
+      "epoch": 7.444743935309973,
+      "grad_norm": 0.8004981279373169,
+      "learning_rate": 0.00015364274150026983,
+      "loss": 3.2569,
       "step": 69050
     },
     {
-      "epoch": 7.4373049187385645,
-      "grad_norm": 0.7711283564567566,
-      "learning_rate": 0.0001540954638508781,
-      "loss": 3.2068,
+      "epoch": 7.450134770889488,
+      "grad_norm": 0.8153196573257446,
+      "learning_rate": 0.0001533189422558014,
+      "loss": 3.2388,
       "step": 69100
     },
     {
-      "epoch": 7.442686470778172,
-      "grad_norm": 0.7406951189041138,
-      "learning_rate": 0.00015377222282081672,
-      "loss": 3.2363,
+      "epoch": 7.455525606469003,
+      "grad_norm": 0.8217753767967224,
+      "learning_rate": 0.00015299514301133295,
+      "loss": 3.2183,
       "step": 69150
     },
     {
-      "epoch": 7.448068022817781,
-      "grad_norm": 0.7828607559204102,
-      "learning_rate": 0.0001534489817907553,
-      "loss": 3.23,
+      "epoch": 7.460916442048518,
+      "grad_norm": 0.8331478834152222,
+      "learning_rate": 0.00015267134376686453,
+      "loss": 3.2407,
       "step": 69200
     },
     {
-      "epoch": 7.453449574857389,
-      "grad_norm": 0.7775204181671143,
-      "learning_rate": 0.00015312574076069388,
-      "loss": 3.2188,
+      "epoch": 7.466307277628032,
+      "grad_norm": 0.8319252133369446,
+      "learning_rate": 0.0001523475445223961,
+      "loss": 3.234,
       "step": 69250
     },
     {
-      "epoch": 7.458831126896997,
-      "grad_norm": 0.8648480176925659,
-      "learning_rate": 0.00015280249973063248,
-      "loss": 3.2287,
+      "epoch": 7.471698113207547,
+      "grad_norm": 0.8192378878593445,
+      "learning_rate": 0.00015202374527792766,
+      "loss": 3.2284,
       "step": 69300
     },
     {
-      "epoch": 7.4642126789366054,
-      "grad_norm": 0.7523376941680908,
-      "learning_rate": 0.00015247925870057104,
-      "loss": 3.2364,
+      "epoch": 7.4770889487870615,
+      "grad_norm": 0.8338528871536255,
+      "learning_rate": 0.00015169994603345924,
+      "loss": 3.229,
       "step": 69350
     },
     {
-      "epoch": 7.469594230976213,
-      "grad_norm": 0.7940405607223511,
-      "learning_rate": 0.00015215601767050964,
-      "loss": 3.2329,
+      "epoch": 7.482479784366577,
+      "grad_norm": 0.8317149877548218,
+      "learning_rate": 0.00015137614678899082,
+      "loss": 3.2566,
       "step": 69400
     },
     {
-      "epoch": 7.474975783015822,
-      "grad_norm": 0.8218305706977844,
-      "learning_rate": 0.0001518327766404482,
-      "loss": 3.2279,
+      "epoch": 7.487870619946092,
+      "grad_norm": 0.8038789629936218,
+      "learning_rate": 0.00015105234754452237,
+      "loss": 3.2393,
       "step": 69450
     },
     {
-      "epoch": 7.48035733505543,
-      "grad_norm": 0.8601292967796326,
-      "learning_rate": 0.0001515095356103868,
-      "loss": 3.2287,
+      "epoch": 7.493261455525606,
+      "grad_norm": 0.789286732673645,
+      "learning_rate": 0.00015072854830005394,
+      "loss": 3.2284,
       "step": 69500
     },
     {
-      "epoch": 7.485738887095038,
-      "grad_norm": 0.7894765138626099,
-      "learning_rate": 0.0001511862945803254,
-      "loss": 3.2431,
+      "epoch": 7.498652291105121,
+      "grad_norm": 0.8185145258903503,
+      "learning_rate": 0.00015040474905558552,
+      "loss": 3.2413,
       "step": 69550
     },
     {
-      "epoch": 7.491120439134646,
-      "grad_norm": 0.7626713514328003,
-      "learning_rate": 0.00015086305355026396,
-      "loss": 3.239,
+      "epoch": 7.504043126684636,
+      "grad_norm": 0.8178480267524719,
+      "learning_rate": 0.0001500809498111171,
+      "loss": 3.2342,
       "step": 69600
     },
     {
-      "epoch": 7.496501991174255,
-      "grad_norm": 0.7660843133926392,
-      "learning_rate": 0.00015053981252020253,
-      "loss": 3.2452,
+      "epoch": 7.509433962264151,
+      "grad_norm": 0.7696956992149353,
+      "learning_rate": 0.00014975715056664865,
+      "loss": 3.2172,
       "step": 69650
     },
     {
-      "epoch": 7.501883543213863,
-      "grad_norm": 0.7898152470588684,
-      "learning_rate": 0.00015021657149014115,
-      "loss": 3.2364,
+      "epoch": 7.514824797843666,
+      "grad_norm": 0.792907178401947,
+      "learning_rate": 0.00014943335132218023,
+      "loss": 3.2548,
       "step": 69700
     },
     {
-      "epoch": 7.507265095253471,
-      "grad_norm": 0.7465683221817017,
-      "learning_rate": 0.00014989333046007972,
-      "loss": 3.2378,
+      "epoch": 7.520215633423181,
+      "grad_norm": 0.7962068915367126,
+      "learning_rate": 0.0001491095520777118,
+      "loss": 3.2337,
       "step": 69750
     },
     {
-      "epoch": 7.51264664729308,
-      "grad_norm": 0.8163168430328369,
-      "learning_rate": 0.00014957008943001832,
-      "loss": 3.2374,
+      "epoch": 7.525606469002695,
+      "grad_norm": 0.8143125176429749,
+      "learning_rate": 0.00014878575283324338,
+      "loss": 3.2427,
       "step": 69800
     },
     {
-      "epoch": 7.518028199332687,
-      "grad_norm": 0.7777751684188843,
-      "learning_rate": 0.00014925331322055812,
-      "loss": 3.2407,
+      "epoch": 7.53099730458221,
+      "grad_norm": 0.7968950271606445,
+      "learning_rate": 0.00014846195358877493,
+      "loss": 3.227,
       "step": 69850
     },
     {
-      "epoch": 7.523409751372296,
-      "grad_norm": 0.7856535911560059,
-      "learning_rate": 0.0001489300721904967,
-      "loss": 3.2332,
+      "epoch": 7.536388140161725,
+      "grad_norm": 0.8375698328018188,
+      "learning_rate": 0.0001481381543443065,
+      "loss": 3.2473,
       "step": 69900
     },
     {
-      "epoch": 7.528791303411904,
-      "grad_norm": 0.8025381565093994,
-      "learning_rate": 0.00014860683116043528,
-      "loss": 3.2268,
+      "epoch": 7.54177897574124,
+      "grad_norm": 0.8069729208946228,
+      "learning_rate": 0.0001478143550998381,
+      "loss": 3.2388,
       "step": 69950
     },
     {
-      "epoch": 7.534172855451512,
-      "grad_norm": 0.768328070640564,
-      "learning_rate": 0.00014828359013037385,
-      "loss": 3.2271,
+      "epoch": 7.547169811320755,
+      "grad_norm": 0.8065479397773743,
+      "learning_rate": 0.00014749055585536967,
+      "loss": 3.2323,
       "step": 70000
     },
     {
-      "epoch": 7.534172855451512,
-      "eval_accuracy": 0.3889133418249187,
-      "eval_loss": 3.347104549407959,
-      "eval_runtime": 186.753,
-      "eval_samples_per_second": 96.443,
-      "eval_steps_per_second": 6.029,
+      "epoch": 7.547169811320755,
+      "eval_accuracy": 0.3886169367469608,
+      "eval_loss": 3.3482158184051514,
+      "eval_runtime": 183.4915,
+      "eval_samples_per_second": 98.157,
+      "eval_steps_per_second": 6.137,
       "step": 70000
     },
     {
-      "epoch": 7.539554407491121,
-      "grad_norm": 0.7838550209999084,
-      "learning_rate": 0.00014796034910031245,
-      "loss": 3.2358,
+      "epoch": 7.55256064690027,
+      "grad_norm": 0.7838699221611023,
+      "learning_rate": 0.00014716675661090122,
+      "loss": 3.2376,
       "step": 70050
     },
     {
-      "epoch": 7.544935959530728,
-      "grad_norm": 0.8263210654258728,
-      "learning_rate": 0.00014763710807025104,
-      "loss": 3.2473,
+      "epoch": 7.557951482479784,
+      "grad_norm": 0.8455720543861389,
+      "learning_rate": 0.0001468429573664328,
+      "loss": 3.246,
       "step": 70100
     },
     {
-      "epoch": 7.550317511570337,
-      "grad_norm": 0.7420403361320496,
-      "learning_rate": 0.0001473138670401896,
-      "loss": 3.2405,
+      "epoch": 7.563342318059299,
+      "grad_norm": 0.8139489889144897,
+      "learning_rate": 0.00014651915812196437,
+      "loss": 3.2397,
       "step": 70150
     },
     {
-      "epoch": 7.5556990636099455,
-      "grad_norm": 0.8509701490402222,
-      "learning_rate": 0.0001469906260101282,
-      "loss": 3.2202,
+      "epoch": 7.568733153638814,
+      "grad_norm": 0.8418183326721191,
+      "learning_rate": 0.00014619535887749595,
+      "loss": 3.2382,
       "step": 70200
     },
     {
-      "epoch": 7.561080615649553,
-      "grad_norm": 0.7671039700508118,
-      "learning_rate": 0.0001466673849800668,
-      "loss": 3.2382,
+      "epoch": 7.574123989218329,
+      "grad_norm": 0.7932751178741455,
+      "learning_rate": 0.00014587155963302753,
+      "loss": 3.2411,
       "step": 70250
     },
     {
-      "epoch": 7.566462167689162,
-      "grad_norm": 0.7579676508903503,
-      "learning_rate": 0.0001463441439500054,
-      "loss": 3.2368,
+      "epoch": 7.579514824797844,
+      "grad_norm": 0.990576446056366,
+      "learning_rate": 0.00014554776038855908,
+      "loss": 3.2422,
       "step": 70300
     },
     {
-      "epoch": 7.57184371972877,
-      "grad_norm": 0.7678229212760925,
-      "learning_rate": 0.00014602090291994396,
-      "loss": 3.2319,
+      "epoch": 7.584905660377358,
+      "grad_norm": 0.8364869356155396,
+      "learning_rate": 0.00014523043712898003,
+      "loss": 3.2565,
       "step": 70350
     },
     {
-      "epoch": 7.577225271768378,
-      "grad_norm": 0.7729119658470154,
-      "learning_rate": 0.00014569766188988255,
-      "loss": 3.2431,
+      "epoch": 7.590296495956873,
+      "grad_norm": 0.8097447752952576,
+      "learning_rate": 0.00014490663788451158,
+      "loss": 3.2301,
       "step": 70400
     },
     {
-      "epoch": 7.5826068238079865,
-      "grad_norm": 0.8020555973052979,
-      "learning_rate": 0.00014537442085982112,
-      "loss": 3.2512,
+      "epoch": 7.595687331536388,
+      "grad_norm": 0.8373438119888306,
+      "learning_rate": 0.00014458283864004316,
+      "loss": 3.2483,
       "step": 70450
     },
     {
-      "epoch": 7.587988375847594,
-      "grad_norm": 0.8227038383483887,
-      "learning_rate": 0.00014505117982975972,
-      "loss": 3.2352,
+      "epoch": 7.601078167115903,
+      "grad_norm": 0.813815712928772,
+      "learning_rate": 0.00014425903939557474,
+      "loss": 3.2568,
       "step": 70500
     },
     {
-      "epoch": 7.593369927887203,
-      "grad_norm": 0.7413498163223267,
-      "learning_rate": 0.00014472793879969828,
-      "loss": 3.226,
+      "epoch": 7.606469002695418,
+      "grad_norm": 0.8149926662445068,
+      "learning_rate": 0.00014393524015110631,
+      "loss": 3.2454,
       "step": 70550
     },
     {
-      "epoch": 7.598751479926811,
-      "grad_norm": 0.8386573791503906,
-      "learning_rate": 0.00014440469776963688,
-      "loss": 3.2471,
+      "epoch": 7.611859838274933,
+      "grad_norm": 0.866629958152771,
+      "learning_rate": 0.00014361144090663786,
+      "loss": 3.2318,
       "step": 70600
     },
     {
-      "epoch": 7.604133031966419,
-      "grad_norm": 0.7619134783744812,
-      "learning_rate": 0.00014408145673957545,
-      "loss": 3.213,
+      "epoch": 7.617250673854447,
+      "grad_norm": 0.8428345918655396,
+      "learning_rate": 0.00014328764166216944,
+      "loss": 3.2296,
       "step": 70650
     },
     {
-      "epoch": 7.609514584006027,
-      "grad_norm": 0.7510246634483337,
-      "learning_rate": 0.00014375821570951404,
-      "loss": 3.2313,
+      "epoch": 7.622641509433962,
+      "grad_norm": 0.8193896412849426,
+      "learning_rate": 0.00014296384241770102,
+      "loss": 3.2552,
       "step": 70700
     },
     {
-      "epoch": 7.614896136045635,
-      "grad_norm": 0.795792818069458,
-      "learning_rate": 0.00014343497467945264,
-      "loss": 3.2269,
+      "epoch": 7.628032345013477,
+      "grad_norm": 0.794182538986206,
+      "learning_rate": 0.0001426400431732326,
+      "loss": 3.2357,
       "step": 70750
     },
     {
-      "epoch": 7.620277688085244,
-      "grad_norm": 0.803733766078949,
-      "learning_rate": 0.00014311173364939123,
-      "loss": 3.2285,
+      "epoch": 7.633423180592992,
+      "grad_norm": 0.8212337493896484,
+      "learning_rate": 0.00014231624392876417,
+      "loss": 3.2446,
       "step": 70800
     },
     {
-      "epoch": 7.625659240124852,
-      "grad_norm": 0.7954763770103455,
-      "learning_rate": 0.0001427884926193298,
-      "loss": 3.2346,
+      "epoch": 7.638814016172507,
+      "grad_norm": 0.778418242931366,
+      "learning_rate": 0.00014199244468429573,
+      "loss": 3.2301,
       "step": 70850
     },
     {
-      "epoch": 7.63104079216446,
-      "grad_norm": 0.7582803964614868,
-      "learning_rate": 0.0001424652515892684,
-      "loss": 3.2259,
+      "epoch": 7.644204851752022,
+      "grad_norm": 0.847078263759613,
+      "learning_rate": 0.0001416686454398273,
+      "loss": 3.2439,
       "step": 70900
     },
     {
-      "epoch": 7.636422344204068,
-      "grad_norm": 0.8451318740844727,
-      "learning_rate": 0.000142142010559207,
-      "loss": 3.2392,
+      "epoch": 7.649595687331536,
+      "grad_norm": 0.786853551864624,
+      "learning_rate": 0.00014134484619535888,
+      "loss": 3.2243,
       "step": 70950
     },
     {
-      "epoch": 7.641803896243677,
-      "grad_norm": 0.7700170278549194,
-      "learning_rate": 0.00014181876952914555,
-      "loss": 3.2571,
+      "epoch": 7.654986522911051,
+      "grad_norm": 0.7910891771316528,
+      "learning_rate": 0.00014102104695089043,
+      "loss": 3.2291,
       "step": 71000
     },
     {
-      "epoch": 7.641803896243677,
-      "eval_accuracy": 0.38935566773121144,
-      "eval_loss": 3.3431193828582764,
-      "eval_runtime": 184.6174,
-      "eval_samples_per_second": 97.559,
-      "eval_steps_per_second": 6.099,
+      "epoch": 7.654986522911051,
+      "eval_accuracy": 0.3887969745820694,
+      "eval_loss": 3.3445234298706055,
+      "eval_runtime": 183.6631,
+      "eval_samples_per_second": 98.065,
+      "eval_steps_per_second": 6.131,
       "step": 71000
     },
     {
-      "epoch": 7.647185448283285,
-      "grad_norm": 0.7822245955467224,
-      "learning_rate": 0.00014149552849908415,
-      "loss": 3.237,
+      "epoch": 7.660377358490566,
+      "grad_norm": 0.8126778602600098,
+      "learning_rate": 0.000140697247706422,
+      "loss": 3.2577,
       "step": 71050
     },
     {
-      "epoch": 7.652567000322893,
-      "grad_norm": 0.8479968905448914,
-      "learning_rate": 0.00014117228746902272,
-      "loss": 3.2428,
+      "epoch": 7.665768194070081,
+      "grad_norm": 0.8710419535636902,
+      "learning_rate": 0.0001403734484619536,
+      "loss": 3.2308,
       "step": 71100
     },
     {
-      "epoch": 7.657948552362502,
-      "grad_norm": 0.818569004535675,
-      "learning_rate": 0.0001408490464389613,
-      "loss": 3.2402,
+      "epoch": 7.671159029649596,
+      "grad_norm": 0.8106386065483093,
+      "learning_rate": 0.00014004964921748514,
+      "loss": 3.2225,
       "step": 71150
     },
     {
-      "epoch": 7.663330104402109,
-      "grad_norm": 0.8019593954086304,
-      "learning_rate": 0.00014052580540889988,
-      "loss": 3.2463,
+      "epoch": 7.67654986522911,
+      "grad_norm": 0.839568018913269,
+      "learning_rate": 0.00013972584997301671,
+      "loss": 3.2321,
       "step": 71200
     },
     {
-      "epoch": 7.668711656441718,
-      "grad_norm": 0.7790815234184265,
-      "learning_rate": 0.00014020256437883847,
-      "loss": 3.2355,
+      "epoch": 7.681940700808625,
+      "grad_norm": 0.8566945791244507,
+      "learning_rate": 0.0001394020507285483,
+      "loss": 3.2389,
       "step": 71250
     },
     {
-      "epoch": 7.674093208481326,
-      "grad_norm": 0.7565615177154541,
-      "learning_rate": 0.00013987932334877707,
-      "loss": 3.2197,
+      "epoch": 7.6873315363881405,
+      "grad_norm": 0.8562785387039185,
+      "learning_rate": 0.00013907825148407984,
+      "loss": 3.2319,
       "step": 71300
     },
     {
-      "epoch": 7.679474760520934,
-      "grad_norm": 0.782861590385437,
-      "learning_rate": 0.00013955608231871564,
-      "loss": 3.2444,
+      "epoch": 7.692722371967655,
+      "grad_norm": 0.8187358975410461,
+      "learning_rate": 0.00013875445223961142,
+      "loss": 3.2427,
       "step": 71350
     },
     {
-      "epoch": 7.684856312560543,
-      "grad_norm": 0.8096931576728821,
-      "learning_rate": 0.00013923284128865423,
-      "loss": 3.2483,
+      "epoch": 7.69811320754717,
+      "grad_norm": 0.8082362413406372,
+      "learning_rate": 0.000138430652995143,
+      "loss": 3.2448,
       "step": 71400
     },
     {
-      "epoch": 7.69023786460015,
-      "grad_norm": 0.7934433817863464,
-      "learning_rate": 0.00013890960025859283,
-      "loss": 3.2454,
+      "epoch": 7.703504043126685,
+      "grad_norm": 0.8942475318908691,
+      "learning_rate": 0.00013810685375067455,
+      "loss": 3.2331,
       "step": 71450
     },
     {
-      "epoch": 7.695619416639759,
-      "grad_norm": 0.7780919671058655,
-      "learning_rate": 0.0001385863592285314,
-      "loss": 3.236,
+      "epoch": 7.708894878706199,
+      "grad_norm": 0.8236886858940125,
+      "learning_rate": 0.00013778305450620613,
+      "loss": 3.2444,
       "step": 71500
     },
     {
-      "epoch": 7.7010009686793675,
-      "grad_norm": 0.7714535593986511,
-      "learning_rate": 0.00013826311819847,
-      "loss": 3.2354,
+      "epoch": 7.714285714285714,
+      "grad_norm": 0.860549807548523,
+      "learning_rate": 0.0001374592552617377,
+      "loss": 3.2318,
       "step": 71550
     },
     {
-      "epoch": 7.706382520718975,
-      "grad_norm": 0.7559123039245605,
-      "learning_rate": 0.00013793987716840858,
-      "loss": 3.2452,
+      "epoch": 7.719676549865229,
+      "grad_norm": 0.8225796818733215,
+      "learning_rate": 0.00013713545601726928,
+      "loss": 3.2357,
       "step": 71600
     },
     {
-      "epoch": 7.711764072758584,
-      "grad_norm": 0.7606051564216614,
-      "learning_rate": 0.00013761663613834715,
-      "loss": 3.2283,
+      "epoch": 7.725067385444744,
+      "grad_norm": 0.8761296272277832,
+      "learning_rate": 0.00013681165677280086,
+      "loss": 3.2266,
       "step": 71650
     },
     {
-      "epoch": 7.717145624798192,
-      "grad_norm": 0.7872258424758911,
-      "learning_rate": 0.00013729339510828572,
-      "loss": 3.2319,
+      "epoch": 7.730458221024259,
+      "grad_norm": 0.8161090612411499,
+      "learning_rate": 0.0001364878575283324,
+      "loss": 3.2535,
       "step": 71700
     },
     {
-      "epoch": 7.7225271768378,
-      "grad_norm": 0.7712429761886597,
-      "learning_rate": 0.0001369701540782243,
-      "loss": 3.2328,
+      "epoch": 7.735849056603773,
+      "grad_norm": 0.7882575392723083,
+      "learning_rate": 0.000136164058283864,
+      "loss": 3.2397,
       "step": 71750
     },
     {
-      "epoch": 7.727908728877408,
-      "grad_norm": 0.8140832185745239,
-      "learning_rate": 0.0001366469130481629,
-      "loss": 3.2216,
+      "epoch": 7.741239892183288,
+      "grad_norm": 0.8809931874275208,
+      "learning_rate": 0.00013584025903939557,
+      "loss": 3.2278,
       "step": 71800
     },
     {
-      "epoch": 7.733290280917016,
-      "grad_norm": 0.8168127536773682,
-      "learning_rate": 0.00013632367201810147,
-      "loss": 3.2295,
+      "epoch": 7.7466307277628035,
+      "grad_norm": 0.8188114762306213,
+      "learning_rate": 0.00013551645979492714,
+      "loss": 3.2244,
       "step": 71850
     },
     {
-      "epoch": 7.738671832956625,
-      "grad_norm": 0.8033936619758606,
-      "learning_rate": 0.0001360068958086413,
-      "loss": 3.2374,
+      "epoch": 7.752021563342318,
+      "grad_norm": 0.8257802128791809,
+      "learning_rate": 0.0001351926605504587,
+      "loss": 3.2463,
       "step": 71900
     },
     {
-      "epoch": 7.744053384996233,
-      "grad_norm": 0.9045777320861816,
-      "learning_rate": 0.00013568365477857988,
-      "loss": 3.2282,
+      "epoch": 7.757412398921833,
+      "grad_norm": 0.874371349811554,
+      "learning_rate": 0.00013486886130599027,
+      "loss": 3.24,
       "step": 71950
     },
     {
-      "epoch": 7.749434937035841,
-      "grad_norm": 0.7644913792610168,
-      "learning_rate": 0.00013536041374851847,
-      "loss": 3.2354,
+      "epoch": 7.762803234501348,
+      "grad_norm": 0.8590646982192993,
+      "learning_rate": 0.00013454506206152185,
+      "loss": 3.2425,
       "step": 72000
     },
     {
-      "epoch": 7.749434937035841,
-      "eval_accuracy": 0.38948757233701825,
-      "eval_loss": 3.3385231494903564,
-      "eval_runtime": 185.7025,
-      "eval_samples_per_second": 96.988,
-      "eval_steps_per_second": 6.063,
+      "epoch": 7.762803234501348,
+      "eval_accuracy": 0.38919529606879727,
+      "eval_loss": 3.338242292404175,
+      "eval_runtime": 183.9612,
+      "eval_samples_per_second": 97.907,
+      "eval_steps_per_second": 6.121,
       "step": 72000
     },
     {
-      "epoch": 7.754816489075449,
-      "grad_norm": 0.7797622680664062,
-      "learning_rate": 0.00013503717271845706,
-      "loss": 3.2282,
+      "epoch": 7.768194070080862,
+      "grad_norm": 0.800114631652832,
+      "learning_rate": 0.00013422126281705343,
+      "loss": 3.2321,
       "step": 72050
     },
     {
-      "epoch": 7.760198041115058,
-      "grad_norm": 0.7901070713996887,
-      "learning_rate": 0.00013471393168839563,
-      "loss": 3.2469,
+      "epoch": 7.773584905660377,
+      "grad_norm": 0.8323007822036743,
+      "learning_rate": 0.00013389746357258498,
+      "loss": 3.242,
       "step": 72100
     },
     {
-      "epoch": 7.765579593154666,
-      "grad_norm": 0.8164810538291931,
-      "learning_rate": 0.00013439069065833423,
-      "loss": 3.2479,
+      "epoch": 7.7789757412398925,
+      "grad_norm": 0.7954844236373901,
+      "learning_rate": 0.00013357366432811656,
+      "loss": 3.2409,
       "step": 72150
     },
     {
-      "epoch": 7.770961145194274,
-      "grad_norm": 0.8166574835777283,
-      "learning_rate": 0.0001340674496282728,
-      "loss": 3.2366,
+      "epoch": 7.784366576819407,
+      "grad_norm": 0.830142617225647,
+      "learning_rate": 0.00013324986508364813,
+      "loss": 3.2646,
       "step": 72200
     },
     {
-      "epoch": 7.776342697233883,
-      "grad_norm": 0.7946748733520508,
-      "learning_rate": 0.0001337442085982114,
-      "loss": 3.2279,
+      "epoch": 7.789757412398922,
+      "grad_norm": 0.8352818489074707,
+      "learning_rate": 0.0001329260658391797,
+      "loss": 3.2187,
       "step": 72250
     },
     {
-      "epoch": 7.78172424927349,
-      "grad_norm": 0.8163143396377563,
-      "learning_rate": 0.00013342096756814996,
-      "loss": 3.2278,
+      "epoch": 7.795148247978437,
+      "grad_norm": 0.8565665483474731,
+      "learning_rate": 0.00013260226659471126,
+      "loss": 3.2451,
       "step": 72300
     },
     {
-      "epoch": 7.787105801313099,
-      "grad_norm": 0.7848666906356812,
-      "learning_rate": 0.00013309772653808855,
-      "loss": 3.2493,
+      "epoch": 7.800539083557951,
+      "grad_norm": 0.8777026534080505,
+      "learning_rate": 0.00013227846735024284,
+      "loss": 3.238,
       "step": 72350
     },
     {
-      "epoch": 7.792487353352707,
-      "grad_norm": 0.7951918840408325,
-      "learning_rate": 0.00013277448550802715,
-      "loss": 3.2208,
+      "epoch": 7.8059299191374665,
+      "grad_norm": 0.9060974717140198,
+      "learning_rate": 0.00013195466810577442,
+      "loss": 3.2376,
       "step": 72400
     },
     {
-      "epoch": 7.797868905392315,
-      "grad_norm": 0.7784159779548645,
-      "learning_rate": 0.0001324512444779657,
-      "loss": 3.2259,
+      "epoch": 7.811320754716981,
+      "grad_norm": 0.7934736609458923,
+      "learning_rate": 0.000131630868861306,
+      "loss": 3.2418,
       "step": 72450
     },
     {
-      "epoch": 7.803250457431924,
-      "grad_norm": 0.8353852033615112,
-      "learning_rate": 0.0001321280034479043,
-      "loss": 3.2519,
+      "epoch": 7.816711590296496,
+      "grad_norm": 0.8556714653968811,
+      "learning_rate": 0.00013130706961683754,
+      "loss": 3.2286,
       "step": 72500
     },
     {
-      "epoch": 7.808632009471531,
-      "grad_norm": 0.7999012470245361,
-      "learning_rate": 0.0001318047624178429,
-      "loss": 3.2405,
+      "epoch": 7.822102425876011,
+      "grad_norm": 0.8331558704376221,
+      "learning_rate": 0.00013098327037236912,
+      "loss": 3.2158,
       "step": 72550
     },
     {
-      "epoch": 7.81401356151114,
-      "grad_norm": 0.8065581321716309,
-      "learning_rate": 0.0001314815213877815,
-      "loss": 3.2377,
+      "epoch": 7.827493261455525,
+      "grad_norm": 0.79069584608078,
+      "learning_rate": 0.0001306594711279007,
+      "loss": 3.2468,
       "step": 72600
     },
     {
-      "epoch": 7.819395113550748,
-      "grad_norm": 0.7664511799812317,
-      "learning_rate": 0.00013115828035772007,
-      "loss": 3.2396,
+      "epoch": 7.83288409703504,
+      "grad_norm": 0.8664618134498596,
+      "learning_rate": 0.00013033567188343225,
+      "loss": 3.2358,
       "step": 72650
     },
     {
-      "epoch": 7.824776665590356,
-      "grad_norm": 0.8212724924087524,
-      "learning_rate": 0.00013083503932765866,
-      "loss": 3.2414,
+      "epoch": 7.8382749326145555,
+      "grad_norm": 0.8485777378082275,
+      "learning_rate": 0.00013001187263896383,
+      "loss": 3.2413,
       "step": 72700
     },
     {
-      "epoch": 7.830158217629965,
-      "grad_norm": 0.8192585110664368,
-      "learning_rate": 0.00013051179829759723,
-      "loss": 3.2474,
+      "epoch": 7.84366576819407,
+      "grad_norm": 0.9091561436653137,
+      "learning_rate": 0.0001296880733944954,
+      "loss": 3.2381,
       "step": 72750
     },
     {
-      "epoch": 7.835539769669572,
-      "grad_norm": 0.7615439295768738,
-      "learning_rate": 0.00013018855726753582,
-      "loss": 3.2471,
+      "epoch": 7.849056603773585,
+      "grad_norm": 0.7862276434898376,
+      "learning_rate": 0.00012936427415002696,
+      "loss": 3.2335,
       "step": 72800
     },
     {
-      "epoch": 7.840921321709181,
-      "grad_norm": 0.8273650407791138,
-      "learning_rate": 0.0001298653162374744,
-      "loss": 3.2322,
+      "epoch": 7.8544474393531,
+      "grad_norm": 0.8281474113464355,
+      "learning_rate": 0.00012904047490555853,
+      "loss": 3.2286,
       "step": 72850
     },
     {
-      "epoch": 7.846302873748789,
-      "grad_norm": 0.8271594047546387,
-      "learning_rate": 0.00012954207520741298,
-      "loss": 3.2416,
+      "epoch": 7.859838274932614,
+      "grad_norm": 0.7945317625999451,
+      "learning_rate": 0.0001287166756610901,
+      "loss": 3.2301,
       "step": 72900
     },
     {
-      "epoch": 7.851684425788397,
-      "grad_norm": 0.7770468592643738,
-      "learning_rate": 0.00012921883417735155,
-      "loss": 3.2437,
+      "epoch": 7.8652291105121295,
+      "grad_norm": 0.8411368727684021,
+      "learning_rate": 0.0001283928764166217,
+      "loss": 3.2244,
       "step": 72950
     },
     {
-      "epoch": 7.857065977828006,
-      "grad_norm": 0.8396730422973633,
-      "learning_rate": 0.00012889559314729015,
-      "loss": 3.2343,
+      "epoch": 7.870619946091644,
+      "grad_norm": 0.8305996060371399,
+      "learning_rate": 0.00012806907717215324,
+      "loss": 3.2255,
       "step": 73000
     },
     {
-      "epoch": 7.857065977828006,
-      "eval_accuracy": 0.3899009965751523,
-      "eval_loss": 3.3356707096099854,
-      "eval_runtime": 185.9451,
-      "eval_samples_per_second": 96.862,
-      "eval_steps_per_second": 6.056,
+      "epoch": 7.870619946091644,
+      "eval_accuracy": 0.3896523987678328,
+      "eval_loss": 3.3369224071502686,
+      "eval_runtime": 183.4047,
+      "eval_samples_per_second": 98.204,
+      "eval_steps_per_second": 6.139,
       "step": 73000
     },
     {
-      "epoch": 7.862447529867614,
-      "grad_norm": 0.8383583426475525,
-      "learning_rate": 0.00012857235211722874,
+      "epoch": 7.876010781671159,
+      "grad_norm": 0.8752392530441284,
+      "learning_rate": 0.00012774527792768482,
       "loss": 3.2385,
       "step": 73050
     },
     {
-      "epoch": 7.867829081907222,
-      "grad_norm": 0.8135746121406555,
-      "learning_rate": 0.00012824911108716734,
-      "loss": 3.2319,
+      "epoch": 7.881401617250674,
+      "grad_norm": 0.9205421209335327,
+      "learning_rate": 0.0001274214786832164,
+      "loss": 3.2175,
       "step": 73100
     },
     {
-      "epoch": 7.87321063394683,
-      "grad_norm": 0.7756822109222412,
-      "learning_rate": 0.0001279258700571059,
-      "loss": 3.2414,
+      "epoch": 7.886792452830189,
+      "grad_norm": 0.8184296488761902,
+      "learning_rate": 0.00012709767943874795,
+      "loss": 3.2264,
       "step": 73150
     },
     {
-      "epoch": 7.878592185986438,
-      "grad_norm": 0.7989129424095154,
-      "learning_rate": 0.0001276026290270445,
-      "loss": 3.2258,
+      "epoch": 7.892183288409703,
+      "grad_norm": 0.84311443567276,
+      "learning_rate": 0.00012677388019427952,
+      "loss": 3.243,
       "step": 73200
     },
     {
-      "epoch": 7.883973738026047,
-      "grad_norm": 0.7954966425895691,
-      "learning_rate": 0.00012727938799698307,
-      "loss": 3.2598,
+      "epoch": 7.8975741239892185,
+      "grad_norm": 0.7987281084060669,
+      "learning_rate": 0.0001264500809498111,
+      "loss": 3.2351,
       "step": 73250
     },
     {
-      "epoch": 7.889355290065655,
-      "grad_norm": 0.8074511885643005,
-      "learning_rate": 0.00012696261178752287,
-      "loss": 3.2205,
+      "epoch": 7.902964959568733,
+      "grad_norm": 0.8131510615348816,
+      "learning_rate": 0.00012612628170534268,
+      "loss": 3.2291,
       "step": 73300
     },
     {
-      "epoch": 7.894736842105263,
-      "grad_norm": 0.8159090876579285,
-      "learning_rate": 0.00012663937075746147,
-      "loss": 3.2612,
+      "epoch": 7.908355795148248,
+      "grad_norm": 0.8380711674690247,
+      "learning_rate": 0.00012580248246087426,
+      "loss": 3.2148,
       "step": 73350
     },
     {
-      "epoch": 7.900118394144871,
-      "grad_norm": 0.8149001598358154,
-      "learning_rate": 0.00012631612972740006,
-      "loss": 3.2642,
+      "epoch": 7.913746630727763,
+      "grad_norm": 0.8311099410057068,
+      "learning_rate": 0.00012548515920129518,
+      "loss": 3.259,
       "step": 73400
     },
     {
-      "epoch": 7.90549994618448,
-      "grad_norm": 0.7865789532661438,
-      "learning_rate": 0.00012599288869733863,
-      "loss": 3.2372,
+      "epoch": 7.919137466307277,
+      "grad_norm": 0.8204582333564758,
+      "learning_rate": 0.00012516135995682676,
+      "loss": 3.2398,
       "step": 73450
     },
     {
-      "epoch": 7.910881498224088,
-      "grad_norm": 0.805672824382782,
-      "learning_rate": 0.00012566964766727722,
-      "loss": 3.232,
+      "epoch": 7.9245283018867925,
+      "grad_norm": 0.8196551203727722,
+      "learning_rate": 0.0001248375607123583,
+      "loss": 3.2513,
       "step": 73500
     },
     {
-      "epoch": 7.916263050263696,
-      "grad_norm": 0.7899633049964905,
-      "learning_rate": 0.00012534640663721582,
-      "loss": 3.2377,
+      "epoch": 7.929919137466308,
+      "grad_norm": 0.8708674311637878,
+      "learning_rate": 0.0001245137614678899,
+      "loss": 3.2424,
       "step": 73550
     },
     {
-      "epoch": 7.921644602303305,
-      "grad_norm": 0.795754611492157,
-      "learning_rate": 0.00012502316560715439,
-      "loss": 3.2179,
+      "epoch": 7.935309973045822,
+      "grad_norm": 0.8819312453269958,
+      "learning_rate": 0.00012418996222342147,
+      "loss": 3.241,
       "step": 73600
     },
     {
-      "epoch": 7.927026154342912,
-      "grad_norm": 0.7938458323478699,
-      "learning_rate": 0.00012469992457709298,
-      "loss": 3.2383,
+      "epoch": 7.940700808625337,
+      "grad_norm": 0.8691334128379822,
+      "learning_rate": 0.00012386616297895304,
+      "loss": 3.251,
       "step": 73650
     },
     {
-      "epoch": 7.932407706382521,
-      "grad_norm": 0.7996332049369812,
-      "learning_rate": 0.00012437668354703158,
-      "loss": 3.2522,
+      "epoch": 7.946091644204852,
+      "grad_norm": 0.8479558825492859,
+      "learning_rate": 0.0001235423637344846,
+      "loss": 3.2569,
       "step": 73700
     },
     {
-      "epoch": 7.937789258422129,
-      "grad_norm": 0.7663394808769226,
-      "learning_rate": 0.00012405344251697014,
-      "loss": 3.2147,
+      "epoch": 7.951482479784366,
+      "grad_norm": 0.824360191822052,
+      "learning_rate": 0.00012321856449001617,
+      "loss": 3.2141,
       "step": 73750
     },
     {
-      "epoch": 7.943170810461737,
-      "grad_norm": 0.7848943471908569,
-      "learning_rate": 0.00012373020148690874,
-      "loss": 3.2357,
+      "epoch": 7.9568733153638815,
+      "grad_norm": 0.8288909196853638,
+      "learning_rate": 0.00012289476524554775,
+      "loss": 3.2363,
       "step": 73800
     },
     {
-      "epoch": 7.948552362501346,
-      "grad_norm": 0.7804843187332153,
-      "learning_rate": 0.0001234069604568473,
-      "loss": 3.2358,
+      "epoch": 7.962264150943396,
+      "grad_norm": 0.8791062235832214,
+      "learning_rate": 0.00012257096600107933,
+      "loss": 3.2223,
       "step": 73850
     },
     {
-      "epoch": 7.953933914540953,
-      "grad_norm": 0.81059730052948,
-      "learning_rate": 0.0001230837194267859,
-      "loss": 3.2335,
+      "epoch": 7.967654986522911,
+      "grad_norm": 0.8680914044380188,
+      "learning_rate": 0.0001222471667566109,
+      "loss": 3.2292,
       "step": 73900
     },
     {
-      "epoch": 7.959315466580562,
-      "grad_norm": 0.8302901387214661,
-      "learning_rate": 0.00012276047839672447,
-      "loss": 3.2246,
+      "epoch": 7.973045822102426,
+      "grad_norm": 0.8580226898193359,
+      "learning_rate": 0.00012192336751214245,
+      "loss": 3.2387,
       "step": 73950
     },
     {
-      "epoch": 7.96469701862017,
-      "grad_norm": 0.793449878692627,
-      "learning_rate": 0.00012243723736666306,
-      "loss": 3.2224,
+      "epoch": 7.97843665768194,
+      "grad_norm": 0.8580893278121948,
+      "learning_rate": 0.00012159956826767403,
+      "loss": 3.2364,
       "step": 74000
     },
     {
-      "epoch": 7.96469701862017,
-      "eval_accuracy": 0.3902945373348164,
-      "eval_loss": 3.3304708003997803,
-      "eval_runtime": 185.3875,
-      "eval_samples_per_second": 97.153,
-      "eval_steps_per_second": 6.074,
+      "epoch": 7.97843665768194,
+      "eval_accuracy": 0.3899765103321834,
+      "eval_loss": 3.333406925201416,
+      "eval_runtime": 183.9362,
+      "eval_samples_per_second": 97.92,
+      "eval_steps_per_second": 6.122,
       "step": 74000
     },
     {
-      "epoch": 7.970078570659778,
-      "grad_norm": 0.8431436419487,
-      "learning_rate": 0.00012211399633660166,
-      "loss": 3.2265,
+      "epoch": 7.9838274932614555,
+      "grad_norm": 0.843823254108429,
+      "learning_rate": 0.00012127576902320561,
+      "loss": 3.2387,
       "step": 74050
     },
     {
-      "epoch": 7.975460122699387,
-      "grad_norm": 0.8346486687660217,
-      "learning_rate": 0.00012179075530654022,
-      "loss": 3.2429,
+      "epoch": 7.989218328840971,
+      "grad_norm": 0.8297356963157654,
+      "learning_rate": 0.00012095196977873717,
+      "loss": 3.2414,
       "step": 74100
     },
     {
-      "epoch": 7.980841674738995,
-      "grad_norm": 0.8443086743354797,
-      "learning_rate": 0.00012146751427647882,
-      "loss": 3.2296,
+      "epoch": 7.994609164420485,
+      "grad_norm": 0.8512176871299744,
+      "learning_rate": 0.00012062817053426874,
+      "loss": 3.2302,
       "step": 74150
     },
     {
-      "epoch": 7.986223226778603,
-      "grad_norm": 0.8074975609779358,
-      "learning_rate": 0.0001211442732464174,
-      "loss": 3.2325,
+      "epoch": 8.0,
+      "grad_norm": 1.710868239402771,
+      "learning_rate": 0.00012030437128980032,
+      "loss": 3.2452,
       "step": 74200
     },
     {
-      "epoch": 7.991604778818211,
-      "grad_norm": 0.7959332466125488,
-      "learning_rate": 0.00012082103221635598,
-      "loss": 3.2252,
+      "epoch": 8.005390835579515,
+      "grad_norm": 0.8137375712394714,
+      "learning_rate": 0.00011998057204533188,
+      "loss": 3.1626,
       "step": 74250
     },
     {
-      "epoch": 7.996986330857819,
-      "grad_norm": 0.7904613018035889,
-      "learning_rate": 0.00012049779118629456,
-      "loss": 3.2217,
+      "epoch": 8.01078167115903,
+      "grad_norm": 0.8332458138465881,
+      "learning_rate": 0.00011965677280086346,
+      "loss": 3.1565,
       "step": 74300
     },
     {
-      "epoch": 8.002367882897428,
-      "grad_norm": 0.830864429473877,
-      "learning_rate": 0.00012017455015623316,
-      "loss": 3.2092,
+      "epoch": 8.016172506738544,
+      "grad_norm": 0.8537135720252991,
+      "learning_rate": 0.00011933297355639502,
+      "loss": 3.1415,
       "step": 74350
     },
     {
-      "epoch": 8.007749434937036,
-      "grad_norm": 0.8585742712020874,
-      "learning_rate": 0.00011985130912617175,
-      "loss": 3.151,
+      "epoch": 8.021563342318059,
+      "grad_norm": 0.8167550563812256,
+      "learning_rate": 0.00011900917431192659,
+      "loss": 3.1471,
       "step": 74400
     },
     {
-      "epoch": 8.013130986976645,
-      "grad_norm": 0.7843531370162964,
-      "learning_rate": 0.00011952806809611032,
-      "loss": 3.1576,
+      "epoch": 8.026954177897574,
+      "grad_norm": 0.7993401288986206,
+      "learning_rate": 0.00011868537506745816,
+      "loss": 3.1652,
       "step": 74450
     },
     {
-      "epoch": 8.018512539016251,
-      "grad_norm": 0.8142103552818298,
-      "learning_rate": 0.00011920482706604891,
-      "loss": 3.1468,
+      "epoch": 8.032345013477089,
+      "grad_norm": 0.8277669548988342,
+      "learning_rate": 0.00011836157582298974,
+      "loss": 3.1505,
       "step": 74500
     },
     {
-      "epoch": 8.02389409105586,
-      "grad_norm": 0.7832826972007751,
-      "learning_rate": 0.0001188815860359875,
-      "loss": 3.1542,
+      "epoch": 8.037735849056604,
+      "grad_norm": 0.8260676860809326,
+      "learning_rate": 0.00011803777657852132,
+      "loss": 3.1681,
       "step": 74550
     },
     {
-      "epoch": 8.029275643095469,
-      "grad_norm": 0.825363278388977,
-      "learning_rate": 0.00011855834500592608,
-      "loss": 3.1713,
+      "epoch": 8.04312668463612,
+      "grad_norm": 0.8829268217086792,
+      "learning_rate": 0.00011771397733405287,
+      "loss": 3.1606,
       "step": 74600
     },
     {
-      "epoch": 8.034657195135077,
-      "grad_norm": 0.8069754242897034,
-      "learning_rate": 0.00011823510397586466,
-      "loss": 3.1709,
+      "epoch": 8.048517520215633,
+      "grad_norm": 0.8338451385498047,
+      "learning_rate": 0.00011739017808958445,
+      "loss": 3.1549,
       "step": 74650
     },
     {
-      "epoch": 8.040038747174686,
-      "grad_norm": 0.8154868483543396,
-      "learning_rate": 0.00011791186294580325,
-      "loss": 3.1677,
+      "epoch": 8.053908355795148,
+      "grad_norm": 0.8860074281692505,
+      "learning_rate": 0.00011706637884511602,
+      "loss": 3.1636,
       "step": 74700
     },
     {
-      "epoch": 8.045420299214294,
-      "grad_norm": 0.8420593738555908,
-      "learning_rate": 0.00011758862191574182,
-      "loss": 3.1661,
+      "epoch": 8.059299191374663,
+      "grad_norm": 0.8655481338500977,
+      "learning_rate": 0.0001167425796006476,
+      "loss": 3.1584,
       "step": 74750
     },
     {
-      "epoch": 8.050801851253901,
-      "grad_norm": 0.7894124388694763,
-      "learning_rate": 0.00011726538088568041,
-      "loss": 3.1623,
+      "epoch": 8.064690026954178,
+      "grad_norm": 0.8550511002540588,
+      "learning_rate": 0.00011641878035617915,
+      "loss": 3.1629,
       "step": 74800
     },
     {
-      "epoch": 8.05618340329351,
-      "grad_norm": 0.7732349038124084,
-      "learning_rate": 0.000116942139855619,
-      "loss": 3.1625,
+      "epoch": 8.070080862533693,
+      "grad_norm": 0.8494965434074402,
+      "learning_rate": 0.00011609498111171073,
+      "loss": 3.1723,
       "step": 74850
     },
     {
-      "epoch": 8.061564955333118,
-      "grad_norm": 0.7911288738250732,
-      "learning_rate": 0.00011661889882555759,
-      "loss": 3.1792,
+      "epoch": 8.075471698113208,
+      "grad_norm": 0.8265563249588013,
+      "learning_rate": 0.00011577118186724231,
+      "loss": 3.1505,
       "step": 74900
     },
     {
-      "epoch": 8.066946507372727,
-      "grad_norm": 0.7990174293518066,
-      "learning_rate": 0.00011629565779549616,
-      "loss": 3.1663,
+      "epoch": 8.080862533692722,
+      "grad_norm": 0.8405255675315857,
+      "learning_rate": 0.00011544738262277387,
+      "loss": 3.1527,
       "step": 74950
     },
     {
-      "epoch": 8.072328059412335,
-      "grad_norm": 0.8296521902084351,
-      "learning_rate": 0.00011597241676543475,
-      "loss": 3.1624,
+      "epoch": 8.086253369272237,
+      "grad_norm": 0.8647146821022034,
+      "learning_rate": 0.00011512358337830544,
+      "loss": 3.1645,
       "step": 75000
     },
     {
-      "epoch": 8.072328059412335,
-      "eval_accuracy": 0.390195771859793,
-      "eval_loss": 3.338721513748169,
-      "eval_runtime": 185.4374,
-      "eval_samples_per_second": 97.127,
-      "eval_steps_per_second": 6.072,
+      "epoch": 8.086253369272237,
+      "eval_accuracy": 0.3898099454551638,
+      "eval_loss": 3.3394393920898438,
+      "eval_runtime": 183.6112,
+      "eval_samples_per_second": 98.093,
+      "eval_steps_per_second": 6.133,
       "step": 75000
     },
     {
-      "epoch": 8.077709611451942,
-      "grad_norm": 0.8073539733886719,
-      "learning_rate": 0.00011564917573537335,
-      "loss": 3.1724,
+      "epoch": 8.091644204851752,
+      "grad_norm": 0.9160317182540894,
+      "learning_rate": 0.00011479978413383701,
+      "loss": 3.1715,
       "step": 75050
     },
     {
-      "epoch": 8.08309116349155,
-      "grad_norm": 0.8138803839683533,
-      "learning_rate": 0.00011532593470531191,
-      "loss": 3.1767,
+      "epoch": 8.097035040431267,
+      "grad_norm": 0.7973936200141907,
+      "learning_rate": 0.00011447598488936858,
+      "loss": 3.1577,
       "step": 75100
     },
     {
-      "epoch": 8.088472715531159,
-      "grad_norm": 0.8345963358879089,
-      "learning_rate": 0.0001150026936752505,
-      "loss": 3.1701,
+      "epoch": 8.102425876010782,
+      "grad_norm": 0.825989842414856,
+      "learning_rate": 0.00011415218564490016,
+      "loss": 3.178,
       "step": 75150
     },
     {
-      "epoch": 8.093854267570768,
-      "grad_norm": 0.8031169176101685,
-      "learning_rate": 0.00011467945264518909,
-      "loss": 3.1608,
+      "epoch": 8.107816711590296,
+      "grad_norm": 0.8714901208877563,
+      "learning_rate": 0.00011382838640043172,
+      "loss": 3.1699,
       "step": 75200
     },
     {
-      "epoch": 8.099235819610376,
-      "grad_norm": 0.8811976909637451,
-      "learning_rate": 0.00011435621161512766,
-      "loss": 3.1759,
+      "epoch": 8.11320754716981,
+      "grad_norm": 0.8460976481437683,
+      "learning_rate": 0.00011350458715596328,
+      "loss": 3.1599,
       "step": 75250
     },
     {
-      "epoch": 8.104617371649983,
-      "grad_norm": 0.8665459156036377,
-      "learning_rate": 0.00011403297058506625,
-      "loss": 3.1649,
+      "epoch": 8.118598382749326,
+      "grad_norm": 0.8299224972724915,
+      "learning_rate": 0.00011318078791149486,
+      "loss": 3.1705,
       "step": 75300
     },
     {
-      "epoch": 8.109998923689592,
-      "grad_norm": 0.8429995775222778,
-      "learning_rate": 0.00011370972955500485,
-      "loss": 3.1501,
+      "epoch": 8.123989218328841,
+      "grad_norm": 0.878569483757019,
+      "learning_rate": 0.00011285698866702644,
+      "loss": 3.1724,
       "step": 75350
     },
     {
-      "epoch": 8.1153804757292,
-      "grad_norm": 0.7925933599472046,
-      "learning_rate": 0.00011338648852494343,
-      "loss": 3.1473,
+      "epoch": 8.129380053908356,
+      "grad_norm": 0.8617807030677795,
+      "learning_rate": 0.00011253318942255802,
+      "loss": 3.1723,
       "step": 75400
     },
     {
-      "epoch": 8.120762027768809,
-      "grad_norm": 0.806531548500061,
-      "learning_rate": 0.00011306324749488201,
-      "loss": 3.1832,
+      "epoch": 8.134770889487871,
+      "grad_norm": 0.8983737826347351,
+      "learning_rate": 0.00011220939017808957,
+      "loss": 3.1759,
       "step": 75450
     },
     {
-      "epoch": 8.126143579808417,
-      "grad_norm": 0.8357629179954529,
-      "learning_rate": 0.00011274000646482059,
-      "loss": 3.1748,
+      "epoch": 8.140161725067385,
+      "grad_norm": 0.8343617916107178,
+      "learning_rate": 0.00011188559093362115,
+      "loss": 3.1481,
       "step": 75500
     },
     {
-      "epoch": 8.131525131848026,
-      "grad_norm": 0.8737144470214844,
-      "learning_rate": 0.00011241676543475918,
-      "loss": 3.1739,
+      "epoch": 8.1455525606469,
+      "grad_norm": 0.8640062808990479,
+      "learning_rate": 0.00011156179168915272,
+      "loss": 3.1828,
       "step": 75550
     },
     {
-      "epoch": 8.136906683887632,
-      "grad_norm": 0.8199858665466309,
-      "learning_rate": 0.00011209352440469775,
-      "loss": 3.1759,
+      "epoch": 8.150943396226415,
+      "grad_norm": 0.8575279116630554,
+      "learning_rate": 0.00011123799244468429,
+      "loss": 3.1583,
       "step": 75600
     },
     {
-      "epoch": 8.142288235927241,
-      "grad_norm": 0.7963452339172363,
-      "learning_rate": 0.00011177028337463635,
-      "loss": 3.1703,
+      "epoch": 8.15633423180593,
+      "grad_norm": 0.8382391929626465,
+      "learning_rate": 0.00011091419320021585,
+      "loss": 3.1735,
       "step": 75650
     },
     {
-      "epoch": 8.14766978796685,
-      "grad_norm": 0.8108802437782288,
-      "learning_rate": 0.00011144704234457493,
-      "loss": 3.1827,
+      "epoch": 8.161725067385445,
+      "grad_norm": 0.8406264185905457,
+      "learning_rate": 0.00011059039395574743,
+      "loss": 3.1957,
       "step": 75700
     },
     {
-      "epoch": 8.153051340006458,
-      "grad_norm": 0.8074478507041931,
-      "learning_rate": 0.00011112380131451351,
-      "loss": 3.1549,
+      "epoch": 8.167115902964959,
+      "grad_norm": 0.8683697581291199,
+      "learning_rate": 0.00011026659471127899,
+      "loss": 3.1723,
       "step": 75750
     },
     {
-      "epoch": 8.158432892046067,
-      "grad_norm": 0.8079409599304199,
-      "learning_rate": 0.00011080056028445209,
-      "loss": 3.1839,
+      "epoch": 8.172506738544474,
+      "grad_norm": 0.8310422897338867,
+      "learning_rate": 0.00010994279546681057,
+      "loss": 3.1679,
       "step": 75800
     },
     {
-      "epoch": 8.163814444085673,
-      "grad_norm": 0.8951064944267273,
-      "learning_rate": 0.00011047731925439068,
-      "loss": 3.1682,
+      "epoch": 8.177897574123989,
+      "grad_norm": 0.8195992708206177,
+      "learning_rate": 0.00010961899622234213,
+      "loss": 3.1575,
       "step": 75850
     },
     {
-      "epoch": 8.169195996125282,
-      "grad_norm": 0.8033889532089233,
-      "learning_rate": 0.00011015407822432928,
-      "loss": 3.1732,
+      "epoch": 8.183288409703504,
+      "grad_norm": 0.8489004373550415,
+      "learning_rate": 0.00010929519697787371,
+      "loss": 3.1935,
       "step": 75900
     },
     {
-      "epoch": 8.17457754816489,
-      "grad_norm": 0.8481862545013428,
-      "learning_rate": 0.00010983083719426785,
-      "loss": 3.1711,
+      "epoch": 8.18867924528302,
+      "grad_norm": 0.8514471054077148,
+      "learning_rate": 0.00010897139773340528,
+      "loss": 3.153,
       "step": 75950
     },
     {
-      "epoch": 8.1799591002045,
-      "grad_norm": 0.8201789855957031,
-      "learning_rate": 0.00010950759616420644,
-      "loss": 3.1815,
+      "epoch": 8.194070080862534,
+      "grad_norm": 0.8518810868263245,
+      "learning_rate": 0.00010864759848893685,
+      "loss": 3.1706,
       "step": 76000
     },
     {
-      "epoch": 8.1799591002045,
-      "eval_accuracy": 0.39044654272486884,
-      "eval_loss": 3.3347246646881104,
-      "eval_runtime": 185.7806,
-      "eval_samples_per_second": 96.948,
-      "eval_steps_per_second": 6.061,
+      "epoch": 8.194070080862534,
+      "eval_accuracy": 0.39030138246674867,
+      "eval_loss": 3.336826801300049,
+      "eval_runtime": 183.7013,
+      "eval_samples_per_second": 98.045,
+      "eval_steps_per_second": 6.13,
       "step": 76000
     },
     {
-      "epoch": 8.185340652244108,
-      "grad_norm": 0.7705520391464233,
-      "learning_rate": 0.00010918435513414502,
-      "loss": 3.1902,
+      "epoch": 8.199460916442048,
+      "grad_norm": 0.8850810527801514,
+      "learning_rate": 0.00010832379924446842,
+      "loss": 3.1794,
       "step": 76050
     },
     {
-      "epoch": 8.190722204283716,
-      "grad_norm": 0.8561078310012817,
-      "learning_rate": 0.00010886111410408359,
-      "loss": 3.188,
+      "epoch": 8.204851752021563,
+      "grad_norm": 0.8909291625022888,
+      "learning_rate": 0.00010799999999999998,
+      "loss": 3.1893,
       "step": 76100
     },
     {
-      "epoch": 8.196103756323323,
-      "grad_norm": 0.8287132382392883,
-      "learning_rate": 0.00010853787307402218,
-      "loss": 3.176,
+      "epoch": 8.210242587601078,
+      "grad_norm": 0.862022876739502,
+      "learning_rate": 0.00010767620075553156,
+      "loss": 3.1766,
       "step": 76150
     },
     {
-      "epoch": 8.201485308362932,
-      "grad_norm": 0.8220933675765991,
-      "learning_rate": 0.00010821463204396078,
-      "loss": 3.1587,
+      "epoch": 8.215633423180593,
+      "grad_norm": 0.8716922402381897,
+      "learning_rate": 0.00010735240151106314,
+      "loss": 3.1806,
       "step": 76200
     },
     {
-      "epoch": 8.20686686040254,
-      "grad_norm": 0.8482598066329956,
-      "learning_rate": 0.00010789139101389935,
-      "loss": 3.1814,
+      "epoch": 8.221024258760108,
+      "grad_norm": 0.8505301475524902,
+      "learning_rate": 0.00010702860226659472,
+      "loss": 3.1685,
       "step": 76250
     },
     {
-      "epoch": 8.212248412442149,
-      "grad_norm": 0.7827437520027161,
-      "learning_rate": 0.00010757461480443917,
-      "loss": 3.184,
+      "epoch": 8.226415094339623,
+      "grad_norm": 0.8627468943595886,
+      "learning_rate": 0.00010670480302212627,
+      "loss": 3.1764,
       "step": 76300
     },
     {
-      "epoch": 8.217629964481757,
-      "grad_norm": 0.8484346270561218,
-      "learning_rate": 0.00010725137377437776,
-      "loss": 3.176,
+      "epoch": 8.231805929919137,
+      "grad_norm": 0.8449823260307312,
+      "learning_rate": 0.00010638100377765784,
+      "loss": 3.1821,
       "step": 76350
     },
     {
-      "epoch": 8.223011516521364,
-      "grad_norm": 0.8320658206939697,
-      "learning_rate": 0.00010692813274431633,
-      "loss": 3.1593,
+      "epoch": 8.237196765498652,
+      "grad_norm": 0.9091751575469971,
+      "learning_rate": 0.00010605720453318942,
+      "loss": 3.19,
       "step": 76400
     },
     {
-      "epoch": 8.228393068560973,
-      "grad_norm": 0.8569603562355042,
-      "learning_rate": 0.00010660489171425492,
-      "loss": 3.1722,
+      "epoch": 8.242587601078167,
+      "grad_norm": 0.8580686450004578,
+      "learning_rate": 0.00010573340528872099,
+      "loss": 3.1607,
       "step": 76450
     },
     {
-      "epoch": 8.233774620600581,
-      "grad_norm": 0.8456915020942688,
-      "learning_rate": 0.0001062816506841935,
-      "loss": 3.1646,
+      "epoch": 8.247978436657682,
+      "grad_norm": 0.8650367856025696,
+      "learning_rate": 0.00010540960604425255,
+      "loss": 3.1667,
       "step": 76500
     },
     {
-      "epoch": 8.23915617264019,
-      "grad_norm": 0.8972082138061523,
-      "learning_rate": 0.00010595840965413209,
-      "loss": 3.1839,
+      "epoch": 8.253369272237197,
+      "grad_norm": 0.8627279996871948,
+      "learning_rate": 0.00010508580679978413,
+      "loss": 3.1607,
       "step": 76550
     },
     {
-      "epoch": 8.244537724679798,
-      "grad_norm": 0.7962077260017395,
-      "learning_rate": 0.00010563516862407067,
-      "loss": 3.1742,
+      "epoch": 8.25876010781671,
+      "grad_norm": 0.8591924905776978,
+      "learning_rate": 0.00010476200755531569,
+      "loss": 3.1841,
       "step": 76600
     },
     {
-      "epoch": 8.249919276719407,
-      "grad_norm": 0.8209040760993958,
-      "learning_rate": 0.00010531192759400926,
-      "loss": 3.1883,
+      "epoch": 8.264150943396226,
+      "grad_norm": 0.8910982012748718,
+      "learning_rate": 0.00010443820831084727,
+      "loss": 3.1651,
       "step": 76650
     },
     {
-      "epoch": 8.255300828759013,
-      "grad_norm": 0.8176620602607727,
-      "learning_rate": 0.00010498868656394784,
-      "loss": 3.1758,
+      "epoch": 8.269541778975741,
+      "grad_norm": 0.8738256692886353,
+      "learning_rate": 0.00010411440906637883,
+      "loss": 3.1976,
       "step": 76700
     },
     {
-      "epoch": 8.260682380798622,
-      "grad_norm": 0.8269286751747131,
-      "learning_rate": 0.00010466544553388642,
-      "loss": 3.1552,
+      "epoch": 8.274932614555256,
+      "grad_norm": 0.8481124639511108,
+      "learning_rate": 0.0001037906098219104,
+      "loss": 3.1799,
       "step": 76750
     },
     {
-      "epoch": 8.26606393283823,
-      "grad_norm": 0.8178252577781677,
-      "learning_rate": 0.000104342204503825,
-      "loss": 3.1911,
+      "epoch": 8.280323450134771,
+      "grad_norm": 0.8235103487968445,
+      "learning_rate": 0.00010346681057744197,
+      "loss": 3.1631,
       "step": 76800
     },
     {
-      "epoch": 8.27144548487784,
-      "grad_norm": 0.8506616950035095,
-      "learning_rate": 0.0001040189634737636,
-      "loss": 3.1894,
+      "epoch": 8.285714285714286,
+      "grad_norm": 0.8965120911598206,
+      "learning_rate": 0.00010314301133297355,
+      "loss": 3.2055,
       "step": 76850
     },
     {
-      "epoch": 8.276827036917448,
-      "grad_norm": 0.8493346571922302,
-      "learning_rate": 0.00010369572244370217,
-      "loss": 3.179,
+      "epoch": 8.2911051212938,
+      "grad_norm": 0.8587255477905273,
+      "learning_rate": 0.0001028192120885051,
+      "loss": 3.1898,
       "step": 76900
     },
     {
-      "epoch": 8.282208588957054,
-      "grad_norm": 0.826920211315155,
-      "learning_rate": 0.00010337248141364076,
-      "loss": 3.176,
+      "epoch": 8.296495956873315,
+      "grad_norm": 0.8255211114883423,
+      "learning_rate": 0.00010249541284403668,
+      "loss": 3.174,
       "step": 76950
     },
     {
-      "epoch": 8.287590140996663,
-      "grad_norm": 0.7999362349510193,
-      "learning_rate": 0.00010304924038357936,
-      "loss": 3.1758,
+      "epoch": 8.30188679245283,
+      "grad_norm": 0.8812569379806519,
+      "learning_rate": 0.00010217161359956826,
+      "loss": 3.184,
       "step": 77000
     },
     {
-      "epoch": 8.287590140996663,
-      "eval_accuracy": 0.39078890797437227,
-      "eval_loss": 3.3327534198760986,
-      "eval_runtime": 185.3503,
-      "eval_samples_per_second": 97.173,
-      "eval_steps_per_second": 6.075,
+      "epoch": 8.30188679245283,
+      "eval_accuracy": 0.39023510420518187,
+      "eval_loss": 3.3336145877838135,
+      "eval_runtime": 183.6293,
+      "eval_samples_per_second": 98.083,
+      "eval_steps_per_second": 6.132,
       "step": 77000
     },
     {
-      "epoch": 8.292971693036272,
-      "grad_norm": 0.9111582040786743,
-      "learning_rate": 0.00010272599935351792,
+      "epoch": 8.307277628032345,
+      "grad_norm": 0.8371959924697876,
+      "learning_rate": 0.00010184781435509984,
       "loss": 3.178,
       "step": 77050
     },
     {
-      "epoch": 8.29835324507588,
-      "grad_norm": 0.8611065745353699,
-      "learning_rate": 0.00010240275832345652,
-      "loss": 3.1736,
+      "epoch": 8.31266846361186,
+      "grad_norm": 0.8691743016242981,
+      "learning_rate": 0.00010152401511063141,
+      "loss": 3.1812,
       "step": 77100
     },
     {
-      "epoch": 8.303734797115489,
-      "grad_norm": 0.7752180099487305,
-      "learning_rate": 0.0001020795172933951,
-      "loss": 3.1778,
+      "epoch": 8.318059299191376,
+      "grad_norm": 0.8494551181793213,
+      "learning_rate": 0.00010120021586616296,
+      "loss": 3.1846,
       "step": 77150
     },
     {
-      "epoch": 8.309116349155097,
-      "grad_norm": 0.7995375394821167,
-      "learning_rate": 0.0001017562762633337,
-      "loss": 3.2014,
+      "epoch": 8.323450134770889,
+      "grad_norm": 0.8436992168426514,
+      "learning_rate": 0.00010087641662169454,
+      "loss": 3.1926,
       "step": 77200
     },
     {
-      "epoch": 8.314497901194704,
-      "grad_norm": 0.8144875168800354,
-      "learning_rate": 0.00010143303523327226,
-      "loss": 3.1808,
+      "epoch": 8.328840970350404,
+      "grad_norm": 0.8713855147361755,
+      "learning_rate": 0.00010055261737722612,
+      "loss": 3.1935,
       "step": 77250
     },
     {
-      "epoch": 8.319879453234313,
-      "grad_norm": 0.8051859140396118,
-      "learning_rate": 0.00010110979420321086,
-      "loss": 3.1826,
+      "epoch": 8.33423180592992,
+      "grad_norm": 0.8606102466583252,
+      "learning_rate": 0.00010022881813275768,
+      "loss": 3.1791,
       "step": 77300
     },
     {
-      "epoch": 8.325261005273921,
-      "grad_norm": 0.7994803786277771,
-      "learning_rate": 0.00010078655317314944,
-      "loss": 3.1801,
+      "epoch": 8.339622641509434,
+      "grad_norm": 0.8840133547782898,
+      "learning_rate": 9.990501888828925e-05,
+      "loss": 3.1774,
       "step": 77350
     },
     {
-      "epoch": 8.33064255731353,
-      "grad_norm": 0.8555433750152588,
-      "learning_rate": 0.00010046331214308802,
-      "loss": 3.166,
+      "epoch": 8.34501347708895,
+      "grad_norm": 0.8631689548492432,
+      "learning_rate": 9.958121964382083e-05,
+      "loss": 3.1958,
       "step": 77400
     },
     {
-      "epoch": 8.336024109353138,
-      "grad_norm": 0.8833298087120056,
-      "learning_rate": 0.0001001400711130266,
-      "loss": 3.2049,
+      "epoch": 8.350404312668463,
+      "grad_norm": 0.8530083298683167,
+      "learning_rate": 9.926389638424175e-05,
+      "loss": 3.1735,
       "step": 77450
     },
     {
-      "epoch": 8.341405661392745,
-      "grad_norm": 0.8662601113319397,
-      "learning_rate": 9.98168300829652e-05,
-      "loss": 3.1979,
+      "epoch": 8.355795148247978,
+      "grad_norm": 0.8499566316604614,
+      "learning_rate": 9.894009713977333e-05,
+      "loss": 3.1802,
       "step": 77500
     },
     {
-      "epoch": 8.346787213432354,
-      "grad_norm": 0.8390443325042725,
-      "learning_rate": 9.949358905290376e-05,
-      "loss": 3.1907,
+      "epoch": 8.361185983827493,
+      "grad_norm": 0.825239360332489,
+      "learning_rate": 9.86162978953049e-05,
+      "loss": 3.1909,
       "step": 77550
     },
     {
-      "epoch": 8.352168765471962,
-      "grad_norm": 0.8560362458229065,
-      "learning_rate": 9.917034802284236e-05,
-      "loss": 3.1891,
+      "epoch": 8.366576819407008,
+      "grad_norm": 0.8787322640419006,
+      "learning_rate": 9.829249865083647e-05,
+      "loss": 3.1879,
       "step": 77600
     },
     {
-      "epoch": 8.35755031751157,
-      "grad_norm": 0.8235760927200317,
-      "learning_rate": 9.884710699278094e-05,
-      "loss": 3.1708,
+      "epoch": 8.371967654986523,
+      "grad_norm": 0.903548538684845,
+      "learning_rate": 9.796869940636805e-05,
+      "loss": 3.178,
       "step": 77650
     },
     {
-      "epoch": 8.36293186955118,
-      "grad_norm": 0.8120622634887695,
-      "learning_rate": 9.852386596271953e-05,
-      "loss": 3.1958,
+      "epoch": 8.377358490566039,
+      "grad_norm": 0.8681001663208008,
+      "learning_rate": 9.764490016189961e-05,
+      "loss": 3.1936,
       "step": 77700
     },
     {
-      "epoch": 8.368313421590786,
-      "grad_norm": 0.7914016246795654,
-      "learning_rate": 9.82006249326581e-05,
-      "loss": 3.1867,
+      "epoch": 8.382749326145552,
+      "grad_norm": 0.8955147862434387,
+      "learning_rate": 9.732110091743119e-05,
+      "loss": 3.1737,
       "step": 77750
     },
     {
-      "epoch": 8.373694973630395,
-      "grad_norm": 0.86472088098526,
-      "learning_rate": 9.78773839025967e-05,
-      "loss": 3.1636,
+      "epoch": 8.388140161725067,
+      "grad_norm": 0.861054539680481,
+      "learning_rate": 9.699730167296275e-05,
+      "loss": 3.1939,
       "step": 77800
     },
     {
-      "epoch": 8.379076525670003,
-      "grad_norm": 0.8104182481765747,
-      "learning_rate": 9.755414287253529e-05,
-      "loss": 3.1801,
+      "epoch": 8.393530997304582,
+      "grad_norm": 0.90186607837677,
+      "learning_rate": 9.667350242849433e-05,
+      "loss": 3.1821,
       "step": 77850
     },
     {
-      "epoch": 8.384458077709612,
-      "grad_norm": 0.8004899621009827,
-      "learning_rate": 9.723090184247386e-05,
-      "loss": 3.2015,
+      "epoch": 8.398921832884097,
+      "grad_norm": 0.8513303399085999,
+      "learning_rate": 9.63497031840259e-05,
+      "loss": 3.184,
       "step": 77900
     },
     {
-      "epoch": 8.38983962974922,
-      "grad_norm": 0.8116545081138611,
-      "learning_rate": 9.690766081241245e-05,
-      "loss": 3.1839,
+      "epoch": 8.404312668463612,
+      "grad_norm": 0.8518136739730835,
+      "learning_rate": 9.602590393955746e-05,
+      "loss": 3.1806,
       "step": 77950
     },
     {
-      "epoch": 8.395221181788829,
-      "grad_norm": 0.8094545006752014,
-      "learning_rate": 9.658441978235103e-05,
-      "loss": 3.1992,
+      "epoch": 8.409703504043126,
+      "grad_norm": 0.8534890413284302,
+      "learning_rate": 9.570210469508904e-05,
+      "loss": 3.1718,
       "step": 78000
     },
     {
-      "epoch": 8.395221181788829,
-      "eval_accuracy": 0.3910330510132914,
-      "eval_loss": 3.3307039737701416,
-      "eval_runtime": 184.6192,
-      "eval_samples_per_second": 97.558,
-      "eval_steps_per_second": 6.099,
+      "epoch": 8.409703504043126,
+      "eval_accuracy": 0.3906564601041264,
+      "eval_loss": 3.3315069675445557,
+      "eval_runtime": 183.818,
+      "eval_samples_per_second": 97.983,
+      "eval_steps_per_second": 6.126,
       "step": 78000
     },
     {
-      "epoch": 8.400602733828435,
-      "grad_norm": 0.8134550452232361,
-      "learning_rate": 9.626117875228961e-05,
-      "loss": 3.1925,
+      "epoch": 8.415094339622641,
+      "grad_norm": 0.8671660423278809,
+      "learning_rate": 9.537830545062061e-05,
+      "loss": 3.1825,
       "step": 78050
     },
     {
-      "epoch": 8.405984285868044,
-      "grad_norm": 0.8060505390167236,
-      "learning_rate": 9.59379377222282e-05,
-      "loss": 3.1681,
+      "epoch": 8.420485175202156,
+      "grad_norm": 0.9098289012908936,
+      "learning_rate": 9.505450620615217e-05,
+      "loss": 3.1816,
       "step": 78100
     },
     {
-      "epoch": 8.411365837907653,
-      "grad_norm": 0.8342357873916626,
-      "learning_rate": 9.561469669216679e-05,
-      "loss": 3.1893,
+      "epoch": 8.425876010781671,
+      "grad_norm": 0.9419325590133667,
+      "learning_rate": 9.473070696168374e-05,
+      "loss": 3.1814,
       "step": 78150
     },
     {
-      "epoch": 8.416747389947261,
-      "grad_norm": 0.803632378578186,
-      "learning_rate": 9.529145566210537e-05,
-      "loss": 3.1694,
+      "epoch": 8.431266846361186,
+      "grad_norm": 0.8998332619667053,
+      "learning_rate": 9.440690771721532e-05,
+      "loss": 3.1702,
       "step": 78200
     },
     {
-      "epoch": 8.42212894198687,
-      "grad_norm": 0.8132495284080505,
-      "learning_rate": 9.496821463204395e-05,
-      "loss": 3.181,
+      "epoch": 8.436657681940702,
+      "grad_norm": 0.8600930571556091,
+      "learning_rate": 9.40831084727469e-05,
+      "loss": 3.1846,
       "step": 78250
     },
     {
-      "epoch": 8.427510494026476,
-      "grad_norm": 0.7954928874969482,
-      "learning_rate": 9.464497360198253e-05,
-      "loss": 3.1813,
+      "epoch": 8.442048517520215,
+      "grad_norm": 0.8776827454566956,
+      "learning_rate": 9.375930922827845e-05,
+      "loss": 3.1775,
       "step": 78300
     },
     {
-      "epoch": 8.432892046066085,
-      "grad_norm": 0.8573002815246582,
-      "learning_rate": 9.432173257192113e-05,
-      "loss": 3.1867,
+      "epoch": 8.44743935309973,
+      "grad_norm": 0.8739205002784729,
+      "learning_rate": 9.343550998381003e-05,
+      "loss": 3.2013,
       "step": 78350
     },
     {
-      "epoch": 8.438273598105694,
-      "grad_norm": 0.8724850416183472,
-      "learning_rate": 9.39984915418597e-05,
-      "loss": 3.1723,
+      "epoch": 8.452830188679245,
+      "grad_norm": 0.8829511404037476,
+      "learning_rate": 9.31117107393416e-05,
+      "loss": 3.1902,
       "step": 78400
     },
     {
-      "epoch": 8.443655150145302,
-      "grad_norm": 0.825176477432251,
-      "learning_rate": 9.367525051179829e-05,
-      "loss": 3.1915,
+      "epoch": 8.45822102425876,
+      "grad_norm": 0.8830351829528809,
+      "learning_rate": 9.278791149487317e-05,
+      "loss": 3.1743,
       "step": 78450
     },
     {
-      "epoch": 8.44903670218491,
-      "grad_norm": 0.8631423115730286,
-      "learning_rate": 9.335200948173688e-05,
-      "loss": 3.189,
+      "epoch": 8.463611859838275,
+      "grad_norm": 0.8151218891143799,
+      "learning_rate": 9.246411225040475e-05,
+      "loss": 3.2046,
       "step": 78500
     },
     {
-      "epoch": 8.45441825422452,
-      "grad_norm": 0.8290612697601318,
-      "learning_rate": 9.302876845167545e-05,
-      "loss": 3.1749,
+      "epoch": 8.46900269541779,
+      "grad_norm": 0.8532697558403015,
+      "learning_rate": 9.214031300593631e-05,
+      "loss": 3.1741,
       "step": 78550
     },
     {
-      "epoch": 8.459799806264126,
-      "grad_norm": 0.8389089703559875,
-      "learning_rate": 9.270552742161403e-05,
-      "loss": 3.1706,
+      "epoch": 8.474393530997304,
+      "grad_norm": 0.8145749568939209,
+      "learning_rate": 9.181651376146787e-05,
+      "loss": 3.1797,
       "step": 78600
     },
     {
-      "epoch": 8.465181358303735,
-      "grad_norm": 0.8537444472312927,
-      "learning_rate": 9.238228639155263e-05,
-      "loss": 3.1814,
+      "epoch": 8.479784366576819,
+      "grad_norm": 0.8687123656272888,
+      "learning_rate": 9.149271451699945e-05,
+      "loss": 3.179,
       "step": 78650
     },
     {
-      "epoch": 8.470562910343343,
-      "grad_norm": 0.8286445140838623,
-      "learning_rate": 9.205904536149122e-05,
-      "loss": 3.1737,
+      "epoch": 8.485175202156334,
+      "grad_norm": 0.8689339756965637,
+      "learning_rate": 9.116891527253103e-05,
+      "loss": 3.2056,
       "step": 78700
     },
     {
-      "epoch": 8.475944462382952,
-      "grad_norm": 0.8667829632759094,
-      "learning_rate": 9.173580433142979e-05,
-      "loss": 3.1634,
+      "epoch": 8.49056603773585,
+      "grad_norm": 0.8794228434562683,
+      "learning_rate": 9.084511602806258e-05,
+      "loss": 3.1902,
       "step": 78750
     },
     {
-      "epoch": 8.48132601442256,
-      "grad_norm": 0.8203412294387817,
-      "learning_rate": 9.141256330136838e-05,
-      "loss": 3.1728,
+      "epoch": 8.495956873315365,
+      "grad_norm": 0.8810113072395325,
+      "learning_rate": 9.052131678359416e-05,
+      "loss": 3.1825,
       "step": 78800
     },
     {
-      "epoch": 8.486707566462167,
-      "grad_norm": 0.8149620294570923,
-      "learning_rate": 9.108932227130697e-05,
-      "loss": 3.1789,
+      "epoch": 8.501347708894878,
+      "grad_norm": 0.8873330354690552,
+      "learning_rate": 9.019751753912574e-05,
+      "loss": 3.1851,
       "step": 78850
     },
     {
-      "epoch": 8.492089118501776,
-      "grad_norm": 0.8609354496002197,
-      "learning_rate": 9.076608124124555e-05,
-      "loss": 3.1664,
+      "epoch": 8.506738544474393,
+      "grad_norm": 0.8767502307891846,
+      "learning_rate": 8.987371829465731e-05,
+      "loss": 3.18,
       "step": 78900
     },
     {
-      "epoch": 8.497470670541384,
-      "grad_norm": 0.8484876751899719,
-      "learning_rate": 9.044284021118413e-05,
-      "loss": 3.1736,
+      "epoch": 8.512129380053908,
+      "grad_norm": 0.8605179786682129,
+      "learning_rate": 8.954991905018886e-05,
+      "loss": 3.201,
       "step": 78950
     },
     {
-      "epoch": 8.502852222580993,
-      "grad_norm": 0.817091166973114,
-      "learning_rate": 9.011959918112272e-05,
-      "loss": 3.183,
+      "epoch": 8.517520215633423,
+      "grad_norm": 0.8646233677864075,
+      "learning_rate": 8.922611980572044e-05,
+      "loss": 3.1856,
       "step": 79000
     },
     {
-      "epoch": 8.502852222580993,
-      "eval_accuracy": 0.3913540116438954,
-      "eval_loss": 3.326340913772583,
-      "eval_runtime": 183.8032,
-      "eval_samples_per_second": 97.991,
-      "eval_steps_per_second": 6.126,
+      "epoch": 8.517520215633423,
+      "eval_accuracy": 0.3910653209209723,
+      "eval_loss": 3.3277745246887207,
+      "eval_runtime": 183.8981,
+      "eval_samples_per_second": 97.94,
+      "eval_steps_per_second": 6.123,
       "step": 79000
     },
     {
-      "epoch": 8.508233774620601,
-      "grad_norm": 0.846644401550293,
-      "learning_rate": 8.979635815106129e-05,
-      "loss": 3.1844,
+      "epoch": 8.522911051212938,
+      "grad_norm": 0.8412624001502991,
+      "learning_rate": 8.890232056125202e-05,
+      "loss": 3.1946,
       "step": 79050
     },
     {
-      "epoch": 8.513615326660208,
-      "grad_norm": 0.8385283350944519,
-      "learning_rate": 8.947311712099989e-05,
-      "loss": 3.196,
+      "epoch": 8.528301886792454,
+      "grad_norm": 0.8897676467895508,
+      "learning_rate": 8.85785213167836e-05,
+      "loss": 3.2009,
       "step": 79100
     },
     {
-      "epoch": 8.518996878699816,
-      "grad_norm": 0.8298741579055786,
-      "learning_rate": 8.914987609093847e-05,
-      "loss": 3.1814,
+      "epoch": 8.533692722371967,
+      "grad_norm": 0.8410210013389587,
+      "learning_rate": 8.825472207231516e-05,
+      "loss": 3.1751,
       "step": 79150
     },
     {
-      "epoch": 8.524378430739425,
-      "grad_norm": 0.8278105854988098,
-      "learning_rate": 8.882663506087706e-05,
-      "loss": 3.1983,
+      "epoch": 8.539083557951482,
+      "grad_norm": 0.8973972201347351,
+      "learning_rate": 8.793092282784672e-05,
+      "loss": 3.1806,
       "step": 79200
     },
     {
-      "epoch": 8.529759982779034,
-      "grad_norm": 0.8638647198677063,
-      "learning_rate": 8.850339403081563e-05,
-      "loss": 3.1806,
+      "epoch": 8.544474393530997,
+      "grad_norm": 0.8442065715789795,
+      "learning_rate": 8.76071235833783e-05,
+      "loss": 3.1718,
       "step": 79250
     },
     {
-      "epoch": 8.535141534818642,
-      "grad_norm": 0.8197671175003052,
-      "learning_rate": 8.818015300075422e-05,
-      "loss": 3.1808,
+      "epoch": 8.549865229110512,
+      "grad_norm": 0.926730751991272,
+      "learning_rate": 8.728332433890987e-05,
+      "loss": 3.1895,
       "step": 79300
     },
     {
-      "epoch": 8.54052308685825,
-      "grad_norm": 0.8648247122764587,
-      "learning_rate": 8.785691197069282e-05,
-      "loss": 3.2081,
+      "epoch": 8.555256064690028,
+      "grad_norm": 0.8862857222557068,
+      "learning_rate": 8.695952509444144e-05,
+      "loss": 3.1906,
       "step": 79350
     },
     {
-      "epoch": 8.545904638897857,
-      "grad_norm": 0.8894992470741272,
-      "learning_rate": 8.753367094063139e-05,
-      "loss": 3.1711,
+      "epoch": 8.560646900269543,
+      "grad_norm": 0.8473867774009705,
+      "learning_rate": 8.663572584997301e-05,
+      "loss": 3.1616,
       "step": 79400
     },
     {
-      "epoch": 8.551286190937466,
-      "grad_norm": 0.8495647311210632,
-      "learning_rate": 8.721042991056998e-05,
-      "loss": 3.1824,
+      "epoch": 8.566037735849056,
+      "grad_norm": 0.8909599184989929,
+      "learning_rate": 8.631840259039395e-05,
+      "loss": 3.195,
       "step": 79450
     },
     {
-      "epoch": 8.556667742977075,
-      "grad_norm": 0.8385526537895203,
-      "learning_rate": 8.688718888050856e-05,
-      "loss": 3.1831,
+      "epoch": 8.571428571428571,
+      "grad_norm": 0.8786565065383911,
+      "learning_rate": 8.599460334592551e-05,
+      "loss": 3.1852,
       "step": 79500
     },
     {
-      "epoch": 8.562049295016683,
-      "grad_norm": 0.8313244581222534,
-      "learning_rate": 8.656394785044713e-05,
-      "loss": 3.1923,
+      "epoch": 8.576819407008086,
+      "grad_norm": 0.8280052542686462,
+      "learning_rate": 8.567080410145709e-05,
+      "loss": 3.1833,
       "step": 79550
     },
     {
-      "epoch": 8.567430847056292,
-      "grad_norm": 0.7992787957191467,
-      "learning_rate": 8.624070682038572e-05,
-      "loss": 3.1883,
+      "epoch": 8.582210242587601,
+      "grad_norm": 0.9174842834472656,
+      "learning_rate": 8.534700485698867e-05,
+      "loss": 3.1853,
       "step": 79600
     },
     {
-      "epoch": 8.572812399095898,
-      "grad_norm": 0.8515611290931702,
-      "learning_rate": 8.591746579032432e-05,
-      "loss": 3.1855,
+      "epoch": 8.587601078167117,
+      "grad_norm": 0.9096667170524597,
+      "learning_rate": 8.502320561252023e-05,
+      "loss": 3.1649,
       "step": 79650
     },
     {
-      "epoch": 8.578193951135507,
-      "grad_norm": 0.8336965441703796,
-      "learning_rate": 8.55942247602629e-05,
-      "loss": 3.1975,
+      "epoch": 8.59299191374663,
+      "grad_norm": 0.8842190504074097,
+      "learning_rate": 8.469940636805181e-05,
+      "loss": 3.1896,
       "step": 79700
     },
     {
-      "epoch": 8.583575503175116,
-      "grad_norm": 0.8350300192832947,
-      "learning_rate": 8.527098373020148e-05,
-      "loss": 3.1877,
+      "epoch": 8.598382749326145,
+      "grad_norm": 0.9274055361747742,
+      "learning_rate": 8.437560712358337e-05,
+      "loss": 3.1992,
       "step": 79750
     },
     {
-      "epoch": 8.588957055214724,
-      "grad_norm": 0.8040255308151245,
-      "learning_rate": 8.494774270014006e-05,
-      "loss": 3.1902,
+      "epoch": 8.60377358490566,
+      "grad_norm": 0.8860254883766174,
+      "learning_rate": 8.405180787911494e-05,
+      "loss": 3.1794,
       "step": 79800
     },
     {
-      "epoch": 8.594338607254333,
-      "grad_norm": 0.8291066884994507,
-      "learning_rate": 8.462450167007866e-05,
-      "loss": 3.1798,
+      "epoch": 8.609164420485175,
+      "grad_norm": 0.8451575040817261,
+      "learning_rate": 8.372800863464651e-05,
+      "loss": 3.1644,
       "step": 79850
     },
     {
-      "epoch": 8.599720159293941,
-      "grad_norm": 0.8915792107582092,
-      "learning_rate": 8.430126064001722e-05,
-      "loss": 3.1781,
+      "epoch": 8.61455525606469,
+      "grad_norm": 0.8715153932571411,
+      "learning_rate": 8.340420939017809e-05,
+      "loss": 3.1715,
       "step": 79900
     },
     {
-      "epoch": 8.605101711333548,
-      "grad_norm": 0.8477835655212402,
-      "learning_rate": 8.397801960995582e-05,
-      "loss": 3.1845,
+      "epoch": 8.619946091644206,
+      "grad_norm": 0.8940669894218445,
+      "learning_rate": 8.308041014570964e-05,
+      "loss": 3.1943,
       "step": 79950
     },
     {
-      "epoch": 8.610483263373157,
-      "grad_norm": 0.8423581123352051,
-      "learning_rate": 8.36547785798944e-05,
-      "loss": 3.1571,
+      "epoch": 8.625336927223719,
+      "grad_norm": 0.8656378984451294,
+      "learning_rate": 8.275661090124122e-05,
+      "loss": 3.183,
       "step": 80000
     },
     {
-      "epoch": 8.610483263373157,
-      "eval_accuracy": 0.39168312124108545,
-      "eval_loss": 3.323802947998047,
-      "eval_runtime": 184.6741,
-      "eval_samples_per_second": 97.529,
-      "eval_steps_per_second": 6.097,
+      "epoch": 8.625336927223719,
+      "eval_accuracy": 0.3916346620531202,
+      "eval_loss": 3.3251895904541016,
+      "eval_runtime": 183.6111,
+      "eval_samples_per_second": 98.093,
+      "eval_steps_per_second": 6.133,
       "step": 80000
     }
   ],
   "logging_steps": 50,
-  "max_steps": 92910,
+  "max_steps": 92750,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 10,
   "save_steps": 10000,
@@ -11946,7 +11946,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 6.68874156539904e+17,
+  "total_flos": 6.688553435136e+17,
   "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null