diff --git "a/cost_to_drop_frequency_3591/checkpoint-40000/trainer_state.json" "b/cost_to_drop_frequency_3591/checkpoint-40000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/cost_to_drop_frequency_3591/checkpoint-40000/trainer_state.json"
@@ -0,0 +1,6003 @@
+{
+  "best_global_step": 40000,
+  "best_metric": 3.5466434955596924,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_drop_frequency_3591/checkpoint-40000",
+  "epoch": 11.644982819870712,
+  "eval_steps": 1000,
+  "global_step": 40000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.014559431599790344,
+      "grad_norm": 1.4049561023712158,
+      "learning_rate": 0.000294,
+      "loss": 8.4124,
+      "step": 50
+    },
+    {
+      "epoch": 0.029118863199580687,
+      "grad_norm": 0.8360756635665894,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7276,
+      "step": 100
+    },
+    {
+      "epoch": 0.043678294799371034,
+      "grad_norm": 0.42227354645729065,
+      "learning_rate": 0.0005995714285714286,
+      "loss": 6.3402,
+      "step": 150
+    },
+    {
+      "epoch": 0.058237726399161374,
+      "grad_norm": 0.9324970841407776,
+      "learning_rate": 0.0005991341107871719,
+      "loss": 6.1609,
+      "step": 200
+    },
+    {
+      "epoch": 0.07279715799895171,
+      "grad_norm": 0.4407173991203308,
+      "learning_rate": 0.0005986967930029154,
+      "loss": 6.0089,
+      "step": 250
+    },
+    {
+      "epoch": 0.08735658959874207,
+      "grad_norm": 0.4540535807609558,
+      "learning_rate": 0.0005982594752186589,
+      "loss": 5.8627,
+      "step": 300
+    },
+    {
+      "epoch": 0.10191602119853241,
+      "grad_norm": 0.4887666404247284,
+      "learning_rate": 0.0005978221574344022,
+      "loss": 5.74,
+      "step": 350
+    },
+    {
+      "epoch": 0.11647545279832275,
+      "grad_norm": 0.5809242129325867,
+      "learning_rate": 0.0005973848396501457,
+      "loss": 5.6281,
+      "step": 400
+    },
+    {
+      "epoch": 0.1310348843981131,
+      "grad_norm": 0.4683547914028168,
+      "learning_rate": 0.0005969475218658892,
+      "loss": 5.5265,
+      "step": 450
+    },
+    {
+      "epoch": 0.14559431599790343,
+      "grad_norm": 0.4089968502521515,
+      "learning_rate": 0.0005965102040816326,
+      "loss": 5.4214,
+      "step": 500
+    },
+    {
+      "epoch": 0.1601537475976938,
+      "grad_norm": 0.5466117858886719,
+      "learning_rate": 0.000596072886297376,
+      "loss": 5.3411,
+      "step": 550
+    },
+    {
+      "epoch": 0.17471317919748414,
+      "grad_norm": 0.3846788704395294,
+      "learning_rate": 0.0005956355685131195,
+      "loss": 5.2665,
+      "step": 600
+    },
+    {
+      "epoch": 0.18927261079727448,
+      "grad_norm": 0.4610619843006134,
+      "learning_rate": 0.0005951982507288629,
+      "loss": 5.2078,
+      "step": 650
+    },
+    {
+      "epoch": 0.20383204239706482,
+      "grad_norm": 0.41991209983825684,
+      "learning_rate": 0.0005947609329446064,
+      "loss": 5.1301,
+      "step": 700
+    },
+    {
+      "epoch": 0.21839147399685516,
+      "grad_norm": 0.4753279685974121,
+      "learning_rate": 0.0005943236151603498,
+      "loss": 5.0702,
+      "step": 750
+    },
+    {
+      "epoch": 0.2329509055966455,
+      "grad_norm": 0.4781185984611511,
+      "learning_rate": 0.0005938862973760932,
+      "loss": 5.0195,
+      "step": 800
+    },
+    {
+      "epoch": 0.24751033719643586,
+      "grad_norm": 0.41803014278411865,
+      "learning_rate": 0.0005934489795918367,
+      "loss": 4.971,
+      "step": 850
+    },
+    {
+      "epoch": 0.2620697687962262,
+      "grad_norm": 0.444289892911911,
+      "learning_rate": 0.0005930116618075802,
+      "loss": 4.9305,
+      "step": 900
+    },
+    {
+      "epoch": 0.2766292003960165,
+      "grad_norm": 0.4531804025173187,
+      "learning_rate": 0.0005925743440233235,
+      "loss": 4.8862,
+      "step": 950
+    },
+    {
+      "epoch": 0.29118863199580686,
+      "grad_norm": 0.4998404085636139,
+      "learning_rate": 0.000592137026239067,
+      "loss": 4.8266,
+      "step": 1000
+    },
+    {
+      "epoch": 0.29118863199580686,
+      "eval_accuracy": 0.25396983481710367,
+      "eval_loss": 4.760892868041992,
+      "eval_runtime": 179.1934,
+      "eval_samples_per_second": 92.877,
+      "eval_steps_per_second": 5.809,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30574806359559725,
+      "grad_norm": 0.44976159930229187,
+      "learning_rate": 0.0005916997084548104,
+      "loss": 4.7891,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3203074951953876,
+      "grad_norm": 0.38453996181488037,
+      "learning_rate": 0.0005912623906705539,
+      "loss": 4.7294,
+      "step": 1100
+    },
+    {
+      "epoch": 0.33486692679517793,
+      "grad_norm": 0.4332706928253174,
+      "learning_rate": 0.0005908250728862974,
+      "loss": 4.7002,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3494263583949683,
+      "grad_norm": 0.42371395230293274,
+      "learning_rate": 0.0005903877551020407,
+      "loss": 4.6808,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3639857899947586,
+      "grad_norm": 0.45705753564834595,
+      "learning_rate": 0.0005899504373177842,
+      "loss": 4.6327,
+      "step": 1250
+    },
+    {
+      "epoch": 0.37854522159454895,
+      "grad_norm": 0.42063650488853455,
+      "learning_rate": 0.0005895131195335277,
+      "loss": 4.6117,
+      "step": 1300
+    },
+    {
+      "epoch": 0.3931046531943393,
+      "grad_norm": 0.43308427929878235,
+      "learning_rate": 0.0005890758017492711,
+      "loss": 4.5751,
+      "step": 1350
+    },
+    {
+      "epoch": 0.40766408479412963,
+      "grad_norm": 0.43480074405670166,
+      "learning_rate": 0.0005886384839650145,
+      "loss": 4.5591,
+      "step": 1400
+    },
+    {
+      "epoch": 0.42222351639392,
+      "grad_norm": 0.45868223905563354,
+      "learning_rate": 0.000588201166180758,
+      "loss": 4.5263,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4367829479937103,
+      "grad_norm": 0.41984814405441284,
+      "learning_rate": 0.0005877638483965014,
+      "loss": 4.5044,
+      "step": 1500
+    },
+    {
+      "epoch": 0.45134237959350065,
+      "grad_norm": 0.4139959216117859,
+      "learning_rate": 0.0005873265306122449,
+      "loss": 4.4968,
+      "step": 1550
+    },
+    {
+      "epoch": 0.465901811193291,
+      "grad_norm": 0.38750138878822327,
+      "learning_rate": 0.0005868892128279882,
+      "loss": 4.4646,
+      "step": 1600
+    },
+    {
+      "epoch": 0.48046124279308133,
+      "grad_norm": 0.41930243372917175,
+      "learning_rate": 0.0005864518950437317,
+      "loss": 4.4529,
+      "step": 1650
+    },
+    {
+      "epoch": 0.49502067439287173,
+      "grad_norm": 0.41106143593788147,
+      "learning_rate": 0.0005860145772594752,
+      "loss": 4.4362,
+      "step": 1700
+    },
+    {
+      "epoch": 0.509580105992662,
+      "grad_norm": 0.39897602796554565,
+      "learning_rate": 0.0005855772594752186,
+      "loss": 4.4112,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5241395375924524,
+      "grad_norm": 0.4214461147785187,
+      "learning_rate": 0.000585139941690962,
+      "loss": 4.404,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5386989691922427,
+      "grad_norm": 0.3887820541858673,
+      "learning_rate": 0.0005847026239067055,
+      "loss": 4.3787,
+      "step": 1850
+    },
+    {
+      "epoch": 0.553258400792033,
+      "grad_norm": 0.3768806755542755,
+      "learning_rate": 0.0005842653061224489,
+      "loss": 4.3711,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5678178323918234,
+      "grad_norm": 0.3779532313346863,
+      "learning_rate": 0.0005838279883381924,
+      "loss": 4.3456,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5823772639916137,
+      "grad_norm": 0.3921726942062378,
+      "learning_rate": 0.0005833906705539359,
+      "loss": 4.3399,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5823772639916137,
+      "eval_accuracy": 0.2996934707950652,
+      "eval_loss": 4.28386926651001,
+      "eval_runtime": 179.6428,
+      "eval_samples_per_second": 92.645,
+      "eval_steps_per_second": 5.795,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5969366955914042,
+      "grad_norm": 0.38071900606155396,
+      "learning_rate": 0.0005829533527696792,
+      "loss": 4.3206,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6114961271911945,
+      "grad_norm": 0.4333866536617279,
+      "learning_rate": 0.0005825160349854227,
+      "loss": 4.316,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6260555587909848,
+      "grad_norm": 0.3910558223724365,
+      "learning_rate": 0.0005820787172011661,
+      "loss": 4.2961,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6406149903907752,
+      "grad_norm": 0.3819257318973541,
+      "learning_rate": 0.0005816413994169096,
+      "loss": 4.2951,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6551744219905655,
+      "grad_norm": 0.4080394506454468,
+      "learning_rate": 0.000581204081632653,
+      "loss": 4.2756,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6697338535903559,
+      "grad_norm": 0.37072518467903137,
+      "learning_rate": 0.0005807667638483965,
+      "loss": 4.2638,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6842932851901462,
+      "grad_norm": 0.3981825113296509,
+      "learning_rate": 0.0005803294460641399,
+      "loss": 4.2662,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6988527167899365,
+      "grad_norm": 0.384818971157074,
+      "learning_rate": 0.0005798921282798834,
+      "loss": 4.2509,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7134121483897269,
+      "grad_norm": 0.43530362844467163,
+      "learning_rate": 0.0005794548104956267,
+      "loss": 4.2352,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7279715799895172,
+      "grad_norm": 0.3544856607913971,
+      "learning_rate": 0.0005790174927113702,
+      "loss": 4.2268,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7425310115893076,
+      "grad_norm": 0.38703247904777527,
+      "learning_rate": 0.0005785801749271137,
+      "loss": 4.2107,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7570904431890979,
+      "grad_norm": 0.37904635071754456,
+      "learning_rate": 0.000578142857142857,
+      "loss": 4.1982,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7716498747888882,
+      "grad_norm": 0.41309526562690735,
+      "learning_rate": 0.0005777055393586005,
+      "loss": 4.1833,
+      "step": 2650
+    },
+    {
+      "epoch": 0.7862093063886786,
+      "grad_norm": 0.42821475863456726,
+      "learning_rate": 0.000577268221574344,
+      "loss": 4.1892,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8007687379884689,
+      "grad_norm": 0.4209707975387573,
+      "learning_rate": 0.0005768309037900874,
+      "loss": 4.1834,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8153281695882593,
+      "grad_norm": 0.3531130254268646,
+      "learning_rate": 0.0005763935860058308,
+      "loss": 4.1801,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8298876011880496,
+      "grad_norm": 0.34633395075798035,
+      "learning_rate": 0.0005759562682215744,
+      "loss": 4.1681,
+      "step": 2850
+    },
+    {
+      "epoch": 0.84444703278784,
+      "grad_norm": 0.3938649892807007,
+      "learning_rate": 0.0005755189504373177,
+      "loss": 4.1636,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8590064643876303,
+      "grad_norm": 0.3613823652267456,
+      "learning_rate": 0.0005750816326530612,
+      "loss": 4.1578,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8735658959874206,
+      "grad_norm": 0.3491958677768707,
+      "learning_rate": 0.0005746443148688046,
+      "loss": 4.1452,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8735658959874206,
+      "eval_accuracy": 0.31544864157201075,
+      "eval_loss": 4.095163822174072,
+      "eval_runtime": 179.6171,
+      "eval_samples_per_second": 92.658,
+      "eval_steps_per_second": 5.796,
+      "step": 3000
+    },
+    {
+      "epoch": 0.888125327587211,
+      "grad_norm": 0.3552567958831787,
+      "learning_rate": 0.000574206997084548,
+      "loss": 4.1285,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9026847591870013,
+      "grad_norm": 0.35991519689559937,
+      "learning_rate": 0.0005737696793002915,
+      "loss": 4.132,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9172441907867916,
+      "grad_norm": 0.3861224949359894,
+      "learning_rate": 0.000573332361516035,
+      "loss": 4.1214,
+      "step": 3150
+    },
+    {
+      "epoch": 0.931803622386582,
+      "grad_norm": 0.3921383023262024,
+      "learning_rate": 0.0005728950437317784,
+      "loss": 4.1157,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9463630539863723,
+      "grad_norm": 0.3566656708717346,
+      "learning_rate": 0.0005724577259475218,
+      "loss": 4.1088,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9609224855861627,
+      "grad_norm": 0.3769164979457855,
+      "learning_rate": 0.0005720204081632652,
+      "loss": 4.0963,
+      "step": 3300
+    },
+    {
+      "epoch": 0.975481917185953,
+      "grad_norm": 0.3577769100666046,
+      "learning_rate": 0.0005715830903790087,
+      "loss": 4.1067,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9900413487857435,
+      "grad_norm": 0.35588538646698,
+      "learning_rate": 0.0005711457725947522,
+      "loss": 4.0912,
+      "step": 3400
+    },
+    {
+      "epoch": 1.0043678294799372,
+      "grad_norm": 0.3580274283885956,
+      "learning_rate": 0.0005707084548104955,
+      "loss": 4.0849,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0189272610797275,
+      "grad_norm": 0.3520485758781433,
+      "learning_rate": 0.000570271137026239,
+      "loss": 4.0188,
+      "step": 3500
+    },
+    {
+      "epoch": 1.0334866926795179,
+      "grad_norm": 0.3421690762042999,
+      "learning_rate": 0.0005698338192419825,
+      "loss": 4.0128,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0480461242793082,
+      "grad_norm": 0.3418625593185425,
+      "learning_rate": 0.0005693965014577259,
+      "loss": 4.0056,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0626055558790986,
+      "grad_norm": 0.34526926279067993,
+      "learning_rate": 0.0005689591836734693,
+      "loss": 4.0084,
+      "step": 3650
+    },
+    {
+      "epoch": 1.077164987478889,
+      "grad_norm": 0.35390642285346985,
+      "learning_rate": 0.0005685218658892128,
+      "loss": 4.0061,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0917244190786792,
+      "grad_norm": 0.3431430459022522,
+      "learning_rate": 0.0005680845481049562,
+      "loss": 3.9994,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1062838506784696,
+      "grad_norm": 0.357334166765213,
+      "learning_rate": 0.0005676472303206997,
+      "loss": 4.0071,
+      "step": 3800
+    },
+    {
+      "epoch": 1.12084328227826,
+      "grad_norm": 0.3587090075016022,
+      "learning_rate": 0.000567209912536443,
+      "loss": 3.985,
+      "step": 3850
+    },
+    {
+      "epoch": 1.1354027138780503,
+      "grad_norm": 0.3586151599884033,
+      "learning_rate": 0.0005667725947521865,
+      "loss": 4.0047,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1499621454778406,
+      "grad_norm": 0.37636685371398926,
+      "learning_rate": 0.00056633527696793,
+      "loss": 3.9987,
+      "step": 3950
+    },
+    {
+      "epoch": 1.164521577077631,
+      "grad_norm": 0.35518568754196167,
+      "learning_rate": 0.0005658979591836735,
+      "loss": 3.9904,
+      "step": 4000
+    },
+    {
+      "epoch": 1.164521577077631,
+      "eval_accuracy": 0.32538388464653073,
+      "eval_loss": 3.9894351959228516,
+      "eval_runtime": 179.567,
+      "eval_samples_per_second": 92.684,
+      "eval_steps_per_second": 5.797,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1790810086774213,
+      "grad_norm": 0.3445068895816803,
+      "learning_rate": 0.0005654606413994169,
+      "loss": 3.9831,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1936404402772116,
+      "grad_norm": 0.3411754369735718,
+      "learning_rate": 0.0005650233236151603,
+      "loss": 3.9741,
+      "step": 4100
+    },
+    {
+      "epoch": 1.208199871877002,
+      "grad_norm": 0.3622643053531647,
+      "learning_rate": 0.0005645860058309037,
+      "loss": 3.9812,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2227593034767923,
+      "grad_norm": 0.35340210795402527,
+      "learning_rate": 0.0005641486880466472,
+      "loss": 3.9853,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2373187350765826,
+      "grad_norm": 0.34644776582717896,
+      "learning_rate": 0.0005637113702623907,
+      "loss": 3.9733,
+      "step": 4250
+    },
+    {
+      "epoch": 1.251878166676373,
+      "grad_norm": 0.33221983909606934,
+      "learning_rate": 0.000563274052478134,
+      "loss": 3.9601,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2664375982761633,
+      "grad_norm": 0.3372167646884918,
+      "learning_rate": 0.0005628367346938775,
+      "loss": 3.9708,
+      "step": 4350
+    },
+    {
+      "epoch": 1.2809970298759537,
+      "grad_norm": 0.3629266321659088,
+      "learning_rate": 0.0005623994169096209,
+      "loss": 3.9556,
+      "step": 4400
+    },
+    {
+      "epoch": 1.295556461475744,
+      "grad_norm": 0.31815558671951294,
+      "learning_rate": 0.0005619620991253644,
+      "loss": 3.9644,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3101158930755343,
+      "grad_norm": 0.3518199622631073,
+      "learning_rate": 0.0005615247813411078,
+      "loss": 3.9551,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3246753246753247,
+      "grad_norm": 0.3197888135910034,
+      "learning_rate": 0.0005610874635568513,
+      "loss": 3.9556,
+      "step": 4550
+    },
+    {
+      "epoch": 1.339234756275115,
+      "grad_norm": 0.35236433148384094,
+      "learning_rate": 0.0005606501457725947,
+      "loss": 3.9573,
+      "step": 4600
+    },
+    {
+      "epoch": 1.3537941878749054,
+      "grad_norm": 0.3366566002368927,
+      "learning_rate": 0.0005602128279883382,
+      "loss": 3.9619,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3683536194746957,
+      "grad_norm": 0.3635067939758301,
+      "learning_rate": 0.0005597755102040816,
+      "loss": 3.9568,
+      "step": 4700
+    },
+    {
+      "epoch": 1.382913051074486,
+      "grad_norm": 0.3495481610298157,
+      "learning_rate": 0.000559338192419825,
+      "loss": 3.935,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3974724826742764,
+      "grad_norm": 0.34598347544670105,
+      "learning_rate": 0.0005589008746355685,
+      "loss": 3.9463,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4120319142740667,
+      "grad_norm": 0.32707110047340393,
+      "learning_rate": 0.0005584635568513118,
+      "loss": 3.9388,
+      "step": 4850
+    },
+    {
+      "epoch": 1.426591345873857,
+      "grad_norm": 0.35207509994506836,
+      "learning_rate": 0.0005580262390670554,
+      "loss": 3.9363,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4411507774736474,
+      "grad_norm": 0.33082953095436096,
+      "learning_rate": 0.0005575889212827988,
+      "loss": 3.9443,
+      "step": 4950
+    },
+    {
+      "epoch": 1.4557102090734377,
+      "grad_norm": 0.36195048689842224,
+      "learning_rate": 0.0005571516034985422,
+      "loss": 3.934,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4557102090734377,
+      "eval_accuracy": 0.3320231362585752,
+      "eval_loss": 3.9117023944854736,
+      "eval_runtime": 179.5118,
+      "eval_samples_per_second": 92.713,
+      "eval_steps_per_second": 5.799,
+      "step": 5000
+    },
+    {
+      "epoch": 1.470269640673228,
+      "grad_norm": 0.3603370487689972,
+      "learning_rate": 0.0005567142857142856,
+      "loss": 3.9232,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4848290722730184,
+      "grad_norm": 0.3303501307964325,
+      "learning_rate": 0.0005562769679300292,
+      "loss": 3.929,
+      "step": 5100
+    },
+    {
+      "epoch": 1.4993885038728088,
+      "grad_norm": 0.34812071919441223,
+      "learning_rate": 0.0005558396501457725,
+      "loss": 3.9186,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5139479354725993,
+      "grad_norm": 0.3245297372341156,
+      "learning_rate": 0.000555402332361516,
+      "loss": 3.9281,
+      "step": 5200
+    },
+    {
+      "epoch": 1.5285073670723897,
+      "grad_norm": 0.32848072052001953,
+      "learning_rate": 0.0005549650145772595,
+      "loss": 3.9081,
+      "step": 5250
+    },
+    {
+      "epoch": 1.54306679867218,
+      "grad_norm": 0.3524268865585327,
+      "learning_rate": 0.0005545276967930028,
+      "loss": 3.9169,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5576262302719703,
+      "grad_norm": 0.3273775279521942,
+      "learning_rate": 0.0005540903790087463,
+      "loss": 3.9057,
+      "step": 5350
+    },
+    {
+      "epoch": 1.5721856618717607,
+      "grad_norm": 0.33142444491386414,
+      "learning_rate": 0.0005536530612244898,
+      "loss": 3.9117,
+      "step": 5400
+    },
+    {
+      "epoch": 1.586745093471551,
+      "grad_norm": 0.35404613614082336,
+      "learning_rate": 0.0005532157434402332,
+      "loss": 3.9,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6013045250713414,
+      "grad_norm": 0.3326050341129303,
+      "learning_rate": 0.0005527784256559766,
+      "loss": 3.9023,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6158639566711317,
+      "grad_norm": 0.32253944873809814,
+      "learning_rate": 0.00055234110787172,
+      "loss": 3.9036,
+      "step": 5550
+    },
+    {
+      "epoch": 1.630423388270922,
+      "grad_norm": 0.40896502137184143,
+      "learning_rate": 0.0005519037900874635,
+      "loss": 3.892,
+      "step": 5600
+    },
+    {
+      "epoch": 1.6449828198707124,
+      "grad_norm": 0.33099985122680664,
+      "learning_rate": 0.000551466472303207,
+      "loss": 3.8921,
+      "step": 5650
+    },
+    {
+      "epoch": 1.6595422514705027,
+      "grad_norm": 0.3134934902191162,
+      "learning_rate": 0.0005510291545189503,
+      "loss": 3.8986,
+      "step": 5700
+    },
+    {
+      "epoch": 1.674101683070293,
+      "grad_norm": 0.32286426424980164,
+      "learning_rate": 0.0005505918367346938,
+      "loss": 3.8705,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6886611146700834,
+      "grad_norm": 0.3152390122413635,
+      "learning_rate": 0.0005501545189504373,
+      "loss": 3.8843,
+      "step": 5800
+    },
+    {
+      "epoch": 1.7032205462698737,
+      "grad_norm": 0.3241208493709564,
+      "learning_rate": 0.0005497172011661807,
+      "loss": 3.8915,
+      "step": 5850
+    },
+    {
+      "epoch": 1.717779977869664,
+      "grad_norm": 0.3297117054462433,
+      "learning_rate": 0.0005492798833819241,
+      "loss": 3.8959,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7323394094694544,
+      "grad_norm": 0.34585368633270264,
+      "learning_rate": 0.0005488425655976676,
+      "loss": 3.8631,
+      "step": 5950
+    },
+    {
+      "epoch": 1.7468988410692448,
+      "grad_norm": 0.32093173265457153,
+      "learning_rate": 0.000548405247813411,
+      "loss": 3.8774,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7468988410692448,
+      "eval_accuracy": 0.3372265721042079,
+      "eval_loss": 3.8566107749938965,
+      "eval_runtime": 179.5862,
+      "eval_samples_per_second": 92.674,
+      "eval_steps_per_second": 5.797,
+      "step": 6000
+    },
+    {
+      "epoch": 1.761458272669035,
+      "grad_norm": 0.3342028260231018,
+      "learning_rate": 0.0005479679300291545,
+      "loss": 3.8767,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7760177042688254,
+      "grad_norm": 0.331476628780365,
+      "learning_rate": 0.000547530612244898,
+      "loss": 3.8741,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7905771358686158,
+      "grad_norm": 0.3178947865962982,
+      "learning_rate": 0.0005470932944606413,
+      "loss": 3.8753,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8051365674684061,
+      "grad_norm": 0.33139607310295105,
+      "learning_rate": 0.0005466559766763848,
+      "loss": 3.8686,
+      "step": 6200
+    },
+    {
+      "epoch": 1.8196959990681965,
+      "grad_norm": 0.35270482301712036,
+      "learning_rate": 0.0005462186588921283,
+      "loss": 3.8577,
+      "step": 6250
+    },
+    {
+      "epoch": 1.8342554306679868,
+      "grad_norm": 0.3247964382171631,
+      "learning_rate": 0.0005457813411078717,
+      "loss": 3.8574,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8488148622677771,
+      "grad_norm": 0.33985435962677,
+      "learning_rate": 0.0005453440233236151,
+      "loss": 3.8546,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8633742938675675,
+      "grad_norm": 0.33400237560272217,
+      "learning_rate": 0.0005449067055393585,
+      "loss": 3.8636,
+      "step": 6400
+    },
+    {
+      "epoch": 1.8779337254673578,
+      "grad_norm": 0.3367692232131958,
+      "learning_rate": 0.0005444693877551019,
+      "loss": 3.8718,
+      "step": 6450
+    },
+    {
+      "epoch": 1.8924931570671482,
+      "grad_norm": 0.3267197608947754,
+      "learning_rate": 0.0005440320699708455,
+      "loss": 3.8507,
+      "step": 6500
+    },
+    {
+      "epoch": 1.9070525886669385,
+      "grad_norm": 0.3389538824558258,
+      "learning_rate": 0.0005435947521865888,
+      "loss": 3.8546,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9216120202667288,
+      "grad_norm": 0.32694804668426514,
+      "learning_rate": 0.0005431574344023323,
+      "loss": 3.8391,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9361714518665192,
+      "grad_norm": 0.3353123366832733,
+      "learning_rate": 0.0005427201166180758,
+      "loss": 3.8435,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9507308834663095,
+      "grad_norm": 0.32406482100486755,
+      "learning_rate": 0.0005422827988338192,
+      "loss": 3.8409,
+      "step": 6700
+    },
+    {
+      "epoch": 1.9652903150660999,
+      "grad_norm": 0.3334747850894928,
+      "learning_rate": 0.0005418454810495626,
+      "loss": 3.8506,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9798497466658902,
+      "grad_norm": 0.33217740058898926,
+      "learning_rate": 0.0005414081632653061,
+      "loss": 3.8396,
+      "step": 6800
+    },
+    {
+      "epoch": 1.9944091782656805,
+      "grad_norm": 0.33468008041381836,
+      "learning_rate": 0.0005409708454810495,
+      "loss": 3.8407,
+      "step": 6850
+    },
+    {
+      "epoch": 2.0087356589598744,
+      "grad_norm": 0.3196060359477997,
+      "learning_rate": 0.0005405335276967929,
+      "loss": 3.7913,
+      "step": 6900
+    },
+    {
+      "epoch": 2.0232950905596647,
+      "grad_norm": 0.3573300540447235,
+      "learning_rate": 0.0005400962099125365,
+      "loss": 3.7409,
+      "step": 6950
+    },
+    {
+      "epoch": 2.037854522159455,
+      "grad_norm": 0.3402981460094452,
+      "learning_rate": 0.0005396588921282798,
+      "loss": 3.7556,
+      "step": 7000
+    },
+    {
+      "epoch": 2.037854522159455,
+      "eval_accuracy": 0.34190017535271905,
+      "eval_loss": 3.809979200363159,
+      "eval_runtime": 179.6501,
+      "eval_samples_per_second": 92.641,
+      "eval_steps_per_second": 5.795,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0524139537592454,
+      "grad_norm": 0.3510541319847107,
+      "learning_rate": 0.0005392215743440233,
+      "loss": 3.7422,
+      "step": 7050
+    },
+    {
+      "epoch": 2.0669733853590357,
+      "grad_norm": 0.31116750836372375,
+      "learning_rate": 0.0005387842565597666,
+      "loss": 3.7475,
+      "step": 7100
+    },
+    {
+      "epoch": 2.081532816958826,
+      "grad_norm": 0.3254874050617218,
+      "learning_rate": 0.0005383469387755102,
+      "loss": 3.7546,
+      "step": 7150
+    },
+    {
+      "epoch": 2.0960922485586164,
+      "grad_norm": 0.3147241771221161,
+      "learning_rate": 0.0005379096209912536,
+      "loss": 3.7518,
+      "step": 7200
+    },
+    {
+      "epoch": 2.1106516801584068,
+      "grad_norm": 0.3199782073497772,
+      "learning_rate": 0.000537472303206997,
+      "loss": 3.7659,
+      "step": 7250
+    },
+    {
+      "epoch": 2.125211111758197,
+      "grad_norm": 0.3094785809516907,
+      "learning_rate": 0.0005370349854227405,
+      "loss": 3.7481,
+      "step": 7300
+    },
+    {
+      "epoch": 2.1397705433579874,
+      "grad_norm": 0.3172190189361572,
+      "learning_rate": 0.0005365976676384839,
+      "loss": 3.7408,
+      "step": 7350
+    },
+    {
+      "epoch": 2.154329974957778,
+      "grad_norm": 0.3381129801273346,
+      "learning_rate": 0.0005361603498542273,
+      "loss": 3.7448,
+      "step": 7400
+    },
+    {
+      "epoch": 2.168889406557568,
+      "grad_norm": 0.3302014470100403,
+      "learning_rate": 0.0005357230320699708,
+      "loss": 3.7451,
+      "step": 7450
+    },
+    {
+      "epoch": 2.1834488381573585,
+      "grad_norm": 0.34532982110977173,
+      "learning_rate": 0.0005352857142857143,
+      "loss": 3.7459,
+      "step": 7500
+    },
+    {
+      "epoch": 2.198008269757149,
+      "grad_norm": 0.3262939751148224,
+      "learning_rate": 0.0005348483965014576,
+      "loss": 3.7466,
+      "step": 7550
+    },
+    {
+      "epoch": 2.212567701356939,
+      "grad_norm": 0.33892711997032166,
+      "learning_rate": 0.0005344110787172011,
+      "loss": 3.7505,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2271271329567295,
+      "grad_norm": 0.3445602059364319,
+      "learning_rate": 0.0005339737609329446,
+      "loss": 3.7429,
+      "step": 7650
+    },
+    {
+      "epoch": 2.24168656455652,
+      "grad_norm": 0.3161507248878479,
+      "learning_rate": 0.000533536443148688,
+      "loss": 3.7541,
+      "step": 7700
+    },
+    {
+      "epoch": 2.25624599615631,
+      "grad_norm": 0.31178775429725647,
+      "learning_rate": 0.0005330991253644314,
+      "loss": 3.7447,
+      "step": 7750
+    },
+    {
+      "epoch": 2.2708054277561005,
+      "grad_norm": 0.3178870975971222,
+      "learning_rate": 0.0005326618075801749,
+      "loss": 3.7506,
+      "step": 7800
+    },
+    {
+      "epoch": 2.285364859355891,
+      "grad_norm": 0.3333457112312317,
+      "learning_rate": 0.0005322244897959183,
+      "loss": 3.7494,
+      "step": 7850
+    },
+    {
+      "epoch": 2.299924290955681,
+      "grad_norm": 0.3204410672187805,
+      "learning_rate": 0.0005317871720116618,
+      "loss": 3.7474,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3144837225554715,
+      "grad_norm": 0.31767410039901733,
+      "learning_rate": 0.0005313498542274051,
+      "loss": 3.7368,
+      "step": 7950
+    },
+    {
+      "epoch": 2.329043154155262,
+      "grad_norm": 0.33374062180519104,
+      "learning_rate": 0.0005309125364431486,
+      "loss": 3.7524,
+      "step": 8000
+    },
+    {
+      "epoch": 2.329043154155262,
+      "eval_accuracy": 0.34463388108962084,
+      "eval_loss": 3.7798807621002197,
+      "eval_runtime": 179.8167,
+      "eval_samples_per_second": 92.555,
+      "eval_steps_per_second": 5.789,
+      "step": 8000
+    },
+    {
+      "epoch": 2.343602585755052,
+      "grad_norm": 0.32286617159843445,
+      "learning_rate": 0.0005304752186588921,
+      "loss": 3.7492,
+      "step": 8050
+    },
+    {
+      "epoch": 2.3581620173548425,
+      "grad_norm": 0.33228906989097595,
+      "learning_rate": 0.0005300379008746355,
+      "loss": 3.764,
+      "step": 8100
+    },
+    {
+      "epoch": 2.372721448954633,
+      "grad_norm": 0.33857783675193787,
+      "learning_rate": 0.000529600583090379,
+      "loss": 3.759,
+      "step": 8150
+    },
+    {
+      "epoch": 2.3872808805544232,
+      "grad_norm": 0.3177933394908905,
+      "learning_rate": 0.0005291632653061224,
+      "loss": 3.7536,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4018403121542136,
+      "grad_norm": 0.3171054720878601,
+      "learning_rate": 0.0005287259475218658,
+      "loss": 3.7403,
+      "step": 8250
+    },
+    {
+      "epoch": 2.416399743754004,
+      "grad_norm": 0.32724741101264954,
+      "learning_rate": 0.0005282886297376093,
+      "loss": 3.7446,
+      "step": 8300
+    },
+    {
+      "epoch": 2.4309591753537942,
+      "grad_norm": 0.3406330347061157,
+      "learning_rate": 0.0005278513119533528,
+      "loss": 3.7441,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4455186069535846,
+      "grad_norm": 0.3245644271373749,
+      "learning_rate": 0.0005274139941690961,
+      "loss": 3.7317,
+      "step": 8400
+    },
+    {
+      "epoch": 2.460078038553375,
+      "grad_norm": 0.3408276438713074,
+      "learning_rate": 0.0005269766763848396,
+      "loss": 3.7373,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4746374701531653,
+      "grad_norm": 0.31394264101982117,
+      "learning_rate": 0.0005265393586005831,
+      "loss": 3.732,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4891969017529556,
+      "grad_norm": 0.3347412645816803,
+      "learning_rate": 0.0005261020408163265,
+      "loss": 3.7266,
+      "step": 8550
+    },
+    {
+      "epoch": 2.503756333352746,
+      "grad_norm": 0.32223114371299744,
+      "learning_rate": 0.0005256647230320699,
+      "loss": 3.7293,
+      "step": 8600
+    },
+    {
+      "epoch": 2.5183157649525363,
+      "grad_norm": 0.3145173490047455,
+      "learning_rate": 0.0005252274052478134,
+      "loss": 3.7471,
+      "step": 8650
+    },
+    {
+      "epoch": 2.5328751965523266,
+      "grad_norm": 0.31143006682395935,
+      "learning_rate": 0.0005247900874635568,
+      "loss": 3.7394,
+      "step": 8700
+    },
+    {
+      "epoch": 2.547434628152117,
+      "grad_norm": 0.3238007724285126,
+      "learning_rate": 0.0005243527696793003,
+      "loss": 3.7283,
+      "step": 8750
+    },
+    {
+      "epoch": 2.5619940597519073,
+      "grad_norm": 0.3301667869091034,
+      "learning_rate": 0.0005239154518950436,
+      "loss": 3.7463,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5765534913516976,
+      "grad_norm": 0.32847797870635986,
+      "learning_rate": 0.0005234781341107871,
+      "loss": 3.7397,
+      "step": 8850
+    },
+    {
+      "epoch": 2.591112922951488,
+      "grad_norm": 0.32561489939689636,
+      "learning_rate": 0.0005230408163265306,
+      "loss": 3.7437,
+      "step": 8900
+    },
+    {
+      "epoch": 2.6056723545512783,
+      "grad_norm": 0.30937111377716064,
+      "learning_rate": 0.000522603498542274,
+      "loss": 3.7399,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6202317861510687,
+      "grad_norm": 0.32154905796051025,
+      "learning_rate": 0.0005221661807580175,
+      "loss": 3.7339,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6202317861510687,
+      "eval_accuracy": 0.3472925683629266,
+      "eval_loss": 3.7506699562072754,
+      "eval_runtime": 179.8206,
+      "eval_samples_per_second": 92.553,
+      "eval_steps_per_second": 5.789,
+      "step": 9000
+    },
+    {
+      "epoch": 2.634791217750859,
+      "grad_norm": 0.317490816116333,
+      "learning_rate": 0.0005217288629737609,
+      "loss": 3.7263,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6493506493506493,
+      "grad_norm": 0.32750970125198364,
+      "learning_rate": 0.0005212915451895043,
+      "loss": 3.7324,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6639100809504397,
+      "grad_norm": 0.3290070593357086,
+      "learning_rate": 0.0005208542274052477,
+      "loss": 3.7314,
+      "step": 9150
+    },
+    {
+      "epoch": 2.67846951255023,
+      "grad_norm": 0.34482887387275696,
+      "learning_rate": 0.0005204169096209913,
+      "loss": 3.7192,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6930289441500204,
+      "grad_norm": 0.31812381744384766,
+      "learning_rate": 0.0005199795918367346,
+      "loss": 3.7308,
+      "step": 9250
+    },
+    {
+      "epoch": 2.7075883757498107,
+      "grad_norm": 0.33570706844329834,
+      "learning_rate": 0.0005195422740524781,
+      "loss": 3.7338,
+      "step": 9300
+    },
+    {
+      "epoch": 2.722147807349601,
+      "grad_norm": 0.3004995584487915,
+      "learning_rate": 0.0005191049562682216,
+      "loss": 3.7224,
+      "step": 9350
+    },
+    {
+      "epoch": 2.7367072389493914,
+      "grad_norm": 0.3277261555194855,
+      "learning_rate": 0.000518667638483965,
+      "loss": 3.7313,
+      "step": 9400
+    },
+    {
+      "epoch": 2.7512666705491817,
+      "grad_norm": 0.3260866701602936,
+      "learning_rate": 0.0005182303206997084,
+      "loss": 3.7252,
+      "step": 9450
+    },
+    {
+      "epoch": 2.765826102148972,
+      "grad_norm": 0.30772513151168823,
+      "learning_rate": 0.0005177930029154519,
+      "loss": 3.7263,
+      "step": 9500
+    },
+    {
+      "epoch": 2.7803855337487624,
+      "grad_norm": 0.3158465027809143,
+      "learning_rate": 0.0005173556851311953,
+      "loss": 3.728,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7949449653485527,
+      "grad_norm": 0.31197673082351685,
+      "learning_rate": 0.0005169183673469387,
+      "loss": 3.7135,
+      "step": 9600
+    },
+    {
+      "epoch": 2.809504396948343,
+      "grad_norm": 0.33720263838768005,
+      "learning_rate": 0.0005164810495626821,
+      "loss": 3.7205,
+      "step": 9650
+    },
+    {
+      "epoch": 2.8240638285481334,
+      "grad_norm": 0.3222922086715698,
+      "learning_rate": 0.0005160437317784256,
+      "loss": 3.7212,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8386232601479238,
+      "grad_norm": 0.32163000106811523,
+      "learning_rate": 0.0005156064139941691,
+      "loss": 3.7303,
+      "step": 9750
+    },
+    {
+      "epoch": 2.853182691747714,
+      "grad_norm": 0.29815468192100525,
+      "learning_rate": 0.0005151690962099124,
+      "loss": 3.7143,
+      "step": 9800
+    },
+    {
+      "epoch": 2.8677421233475044,
+      "grad_norm": 0.3258896768093109,
+      "learning_rate": 0.000514731778425656,
+      "loss": 3.7076,
+      "step": 9850
+    },
+    {
+      "epoch": 2.882301554947295,
+      "grad_norm": 0.32969552278518677,
+      "learning_rate": 0.0005142944606413994,
+      "loss": 3.7269,
+      "step": 9900
+    },
+    {
+      "epoch": 2.896860986547085,
+      "grad_norm": 0.31835922598838806,
+      "learning_rate": 0.0005138571428571428,
+      "loss": 3.7207,
+      "step": 9950
+    },
+    {
+      "epoch": 2.9114204181468755,
+      "grad_norm": 0.3245142102241516,
+      "learning_rate": 0.0005134198250728862,
+      "loss": 3.7167,
+      "step": 10000
+    },
+    {
+      "epoch": 2.9114204181468755,
+      "eval_accuracy": 0.3495557037372717,
+      "eval_loss": 3.7238857746124268,
+      "eval_runtime": 179.8397,
+      "eval_samples_per_second": 92.544,
+      "eval_steps_per_second": 5.788,
+      "step": 10000
+    },
+    {
+      "epoch": 2.925979849746666,
+      "grad_norm": 0.32630476355552673,
+      "learning_rate": 0.0005129825072886297,
+      "loss": 3.7083,
+      "step": 10050
+    },
+    {
+      "epoch": 2.940539281346456,
+      "grad_norm": 0.3315964341163635,
+      "learning_rate": 0.0005125451895043731,
+      "loss": 3.7064,
+      "step": 10100
+    },
+    {
+      "epoch": 2.9550987129462465,
+      "grad_norm": 0.31410086154937744,
+      "learning_rate": 0.0005121078717201166,
+      "loss": 3.7236,
+      "step": 10150
+    },
+    {
+      "epoch": 2.969658144546037,
+      "grad_norm": 0.33839717507362366,
+      "learning_rate": 0.0005116705539358601,
+      "loss": 3.7078,
+      "step": 10200
+    },
+    {
+      "epoch": 2.984217576145827,
+      "grad_norm": 0.32319313287734985,
+      "learning_rate": 0.0005112332361516034,
+      "loss": 3.7141,
+      "step": 10250
+    },
+    {
+      "epoch": 2.9987770077456175,
+      "grad_norm": 0.3235074579715729,
+      "learning_rate": 0.0005107959183673469,
+      "loss": 3.7063,
+      "step": 10300
+    },
+    {
+      "epoch": 3.0131034884398114,
+      "grad_norm": 0.314828097820282,
+      "learning_rate": 0.0005103586005830903,
+      "loss": 3.6245,
+      "step": 10350
+    },
+    {
+      "epoch": 3.0276629200396017,
+      "grad_norm": 0.31607604026794434,
+      "learning_rate": 0.0005099212827988338,
+      "loss": 3.6112,
+      "step": 10400
+    },
+    {
+      "epoch": 3.042222351639392,
+      "grad_norm": 0.35359737277030945,
+      "learning_rate": 0.0005094839650145772,
+      "loss": 3.609,
+      "step": 10450
+    },
+    {
+      "epoch": 3.0567817832391824,
+      "grad_norm": 0.32654085755348206,
+      "learning_rate": 0.0005090466472303206,
+      "loss": 3.6166,
+      "step": 10500
+    },
+    {
+      "epoch": 3.0713412148389727,
+      "grad_norm": 0.3420456051826477,
+      "learning_rate": 0.0005086093294460641,
+      "loss": 3.6039,
+      "step": 10550
+    },
+    {
+      "epoch": 3.085900646438763,
+      "grad_norm": 0.32927215099334717,
+      "learning_rate": 0.0005081720116618076,
+      "loss": 3.6076,
+      "step": 10600
+    },
+    {
+      "epoch": 3.1004600780385534,
+      "grad_norm": 0.32174116373062134,
+      "learning_rate": 0.0005077346938775509,
+      "loss": 3.62,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1150195096383437,
+      "grad_norm": 0.32081031799316406,
+      "learning_rate": 0.0005072973760932944,
+      "loss": 3.6198,
+      "step": 10700
+    },
+    {
+      "epoch": 3.129578941238134,
+      "grad_norm": 0.3233294188976288,
+      "learning_rate": 0.0005068600583090379,
+      "loss": 3.6221,
+      "step": 10750
+    },
+    {
+      "epoch": 3.1441383728379244,
+      "grad_norm": 0.3179484009742737,
+      "learning_rate": 0.0005064227405247813,
+      "loss": 3.6265,
+      "step": 10800
+    },
+    {
+      "epoch": 3.1586978044377148,
+      "grad_norm": 0.3125128746032715,
+      "learning_rate": 0.0005059854227405247,
+      "loss": 3.6316,
+      "step": 10850
+    },
+    {
+      "epoch": 3.173257236037505,
+      "grad_norm": 0.32463568449020386,
+      "learning_rate": 0.0005055481049562682,
+      "loss": 3.6245,
+      "step": 10900
+    },
+    {
+      "epoch": 3.1878166676372954,
+      "grad_norm": 0.31310543417930603,
+      "learning_rate": 0.0005051107871720116,
+      "loss": 3.6185,
+      "step": 10950
+    },
+    {
+      "epoch": 3.2023760992370858,
+      "grad_norm": 0.3464823067188263,
+      "learning_rate": 0.0005046734693877551,
+      "loss": 3.6204,
+      "step": 11000
+    },
+    {
+      "epoch": 3.2023760992370858,
+      "eval_accuracy": 0.3516197697403503,
+      "eval_loss": 3.7100203037261963,
+      "eval_runtime": 180.2504,
+      "eval_samples_per_second": 92.333,
+      "eval_steps_per_second": 5.775,
+      "step": 11000
+    },
+    {
+      "epoch": 3.216935530836876,
+      "grad_norm": 0.3277588486671448,
+      "learning_rate": 0.0005042361516034986,
+      "loss": 3.6268,
+      "step": 11050
+    },
+    {
+      "epoch": 3.2314949624366665,
+      "grad_norm": 0.32706061005592346,
+      "learning_rate": 0.0005037988338192419,
+      "loss": 3.6153,
+      "step": 11100
+    },
+    {
+      "epoch": 3.246054394036457,
+      "grad_norm": 0.31766435503959656,
+      "learning_rate": 0.0005033615160349854,
+      "loss": 3.6336,
+      "step": 11150
+    },
+    {
+      "epoch": 3.260613825636247,
+      "grad_norm": 0.3006264269351959,
+      "learning_rate": 0.0005029241982507288,
+      "loss": 3.6275,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2751732572360375,
+      "grad_norm": 0.32919037342071533,
+      "learning_rate": 0.0005024868804664723,
+      "loss": 3.6301,
+      "step": 11250
+    },
+    {
+      "epoch": 3.289732688835828,
+      "grad_norm": 0.3155740797519684,
+      "learning_rate": 0.0005020495626822157,
+      "loss": 3.6203,
+      "step": 11300
+    },
+    {
+      "epoch": 3.304292120435618,
+      "grad_norm": 0.3527681529521942,
+      "learning_rate": 0.0005016122448979591,
+      "loss": 3.6288,
+      "step": 11350
+    },
+    {
+      "epoch": 3.3188515520354085,
+      "grad_norm": 0.3135804831981659,
+      "learning_rate": 0.0005011749271137026,
+      "loss": 3.6415,
+      "step": 11400
+    },
+    {
+      "epoch": 3.333410983635199,
+      "grad_norm": 0.3078667223453522,
+      "learning_rate": 0.0005007376093294461,
+      "loss": 3.6284,
+      "step": 11450
+    },
+    {
+      "epoch": 3.347970415234989,
+      "grad_norm": 0.319755494594574,
+      "learning_rate": 0.0005003002915451894,
+      "loss": 3.6314,
+      "step": 11500
+    },
+    {
+      "epoch": 3.3625298468347795,
+      "grad_norm": 0.32641854882240295,
+      "learning_rate": 0.0004998629737609329,
+      "loss": 3.629,
+      "step": 11550
+    },
+    {
+      "epoch": 3.37708927843457,
+      "grad_norm": 0.3268803060054779,
+      "learning_rate": 0.0004994256559766764,
+      "loss": 3.6372,
+      "step": 11600
+    },
+    {
+      "epoch": 3.39164871003436,
+      "grad_norm": 0.32382065057754517,
+      "learning_rate": 0.0004989883381924198,
+      "loss": 3.6286,
+      "step": 11650
+    },
+    {
+      "epoch": 3.4062081416341505,
+      "grad_norm": 0.3158361613750458,
+      "learning_rate": 0.0004985510204081632,
+      "loss": 3.6329,
+      "step": 11700
+    },
+    {
+      "epoch": 3.420767573233941,
+      "grad_norm": 0.31245240569114685,
+      "learning_rate": 0.0004981137026239067,
+      "loss": 3.6428,
+      "step": 11750
+    },
+    {
+      "epoch": 3.435327004833731,
+      "grad_norm": 0.3362303078174591,
+      "learning_rate": 0.0004976763848396501,
+      "loss": 3.6369,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4498864364335216,
+      "grad_norm": 0.3208737373352051,
+      "learning_rate": 0.0004972390670553935,
+      "loss": 3.6428,
+      "step": 11850
+    },
+    {
+      "epoch": 3.464445868033312,
+      "grad_norm": 0.3163570761680603,
+      "learning_rate": 0.000496801749271137,
+      "loss": 3.6239,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4790052996331022,
+      "grad_norm": 0.3181529641151428,
+      "learning_rate": 0.0004963644314868804,
+      "loss": 3.6303,
+      "step": 11950
+    },
+    {
+      "epoch": 3.4935647312328926,
+      "grad_norm": 0.33231833577156067,
+      "learning_rate": 0.0004959271137026239,
+      "loss": 3.6358,
+      "step": 12000
+    },
+    {
+      "epoch": 3.4935647312328926,
+      "eval_accuracy": 0.35356460577150667,
+      "eval_loss": 3.6901774406433105,
+      "eval_runtime": 180.1137,
+      "eval_samples_per_second": 92.403,
+      "eval_steps_per_second": 5.78,
+      "step": 12000
+    },
+    {
+      "epoch": 3.508124162832683,
+      "grad_norm": 0.3368173837661743,
+      "learning_rate": 0.0004954897959183672,
+      "loss": 3.6389,
+      "step": 12050
+    },
+    {
+      "epoch": 3.5226835944324733,
+      "grad_norm": 0.33402830362319946,
+      "learning_rate": 0.0004950524781341108,
+      "loss": 3.645,
+      "step": 12100
+    },
+    {
+      "epoch": 3.5372430260322636,
+      "grad_norm": 0.33064502477645874,
+      "learning_rate": 0.0004946151603498542,
+      "loss": 3.6336,
+      "step": 12150
+    },
+    {
+      "epoch": 3.551802457632054,
+      "grad_norm": 0.31694450974464417,
+      "learning_rate": 0.0004941778425655976,
+      "loss": 3.6325,
+      "step": 12200
+    },
+    {
+      "epoch": 3.5663618892318443,
+      "grad_norm": 0.3069068193435669,
+      "learning_rate": 0.0004937405247813411,
+      "loss": 3.6246,
+      "step": 12250
+    },
+    {
+      "epoch": 3.5809213208316346,
+      "grad_norm": 0.3142222464084625,
+      "learning_rate": 0.0004933032069970845,
+      "loss": 3.6453,
+      "step": 12300
+    },
+    {
+      "epoch": 3.595480752431425,
+      "grad_norm": 0.3237994909286499,
+      "learning_rate": 0.0004928658892128279,
+      "loss": 3.6295,
+      "step": 12350
+    },
+    {
+      "epoch": 3.6100401840312153,
+      "grad_norm": 0.30255311727523804,
+      "learning_rate": 0.0004924285714285714,
+      "loss": 3.6468,
+      "step": 12400
+    },
+    {
+      "epoch": 3.6245996156310056,
+      "grad_norm": 0.3128635883331299,
+      "learning_rate": 0.0004919912536443149,
+      "loss": 3.6346,
+      "step": 12450
+    },
+    {
+      "epoch": 3.639159047230796,
+      "grad_norm": 0.31057000160217285,
+      "learning_rate": 0.0004915539358600582,
+      "loss": 3.622,
+      "step": 12500
+    },
+    {
+      "epoch": 3.6537184788305863,
+      "grad_norm": 0.3453236520290375,
+      "learning_rate": 0.0004911166180758017,
+      "loss": 3.6354,
+      "step": 12550
+    },
+    {
+      "epoch": 3.6682779104303767,
+      "grad_norm": 0.3358878493309021,
+      "learning_rate": 0.0004906793002915452,
+      "loss": 3.6276,
+      "step": 12600
+    },
+    {
+      "epoch": 3.682837342030167,
+      "grad_norm": 0.3207370638847351,
+      "learning_rate": 0.0004902419825072886,
+      "loss": 3.6358,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6973967736299573,
+      "grad_norm": 0.31057843565940857,
+      "learning_rate": 0.000489804664723032,
+      "loss": 3.6433,
+      "step": 12700
+    },
+    {
+      "epoch": 3.7119562052297477,
+      "grad_norm": 0.32829779386520386,
+      "learning_rate": 0.0004893673469387754,
+      "loss": 3.6282,
+      "step": 12750
+    },
+    {
+      "epoch": 3.726515636829538,
+      "grad_norm": 0.32469305396080017,
+      "learning_rate": 0.0004889300291545189,
+      "loss": 3.6353,
+      "step": 12800
+    },
+    {
+      "epoch": 3.7410750684293284,
+      "grad_norm": 0.32968953251838684,
+      "learning_rate": 0.0004884927113702624,
+      "loss": 3.6307,
+      "step": 12850
+    },
+    {
+      "epoch": 3.755634500029119,
+      "grad_norm": 0.3125181198120117,
+      "learning_rate": 0.0004880553935860058,
+      "loss": 3.6321,
+      "step": 12900
+    },
+    {
+      "epoch": 3.770193931628909,
+      "grad_norm": 0.31494152545928955,
+      "learning_rate": 0.0004876180758017492,
+      "loss": 3.6359,
+      "step": 12950
+    },
+    {
+      "epoch": 3.7847533632287,
+      "grad_norm": 0.32235443592071533,
+      "learning_rate": 0.0004871807580174927,
+      "loss": 3.6319,
+      "step": 13000
+    },
+    {
+      "epoch": 3.7847533632287,
+      "eval_accuracy": 0.35539115250113085,
+      "eval_loss": 3.6755480766296387,
+      "eval_runtime": 179.9398,
+      "eval_samples_per_second": 92.492,
+      "eval_steps_per_second": 5.785,
+      "step": 13000
+    },
+    {
+      "epoch": 3.7993127948284897,
+      "grad_norm": 0.3097991645336151,
+      "learning_rate": 0.00048674344023323613,
+      "loss": 3.6393,
+      "step": 13050
+    },
+    {
+      "epoch": 3.8138722264282805,
+      "grad_norm": 0.3186699450016022,
+      "learning_rate": 0.00048630612244897955,
+      "loss": 3.6318,
+      "step": 13100
+    },
+    {
+      "epoch": 3.8284316580280704,
+      "grad_norm": 0.3037383556365967,
+      "learning_rate": 0.00048586880466472296,
+      "loss": 3.6293,
+      "step": 13150
+    },
+    {
+      "epoch": 3.842991089627861,
+      "grad_norm": 0.32788893580436707,
+      "learning_rate": 0.0004854314868804664,
+      "loss": 3.6152,
+      "step": 13200
+    },
+    {
+      "epoch": 3.857550521227651,
+      "grad_norm": 0.3229829967021942,
+      "learning_rate": 0.0004849941690962099,
+      "loss": 3.6483,
+      "step": 13250
+    },
+    {
+      "epoch": 3.872109952827442,
+      "grad_norm": 0.3292683959007263,
+      "learning_rate": 0.0004845568513119533,
+      "loss": 3.6381,
+      "step": 13300
+    },
+    {
+      "epoch": 3.8866693844272318,
+      "grad_norm": 0.3210625648498535,
+      "learning_rate": 0.00048411953352769677,
+      "loss": 3.6269,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9012288160270225,
+      "grad_norm": 0.31549862027168274,
+      "learning_rate": 0.0004836822157434402,
+      "loss": 3.6213,
+      "step": 13400
+    },
+    {
+      "epoch": 3.9157882476268124,
+      "grad_norm": 0.30793866515159607,
+      "learning_rate": 0.00048324489795918365,
+      "loss": 3.6309,
+      "step": 13450
+    },
+    {
+      "epoch": 3.930347679226603,
+      "grad_norm": 0.3356075882911682,
+      "learning_rate": 0.00048280758017492706,
+      "loss": 3.6262,
+      "step": 13500
+    },
+    {
+      "epoch": 3.944907110826393,
+      "grad_norm": 0.32913827896118164,
+      "learning_rate": 0.00048237026239067053,
+      "loss": 3.6213,
+      "step": 13550
+    },
+    {
+      "epoch": 3.959466542426184,
+      "grad_norm": 0.3327690362930298,
+      "learning_rate": 0.00048193294460641394,
+      "loss": 3.6438,
+      "step": 13600
+    },
+    {
+      "epoch": 3.974025974025974,
+      "grad_norm": 0.3101835250854492,
+      "learning_rate": 0.00048149562682215735,
+      "loss": 3.6296,
+      "step": 13650
+    },
+    {
+      "epoch": 3.9885854056257646,
+      "grad_norm": 0.327761709690094,
+      "learning_rate": 0.0004810583090379009,
+      "loss": 3.6235,
+      "step": 13700
+    },
+    {
+      "epoch": 4.002911886319958,
+      "grad_norm": 0.32982325553894043,
+      "learning_rate": 0.0004806209912536443,
+      "loss": 3.6143,
+      "step": 13750
+    },
+    {
+      "epoch": 4.017471317919749,
+      "grad_norm": 0.32604551315307617,
+      "learning_rate": 0.0004801836734693877,
+      "loss": 3.511,
+      "step": 13800
+    },
+    {
+      "epoch": 4.032030749519539,
+      "grad_norm": 0.3375633955001831,
+      "learning_rate": 0.0004797463556851311,
+      "loss": 3.5208,
+      "step": 13850
+    },
+    {
+      "epoch": 4.046590181119329,
+      "grad_norm": 0.3174295127391815,
+      "learning_rate": 0.00047930903790087463,
+      "loss": 3.5217,
+      "step": 13900
+    },
+    {
+      "epoch": 4.061149612719119,
+      "grad_norm": 0.3297431170940399,
+      "learning_rate": 0.00047887172011661805,
+      "loss": 3.5218,
+      "step": 13950
+    },
+    {
+      "epoch": 4.07570904431891,
+      "grad_norm": 0.33495378494262695,
+      "learning_rate": 0.00047843440233236146,
+      "loss": 3.5213,
+      "step": 14000
+    },
+    {
+      "epoch": 4.07570904431891,
+      "eval_accuracy": 0.35668316328168387,
+      "eval_loss": 3.6667861938476562,
+      "eval_runtime": 179.8727,
+      "eval_samples_per_second": 92.527,
+      "eval_steps_per_second": 5.787,
+      "step": 14000
+    },
+    {
+      "epoch": 4.0902684759187,
+      "grad_norm": 0.3054860234260559,
+      "learning_rate": 0.0004779970845481049,
+      "loss": 3.5277,
+      "step": 14050
+    },
+    {
+      "epoch": 4.104827907518491,
+      "grad_norm": 0.3263727128505707,
+      "learning_rate": 0.00047755976676384834,
+      "loss": 3.5327,
+      "step": 14100
+    },
+    {
+      "epoch": 4.119387339118281,
+      "grad_norm": 0.3170093894004822,
+      "learning_rate": 0.0004771224489795918,
+      "loss": 3.5418,
+      "step": 14150
+    },
+    {
+      "epoch": 4.1339467707180715,
+      "grad_norm": 0.33194735646247864,
+      "learning_rate": 0.00047668513119533527,
+      "loss": 3.5364,
+      "step": 14200
+    },
+    {
+      "epoch": 4.148506202317861,
+      "grad_norm": 0.32043886184692383,
+      "learning_rate": 0.0004762478134110787,
+      "loss": 3.544,
+      "step": 14250
+    },
+    {
+      "epoch": 4.163065633917652,
+      "grad_norm": 0.32483235001564026,
+      "learning_rate": 0.0004758104956268221,
+      "loss": 3.5385,
+      "step": 14300
+    },
+    {
+      "epoch": 4.177625065517442,
+      "grad_norm": 0.3203752934932709,
+      "learning_rate": 0.0004753731778425656,
+      "loss": 3.5636,
+      "step": 14350
+    },
+    {
+      "epoch": 4.192184497117233,
+      "grad_norm": 0.3080170452594757,
+      "learning_rate": 0.00047493586005830903,
+      "loss": 3.5453,
+      "step": 14400
+    },
+    {
+      "epoch": 4.206743928717023,
+      "grad_norm": 0.3298153877258301,
+      "learning_rate": 0.00047449854227405244,
+      "loss": 3.5491,
+      "step": 14450
+    },
+    {
+      "epoch": 4.2213033603168135,
+      "grad_norm": 0.3529611825942993,
+      "learning_rate": 0.00047406122448979585,
+      "loss": 3.544,
+      "step": 14500
+    },
+    {
+      "epoch": 4.235862791916603,
+      "grad_norm": 0.3273563086986542,
+      "learning_rate": 0.00047362390670553926,
+      "loss": 3.5498,
+      "step": 14550
+    },
+    {
+      "epoch": 4.250422223516394,
+      "grad_norm": 0.313999205827713,
+      "learning_rate": 0.0004731865889212828,
+      "loss": 3.5526,
+      "step": 14600
+    },
+    {
+      "epoch": 4.264981655116184,
+      "grad_norm": 0.30790430307388306,
+      "learning_rate": 0.0004727492711370262,
+      "loss": 3.5544,
+      "step": 14650
+    },
+    {
+      "epoch": 4.279541086715975,
+      "grad_norm": 0.33186236023902893,
+      "learning_rate": 0.0004723119533527696,
+      "loss": 3.5506,
+      "step": 14700
+    },
+    {
+      "epoch": 4.294100518315765,
+      "grad_norm": 0.32786890864372253,
+      "learning_rate": 0.0004718746355685131,
+      "loss": 3.5475,
+      "step": 14750
+    },
+    {
+      "epoch": 4.308659949915556,
+      "grad_norm": 0.3234544098377228,
+      "learning_rate": 0.0004714373177842565,
+      "loss": 3.5549,
+      "step": 14800
+    },
+    {
+      "epoch": 4.3232193815153455,
+      "grad_norm": 0.31056949496269226,
+      "learning_rate": 0.00047099999999999996,
+      "loss": 3.5447,
+      "step": 14850
+    },
+    {
+      "epoch": 4.337778813115136,
+      "grad_norm": 0.3284071087837219,
+      "learning_rate": 0.0004705626822157434,
+      "loss": 3.5585,
+      "step": 14900
+    },
+    {
+      "epoch": 4.352338244714926,
+      "grad_norm": 0.32166486978530884,
+      "learning_rate": 0.00047012536443148683,
+      "loss": 3.5546,
+      "step": 14950
+    },
+    {
+      "epoch": 4.366897676314717,
+      "grad_norm": 0.3296414613723755,
+      "learning_rate": 0.00046968804664723025,
+      "loss": 3.5562,
+      "step": 15000
+    },
+    {
+      "epoch": 4.366897676314717,
+      "eval_accuracy": 0.3575879706129867,
+      "eval_loss": 3.6574151515960693,
+      "eval_runtime": 179.9318,
+      "eval_samples_per_second": 92.496,
+      "eval_steps_per_second": 5.786,
+      "step": 15000
+    },
+    {
+      "epoch": 4.381457107914507,
+      "grad_norm": 0.31862205266952515,
+      "learning_rate": 0.00046925072886297377,
+      "loss": 3.5609,
+      "step": 15050
+    },
+    {
+      "epoch": 4.396016539514298,
+      "grad_norm": 0.321135938167572,
+      "learning_rate": 0.0004688134110787172,
+      "loss": 3.5592,
+      "step": 15100
+    },
+    {
+      "epoch": 4.4105759711140875,
+      "grad_norm": 0.34049704670906067,
+      "learning_rate": 0.0004683760932944606,
+      "loss": 3.5666,
+      "step": 15150
+    },
+    {
+      "epoch": 4.425135402713878,
+      "grad_norm": 0.32759514451026917,
+      "learning_rate": 0.000467938775510204,
+      "loss": 3.5645,
+      "step": 15200
+    },
+    {
+      "epoch": 4.439694834313668,
+      "grad_norm": 0.31559038162231445,
+      "learning_rate": 0.00046750145772594747,
+      "loss": 3.5424,
+      "step": 15250
+    },
+    {
+      "epoch": 4.454254265913459,
+      "grad_norm": 0.31429657340049744,
+      "learning_rate": 0.00046706413994169094,
+      "loss": 3.5577,
+      "step": 15300
+    },
+    {
+      "epoch": 4.468813697513249,
+      "grad_norm": 0.32119688391685486,
+      "learning_rate": 0.00046662682215743435,
+      "loss": 3.5645,
+      "step": 15350
+    },
+    {
+      "epoch": 4.48337312911304,
+      "grad_norm": 0.32725510001182556,
+      "learning_rate": 0.0004661895043731778,
+      "loss": 3.558,
+      "step": 15400
+    },
+    {
+      "epoch": 4.4979325607128295,
+      "grad_norm": 0.3302425742149353,
+      "learning_rate": 0.00046575218658892123,
+      "loss": 3.5645,
+      "step": 15450
+    },
+    {
+      "epoch": 4.51249199231262,
+      "grad_norm": 0.33752188086509705,
+      "learning_rate": 0.0004653148688046647,
+      "loss": 3.5654,
+      "step": 15500
+    },
+    {
+      "epoch": 4.52705142391241,
+      "grad_norm": 0.3348866105079651,
+      "learning_rate": 0.0004648775510204081,
+      "loss": 3.5587,
+      "step": 15550
+    },
+    {
+      "epoch": 4.541610855512201,
+      "grad_norm": 0.33069008588790894,
+      "learning_rate": 0.0004644402332361516,
+      "loss": 3.5564,
+      "step": 15600
+    },
+    {
+      "epoch": 4.556170287111991,
+      "grad_norm": 0.36258620023727417,
+      "learning_rate": 0.000464002915451895,
+      "loss": 3.5586,
+      "step": 15650
+    },
+    {
+      "epoch": 4.570729718711782,
+      "grad_norm": 0.3146510422229767,
+      "learning_rate": 0.0004635655976676384,
+      "loss": 3.5612,
+      "step": 15700
+    },
+    {
+      "epoch": 4.585289150311572,
+      "grad_norm": 0.3268812298774719,
+      "learning_rate": 0.0004631282798833819,
+      "loss": 3.5536,
+      "step": 15750
+    },
+    {
+      "epoch": 4.599848581911362,
+      "grad_norm": 0.31493905186653137,
+      "learning_rate": 0.00046269096209912533,
+      "loss": 3.5717,
+      "step": 15800
+    },
+    {
+      "epoch": 4.614408013511152,
+      "grad_norm": 0.3173486590385437,
+      "learning_rate": 0.00046225364431486875,
+      "loss": 3.5678,
+      "step": 15850
+    },
+    {
+      "epoch": 4.628967445110943,
+      "grad_norm": 0.32398083806037903,
+      "learning_rate": 0.00046181632653061216,
+      "loss": 3.557,
+      "step": 15900
+    },
+    {
+      "epoch": 4.643526876710733,
+      "grad_norm": 0.31683549284935,
+      "learning_rate": 0.0004613790087463557,
+      "loss": 3.5652,
+      "step": 15950
+    },
+    {
+      "epoch": 4.658086308310524,
+      "grad_norm": 0.3226284682750702,
+      "learning_rate": 0.0004609416909620991,
+      "loss": 3.5583,
+      "step": 16000
+    },
+    {
+      "epoch": 4.658086308310524,
+      "eval_accuracy": 0.35886363724551484,
+      "eval_loss": 3.641108989715576,
+      "eval_runtime": 179.9045,
+      "eval_samples_per_second": 92.51,
+      "eval_steps_per_second": 5.786,
+      "step": 16000
+    },
+    {
+      "epoch": 4.672645739910314,
+      "grad_norm": 0.3244362771511078,
+      "learning_rate": 0.0004605043731778425,
+      "loss": 3.5653,
+      "step": 16050
+    },
+    {
+      "epoch": 4.687205171510104,
+      "grad_norm": 0.3218280076980591,
+      "learning_rate": 0.00046006705539358597,
+      "loss": 3.5573,
+      "step": 16100
+    },
+    {
+      "epoch": 4.701764603109894,
+      "grad_norm": 0.31557270884513855,
+      "learning_rate": 0.0004596297376093294,
+      "loss": 3.5697,
+      "step": 16150
+    },
+    {
+      "epoch": 4.716324034709685,
+      "grad_norm": 0.32409724593162537,
+      "learning_rate": 0.00045919241982507285,
+      "loss": 3.5727,
+      "step": 16200
+    },
+    {
+      "epoch": 4.730883466309475,
+      "grad_norm": 0.32196715474128723,
+      "learning_rate": 0.0004587551020408163,
+      "loss": 3.5696,
+      "step": 16250
+    },
+    {
+      "epoch": 4.745442897909266,
+      "grad_norm": 0.3190127909183502,
+      "learning_rate": 0.00045831778425655973,
+      "loss": 3.5589,
+      "step": 16300
+    },
+    {
+      "epoch": 4.760002329509056,
+      "grad_norm": 0.3492906391620636,
+      "learning_rate": 0.00045788046647230314,
+      "loss": 3.576,
+      "step": 16350
+    },
+    {
+      "epoch": 4.7745617611088464,
+      "grad_norm": 0.3227944076061249,
+      "learning_rate": 0.00045744314868804666,
+      "loss": 3.5511,
+      "step": 16400
+    },
+    {
+      "epoch": 4.789121192708636,
+      "grad_norm": 0.3169122040271759,
+      "learning_rate": 0.0004570058309037901,
+      "loss": 3.5684,
+      "step": 16450
+    },
+    {
+      "epoch": 4.803680624308427,
+      "grad_norm": 0.31213343143463135,
+      "learning_rate": 0.0004565685131195335,
+      "loss": 3.5667,
+      "step": 16500
+    },
+    {
+      "epoch": 4.818240055908217,
+      "grad_norm": 0.32593971490859985,
+      "learning_rate": 0.0004561311953352769,
+      "loss": 3.5661,
+      "step": 16550
+    },
+    {
+      "epoch": 4.832799487508008,
+      "grad_norm": 0.33425310254096985,
+      "learning_rate": 0.0004556938775510203,
+      "loss": 3.5688,
+      "step": 16600
+    },
+    {
+      "epoch": 4.847358919107798,
+      "grad_norm": 0.32003140449523926,
+      "learning_rate": 0.00045525655976676383,
+      "loss": 3.5601,
+      "step": 16650
+    },
+    {
+      "epoch": 4.8619183507075885,
+      "grad_norm": 0.3596481382846832,
+      "learning_rate": 0.00045481924198250724,
+      "loss": 3.5688,
+      "step": 16700
+    },
+    {
+      "epoch": 4.876477782307378,
+      "grad_norm": 0.3375333547592163,
+      "learning_rate": 0.00045438192419825066,
+      "loss": 3.5685,
+      "step": 16750
+    },
+    {
+      "epoch": 4.891037213907169,
+      "grad_norm": 0.31676721572875977,
+      "learning_rate": 0.0004539446064139941,
+      "loss": 3.5581,
+      "step": 16800
+    },
+    {
+      "epoch": 4.905596645506959,
+      "grad_norm": 0.3257509469985962,
+      "learning_rate": 0.0004535072886297376,
+      "loss": 3.5537,
+      "step": 16850
+    },
+    {
+      "epoch": 4.92015607710675,
+      "grad_norm": 0.3176610767841339,
+      "learning_rate": 0.000453069970845481,
+      "loss": 3.5678,
+      "step": 16900
+    },
+    {
+      "epoch": 4.93471550870654,
+      "grad_norm": 0.3168198764324188,
+      "learning_rate": 0.00045263265306122447,
+      "loss": 3.5499,
+      "step": 16950
+    },
+    {
+      "epoch": 4.9492749403063305,
+      "grad_norm": 0.31883013248443604,
+      "learning_rate": 0.0004521953352769679,
+      "loss": 3.5668,
+      "step": 17000
+    },
+    {
+      "epoch": 4.9492749403063305,
+      "eval_accuracy": 0.360385171601208,
+      "eval_loss": 3.626793146133423,
+      "eval_runtime": 180.0874,
+      "eval_samples_per_second": 92.416,
+      "eval_steps_per_second": 5.781,
+      "step": 17000
+    },
+    {
+      "epoch": 4.96383437190612,
+      "grad_norm": 0.3429825007915497,
+      "learning_rate": 0.0004517580174927113,
+      "loss": 3.5693,
+      "step": 17050
+    },
+    {
+      "epoch": 4.978393803505911,
+      "grad_norm": 0.31468144059181213,
+      "learning_rate": 0.0004513206997084548,
+      "loss": 3.5648,
+      "step": 17100
+    },
+    {
+      "epoch": 4.992953235105701,
+      "grad_norm": 0.3186092972755432,
+      "learning_rate": 0.0004508833819241982,
+      "loss": 3.5611,
+      "step": 17150
+    },
+    {
+      "epoch": 5.007279715799895,
+      "grad_norm": 0.32911449670791626,
+      "learning_rate": 0.00045044606413994164,
+      "loss": 3.5003,
+      "step": 17200
+    },
+    {
+      "epoch": 5.021839147399685,
+      "grad_norm": 0.32932335138320923,
+      "learning_rate": 0.00045000874635568505,
+      "loss": 3.4462,
+      "step": 17250
+    },
+    {
+      "epoch": 5.036398578999476,
+      "grad_norm": 0.3199908435344696,
+      "learning_rate": 0.00044957142857142857,
+      "loss": 3.4569,
+      "step": 17300
+    },
+    {
+      "epoch": 5.050958010599266,
+      "grad_norm": 0.33716824650764465,
+      "learning_rate": 0.000449134110787172,
+      "loss": 3.4669,
+      "step": 17350
+    },
+    {
+      "epoch": 5.065517442199057,
+      "grad_norm": 0.32985949516296387,
+      "learning_rate": 0.0004486967930029154,
+      "loss": 3.4787,
+      "step": 17400
+    },
+    {
+      "epoch": 5.080076873798847,
+      "grad_norm": 0.3227981925010681,
+      "learning_rate": 0.00044825947521865886,
+      "loss": 3.46,
+      "step": 17450
+    },
+    {
+      "epoch": 5.094636305398637,
+      "grad_norm": 0.32830196619033813,
+      "learning_rate": 0.0004478221574344023,
+      "loss": 3.4714,
+      "step": 17500
+    },
+    {
+      "epoch": 5.109195736998427,
+      "grad_norm": 0.33184128999710083,
+      "learning_rate": 0.00044738483965014574,
+      "loss": 3.4636,
+      "step": 17550
+    },
+    {
+      "epoch": 5.123755168598218,
+      "grad_norm": 0.3285403251647949,
+      "learning_rate": 0.00044694752186588915,
+      "loss": 3.4711,
+      "step": 17600
+    },
+    {
+      "epoch": 5.138314600198008,
+      "grad_norm": 0.3541177809238434,
+      "learning_rate": 0.0004465102040816326,
+      "loss": 3.4806,
+      "step": 17650
+    },
+    {
+      "epoch": 5.152874031797799,
+      "grad_norm": 0.3223034143447876,
+      "learning_rate": 0.00044607288629737603,
+      "loss": 3.4813,
+      "step": 17700
+    },
+    {
+      "epoch": 5.167433463397589,
+      "grad_norm": 0.3218257427215576,
+      "learning_rate": 0.0004456355685131195,
+      "loss": 3.4826,
+      "step": 17750
+    },
+    {
+      "epoch": 5.1819928949973795,
+      "grad_norm": 0.3309643566608429,
+      "learning_rate": 0.00044519825072886297,
+      "loss": 3.4888,
+      "step": 17800
+    },
+    {
+      "epoch": 5.196552326597169,
+      "grad_norm": 0.3264036774635315,
+      "learning_rate": 0.0004447609329446064,
+      "loss": 3.483,
+      "step": 17850
+    },
+    {
+      "epoch": 5.21111175819696,
+      "grad_norm": 0.324790358543396,
+      "learning_rate": 0.0004443236151603498,
+      "loss": 3.4896,
+      "step": 17900
+    },
+    {
+      "epoch": 5.22567118979675,
+      "grad_norm": 0.3223564922809601,
+      "learning_rate": 0.0004438862973760932,
+      "loss": 3.4711,
+      "step": 17950
+    },
+    {
+      "epoch": 5.240230621396541,
+      "grad_norm": 0.33800962567329407,
+      "learning_rate": 0.0004434489795918367,
+      "loss": 3.4813,
+      "step": 18000
+    },
+    {
+      "epoch": 5.240230621396541,
+      "eval_accuracy": 0.36069406424049744,
+      "eval_loss": 3.6294045448303223,
+      "eval_runtime": 179.9519,
+      "eval_samples_per_second": 92.486,
+      "eval_steps_per_second": 5.785,
+      "step": 18000
+    },
+    {
+      "epoch": 5.254790052996331,
+      "grad_norm": 0.3145381510257721,
+      "learning_rate": 0.00044301166180758014,
+      "loss": 3.4916,
+      "step": 18050
+    },
+    {
+      "epoch": 5.2693494845961215,
+      "grad_norm": 0.33484500646591187,
+      "learning_rate": 0.00044257434402332355,
+      "loss": 3.5041,
+      "step": 18100
+    },
+    {
+      "epoch": 5.283908916195911,
+      "grad_norm": 0.3385532796382904,
+      "learning_rate": 0.000442137026239067,
+      "loss": 3.4968,
+      "step": 18150
+    },
+    {
+      "epoch": 5.298468347795702,
+      "grad_norm": 0.32390275597572327,
+      "learning_rate": 0.0004416997084548105,
+      "loss": 3.4945,
+      "step": 18200
+    },
+    {
+      "epoch": 5.313027779395492,
+      "grad_norm": 0.3384283483028412,
+      "learning_rate": 0.0004412623906705539,
+      "loss": 3.4902,
+      "step": 18250
+    },
+    {
+      "epoch": 5.327587210995283,
+      "grad_norm": 0.3334660530090332,
+      "learning_rate": 0.00044082507288629736,
+      "loss": 3.4961,
+      "step": 18300
+    },
+    {
+      "epoch": 5.342146642595073,
+      "grad_norm": 0.32754096388816833,
+      "learning_rate": 0.0004403877551020408,
+      "loss": 3.5051,
+      "step": 18350
+    },
+    {
+      "epoch": 5.3567060741948636,
+      "grad_norm": 0.3279802203178406,
+      "learning_rate": 0.0004399504373177842,
+      "loss": 3.5005,
+      "step": 18400
+    },
+    {
+      "epoch": 5.3712655057946534,
+      "grad_norm": 0.3342030346393585,
+      "learning_rate": 0.0004395131195335277,
+      "loss": 3.4931,
+      "step": 18450
+    },
+    {
+      "epoch": 5.385824937394444,
+      "grad_norm": 0.3162689805030823,
+      "learning_rate": 0.0004390758017492711,
+      "loss": 3.4952,
+      "step": 18500
+    },
+    {
+      "epoch": 5.400384368994234,
+      "grad_norm": 0.32114139199256897,
+      "learning_rate": 0.00043863848396501453,
+      "loss": 3.4999,
+      "step": 18550
+    },
+    {
+      "epoch": 5.414943800594025,
+      "grad_norm": 0.32540494203567505,
+      "learning_rate": 0.00043820116618075794,
+      "loss": 3.5038,
+      "step": 18600
+    },
+    {
+      "epoch": 5.429503232193815,
+      "grad_norm": 0.33477944135665894,
+      "learning_rate": 0.00043776384839650147,
+      "loss": 3.4941,
+      "step": 18650
+    },
+    {
+      "epoch": 5.444062663793606,
+      "grad_norm": 0.3286316990852356,
+      "learning_rate": 0.0004373265306122449,
+      "loss": 3.502,
+      "step": 18700
+    },
+    {
+      "epoch": 5.4586220953933955,
+      "grad_norm": 0.3312956690788269,
+      "learning_rate": 0.0004368892128279883,
+      "loss": 3.5054,
+      "step": 18750
+    },
+    {
+      "epoch": 5.473181526993186,
+      "grad_norm": 0.33420413732528687,
+      "learning_rate": 0.0004364518950437317,
+      "loss": 3.5176,
+      "step": 18800
+    },
+    {
+      "epoch": 5.487740958592976,
+      "grad_norm": 0.312959223985672,
+      "learning_rate": 0.00043601457725947517,
+      "loss": 3.5112,
+      "step": 18850
+    },
+    {
+      "epoch": 5.502300390192767,
+      "grad_norm": 0.31764182448387146,
+      "learning_rate": 0.00043557725947521864,
+      "loss": 3.4987,
+      "step": 18900
+    },
+    {
+      "epoch": 5.516859821792557,
+      "grad_norm": 0.3139015734195709,
+      "learning_rate": 0.00043513994169096205,
+      "loss": 3.4989,
+      "step": 18950
+    },
+    {
+      "epoch": 5.531419253392348,
+      "grad_norm": 0.32444003224372864,
+      "learning_rate": 0.0004347026239067055,
+      "loss": 3.4996,
+      "step": 19000
+    },
+    {
+      "epoch": 5.531419253392348,
+      "eval_accuracy": 0.3617237455660619,
+      "eval_loss": 3.6197915077209473,
+      "eval_runtime": 180.0468,
+      "eval_samples_per_second": 92.437,
+      "eval_steps_per_second": 5.782,
+      "step": 19000
+    },
+    {
+      "epoch": 5.5459786849921375,
+      "grad_norm": 0.32666343450546265,
+      "learning_rate": 0.0004342653061224489,
+      "loss": 3.5084,
+      "step": 19050
+    },
+    {
+      "epoch": 5.560538116591928,
+      "grad_norm": 0.32281461358070374,
+      "learning_rate": 0.0004338279883381924,
+      "loss": 3.5143,
+      "step": 19100
+    },
+    {
+      "epoch": 5.575097548191718,
+      "grad_norm": 0.3272330164909363,
+      "learning_rate": 0.00043339067055393586,
+      "loss": 3.5108,
+      "step": 19150
+    },
+    {
+      "epoch": 5.589656979791509,
+      "grad_norm": 0.31538012623786926,
+      "learning_rate": 0.00043295335276967927,
+      "loss": 3.504,
+      "step": 19200
+    },
+    {
+      "epoch": 5.604216411391299,
+      "grad_norm": 0.34619444608688354,
+      "learning_rate": 0.0004325160349854227,
+      "loss": 3.4962,
+      "step": 19250
+    },
+    {
+      "epoch": 5.61877584299109,
+      "grad_norm": 0.33601802587509155,
+      "learning_rate": 0.0004320787172011661,
+      "loss": 3.5107,
+      "step": 19300
+    },
+    {
+      "epoch": 5.6333352745908805,
+      "grad_norm": 0.32709893584251404,
+      "learning_rate": 0.0004316413994169096,
+      "loss": 3.5085,
+      "step": 19350
+    },
+    {
+      "epoch": 5.64789470619067,
+      "grad_norm": 0.332736611366272,
+      "learning_rate": 0.00043120408163265303,
+      "loss": 3.5027,
+      "step": 19400
+    },
+    {
+      "epoch": 5.66245413779046,
+      "grad_norm": 0.32013025879859924,
+      "learning_rate": 0.00043076676384839644,
+      "loss": 3.507,
+      "step": 19450
+    },
+    {
+      "epoch": 5.677013569390251,
+      "grad_norm": 0.34380871057510376,
+      "learning_rate": 0.0004303294460641399,
+      "loss": 3.5091,
+      "step": 19500
+    },
+    {
+      "epoch": 5.691573000990042,
+      "grad_norm": 0.3146701455116272,
+      "learning_rate": 0.0004298921282798834,
+      "loss": 3.5085,
+      "step": 19550
+    },
+    {
+      "epoch": 5.706132432589832,
+      "grad_norm": 0.3258221447467804,
+      "learning_rate": 0.0004294548104956268,
+      "loss": 3.513,
+      "step": 19600
+    },
+    {
+      "epoch": 5.720691864189622,
+      "grad_norm": 0.3335384726524353,
+      "learning_rate": 0.0004290174927113702,
+      "loss": 3.5065,
+      "step": 19650
+    },
+    {
+      "epoch": 5.735251295789412,
+      "grad_norm": 0.333322674036026,
+      "learning_rate": 0.00042858017492711367,
+      "loss": 3.5061,
+      "step": 19700
+    },
+    {
+      "epoch": 5.749810727389203,
+      "grad_norm": 0.3227587342262268,
+      "learning_rate": 0.0004281428571428571,
+      "loss": 3.5212,
+      "step": 19750
+    },
+    {
+      "epoch": 5.764370158988993,
+      "grad_norm": 0.3334672152996063,
+      "learning_rate": 0.00042770553935860055,
+      "loss": 3.4976,
+      "step": 19800
+    },
+    {
+      "epoch": 5.778929590588783,
+      "grad_norm": 0.3109551966190338,
+      "learning_rate": 0.000427268221574344,
+      "loss": 3.5159,
+      "step": 19850
+    },
+    {
+      "epoch": 5.793489022188574,
+      "grad_norm": 0.3229271471500397,
+      "learning_rate": 0.0004268309037900874,
+      "loss": 3.511,
+      "step": 19900
+    },
+    {
+      "epoch": 5.8080484537883645,
+      "grad_norm": 0.31595003604888916,
+      "learning_rate": 0.00042639358600583084,
+      "loss": 3.5083,
+      "step": 19950
+    },
+    {
+      "epoch": 5.822607885388154,
+      "grad_norm": 0.3313562273979187,
+      "learning_rate": 0.00042595626822157436,
+      "loss": 3.5032,
+      "step": 20000
+    },
+    {
+      "epoch": 5.822607885388154,
+      "eval_accuracy": 0.3630212827851557,
+      "eval_loss": 3.6066231727600098,
+      "eval_runtime": 180.0689,
+      "eval_samples_per_second": 92.426,
+      "eval_steps_per_second": 5.781,
+      "step": 20000
+    },
+    {
+      "epoch": 5.837167316987944,
+      "grad_norm": 0.32517609000205994,
+      "learning_rate": 0.00042551895043731777,
+      "loss": 3.5075,
+      "step": 20050
+    },
+    {
+      "epoch": 5.851726748587735,
+      "grad_norm": 0.3312103748321533,
+      "learning_rate": 0.0004250816326530612,
+      "loss": 3.5038,
+      "step": 20100
+    },
+    {
+      "epoch": 5.866286180187526,
+      "grad_norm": 0.3302570879459381,
+      "learning_rate": 0.0004246443148688046,
+      "loss": 3.5185,
+      "step": 20150
+    },
+    {
+      "epoch": 5.880845611787316,
+      "grad_norm": 0.3184104561805725,
+      "learning_rate": 0.00042420699708454806,
+      "loss": 3.5069,
+      "step": 20200
+    },
+    {
+      "epoch": 5.895405043387106,
+      "grad_norm": 0.31885817646980286,
+      "learning_rate": 0.00042376967930029153,
+      "loss": 3.5122,
+      "step": 20250
+    },
+    {
+      "epoch": 5.9099644749868965,
+      "grad_norm": 0.3231607973575592,
+      "learning_rate": 0.00042333236151603494,
+      "loss": 3.522,
+      "step": 20300
+    },
+    {
+      "epoch": 5.924523906586687,
+      "grad_norm": 0.3280011713504791,
+      "learning_rate": 0.0004228950437317784,
+      "loss": 3.5191,
+      "step": 20350
+    },
+    {
+      "epoch": 5.939083338186477,
+      "grad_norm": 0.32695943117141724,
+      "learning_rate": 0.0004224577259475218,
+      "loss": 3.5189,
+      "step": 20400
+    },
+    {
+      "epoch": 5.953642769786267,
+      "grad_norm": 0.31571418046951294,
+      "learning_rate": 0.0004220204081632653,
+      "loss": 3.506,
+      "step": 20450
+    },
+    {
+      "epoch": 5.968202201386058,
+      "grad_norm": 0.3223441243171692,
+      "learning_rate": 0.0004215830903790087,
+      "loss": 3.5298,
+      "step": 20500
+    },
+    {
+      "epoch": 5.982761632985849,
+      "grad_norm": 0.3090570569038391,
+      "learning_rate": 0.00042114577259475217,
+      "loss": 3.5086,
+      "step": 20550
+    },
+    {
+      "epoch": 5.9973210645856385,
+      "grad_norm": 0.32136136293411255,
+      "learning_rate": 0.0004207084548104956,
+      "loss": 3.5194,
+      "step": 20600
+    },
+    {
+      "epoch": 6.011647545279832,
+      "grad_norm": 0.33823925256729126,
+      "learning_rate": 0.000420271137026239,
+      "loss": 3.4225,
+      "step": 20650
+    },
+    {
+      "epoch": 6.026206976879623,
+      "grad_norm": 0.31168079376220703,
+      "learning_rate": 0.0004198338192419825,
+      "loss": 3.4097,
+      "step": 20700
+    },
+    {
+      "epoch": 6.040766408479413,
+      "grad_norm": 0.33379727602005005,
+      "learning_rate": 0.0004193965014577259,
+      "loss": 3.3974,
+      "step": 20750
+    },
+    {
+      "epoch": 6.055325840079203,
+      "grad_norm": 0.3196876645088196,
+      "learning_rate": 0.00041895918367346934,
+      "loss": 3.4086,
+      "step": 20800
+    },
+    {
+      "epoch": 6.069885271678993,
+      "grad_norm": 0.3263348937034607,
+      "learning_rate": 0.00041852186588921275,
+      "loss": 3.4206,
+      "step": 20850
+    },
+    {
+      "epoch": 6.084444703278784,
+      "grad_norm": 0.3413217067718506,
+      "learning_rate": 0.00041808454810495627,
+      "loss": 3.4143,
+      "step": 20900
+    },
+    {
+      "epoch": 6.099004134878574,
+      "grad_norm": 0.3205811381340027,
+      "learning_rate": 0.0004176472303206997,
+      "loss": 3.4236,
+      "step": 20950
+    },
+    {
+      "epoch": 6.113563566478365,
+      "grad_norm": 0.3402191996574402,
+      "learning_rate": 0.0004172099125364431,
+      "loss": 3.4196,
+      "step": 21000
+    },
+    {
+      "epoch": 6.113563566478365,
+      "eval_accuracy": 0.36323646113684954,
+      "eval_loss": 3.610100030899048,
+      "eval_runtime": 179.9878,
+      "eval_samples_per_second": 92.467,
+      "eval_steps_per_second": 5.784,
+      "step": 21000
+    },
+    {
+      "epoch": 6.128122998078155,
+      "grad_norm": 0.35015061497688293,
+      "learning_rate": 0.00041677259475218656,
+      "loss": 3.4323,
+      "step": 21050
+    },
+    {
+      "epoch": 6.142682429677945,
+      "grad_norm": 0.3365619480609894,
+      "learning_rate": 0.00041633527696792997,
+      "loss": 3.4316,
+      "step": 21100
+    },
+    {
+      "epoch": 6.157241861277735,
+      "grad_norm": 0.32558462023735046,
+      "learning_rate": 0.00041589795918367344,
+      "loss": 3.4261,
+      "step": 21150
+    },
+    {
+      "epoch": 6.171801292877526,
+      "grad_norm": 0.3229493498802185,
+      "learning_rate": 0.0004154606413994169,
+      "loss": 3.4273,
+      "step": 21200
+    },
+    {
+      "epoch": 6.186360724477316,
+      "grad_norm": 0.3373366594314575,
+      "learning_rate": 0.0004150233236151603,
+      "loss": 3.4241,
+      "step": 21250
+    },
+    {
+      "epoch": 6.200920156077107,
+      "grad_norm": 0.33470067381858826,
+      "learning_rate": 0.00041458600583090373,
+      "loss": 3.436,
+      "step": 21300
+    },
+    {
+      "epoch": 6.215479587676897,
+      "grad_norm": 0.33129194378852844,
+      "learning_rate": 0.00041414868804664725,
+      "loss": 3.4464,
+      "step": 21350
+    },
+    {
+      "epoch": 6.2300390192766875,
+      "grad_norm": 0.3305993676185608,
+      "learning_rate": 0.00041371137026239066,
+      "loss": 3.44,
+      "step": 21400
+    },
+    {
+      "epoch": 6.244598450876477,
+      "grad_norm": 0.3288079500198364,
+      "learning_rate": 0.0004132740524781341,
+      "loss": 3.4417,
+      "step": 21450
+    },
+    {
+      "epoch": 6.259157882476268,
+      "grad_norm": 0.33732712268829346,
+      "learning_rate": 0.0004128367346938775,
+      "loss": 3.445,
+      "step": 21500
+    },
+    {
+      "epoch": 6.273717314076059,
+      "grad_norm": 0.3398957848548889,
+      "learning_rate": 0.0004123994169096209,
+      "loss": 3.4473,
+      "step": 21550
+    },
+    {
+      "epoch": 6.288276745675849,
+      "grad_norm": 0.3353675305843353,
+      "learning_rate": 0.0004119620991253644,
+      "loss": 3.4428,
+      "step": 21600
+    },
+    {
+      "epoch": 6.302836177275639,
+      "grad_norm": 0.3312719464302063,
+      "learning_rate": 0.00041152478134110783,
+      "loss": 3.4346,
+      "step": 21650
+    },
+    {
+      "epoch": 6.3173956088754295,
+      "grad_norm": 0.32870662212371826,
+      "learning_rate": 0.00041108746355685125,
+      "loss": 3.4397,
+      "step": 21700
+    },
+    {
+      "epoch": 6.33195504047522,
+      "grad_norm": 0.3326077461242676,
+      "learning_rate": 0.0004106501457725947,
+      "loss": 3.4494,
+      "step": 21750
+    },
+    {
+      "epoch": 6.34651447207501,
+      "grad_norm": 0.32431626319885254,
+      "learning_rate": 0.0004102128279883382,
+      "loss": 3.435,
+      "step": 21800
+    },
+    {
+      "epoch": 6.3610739036748,
+      "grad_norm": 0.32606053352355957,
+      "learning_rate": 0.0004097755102040816,
+      "loss": 3.4515,
+      "step": 21850
+    },
+    {
+      "epoch": 6.375633335274591,
+      "grad_norm": 0.33837705850601196,
+      "learning_rate": 0.00040933819241982506,
+      "loss": 3.4578,
+      "step": 21900
+    },
+    {
+      "epoch": 6.390192766874382,
+      "grad_norm": 0.35296231508255005,
+      "learning_rate": 0.00040890087463556847,
+      "loss": 3.4563,
+      "step": 21950
+    },
+    {
+      "epoch": 6.4047521984741715,
+      "grad_norm": 0.3277094066143036,
+      "learning_rate": 0.0004084635568513119,
+      "loss": 3.4499,
+      "step": 22000
+    },
+    {
+      "epoch": 6.4047521984741715,
+      "eval_accuracy": 0.3639197405913266,
+      "eval_loss": 3.6030333042144775,
+      "eval_runtime": 180.1326,
+      "eval_samples_per_second": 92.393,
+      "eval_steps_per_second": 5.779,
+      "step": 22000
+    },
+    {
+      "epoch": 6.419311630073962,
+      "grad_norm": 0.3193458020687103,
+      "learning_rate": 0.0004080262390670554,
+      "loss": 3.4559,
+      "step": 22050
+    },
+    {
+      "epoch": 6.433871061673752,
+      "grad_norm": 0.3288237452507019,
+      "learning_rate": 0.0004075889212827988,
+      "loss": 3.4597,
+      "step": 22100
+    },
+    {
+      "epoch": 6.448430493273543,
+      "grad_norm": 0.3396027088165283,
+      "learning_rate": 0.00040715160349854223,
+      "loss": 3.4572,
+      "step": 22150
+    },
+    {
+      "epoch": 6.462989924873333,
+      "grad_norm": 0.3147648572921753,
+      "learning_rate": 0.00040671428571428564,
+      "loss": 3.4541,
+      "step": 22200
+    },
+    {
+      "epoch": 6.477549356473124,
+      "grad_norm": 0.31667134165763855,
+      "learning_rate": 0.00040627696793002916,
+      "loss": 3.4621,
+      "step": 22250
+    },
+    {
+      "epoch": 6.492108788072914,
+      "grad_norm": 0.34657663106918335,
+      "learning_rate": 0.0004058396501457726,
+      "loss": 3.4604,
+      "step": 22300
+    },
+    {
+      "epoch": 6.506668219672704,
+      "grad_norm": 0.32261285185813904,
+      "learning_rate": 0.000405402332361516,
+      "loss": 3.4612,
+      "step": 22350
+    },
+    {
+      "epoch": 6.521227651272494,
+      "grad_norm": 0.3422505855560303,
+      "learning_rate": 0.00040496501457725945,
+      "loss": 3.4559,
+      "step": 22400
+    },
+    {
+      "epoch": 6.535787082872285,
+      "grad_norm": 0.3316167891025543,
+      "learning_rate": 0.00040452769679300287,
+      "loss": 3.4496,
+      "step": 22450
+    },
+    {
+      "epoch": 6.550346514472075,
+      "grad_norm": 0.3352113962173462,
+      "learning_rate": 0.00040409037900874633,
+      "loss": 3.4656,
+      "step": 22500
+    },
+    {
+      "epoch": 6.564905946071866,
+      "grad_norm": 0.3302208185195923,
+      "learning_rate": 0.00040365306122448974,
+      "loss": 3.444,
+      "step": 22550
+    },
+    {
+      "epoch": 6.579465377671656,
+      "grad_norm": 0.3382601737976074,
+      "learning_rate": 0.0004032157434402332,
+      "loss": 3.4749,
+      "step": 22600
+    },
+    {
+      "epoch": 6.594024809271446,
+      "grad_norm": 0.32733502984046936,
+      "learning_rate": 0.0004027784256559766,
+      "loss": 3.4676,
+      "step": 22650
+    },
+    {
+      "epoch": 6.608584240871236,
+      "grad_norm": 0.3271522521972656,
+      "learning_rate": 0.0004023411078717201,
+      "loss": 3.4625,
+      "step": 22700
+    },
+    {
+      "epoch": 6.623143672471027,
+      "grad_norm": 0.35525378584861755,
+      "learning_rate": 0.00040190379008746356,
+      "loss": 3.47,
+      "step": 22750
+    },
+    {
+      "epoch": 6.637703104070817,
+      "grad_norm": 0.34130969643592834,
+      "learning_rate": 0.00040146647230320697,
+      "loss": 3.4531,
+      "step": 22800
+    },
+    {
+      "epoch": 6.652262535670608,
+      "grad_norm": 0.3281981647014618,
+      "learning_rate": 0.0004010291545189504,
+      "loss": 3.4675,
+      "step": 22850
+    },
+    {
+      "epoch": 6.666821967270398,
+      "grad_norm": 0.3403642475605011,
+      "learning_rate": 0.0004005918367346938,
+      "loss": 3.4779,
+      "step": 22900
+    },
+    {
+      "epoch": 6.6813813988701884,
+      "grad_norm": 0.35402730107307434,
+      "learning_rate": 0.0004001545189504373,
+      "loss": 3.4748,
+      "step": 22950
+    },
+    {
+      "epoch": 6.695940830469978,
+      "grad_norm": 0.3444899618625641,
+      "learning_rate": 0.0003997172011661807,
+      "loss": 3.4726,
+      "step": 23000
+    },
+    {
+      "epoch": 6.695940830469978,
+      "eval_accuracy": 0.36462571371896035,
+      "eval_loss": 3.593517541885376,
+      "eval_runtime": 180.0181,
+      "eval_samples_per_second": 92.452,
+      "eval_steps_per_second": 5.783,
+      "step": 23000
+    },
+    {
+      "epoch": 6.710500262069769,
+      "grad_norm": 0.3251613676548004,
+      "learning_rate": 0.00039927988338192414,
+      "loss": 3.4531,
+      "step": 23050
+    },
+    {
+      "epoch": 6.725059693669559,
+      "grad_norm": 0.3224335312843323,
+      "learning_rate": 0.0003988425655976676,
+      "loss": 3.4575,
+      "step": 23100
+    },
+    {
+      "epoch": 6.73961912526935,
+      "grad_norm": 0.3301301598548889,
+      "learning_rate": 0.00039840524781341107,
+      "loss": 3.4608,
+      "step": 23150
+    },
+    {
+      "epoch": 6.75417855686914,
+      "grad_norm": 0.3267367482185364,
+      "learning_rate": 0.0003979679300291545,
+      "loss": 3.4722,
+      "step": 23200
+    },
+    {
+      "epoch": 6.7687379884689305,
+      "grad_norm": 0.34791019558906555,
+      "learning_rate": 0.00039753061224489795,
+      "loss": 3.4737,
+      "step": 23250
+    },
+    {
+      "epoch": 6.78329742006872,
+      "grad_norm": 0.3289180099964142,
+      "learning_rate": 0.00039709329446064136,
+      "loss": 3.4727,
+      "step": 23300
+    },
+    {
+      "epoch": 6.797856851668511,
+      "grad_norm": 0.326933890581131,
+      "learning_rate": 0.0003966559766763848,
+      "loss": 3.4499,
+      "step": 23350
+    },
+    {
+      "epoch": 6.812416283268301,
+      "grad_norm": 0.3207686245441437,
+      "learning_rate": 0.0003962186588921283,
+      "loss": 3.4833,
+      "step": 23400
+    },
+    {
+      "epoch": 6.826975714868092,
+      "grad_norm": 0.34312567114830017,
+      "learning_rate": 0.0003957813411078717,
+      "loss": 3.4743,
+      "step": 23450
+    },
+    {
+      "epoch": 6.841535146467882,
+      "grad_norm": 0.32261767983436584,
+      "learning_rate": 0.0003953440233236151,
+      "loss": 3.4717,
+      "step": 23500
+    },
+    {
+      "epoch": 6.8560945780676725,
+      "grad_norm": 0.3239823579788208,
+      "learning_rate": 0.00039490670553935853,
+      "loss": 3.4745,
+      "step": 23550
+    },
+    {
+      "epoch": 6.870654009667462,
+      "grad_norm": 0.3287251889705658,
+      "learning_rate": 0.00039446938775510195,
+      "loss": 3.4706,
+      "step": 23600
+    },
+    {
+      "epoch": 6.885213441267253,
+      "grad_norm": 0.33744481205940247,
+      "learning_rate": 0.00039403206997084547,
+      "loss": 3.4712,
+      "step": 23650
+    },
+    {
+      "epoch": 6.899772872867043,
+      "grad_norm": 0.33784157037734985,
+      "learning_rate": 0.0003935947521865889,
+      "loss": 3.4744,
+      "step": 23700
+    },
+    {
+      "epoch": 6.914332304466834,
+      "grad_norm": 0.33039554953575134,
+      "learning_rate": 0.0003931574344023323,
+      "loss": 3.4635,
+      "step": 23750
+    },
+    {
+      "epoch": 6.928891736066624,
+      "grad_norm": 0.3252994418144226,
+      "learning_rate": 0.00039272011661807576,
+      "loss": 3.4699,
+      "step": 23800
+    },
+    {
+      "epoch": 6.943451167666415,
+      "grad_norm": 0.33139488101005554,
+      "learning_rate": 0.0003922827988338192,
+      "loss": 3.4763,
+      "step": 23850
+    },
+    {
+      "epoch": 6.9580105992662045,
+      "grad_norm": 0.3324434757232666,
+      "learning_rate": 0.00039184548104956264,
+      "loss": 3.467,
+      "step": 23900
+    },
+    {
+      "epoch": 6.972570030865995,
+      "grad_norm": 0.32796579599380493,
+      "learning_rate": 0.0003914081632653061,
+      "loss": 3.4705,
+      "step": 23950
+    },
+    {
+      "epoch": 6.987129462465785,
+      "grad_norm": 0.32173773646354675,
+      "learning_rate": 0.0003909708454810495,
+      "loss": 3.4734,
+      "step": 24000
+    },
+    {
+      "epoch": 6.987129462465785,
+      "eval_accuracy": 0.36566327315904046,
+      "eval_loss": 3.5826520919799805,
+      "eval_runtime": 179.8608,
+      "eval_samples_per_second": 92.533,
+      "eval_steps_per_second": 5.788,
+      "step": 24000
+    },
+    {
+      "epoch": 7.001455943159979,
+      "grad_norm": 0.3372614085674286,
+      "learning_rate": 0.00039053352769679293,
+      "loss": 3.4591,
+      "step": 24050
+    },
+    {
+      "epoch": 7.016015374759769,
+      "grad_norm": 0.3403565585613251,
+      "learning_rate": 0.00039009620991253645,
+      "loss": 3.3491,
+      "step": 24100
+    },
+    {
+      "epoch": 7.03057480635956,
+      "grad_norm": 0.34883126616477966,
+      "learning_rate": 0.00038965889212827986,
+      "loss": 3.3559,
+      "step": 24150
+    },
+    {
+      "epoch": 7.04513423795935,
+      "grad_norm": 0.3384236693382263,
+      "learning_rate": 0.0003892215743440233,
+      "loss": 3.3671,
+      "step": 24200
+    },
+    {
+      "epoch": 7.059693669559141,
+      "grad_norm": 0.33237436413764954,
+      "learning_rate": 0.0003887842565597667,
+      "loss": 3.3692,
+      "step": 24250
+    },
+    {
+      "epoch": 7.074253101158931,
+      "grad_norm": 0.35221633315086365,
+      "learning_rate": 0.0003883469387755102,
+      "loss": 3.3795,
+      "step": 24300
+    },
+    {
+      "epoch": 7.0888125327587215,
+      "grad_norm": 0.33727243542671204,
+      "learning_rate": 0.0003879096209912536,
+      "loss": 3.3848,
+      "step": 24350
+    },
+    {
+      "epoch": 7.103371964358511,
+      "grad_norm": 0.34708696603775024,
+      "learning_rate": 0.00038747230320699703,
+      "loss": 3.3819,
+      "step": 24400
+    },
+    {
+      "epoch": 7.117931395958302,
+      "grad_norm": 0.3307049572467804,
+      "learning_rate": 0.0003870349854227405,
+      "loss": 3.3914,
+      "step": 24450
+    },
+    {
+      "epoch": 7.132490827558092,
+      "grad_norm": 0.3246367871761322,
+      "learning_rate": 0.0003865976676384839,
+      "loss": 3.3867,
+      "step": 24500
+    },
+    {
+      "epoch": 7.147050259157883,
+      "grad_norm": 0.33385294675827026,
+      "learning_rate": 0.0003861603498542274,
+      "loss": 3.3878,
+      "step": 24550
+    },
+    {
+      "epoch": 7.161609690757673,
+      "grad_norm": 0.35358157753944397,
+      "learning_rate": 0.0003857230320699708,
+      "loss": 3.3848,
+      "step": 24600
+    },
+    {
+      "epoch": 7.1761691223574635,
+      "grad_norm": 0.3381134271621704,
+      "learning_rate": 0.00038528571428571426,
+      "loss": 3.381,
+      "step": 24650
+    },
+    {
+      "epoch": 7.190728553957253,
+      "grad_norm": 0.33539456129074097,
+      "learning_rate": 0.00038484839650145767,
+      "loss": 3.391,
+      "step": 24700
+    },
+    {
+      "epoch": 7.205287985557044,
+      "grad_norm": 0.3288535475730896,
+      "learning_rate": 0.00038441107871720114,
+      "loss": 3.3846,
+      "step": 24750
+    },
+    {
+      "epoch": 7.219847417156834,
+      "grad_norm": 0.3503969609737396,
+      "learning_rate": 0.0003839737609329446,
+      "loss": 3.3942,
+      "step": 24800
+    },
+    {
+      "epoch": 7.234406848756625,
+      "grad_norm": 0.34089216589927673,
+      "learning_rate": 0.000383536443148688,
+      "loss": 3.4039,
+      "step": 24850
+    },
+    {
+      "epoch": 7.248966280356415,
+      "grad_norm": 0.33822911977767944,
+      "learning_rate": 0.00038309912536443143,
+      "loss": 3.3938,
+      "step": 24900
+    },
+    {
+      "epoch": 7.2635257119562056,
+      "grad_norm": 0.34553372859954834,
+      "learning_rate": 0.00038266180758017484,
+      "loss": 3.4007,
+      "step": 24950
+    },
+    {
+      "epoch": 7.2780851435559955,
+      "grad_norm": 0.34171566367149353,
+      "learning_rate": 0.00038222448979591836,
+      "loss": 3.4074,
+      "step": 25000
+    },
+    {
+      "epoch": 7.2780851435559955,
+      "eval_accuracy": 0.3654761973352454,
+      "eval_loss": 3.590606212615967,
+      "eval_runtime": 179.9086,
+      "eval_samples_per_second": 92.508,
+      "eval_steps_per_second": 5.786,
+      "step": 25000
+    },
+    {
+      "epoch": 7.292644575155786,
+      "grad_norm": 0.33524560928344727,
+      "learning_rate": 0.00038178717201166177,
+      "loss": 3.4011,
+      "step": 25050
+    },
+    {
+      "epoch": 7.307204006755576,
+      "grad_norm": 0.3223832845687866,
+      "learning_rate": 0.0003813498542274052,
+      "loss": 3.4039,
+      "step": 25100
+    },
+    {
+      "epoch": 7.321763438355367,
+      "grad_norm": 0.33097726106643677,
+      "learning_rate": 0.00038091253644314865,
+      "loss": 3.4045,
+      "step": 25150
+    },
+    {
+      "epoch": 7.336322869955157,
+      "grad_norm": 0.3478914201259613,
+      "learning_rate": 0.0003804752186588921,
+      "loss": 3.4058,
+      "step": 25200
+    },
+    {
+      "epoch": 7.350882301554948,
+      "grad_norm": 0.36241742968559265,
+      "learning_rate": 0.00038003790087463553,
+      "loss": 3.4074,
+      "step": 25250
+    },
+    {
+      "epoch": 7.3654417331547375,
+      "grad_norm": 0.3258671164512634,
+      "learning_rate": 0.000379600583090379,
+      "loss": 3.4111,
+      "step": 25300
+    },
+    {
+      "epoch": 7.380001164754528,
+      "grad_norm": 0.34953588247299194,
+      "learning_rate": 0.0003791632653061224,
+      "loss": 3.4106,
+      "step": 25350
+    },
+    {
+      "epoch": 7.394560596354318,
+      "grad_norm": 0.33248066902160645,
+      "learning_rate": 0.0003787259475218658,
+      "loss": 3.4044,
+      "step": 25400
+    },
+    {
+      "epoch": 7.409120027954109,
+      "grad_norm": 0.3584959805011749,
+      "learning_rate": 0.00037828862973760934,
+      "loss": 3.4181,
+      "step": 25450
+    },
+    {
+      "epoch": 7.423679459553899,
+      "grad_norm": 0.3388174772262573,
+      "learning_rate": 0.00037785131195335276,
+      "loss": 3.4183,
+      "step": 25500
+    },
+    {
+      "epoch": 7.43823889115369,
+      "grad_norm": 0.3261428773403168,
+      "learning_rate": 0.00037741399416909617,
+      "loss": 3.421,
+      "step": 25550
+    },
+    {
+      "epoch": 7.4527983227534795,
+      "grad_norm": 0.33287757635116577,
+      "learning_rate": 0.0003769766763848396,
+      "loss": 3.4161,
+      "step": 25600
+    },
+    {
+      "epoch": 7.46735775435327,
+      "grad_norm": 0.3487517237663269,
+      "learning_rate": 0.0003765393586005831,
+      "loss": 3.4124,
+      "step": 25650
+    },
+    {
+      "epoch": 7.48191718595306,
+      "grad_norm": 0.3360964357852936,
+      "learning_rate": 0.0003761020408163265,
+      "loss": 3.4114,
+      "step": 25700
+    },
+    {
+      "epoch": 7.496476617552851,
+      "grad_norm": 0.339276522397995,
+      "learning_rate": 0.0003756647230320699,
+      "loss": 3.4243,
+      "step": 25750
+    },
+    {
+      "epoch": 7.511036049152641,
+      "grad_norm": 0.34367257356643677,
+      "learning_rate": 0.00037522740524781334,
+      "loss": 3.422,
+      "step": 25800
+    },
+    {
+      "epoch": 7.525595480752432,
+      "grad_norm": 0.33407968282699585,
+      "learning_rate": 0.0003747900874635568,
+      "loss": 3.4223,
+      "step": 25850
+    },
+    {
+      "epoch": 7.540154912352222,
+      "grad_norm": 0.3418211042881012,
+      "learning_rate": 0.00037435276967930027,
+      "loss": 3.4312,
+      "step": 25900
+    },
+    {
+      "epoch": 7.554714343952012,
+      "grad_norm": 0.3525235950946808,
+      "learning_rate": 0.0003739154518950437,
+      "loss": 3.436,
+      "step": 25950
+    },
+    {
+      "epoch": 7.569273775551802,
+      "grad_norm": 0.35437750816345215,
+      "learning_rate": 0.00037347813411078715,
+      "loss": 3.4173,
+      "step": 26000
+    },
+    {
+      "epoch": 7.569273775551802,
+      "eval_accuracy": 0.36623061498796,
+      "eval_loss": 3.5847508907318115,
+      "eval_runtime": 179.8238,
+      "eval_samples_per_second": 92.552,
+      "eval_steps_per_second": 5.789,
+      "step": 26000
+    },
+    {
+      "epoch": 7.583833207151593,
+      "grad_norm": 0.36680832505226135,
+      "learning_rate": 0.00037304081632653056,
+      "loss": 3.4282,
+      "step": 26050
+    },
+    {
+      "epoch": 7.598392638751383,
+      "grad_norm": 0.32975292205810547,
+      "learning_rate": 0.00037260349854227403,
+      "loss": 3.4187,
+      "step": 26100
+    },
+    {
+      "epoch": 7.612952070351174,
+      "grad_norm": 0.33258336782455444,
+      "learning_rate": 0.0003721661807580175,
+      "loss": 3.419,
+      "step": 26150
+    },
+    {
+      "epoch": 7.627511501950964,
+      "grad_norm": 0.3520626723766327,
+      "learning_rate": 0.0003717288629737609,
+      "loss": 3.4317,
+      "step": 26200
+    },
+    {
+      "epoch": 7.642070933550754,
+      "grad_norm": 0.34615185856819153,
+      "learning_rate": 0.0003712915451895043,
+      "loss": 3.4397,
+      "step": 26250
+    },
+    {
+      "epoch": 7.656630365150544,
+      "grad_norm": 0.3472108542919159,
+      "learning_rate": 0.00037085422740524773,
+      "loss": 3.4183,
+      "step": 26300
+    },
+    {
+      "epoch": 7.671189796750335,
+      "grad_norm": 0.3401790261268616,
+      "learning_rate": 0.00037041690962099125,
+      "loss": 3.4274,
+      "step": 26350
+    },
+    {
+      "epoch": 7.685749228350125,
+      "grad_norm": 0.34616005420684814,
+      "learning_rate": 0.00036997959183673467,
+      "loss": 3.432,
+      "step": 26400
+    },
+    {
+      "epoch": 7.700308659949916,
+      "grad_norm": 0.35238298773765564,
+      "learning_rate": 0.0003695422740524781,
+      "loss": 3.4308,
+      "step": 26450
+    },
+    {
+      "epoch": 7.714868091549706,
+      "grad_norm": 0.3595859408378601,
+      "learning_rate": 0.00036910495626822154,
+      "loss": 3.4368,
+      "step": 26500
+    },
+    {
+      "epoch": 7.729427523149496,
+      "grad_norm": 0.3455177843570709,
+      "learning_rate": 0.000368667638483965,
+      "loss": 3.4389,
+      "step": 26550
+    },
+    {
+      "epoch": 7.743986954749286,
+      "grad_norm": 0.3548458516597748,
+      "learning_rate": 0.0003682303206997084,
+      "loss": 3.427,
+      "step": 26600
+    },
+    {
+      "epoch": 7.758546386349077,
+      "grad_norm": 0.34287944436073303,
+      "learning_rate": 0.00036779300291545184,
+      "loss": 3.4268,
+      "step": 26650
+    },
+    {
+      "epoch": 7.773105817948867,
+      "grad_norm": 0.3392084836959839,
+      "learning_rate": 0.0003673556851311953,
+      "loss": 3.4348,
+      "step": 26700
+    },
+    {
+      "epoch": 7.787665249548658,
+      "grad_norm": 0.3404330015182495,
+      "learning_rate": 0.0003669183673469387,
+      "loss": 3.4162,
+      "step": 26750
+    },
+    {
+      "epoch": 7.802224681148448,
+      "grad_norm": 0.34505772590637207,
+      "learning_rate": 0.0003664810495626822,
+      "loss": 3.4267,
+      "step": 26800
+    },
+    {
+      "epoch": 7.8167841127482385,
+      "grad_norm": 0.34470146894454956,
+      "learning_rate": 0.00036604373177842565,
+      "loss": 3.444,
+      "step": 26850
+    },
+    {
+      "epoch": 7.831343544348028,
+      "grad_norm": 0.33952096104621887,
+      "learning_rate": 0.00036560641399416906,
+      "loss": 3.4317,
+      "step": 26900
+    },
+    {
+      "epoch": 7.845902975947819,
+      "grad_norm": 0.35519036650657654,
+      "learning_rate": 0.0003651690962099125,
+      "loss": 3.4335,
+      "step": 26950
+    },
+    {
+      "epoch": 7.860462407547609,
+      "grad_norm": 0.3539314568042755,
+      "learning_rate": 0.000364731778425656,
+      "loss": 3.4318,
+      "step": 27000
+    },
+    {
+      "epoch": 7.860462407547609,
+      "eval_accuracy": 0.36703947393949116,
+      "eval_loss": 3.572652578353882,
+      "eval_runtime": 179.9477,
+      "eval_samples_per_second": 92.488,
+      "eval_steps_per_second": 5.785,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8750218391474,
+      "grad_norm": 0.3434327244758606,
+      "learning_rate": 0.0003642944606413994,
+      "loss": 3.4372,
+      "step": 27050
+    },
+    {
+      "epoch": 7.88958127074719,
+      "grad_norm": 0.3285406231880188,
+      "learning_rate": 0.0003638571428571428,
+      "loss": 3.4238,
+      "step": 27100
+    },
+    {
+      "epoch": 7.9041407023469805,
+      "grad_norm": 0.3453764021396637,
+      "learning_rate": 0.00036341982507288623,
+      "loss": 3.4369,
+      "step": 27150
+    },
+    {
+      "epoch": 7.91870013394677,
+      "grad_norm": 0.32807591557502747,
+      "learning_rate": 0.0003629825072886297,
+      "loss": 3.4333,
+      "step": 27200
+    },
+    {
+      "epoch": 7.933259565546561,
+      "grad_norm": 0.33627721667289734,
+      "learning_rate": 0.00036254518950437316,
+      "loss": 3.4378,
+      "step": 27250
+    },
+    {
+      "epoch": 7.947818997146351,
+      "grad_norm": 0.3372686207294464,
+      "learning_rate": 0.0003621078717201166,
+      "loss": 3.4311,
+      "step": 27300
+    },
+    {
+      "epoch": 7.962378428746142,
+      "grad_norm": 0.3440007269382477,
+      "learning_rate": 0.00036167055393586004,
+      "loss": 3.4387,
+      "step": 27350
+    },
+    {
+      "epoch": 7.976937860345932,
+      "grad_norm": 0.3503531813621521,
+      "learning_rate": 0.00036123323615160346,
+      "loss": 3.435,
+      "step": 27400
+    },
+    {
+      "epoch": 7.991497291945723,
+      "grad_norm": 0.32801827788352966,
+      "learning_rate": 0.0003607959183673469,
+      "loss": 3.4472,
+      "step": 27450
+    },
+    {
+      "epoch": 8.005823772639916,
+      "grad_norm": 0.35048505663871765,
+      "learning_rate": 0.0003603586005830904,
+      "loss": 3.3812,
+      "step": 27500
+    },
+    {
+      "epoch": 8.020383204239707,
+      "grad_norm": 0.3340410888195038,
+      "learning_rate": 0.0003599212827988338,
+      "loss": 3.3375,
+      "step": 27550
+    },
+    {
+      "epoch": 8.034942635839498,
+      "grad_norm": 0.34667739272117615,
+      "learning_rate": 0.0003594839650145772,
+      "loss": 3.3225,
+      "step": 27600
+    },
+    {
+      "epoch": 8.049502067439287,
+      "grad_norm": 0.3594329059123993,
+      "learning_rate": 0.0003590466472303206,
+      "loss": 3.335,
+      "step": 27650
+    },
+    {
+      "epoch": 8.064061499039077,
+      "grad_norm": 0.3574943244457245,
+      "learning_rate": 0.00035860932944606415,
+      "loss": 3.3357,
+      "step": 27700
+    },
+    {
+      "epoch": 8.078620930638868,
+      "grad_norm": 0.3481893539428711,
+      "learning_rate": 0.00035817201166180756,
+      "loss": 3.3318,
+      "step": 27750
+    },
+    {
+      "epoch": 8.093180362238659,
+      "grad_norm": 0.3395717144012451,
+      "learning_rate": 0.00035773469387755097,
+      "loss": 3.3532,
+      "step": 27800
+    },
+    {
+      "epoch": 8.107739793838448,
+      "grad_norm": 0.33678963780403137,
+      "learning_rate": 0.0003572973760932944,
+      "loss": 3.3457,
+      "step": 27850
+    },
+    {
+      "epoch": 8.122299225438239,
+      "grad_norm": 0.3517054617404938,
+      "learning_rate": 0.0003568600583090379,
+      "loss": 3.3469,
+      "step": 27900
+    },
+    {
+      "epoch": 8.13685865703803,
+      "grad_norm": 0.3345564603805542,
+      "learning_rate": 0.0003564227405247813,
+      "loss": 3.3386,
+      "step": 27950
+    },
+    {
+      "epoch": 8.15141808863782,
+      "grad_norm": 0.3494488596916199,
+      "learning_rate": 0.00035598542274052473,
+      "loss": 3.3639,
+      "step": 28000
+    },
+    {
+      "epoch": 8.15141808863782,
+      "eval_accuracy": 0.3670094900708125,
+      "eval_loss": 3.580695867538452,
+      "eval_runtime": 179.627,
+      "eval_samples_per_second": 92.653,
+      "eval_steps_per_second": 5.795,
+      "step": 28000
+    },
+    {
+      "epoch": 8.16597752023761,
+      "grad_norm": 0.33706334233283997,
+      "learning_rate": 0.0003555481049562682,
+      "loss": 3.369,
+      "step": 28050
+    },
+    {
+      "epoch": 8.1805369518374,
+      "grad_norm": 0.34689971804618835,
+      "learning_rate": 0.0003551107871720116,
+      "loss": 3.3533,
+      "step": 28100
+    },
+    {
+      "epoch": 8.19509638343719,
+      "grad_norm": 0.34161534905433655,
+      "learning_rate": 0.0003546734693877551,
+      "loss": 3.375,
+      "step": 28150
+    },
+    {
+      "epoch": 8.209655815036982,
+      "grad_norm": 0.36119794845581055,
+      "learning_rate": 0.00035423615160349854,
+      "loss": 3.3611,
+      "step": 28200
+    },
+    {
+      "epoch": 8.22421524663677,
+      "grad_norm": 0.3473355174064636,
+      "learning_rate": 0.00035379883381924195,
+      "loss": 3.3731,
+      "step": 28250
+    },
+    {
+      "epoch": 8.238774678236561,
+      "grad_norm": 0.33798742294311523,
+      "learning_rate": 0.00035336151603498537,
+      "loss": 3.3692,
+      "step": 28300
+    },
+    {
+      "epoch": 8.253334109836352,
+      "grad_norm": 0.3432019352912903,
+      "learning_rate": 0.0003529241982507289,
+      "loss": 3.3687,
+      "step": 28350
+    },
+    {
+      "epoch": 8.267893541436143,
+      "grad_norm": 0.35700732469558716,
+      "learning_rate": 0.0003524868804664723,
+      "loss": 3.3723,
+      "step": 28400
+    },
+    {
+      "epoch": 8.282452973035932,
+      "grad_norm": 0.348431795835495,
+      "learning_rate": 0.0003520495626822157,
+      "loss": 3.3593,
+      "step": 28450
+    },
+    {
+      "epoch": 8.297012404635723,
+      "grad_norm": 0.34419500827789307,
+      "learning_rate": 0.0003516122448979591,
+      "loss": 3.3778,
+      "step": 28500
+    },
+    {
+      "epoch": 8.311571836235514,
+      "grad_norm": 0.34864479303359985,
+      "learning_rate": 0.0003511749271137026,
+      "loss": 3.3809,
+      "step": 28550
+    },
+    {
+      "epoch": 8.326131267835304,
+      "grad_norm": 0.35667717456817627,
+      "learning_rate": 0.00035073760932944606,
+      "loss": 3.363,
+      "step": 28600
+    },
+    {
+      "epoch": 8.340690699435093,
+      "grad_norm": 0.3501654863357544,
+      "learning_rate": 0.00035030029154518947,
+      "loss": 3.378,
+      "step": 28650
+    },
+    {
+      "epoch": 8.355250131034884,
+      "grad_norm": 0.3490404486656189,
+      "learning_rate": 0.0003498629737609329,
+      "loss": 3.3855,
+      "step": 28700
+    },
+    {
+      "epoch": 8.369809562634675,
+      "grad_norm": 0.358019083738327,
+      "learning_rate": 0.00034942565597667635,
+      "loss": 3.3784,
+      "step": 28750
+    },
+    {
+      "epoch": 8.384368994234466,
+      "grad_norm": 0.33226025104522705,
+      "learning_rate": 0.0003489883381924198,
+      "loss": 3.3714,
+      "step": 28800
+    },
+    {
+      "epoch": 8.398928425834255,
+      "grad_norm": 0.3402322828769684,
+      "learning_rate": 0.00034855102040816323,
+      "loss": 3.3785,
+      "step": 28850
+    },
+    {
+      "epoch": 8.413487857434045,
+      "grad_norm": 0.36141642928123474,
+      "learning_rate": 0.0003481137026239067,
+      "loss": 3.3745,
+      "step": 28900
+    },
+    {
+      "epoch": 8.428047289033836,
+      "grad_norm": 0.36371850967407227,
+      "learning_rate": 0.0003476763848396501,
+      "loss": 3.3874,
+      "step": 28950
+    },
+    {
+      "epoch": 8.442606720633627,
+      "grad_norm": 0.3497146666049957,
+      "learning_rate": 0.0003472390670553935,
+      "loss": 3.3846,
+      "step": 29000
+    },
+    {
+      "epoch": 8.442606720633627,
+      "eval_accuracy": 0.3678150566759789,
+      "eval_loss": 3.572382688522339,
+      "eval_runtime": 179.6973,
+      "eval_samples_per_second": 92.617,
+      "eval_steps_per_second": 5.793,
+      "step": 29000
+    },
+    {
+      "epoch": 8.457166152233416,
+      "grad_norm": 0.3523021936416626,
+      "learning_rate": 0.00034680174927113704,
+      "loss": 3.3833,
+      "step": 29050
+    },
+    {
+      "epoch": 8.471725583833207,
+      "grad_norm": 0.3318672180175781,
+      "learning_rate": 0.00034636443148688045,
+      "loss": 3.3856,
+      "step": 29100
+    },
+    {
+      "epoch": 8.486285015432998,
+      "grad_norm": 0.34436580538749695,
+      "learning_rate": 0.00034592711370262386,
+      "loss": 3.392,
+      "step": 29150
+    },
+    {
+      "epoch": 8.500844447032788,
+      "grad_norm": 0.3374488651752472,
+      "learning_rate": 0.0003454897959183673,
+      "loss": 3.3784,
+      "step": 29200
+    },
+    {
+      "epoch": 8.515403878632577,
+      "grad_norm": 0.3651833236217499,
+      "learning_rate": 0.0003450524781341108,
+      "loss": 3.3695,
+      "step": 29250
+    },
+    {
+      "epoch": 8.529963310232368,
+      "grad_norm": 0.33650752902030945,
+      "learning_rate": 0.0003446151603498542,
+      "loss": 3.3818,
+      "step": 29300
+    },
+    {
+      "epoch": 8.544522741832159,
+      "grad_norm": 0.3391404449939728,
+      "learning_rate": 0.0003441778425655976,
+      "loss": 3.3997,
+      "step": 29350
+    },
+    {
+      "epoch": 8.55908217343195,
+      "grad_norm": 0.3535376787185669,
+      "learning_rate": 0.0003437405247813411,
+      "loss": 3.3867,
+      "step": 29400
+    },
+    {
+      "epoch": 8.573641605031739,
+      "grad_norm": 0.3420208692550659,
+      "learning_rate": 0.0003433032069970845,
+      "loss": 3.3834,
+      "step": 29450
+    },
+    {
+      "epoch": 8.58820103663153,
+      "grad_norm": 0.3331069052219391,
+      "learning_rate": 0.00034286588921282797,
+      "loss": 3.3903,
+      "step": 29500
+    },
+    {
+      "epoch": 8.60276046823132,
+      "grad_norm": 0.3587231934070587,
+      "learning_rate": 0.00034242857142857143,
+      "loss": 3.3888,
+      "step": 29550
+    },
+    {
+      "epoch": 8.617319899831111,
+      "grad_norm": 0.35839417576789856,
+      "learning_rate": 0.00034199125364431485,
+      "loss": 3.404,
+      "step": 29600
+    },
+    {
+      "epoch": 8.6318793314309,
+      "grad_norm": 0.3896600902080536,
+      "learning_rate": 0.00034155393586005826,
+      "loss": 3.3906,
+      "step": 29650
+    },
+    {
+      "epoch": 8.646438763030691,
+      "grad_norm": 0.35471850633621216,
+      "learning_rate": 0.0003411166180758017,
+      "loss": 3.3922,
+      "step": 29700
+    },
+    {
+      "epoch": 8.660998194630482,
+      "grad_norm": 0.3513423800468445,
+      "learning_rate": 0.0003406793002915452,
+      "loss": 3.3873,
+      "step": 29750
+    },
+    {
+      "epoch": 8.675557626230272,
+      "grad_norm": 0.34752732515335083,
+      "learning_rate": 0.0003402419825072886,
+      "loss": 3.3855,
+      "step": 29800
+    },
+    {
+      "epoch": 8.690117057830061,
+      "grad_norm": 0.32745492458343506,
+      "learning_rate": 0.000339804664723032,
+      "loss": 3.3934,
+      "step": 29850
+    },
+    {
+      "epoch": 8.704676489429852,
+      "grad_norm": 0.3485073745250702,
+      "learning_rate": 0.00033936734693877543,
+      "loss": 3.3878,
+      "step": 29900
+    },
+    {
+      "epoch": 8.719235921029643,
+      "grad_norm": 0.3374342620372772,
+      "learning_rate": 0.00033893002915451895,
+      "loss": 3.388,
+      "step": 29950
+    },
+    {
+      "epoch": 8.733795352629434,
+      "grad_norm": 0.3508179187774658,
+      "learning_rate": 0.00033849271137026236,
+      "loss": 3.3893,
+      "step": 30000
+    },
+    {
+      "epoch": 8.733795352629434,
+      "eval_accuracy": 0.3683345418988114,
+      "eval_loss": 3.5642998218536377,
+      "eval_runtime": 179.3326,
+      "eval_samples_per_second": 92.805,
+      "eval_steps_per_second": 5.805,
+      "step": 30000
+    },
+    {
+      "epoch": 8.748354784229225,
+      "grad_norm": 0.3522128760814667,
+      "learning_rate": 0.0003380553935860058,
+      "loss": 3.4013,
+      "step": 30050
+    },
+    {
+      "epoch": 8.762914215829014,
+      "grad_norm": 0.3406279385089874,
+      "learning_rate": 0.00033761807580174924,
+      "loss": 3.4014,
+      "step": 30100
+    },
+    {
+      "epoch": 8.777473647428804,
+      "grad_norm": 0.33040550351142883,
+      "learning_rate": 0.0003371807580174927,
+      "loss": 3.392,
+      "step": 30150
+    },
+    {
+      "epoch": 8.792033079028595,
+      "grad_norm": 0.35470637679100037,
+      "learning_rate": 0.0003367434402332361,
+      "loss": 3.3986,
+      "step": 30200
+    },
+    {
+      "epoch": 8.806592510628384,
+      "grad_norm": 0.35664665699005127,
+      "learning_rate": 0.0003363061224489796,
+      "loss": 3.4054,
+      "step": 30250
+    },
+    {
+      "epoch": 8.821151942228175,
+      "grad_norm": 0.35443365573883057,
+      "learning_rate": 0.000335868804664723,
+      "loss": 3.3916,
+      "step": 30300
+    },
+    {
+      "epoch": 8.835711373827966,
+      "grad_norm": 0.3552112579345703,
+      "learning_rate": 0.0003354314868804664,
+      "loss": 3.4106,
+      "step": 30350
+    },
+    {
+      "epoch": 8.850270805427757,
+      "grad_norm": 0.3517363667488098,
+      "learning_rate": 0.00033499416909620993,
+      "loss": 3.3959,
+      "step": 30400
+    },
+    {
+      "epoch": 8.864830237027547,
+      "grad_norm": 0.3412357568740845,
+      "learning_rate": 0.00033455685131195335,
+      "loss": 3.3977,
+      "step": 30450
+    },
+    {
+      "epoch": 8.879389668627336,
+      "grad_norm": 0.3659086227416992,
+      "learning_rate": 0.00033411953352769676,
+      "loss": 3.4041,
+      "step": 30500
+    },
+    {
+      "epoch": 8.893949100227127,
+      "grad_norm": 0.3394777476787567,
+      "learning_rate": 0.00033368221574344017,
+      "loss": 3.3925,
+      "step": 30550
+    },
+    {
+      "epoch": 8.908508531826918,
+      "grad_norm": 0.3358438014984131,
+      "learning_rate": 0.0003332448979591837,
+      "loss": 3.3954,
+      "step": 30600
+    },
+    {
+      "epoch": 8.923067963426707,
+      "grad_norm": 0.3618221879005432,
+      "learning_rate": 0.0003328075801749271,
+      "loss": 3.3963,
+      "step": 30650
+    },
+    {
+      "epoch": 8.937627395026498,
+      "grad_norm": 0.35156282782554626,
+      "learning_rate": 0.0003323702623906705,
+      "loss": 3.3973,
+      "step": 30700
+    },
+    {
+      "epoch": 8.952186826626289,
+      "grad_norm": 0.3404799997806549,
+      "learning_rate": 0.00033193294460641393,
+      "loss": 3.4064,
+      "step": 30750
+    },
+    {
+      "epoch": 8.96674625822608,
+      "grad_norm": 0.3573434352874756,
+      "learning_rate": 0.0003314956268221574,
+      "loss": 3.3962,
+      "step": 30800
+    },
+    {
+      "epoch": 8.98130568982587,
+      "grad_norm": 0.3326402008533478,
+      "learning_rate": 0.00033105830903790086,
+      "loss": 3.3904,
+      "step": 30850
+    },
+    {
+      "epoch": 8.995865121425659,
+      "grad_norm": 0.3333438038825989,
+      "learning_rate": 0.0003306209912536443,
+      "loss": 3.4072,
+      "step": 30900
+    },
+    {
+      "epoch": 9.010191602119853,
+      "grad_norm": 0.35445913672447205,
+      "learning_rate": 0.00033018367346938774,
+      "loss": 3.3354,
+      "step": 30950
+    },
+    {
+      "epoch": 9.024751033719644,
+      "grad_norm": 0.3602832555770874,
+      "learning_rate": 0.00032974635568513115,
+      "loss": 3.2997,
+      "step": 31000
+    },
+    {
+      "epoch": 9.024751033719644,
+      "eval_accuracy": 0.3683871018568481,
+      "eval_loss": 3.56929087638855,
+      "eval_runtime": 180.7113,
+      "eval_samples_per_second": 92.097,
+      "eval_steps_per_second": 5.761,
+      "step": 31000
+    },
+    {
+      "epoch": 9.039310465319433,
+      "grad_norm": 0.33829203248023987,
+      "learning_rate": 0.0003293090379008746,
+      "loss": 3.2994,
+      "step": 31050
+    },
+    {
+      "epoch": 9.053869896919224,
+      "grad_norm": 0.36634117364883423,
+      "learning_rate": 0.0003288717201166181,
+      "loss": 3.3039,
+      "step": 31100
+    },
+    {
+      "epoch": 9.068429328519015,
+      "grad_norm": 0.34743866324424744,
+      "learning_rate": 0.0003284344023323615,
+      "loss": 3.3133,
+      "step": 31150
+    },
+    {
+      "epoch": 9.082988760118806,
+      "grad_norm": 0.3573026657104492,
+      "learning_rate": 0.0003279970845481049,
+      "loss": 3.3067,
+      "step": 31200
+    },
+    {
+      "epoch": 9.097548191718595,
+      "grad_norm": 0.3499259650707245,
+      "learning_rate": 0.0003275597667638483,
+      "loss": 3.311,
+      "step": 31250
+    },
+    {
+      "epoch": 9.112107623318385,
+      "grad_norm": 0.3550528287887573,
+      "learning_rate": 0.00032712244897959184,
+      "loss": 3.3146,
+      "step": 31300
+    },
+    {
+      "epoch": 9.126667054918176,
+      "grad_norm": 0.3766951262950897,
+      "learning_rate": 0.00032668513119533526,
+      "loss": 3.3203,
+      "step": 31350
+    },
+    {
+      "epoch": 9.141226486517967,
+      "grad_norm": 0.3506350517272949,
+      "learning_rate": 0.00032624781341107867,
+      "loss": 3.3383,
+      "step": 31400
+    },
+    {
+      "epoch": 9.155785918117756,
+      "grad_norm": 0.36587440967559814,
+      "learning_rate": 0.00032581049562682213,
+      "loss": 3.3249,
+      "step": 31450
+    },
+    {
+      "epoch": 9.170345349717547,
+      "grad_norm": 0.3548264503479004,
+      "learning_rate": 0.0003253731778425656,
+      "loss": 3.3173,
+      "step": 31500
+    },
+    {
+      "epoch": 9.184904781317337,
+      "grad_norm": 0.3574599325656891,
+      "learning_rate": 0.000324935860058309,
+      "loss": 3.3277,
+      "step": 31550
+    },
+    {
+      "epoch": 9.199464212917128,
+      "grad_norm": 0.3559187948703766,
+      "learning_rate": 0.0003244985422740524,
+      "loss": 3.3302,
+      "step": 31600
+    },
+    {
+      "epoch": 9.214023644516917,
+      "grad_norm": 0.3626471757888794,
+      "learning_rate": 0.0003240612244897959,
+      "loss": 3.3249,
+      "step": 31650
+    },
+    {
+      "epoch": 9.228583076116708,
+      "grad_norm": 0.34642550349235535,
+      "learning_rate": 0.0003236239067055393,
+      "loss": 3.3288,
+      "step": 31700
+    },
+    {
+      "epoch": 9.243142507716499,
+      "grad_norm": 0.3562052249908447,
+      "learning_rate": 0.00032318658892128277,
+      "loss": 3.3381,
+      "step": 31750
+    },
+    {
+      "epoch": 9.25770193931629,
+      "grad_norm": 0.35299643874168396,
+      "learning_rate": 0.00032274927113702624,
+      "loss": 3.3398,
+      "step": 31800
+    },
+    {
+      "epoch": 9.272261370916079,
+      "grad_norm": 0.3579034209251404,
+      "learning_rate": 0.00032231195335276965,
+      "loss": 3.3376,
+      "step": 31850
+    },
+    {
+      "epoch": 9.28682080251587,
+      "grad_norm": 0.3582768738269806,
+      "learning_rate": 0.00032187463556851306,
+      "loss": 3.3417,
+      "step": 31900
+    },
+    {
+      "epoch": 9.30138023411566,
+      "grad_norm": 0.3462630808353424,
+      "learning_rate": 0.0003214373177842565,
+      "loss": 3.3388,
+      "step": 31950
+    },
+    {
+      "epoch": 9.315939665715451,
+      "grad_norm": 0.35994312167167664,
+      "learning_rate": 0.000321,
+      "loss": 3.3417,
+      "step": 32000
+    },
+    {
+      "epoch": 9.315939665715451,
+      "eval_accuracy": 0.3688413280713799,
+      "eval_loss": 3.5689337253570557,
+      "eval_runtime": 181.1092,
+      "eval_samples_per_second": 91.895,
+      "eval_steps_per_second": 5.748,
+      "step": 32000
+    },
+    {
+      "epoch": 9.33049909731524,
+      "grad_norm": 0.34968388080596924,
+      "learning_rate": 0.0003205626822157434,
+      "loss": 3.3376,
+      "step": 32050
+    },
+    {
+      "epoch": 9.34505852891503,
+      "grad_norm": 0.35291755199432373,
+      "learning_rate": 0.0003201253644314868,
+      "loss": 3.3412,
+      "step": 32100
+    },
+    {
+      "epoch": 9.359617960514822,
+      "grad_norm": 0.3643549978733063,
+      "learning_rate": 0.0003196880466472303,
+      "loss": 3.3411,
+      "step": 32150
+    },
+    {
+      "epoch": 9.374177392114612,
+      "grad_norm": 0.3537770211696625,
+      "learning_rate": 0.00031925072886297375,
+      "loss": 3.3642,
+      "step": 32200
+    },
+    {
+      "epoch": 9.388736823714403,
+      "grad_norm": 0.3553234338760376,
+      "learning_rate": 0.00031881341107871717,
+      "loss": 3.353,
+      "step": 32250
+    },
+    {
+      "epoch": 9.403296255314192,
+      "grad_norm": 0.35173216462135315,
+      "learning_rate": 0.00031837609329446063,
+      "loss": 3.3433,
+      "step": 32300
+    },
+    {
+      "epoch": 9.417855686913983,
+      "grad_norm": 0.3561984598636627,
+      "learning_rate": 0.00031793877551020405,
+      "loss": 3.3459,
+      "step": 32350
+    },
+    {
+      "epoch": 9.432415118513774,
+      "grad_norm": 0.3734908699989319,
+      "learning_rate": 0.00031750145772594746,
+      "loss": 3.3495,
+      "step": 32400
+    },
+    {
+      "epoch": 9.446974550113563,
+      "grad_norm": 0.3848966658115387,
+      "learning_rate": 0.000317064139941691,
+      "loss": 3.3486,
+      "step": 32450
+    },
+    {
+      "epoch": 9.461533981713353,
+      "grad_norm": 0.36939284205436707,
+      "learning_rate": 0.0003166268221574344,
+      "loss": 3.3527,
+      "step": 32500
+    },
+    {
+      "epoch": 9.476093413313144,
+      "grad_norm": 0.3429546654224396,
+      "learning_rate": 0.0003161895043731778,
+      "loss": 3.3402,
+      "step": 32550
+    },
+    {
+      "epoch": 9.490652844912935,
+      "grad_norm": 0.34233972430229187,
+      "learning_rate": 0.0003157521865889212,
+      "loss": 3.3508,
+      "step": 32600
+    },
+    {
+      "epoch": 9.505212276512726,
+      "grad_norm": 0.3572950065135956,
+      "learning_rate": 0.00031531486880466474,
+      "loss": 3.3599,
+      "step": 32650
+    },
+    {
+      "epoch": 9.519771708112515,
+      "grad_norm": 0.34846094250679016,
+      "learning_rate": 0.00031487755102040815,
+      "loss": 3.3585,
+      "step": 32700
+    },
+    {
+      "epoch": 9.534331139712306,
+      "grad_norm": 0.3666765093803406,
+      "learning_rate": 0.00031444023323615156,
+      "loss": 3.3535,
+      "step": 32750
+    },
+    {
+      "epoch": 9.548890571312096,
+      "grad_norm": 0.3483474850654602,
+      "learning_rate": 0.000314002915451895,
+      "loss": 3.3492,
+      "step": 32800
+    },
+    {
+      "epoch": 9.563450002911885,
+      "grad_norm": 0.3478499948978424,
+      "learning_rate": 0.00031356559766763844,
+      "loss": 3.3664,
+      "step": 32850
+    },
+    {
+      "epoch": 9.578009434511676,
+      "grad_norm": 0.3615437150001526,
+      "learning_rate": 0.0003131282798833819,
+      "loss": 3.3739,
+      "step": 32900
+    },
+    {
+      "epoch": 9.592568866111467,
+      "grad_norm": 0.35250964760780334,
+      "learning_rate": 0.0003126909620991253,
+      "loss": 3.3745,
+      "step": 32950
+    },
+    {
+      "epoch": 9.607128297711258,
+      "grad_norm": 0.35164180397987366,
+      "learning_rate": 0.0003122536443148688,
+      "loss": 3.3562,
+      "step": 33000
+    },
+    {
+      "epoch": 9.607128297711258,
+      "eval_accuracy": 0.3692384085597243,
+      "eval_loss": 3.5602471828460693,
+      "eval_runtime": 180.7731,
+      "eval_samples_per_second": 92.066,
+      "eval_steps_per_second": 5.759,
+      "step": 33000
+    },
+    {
+      "epoch": 9.621687729311049,
+      "grad_norm": 0.37672215700149536,
+      "learning_rate": 0.0003118163265306122,
+      "loss": 3.3735,
+      "step": 33050
+    },
+    {
+      "epoch": 9.636247160910838,
+      "grad_norm": 0.36633849143981934,
+      "learning_rate": 0.00031137900874635566,
+      "loss": 3.3498,
+      "step": 33100
+    },
+    {
+      "epoch": 9.650806592510628,
+      "grad_norm": 0.3514011800289154,
+      "learning_rate": 0.00031094169096209913,
+      "loss": 3.3604,
+      "step": 33150
+    },
+    {
+      "epoch": 9.66536602411042,
+      "grad_norm": 0.35586225986480713,
+      "learning_rate": 0.00031050437317784254,
+      "loss": 3.3574,
+      "step": 33200
+    },
+    {
+      "epoch": 9.67992545571021,
+      "grad_norm": 0.33317190408706665,
+      "learning_rate": 0.00031006705539358596,
+      "loss": 3.3546,
+      "step": 33250
+    },
+    {
+      "epoch": 9.694484887309999,
+      "grad_norm": 0.35271352529525757,
+      "learning_rate": 0.00030962973760932937,
+      "loss": 3.3632,
+      "step": 33300
+    },
+    {
+      "epoch": 9.70904431890979,
+      "grad_norm": 0.3521358370780945,
+      "learning_rate": 0.0003091924198250729,
+      "loss": 3.3584,
+      "step": 33350
+    },
+    {
+      "epoch": 9.72360375050958,
+      "grad_norm": 0.3574683666229248,
+      "learning_rate": 0.0003087551020408163,
+      "loss": 3.3581,
+      "step": 33400
+    },
+    {
+      "epoch": 9.738163182109371,
+      "grad_norm": 0.3643791377544403,
+      "learning_rate": 0.0003083177842565597,
+      "loss": 3.3691,
+      "step": 33450
+    },
+    {
+      "epoch": 9.75272261370916,
+      "grad_norm": 0.35385361313819885,
+      "learning_rate": 0.0003078804664723032,
+      "loss": 3.3547,
+      "step": 33500
+    },
+    {
+      "epoch": 9.767282045308951,
+      "grad_norm": 0.35955286026000977,
+      "learning_rate": 0.00030744314868804665,
+      "loss": 3.3496,
+      "step": 33550
+    },
+    {
+      "epoch": 9.781841476908742,
+      "grad_norm": 0.3493342697620392,
+      "learning_rate": 0.00030700583090379006,
+      "loss": 3.3629,
+      "step": 33600
+    },
+    {
+      "epoch": 9.796400908508533,
+      "grad_norm": 0.3883078396320343,
+      "learning_rate": 0.00030656851311953347,
+      "loss": 3.3643,
+      "step": 33650
+    },
+    {
+      "epoch": 9.810960340108322,
+      "grad_norm": 0.34926533699035645,
+      "learning_rate": 0.00030613119533527694,
+      "loss": 3.3662,
+      "step": 33700
+    },
+    {
+      "epoch": 9.825519771708112,
+      "grad_norm": 0.37770354747772217,
+      "learning_rate": 0.00030569387755102035,
+      "loss": 3.3813,
+      "step": 33750
+    },
+    {
+      "epoch": 9.840079203307903,
+      "grad_norm": 0.3666662275791168,
+      "learning_rate": 0.0003052565597667638,
+      "loss": 3.3669,
+      "step": 33800
+    },
+    {
+      "epoch": 9.854638634907694,
+      "grad_norm": 0.3690825402736664,
+      "learning_rate": 0.0003048192419825073,
+      "loss": 3.3718,
+      "step": 33850
+    },
+    {
+      "epoch": 9.869198066507483,
+      "grad_norm": 0.3668816387653351,
+      "learning_rate": 0.0003043819241982507,
+      "loss": 3.3572,
+      "step": 33900
+    },
+    {
+      "epoch": 9.883757498107274,
+      "grad_norm": 0.35026848316192627,
+      "learning_rate": 0.0003039446064139941,
+      "loss": 3.3714,
+      "step": 33950
+    },
+    {
+      "epoch": 9.898316929707065,
+      "grad_norm": 0.36591610312461853,
+      "learning_rate": 0.00030350728862973763,
+      "loss": 3.3759,
+      "step": 34000
+    },
+    {
+      "epoch": 9.898316929707065,
+      "eval_accuracy": 0.36981339333556196,
+      "eval_loss": 3.553095817565918,
+      "eval_runtime": 180.8581,
+      "eval_samples_per_second": 92.022,
+      "eval_steps_per_second": 5.756,
+      "step": 34000
+    },
+    {
+      "epoch": 9.912876361306855,
+      "grad_norm": 0.3765810430049896,
+      "learning_rate": 0.00030306997084548104,
+      "loss": 3.363,
+      "step": 34050
+    },
+    {
+      "epoch": 9.927435792906644,
+      "grad_norm": 0.3594549000263214,
+      "learning_rate": 0.00030263265306122445,
+      "loss": 3.3671,
+      "step": 34100
+    },
+    {
+      "epoch": 9.941995224506435,
+      "grad_norm": 0.35946381092071533,
+      "learning_rate": 0.00030219533527696787,
+      "loss": 3.3735,
+      "step": 34150
+    },
+    {
+      "epoch": 9.956554656106226,
+      "grad_norm": 0.37179645895957947,
+      "learning_rate": 0.00030175801749271133,
+      "loss": 3.3874,
+      "step": 34200
+    },
+    {
+      "epoch": 9.971114087706017,
+      "grad_norm": 0.36117124557495117,
+      "learning_rate": 0.0003013206997084548,
+      "loss": 3.3806,
+      "step": 34250
+    },
+    {
+      "epoch": 9.985673519305806,
+      "grad_norm": 0.34759020805358887,
+      "learning_rate": 0.0003008833819241982,
+      "loss": 3.3681,
+      "step": 34300
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": Infinity,
+      "learning_rate": 0.0003004460641399417,
+      "loss": 3.3643,
+      "step": 34350
+    },
+    {
+      "epoch": 10.01455943159979,
+      "grad_norm": 0.35527414083480835,
+      "learning_rate": 0.0003000087463556851,
+      "loss": 3.2614,
+      "step": 34400
+    },
+    {
+      "epoch": 10.029118863199582,
+      "grad_norm": 0.3797459304332733,
+      "learning_rate": 0.00029957142857142856,
+      "loss": 3.269,
+      "step": 34450
+    },
+    {
+      "epoch": 10.04367829479937,
+      "grad_norm": 0.36752596497535706,
+      "learning_rate": 0.000299134110787172,
+      "loss": 3.2838,
+      "step": 34500
+    },
+    {
+      "epoch": 10.058237726399161,
+      "grad_norm": 0.34516459703445435,
+      "learning_rate": 0.00029869679300291544,
+      "loss": 3.272,
+      "step": 34550
+    },
+    {
+      "epoch": 10.072797157998952,
+      "grad_norm": 0.3728445768356323,
+      "learning_rate": 0.00029825947521865885,
+      "loss": 3.2696,
+      "step": 34600
+    },
+    {
+      "epoch": 10.087356589598743,
+      "grad_norm": 0.3747389018535614,
+      "learning_rate": 0.0002978221574344023,
+      "loss": 3.2888,
+      "step": 34650
+    },
+    {
+      "epoch": 10.101916021198532,
+      "grad_norm": 0.34447789192199707,
+      "learning_rate": 0.00029738483965014573,
+      "loss": 3.2916,
+      "step": 34700
+    },
+    {
+      "epoch": 10.116475452798323,
+      "grad_norm": 0.35870856046676636,
+      "learning_rate": 0.0002969475218658892,
+      "loss": 3.2913,
+      "step": 34750
+    },
+    {
+      "epoch": 10.131034884398114,
+      "grad_norm": 0.35672426223754883,
+      "learning_rate": 0.0002965102040816326,
+      "loss": 3.2963,
+      "step": 34800
+    },
+    {
+      "epoch": 10.145594315997904,
+      "grad_norm": 0.36722877621650696,
+      "learning_rate": 0.0002960728862973761,
+      "loss": 3.2886,
+      "step": 34850
+    },
+    {
+      "epoch": 10.160153747597693,
+      "grad_norm": 0.3597167432308197,
+      "learning_rate": 0.0002956355685131195,
+      "loss": 3.3118,
+      "step": 34900
+    },
+    {
+      "epoch": 10.174713179197484,
+      "grad_norm": 0.3561251759529114,
+      "learning_rate": 0.00029519825072886295,
+      "loss": 3.2997,
+      "step": 34950
+    },
+    {
+      "epoch": 10.189272610797275,
+      "grad_norm": 0.37824273109436035,
+      "learning_rate": 0.00029476093294460637,
+      "loss": 3.3017,
+      "step": 35000
+    },
+    {
+      "epoch": 10.189272610797275,
+      "eval_accuracy": 0.3693485845791435,
+      "eval_loss": 3.5645644664764404,
+      "eval_runtime": 180.1256,
+      "eval_samples_per_second": 92.397,
+      "eval_steps_per_second": 5.779,
+      "step": 35000
+    },
+    {
+      "epoch": 10.203832042397066,
+      "grad_norm": 0.37217044830322266,
+      "learning_rate": 0.00029432361516034983,
+      "loss": 3.3035,
+      "step": 35050
+    },
+    {
+      "epoch": 10.218391473996855,
+      "grad_norm": 0.3471571207046509,
+      "learning_rate": 0.0002938862973760933,
+      "loss": 3.3116,
+      "step": 35100
+    },
+    {
+      "epoch": 10.232950905596645,
+      "grad_norm": 0.3539142310619354,
+      "learning_rate": 0.0002934489795918367,
+      "loss": 3.296,
+      "step": 35150
+    },
+    {
+      "epoch": 10.247510337196436,
+      "grad_norm": 0.36773473024368286,
+      "learning_rate": 0.0002930116618075802,
+      "loss": 3.3028,
+      "step": 35200
+    },
+    {
+      "epoch": 10.262069768796227,
+      "grad_norm": 0.3689476549625397,
+      "learning_rate": 0.0002925743440233236,
+      "loss": 3.3111,
+      "step": 35250
+    },
+    {
+      "epoch": 10.276629200396016,
+      "grad_norm": 0.3640798032283783,
+      "learning_rate": 0.00029213702623906706,
+      "loss": 3.3157,
+      "step": 35300
+    },
+    {
+      "epoch": 10.291188631995807,
+      "grad_norm": 0.3602818250656128,
+      "learning_rate": 0.00029169970845481047,
+      "loss": 3.3138,
+      "step": 35350
+    },
+    {
+      "epoch": 10.305748063595598,
+      "grad_norm": 0.38390350341796875,
+      "learning_rate": 0.00029126239067055394,
+      "loss": 3.3077,
+      "step": 35400
+    },
+    {
+      "epoch": 10.320307495195388,
+      "grad_norm": 0.36689597368240356,
+      "learning_rate": 0.00029082507288629735,
+      "loss": 3.309,
+      "step": 35450
+    },
+    {
+      "epoch": 10.334866926795177,
+      "grad_norm": 0.3611031770706177,
+      "learning_rate": 0.00029038775510204076,
+      "loss": 3.3119,
+      "step": 35500
+    },
+    {
+      "epoch": 10.349426358394968,
+      "grad_norm": 0.36774659156799316,
+      "learning_rate": 0.0002899504373177842,
+      "loss": 3.3189,
+      "step": 35550
+    },
+    {
+      "epoch": 10.363985789994759,
+      "grad_norm": 0.36395514011383057,
+      "learning_rate": 0.00028951311953352764,
+      "loss": 3.3109,
+      "step": 35600
+    },
+    {
+      "epoch": 10.37854522159455,
+      "grad_norm": 0.362166166305542,
+      "learning_rate": 0.0002890758017492711,
+      "loss": 3.3192,
+      "step": 35650
+    },
+    {
+      "epoch": 10.393104653194339,
+      "grad_norm": 0.3618522882461548,
+      "learning_rate": 0.0002886384839650145,
+      "loss": 3.3183,
+      "step": 35700
+    },
+    {
+      "epoch": 10.40766408479413,
+      "grad_norm": 0.3681625723838806,
+      "learning_rate": 0.000288201166180758,
+      "loss": 3.318,
+      "step": 35750
+    },
+    {
+      "epoch": 10.42222351639392,
+      "grad_norm": 0.3899301588535309,
+      "learning_rate": 0.00028776384839650145,
+      "loss": 3.3182,
+      "step": 35800
+    },
+    {
+      "epoch": 10.436782947993711,
+      "grad_norm": 0.35318905115127563,
+      "learning_rate": 0.00028732653061224486,
+      "loss": 3.3284,
+      "step": 35850
+    },
+    {
+      "epoch": 10.4513423795935,
+      "grad_norm": 0.38061952590942383,
+      "learning_rate": 0.00028688921282798833,
+      "loss": 3.3173,
+      "step": 35900
+    },
+    {
+      "epoch": 10.46590181119329,
+      "grad_norm": 0.3645211160182953,
+      "learning_rate": 0.00028645189504373174,
+      "loss": 3.3272,
+      "step": 35950
+    },
+    {
+      "epoch": 10.480461242793082,
+      "grad_norm": 0.36433538794517517,
+      "learning_rate": 0.0002860145772594752,
+      "loss": 3.3164,
+      "step": 36000
+    },
+    {
+      "epoch": 10.480461242793082,
+      "eval_accuracy": 0.37010747041621017,
+      "eval_loss": 3.556314706802368,
+      "eval_runtime": 180.1676,
+      "eval_samples_per_second": 92.375,
+      "eval_steps_per_second": 5.778,
+      "step": 36000
+    },
+    {
+      "epoch": 10.495020674392872,
+      "grad_norm": 0.36834511160850525,
+      "learning_rate": 0.0002855772594752186,
+      "loss": 3.3291,
+      "step": 36050
+    },
+    {
+      "epoch": 10.509580105992661,
+      "grad_norm": 0.3711186945438385,
+      "learning_rate": 0.0002851399416909621,
+      "loss": 3.3161,
+      "step": 36100
+    },
+    {
+      "epoch": 10.524139537592452,
+      "grad_norm": 0.356585294008255,
+      "learning_rate": 0.0002847026239067055,
+      "loss": 3.3329,
+      "step": 36150
+    },
+    {
+      "epoch": 10.538698969192243,
+      "grad_norm": 0.36765870451927185,
+      "learning_rate": 0.00028426530612244897,
+      "loss": 3.3352,
+      "step": 36200
+    },
+    {
+      "epoch": 10.553258400792034,
+      "grad_norm": 0.3481246531009674,
+      "learning_rate": 0.0002838279883381924,
+      "loss": 3.3271,
+      "step": 36250
+    },
+    {
+      "epoch": 10.567817832391823,
+      "grad_norm": 0.35420429706573486,
+      "learning_rate": 0.00028339067055393585,
+      "loss": 3.3268,
+      "step": 36300
+    },
+    {
+      "epoch": 10.582377263991614,
+      "grad_norm": 0.3609519302845001,
+      "learning_rate": 0.00028295335276967926,
+      "loss": 3.3295,
+      "step": 36350
+    },
+    {
+      "epoch": 10.596936695591404,
+      "grad_norm": 0.3677191138267517,
+      "learning_rate": 0.0002825160349854227,
+      "loss": 3.3333,
+      "step": 36400
+    },
+    {
+      "epoch": 10.611496127191195,
+      "grad_norm": 0.37628525495529175,
+      "learning_rate": 0.00028207871720116614,
+      "loss": 3.3399,
+      "step": 36450
+    },
+    {
+      "epoch": 10.626055558790984,
+      "grad_norm": 0.3637225925922394,
+      "learning_rate": 0.0002816413994169096,
+      "loss": 3.3349,
+      "step": 36500
+    },
+    {
+      "epoch": 10.640614990390775,
+      "grad_norm": 0.3519335091114044,
+      "learning_rate": 0.00028120408163265307,
+      "loss": 3.3346,
+      "step": 36550
+    },
+    {
+      "epoch": 10.655174421990566,
+      "grad_norm": 0.348203182220459,
+      "learning_rate": 0.0002807667638483965,
+      "loss": 3.3289,
+      "step": 36600
+    },
+    {
+      "epoch": 10.669733853590357,
+      "grad_norm": 0.36233091354370117,
+      "learning_rate": 0.00028032944606413995,
+      "loss": 3.3264,
+      "step": 36650
+    },
+    {
+      "epoch": 10.684293285190146,
+      "grad_norm": 0.3718380630016327,
+      "learning_rate": 0.00027989212827988336,
+      "loss": 3.3317,
+      "step": 36700
+    },
+    {
+      "epoch": 10.698852716789936,
+      "grad_norm": 0.35991501808166504,
+      "learning_rate": 0.00027945481049562683,
+      "loss": 3.3365,
+      "step": 36750
+    },
+    {
+      "epoch": 10.713412148389727,
+      "grad_norm": 0.37417152523994446,
+      "learning_rate": 0.00027901749271137024,
+      "loss": 3.3266,
+      "step": 36800
+    },
+    {
+      "epoch": 10.727971579989518,
+      "grad_norm": 0.3618806004524231,
+      "learning_rate": 0.00027858017492711365,
+      "loss": 3.3338,
+      "step": 36850
+    },
+    {
+      "epoch": 10.742531011589307,
+      "grad_norm": 0.3808761239051819,
+      "learning_rate": 0.0002781428571428571,
+      "loss": 3.3322,
+      "step": 36900
+    },
+    {
+      "epoch": 10.757090443189098,
+      "grad_norm": 0.35829290747642517,
+      "learning_rate": 0.00027770553935860053,
+      "loss": 3.3405,
+      "step": 36950
+    },
+    {
+      "epoch": 10.771649874788888,
+      "grad_norm": 0.35556626319885254,
+      "learning_rate": 0.000277268221574344,
+      "loss": 3.3349,
+      "step": 37000
+    },
+    {
+      "epoch": 10.771649874788888,
+      "eval_accuracy": 0.3708527165326231,
+      "eval_loss": 3.548063039779663,
+      "eval_runtime": 180.2516,
+      "eval_samples_per_second": 92.332,
+      "eval_steps_per_second": 5.775,
+      "step": 37000
+    },
+    {
+      "epoch": 10.78620930638868,
+      "grad_norm": 0.36781635880470276,
+      "learning_rate": 0.0002768309037900874,
+      "loss": 3.3393,
+      "step": 37050
+    },
+    {
+      "epoch": 10.800768737988468,
+      "grad_norm": 0.3739968538284302,
+      "learning_rate": 0.0002763935860058309,
+      "loss": 3.3362,
+      "step": 37100
+    },
+    {
+      "epoch": 10.815328169588259,
+      "grad_norm": 0.37725409865379333,
+      "learning_rate": 0.0002759562682215743,
+      "loss": 3.343,
+      "step": 37150
+    },
+    {
+      "epoch": 10.82988760118805,
+      "grad_norm": 0.3467895984649658,
+      "learning_rate": 0.00027551895043731776,
+      "loss": 3.3424,
+      "step": 37200
+    },
+    {
+      "epoch": 10.84444703278784,
+      "grad_norm": 0.3589009940624237,
+      "learning_rate": 0.0002750816326530612,
+      "loss": 3.3149,
+      "step": 37250
+    },
+    {
+      "epoch": 10.85900646438763,
+      "grad_norm": 0.36413517594337463,
+      "learning_rate": 0.00027464431486880464,
+      "loss": 3.3448,
+      "step": 37300
+    },
+    {
+      "epoch": 10.87356589598742,
+      "grad_norm": 0.3594954311847687,
+      "learning_rate": 0.0002742069970845481,
+      "loss": 3.3452,
+      "step": 37350
+    },
+    {
+      "epoch": 10.888125327587211,
+      "grad_norm": 0.36977705359458923,
+      "learning_rate": 0.0002737696793002915,
+      "loss": 3.3387,
+      "step": 37400
+    },
+    {
+      "epoch": 10.902684759187002,
+      "grad_norm": 0.3728332817554474,
+      "learning_rate": 0.000273332361516035,
+      "loss": 3.3554,
+      "step": 37450
+    },
+    {
+      "epoch": 10.917244190786791,
+      "grad_norm": 0.3603312075138092,
+      "learning_rate": 0.0002728950437317784,
+      "loss": 3.3495,
+      "step": 37500
+    },
+    {
+      "epoch": 10.931803622386582,
+      "grad_norm": 0.37357112765312195,
+      "learning_rate": 0.00027245772594752186,
+      "loss": 3.3509,
+      "step": 37550
+    },
+    {
+      "epoch": 10.946363053986373,
+      "grad_norm": 0.3870396316051483,
+      "learning_rate": 0.00027202040816326527,
+      "loss": 3.3451,
+      "step": 37600
+    },
+    {
+      "epoch": 10.960922485586163,
+      "grad_norm": 0.36924847960472107,
+      "learning_rate": 0.00027158309037900874,
+      "loss": 3.3482,
+      "step": 37650
+    },
+    {
+      "epoch": 10.975481917185952,
+      "grad_norm": 0.3659966289997101,
+      "learning_rate": 0.00027114577259475215,
+      "loss": 3.3429,
+      "step": 37700
+    },
+    {
+      "epoch": 10.990041348785743,
+      "grad_norm": 0.3750581741333008,
+      "learning_rate": 0.00027070845481049556,
+      "loss": 3.3467,
+      "step": 37750
+    },
+    {
+      "epoch": 11.004367829479937,
+      "grad_norm": 0.3540584444999695,
+      "learning_rate": 0.00027027113702623903,
+      "loss": 3.3111,
+      "step": 37800
+    },
+    {
+      "epoch": 11.018927261079728,
+      "grad_norm": 0.36422842741012573,
+      "learning_rate": 0.0002698338192419825,
+      "loss": 3.2475,
+      "step": 37850
+    },
+    {
+      "epoch": 11.033486692679517,
+      "grad_norm": 0.36595383286476135,
+      "learning_rate": 0.0002693965014577259,
+      "loss": 3.2508,
+      "step": 37900
+    },
+    {
+      "epoch": 11.048046124279308,
+      "grad_norm": 0.3714156448841095,
+      "learning_rate": 0.0002689591836734694,
+      "loss": 3.2548,
+      "step": 37950
+    },
+    {
+      "epoch": 11.062605555879099,
+      "grad_norm": 0.38618841767311096,
+      "learning_rate": 0.00026852186588921284,
+      "loss": 3.2621,
+      "step": 38000
+    },
+    {
+      "epoch": 11.062605555879099,
+      "eval_accuracy": 0.370781343166788,
+      "eval_loss": 3.5563323497772217,
+      "eval_runtime": 179.9746,
+      "eval_samples_per_second": 92.474,
+      "eval_steps_per_second": 5.784,
+      "step": 38000
+    },
+    {
+      "epoch": 11.07716498747889,
+      "grad_norm": 0.3659396469593048,
+      "learning_rate": 0.00026808454810495625,
+      "loss": 3.2429,
+      "step": 38050
+    },
+    {
+      "epoch": 11.091724419078679,
+      "grad_norm": 0.3699627220630646,
+      "learning_rate": 0.0002676472303206997,
+      "loss": 3.2574,
+      "step": 38100
+    },
+    {
+      "epoch": 11.10628385067847,
+      "grad_norm": 0.371509313583374,
+      "learning_rate": 0.00026720991253644313,
+      "loss": 3.2581,
+      "step": 38150
+    },
+    {
+      "epoch": 11.12084328227826,
+      "grad_norm": 0.3545081317424774,
+      "learning_rate": 0.00026677259475218655,
+      "loss": 3.2631,
+      "step": 38200
+    },
+    {
+      "epoch": 11.135402713878051,
+      "grad_norm": 0.36414968967437744,
+      "learning_rate": 0.00026633527696793,
+      "loss": 3.2715,
+      "step": 38250
+    },
+    {
+      "epoch": 11.14996214547784,
+      "grad_norm": 0.36221858859062195,
+      "learning_rate": 0.0002658979591836734,
+      "loss": 3.2702,
+      "step": 38300
+    },
+    {
+      "epoch": 11.16452157707763,
+      "grad_norm": 0.35454094409942627,
+      "learning_rate": 0.0002654606413994169,
+      "loss": 3.2636,
+      "step": 38350
+    },
+    {
+      "epoch": 11.179081008677421,
+      "grad_norm": 0.38314637541770935,
+      "learning_rate": 0.0002650233236151603,
+      "loss": 3.2647,
+      "step": 38400
+    },
+    {
+      "epoch": 11.193640440277212,
+      "grad_norm": 0.36567234992980957,
+      "learning_rate": 0.00026458600583090377,
+      "loss": 3.2764,
+      "step": 38450
+    },
+    {
+      "epoch": 11.208199871877001,
+      "grad_norm": 0.36688846349716187,
+      "learning_rate": 0.0002641486880466472,
+      "loss": 3.2722,
+      "step": 38500
+    },
+    {
+      "epoch": 11.222759303476792,
+      "grad_norm": 0.38438311219215393,
+      "learning_rate": 0.00026371137026239065,
+      "loss": 3.2675,
+      "step": 38550
+    },
+    {
+      "epoch": 11.237318735076583,
+      "grad_norm": 0.3896602392196655,
+      "learning_rate": 0.0002632740524781341,
+      "loss": 3.2684,
+      "step": 38600
+    },
+    {
+      "epoch": 11.251878166676374,
+      "grad_norm": 0.3787361681461334,
+      "learning_rate": 0.00026283673469387753,
+      "loss": 3.2774,
+      "step": 38650
+    },
+    {
+      "epoch": 11.266437598276163,
+      "grad_norm": 0.36523544788360596,
+      "learning_rate": 0.000262399416909621,
+      "loss": 3.2731,
+      "step": 38700
+    },
+    {
+      "epoch": 11.280997029875953,
+      "grad_norm": 0.38595035672187805,
+      "learning_rate": 0.0002619620991253644,
+      "loss": 3.2802,
+      "step": 38750
+    },
+    {
+      "epoch": 11.295556461475744,
+      "grad_norm": 0.3597980737686157,
+      "learning_rate": 0.0002615247813411079,
+      "loss": 3.2836,
+      "step": 38800
+    },
+    {
+      "epoch": 11.310115893075535,
+      "grad_norm": 0.375411719083786,
+      "learning_rate": 0.0002610874635568513,
+      "loss": 3.2809,
+      "step": 38850
+    },
+    {
+      "epoch": 11.324675324675324,
+      "grad_norm": 0.3683791756629944,
+      "learning_rate": 0.00026065014577259475,
+      "loss": 3.2832,
+      "step": 38900
+    },
+    {
+      "epoch": 11.339234756275115,
+      "grad_norm": 0.36232179403305054,
+      "learning_rate": 0.00026021282798833817,
+      "loss": 3.279,
+      "step": 38950
+    },
+    {
+      "epoch": 11.353794187874906,
+      "grad_norm": 0.3584194779396057,
+      "learning_rate": 0.0002597755102040816,
+      "loss": 3.2832,
+      "step": 39000
+    },
+    {
+      "epoch": 11.353794187874906,
+      "eval_accuracy": 0.37091233151858416,
+      "eval_loss": 3.553581953048706,
+      "eval_runtime": 179.822,
+      "eval_samples_per_second": 92.553,
+      "eval_steps_per_second": 5.789,
+      "step": 39000
+    },
+    {
+      "epoch": 11.368353619474696,
+      "grad_norm": 0.36148691177368164,
+      "learning_rate": 0.00025933819241982504,
+      "loss": 3.2926,
+      "step": 39050
+    },
+    {
+      "epoch": 11.382913051074485,
+      "grad_norm": 0.36825209856033325,
+      "learning_rate": 0.00025890087463556846,
+      "loss": 3.2974,
+      "step": 39100
+    },
+    {
+      "epoch": 11.397472482674276,
+      "grad_norm": 0.3690287470817566,
+      "learning_rate": 0.0002584635568513119,
+      "loss": 3.285,
+      "step": 39150
+    },
+    {
+      "epoch": 11.412031914274067,
+      "grad_norm": 0.37193694710731506,
+      "learning_rate": 0.00025802623906705534,
+      "loss": 3.3057,
+      "step": 39200
+    },
+    {
+      "epoch": 11.426591345873858,
+      "grad_norm": 0.3798997700214386,
+      "learning_rate": 0.0002575889212827988,
+      "loss": 3.2901,
+      "step": 39250
+    },
+    {
+      "epoch": 11.441150777473647,
+      "grad_norm": 0.3867810368537903,
+      "learning_rate": 0.00025715160349854227,
+      "loss": 3.2994,
+      "step": 39300
+    },
+    {
+      "epoch": 11.455710209073438,
+      "grad_norm": 0.3750901520252228,
+      "learning_rate": 0.0002567142857142857,
+      "loss": 3.2932,
+      "step": 39350
+    },
+    {
+      "epoch": 11.470269640673228,
+      "grad_norm": 0.35880762338638306,
+      "learning_rate": 0.00025627696793002915,
+      "loss": 3.2873,
+      "step": 39400
+    },
+    {
+      "epoch": 11.484829072273019,
+      "grad_norm": 0.3917964994907379,
+      "learning_rate": 0.00025583965014577256,
+      "loss": 3.2952,
+      "step": 39450
+    },
+    {
+      "epoch": 11.499388503872808,
+      "grad_norm": 0.3772904574871063,
+      "learning_rate": 0.000255402332361516,
+      "loss": 3.3044,
+      "step": 39500
+    },
+    {
+      "epoch": 11.513947935472599,
+      "grad_norm": 0.3691461980342865,
+      "learning_rate": 0.00025496501457725944,
+      "loss": 3.3156,
+      "step": 39550
+    },
+    {
+      "epoch": 11.52850736707239,
+      "grad_norm": 0.36424410343170166,
+      "learning_rate": 0.0002545276967930029,
+      "loss": 3.3019,
+      "step": 39600
+    },
+    {
+      "epoch": 11.54306679867218,
+      "grad_norm": 0.3689974844455719,
+      "learning_rate": 0.0002540903790087463,
+      "loss": 3.3016,
+      "step": 39650
+    },
+    {
+      "epoch": 11.55762623027197,
+      "grad_norm": 0.38458317518234253,
+      "learning_rate": 0.0002536530612244898,
+      "loss": 3.3063,
+      "step": 39700
+    },
+    {
+      "epoch": 11.57218566187176,
+      "grad_norm": 0.3871372640132904,
+      "learning_rate": 0.0002532157434402332,
+      "loss": 3.3034,
+      "step": 39750
+    },
+    {
+      "epoch": 11.586745093471551,
+      "grad_norm": 0.3936833143234253,
+      "learning_rate": 0.00025277842565597666,
+      "loss": 3.3101,
+      "step": 39800
+    },
+    {
+      "epoch": 11.601304525071342,
+      "grad_norm": 0.3917473256587982,
+      "learning_rate": 0.0002523411078717201,
+      "loss": 3.3225,
+      "step": 39850
+    },
+    {
+      "epoch": 11.61586395667113,
+      "grad_norm": 0.3640928864479065,
+      "learning_rate": 0.00025190379008746354,
+      "loss": 3.304,
+      "step": 39900
+    },
+    {
+      "epoch": 11.630423388270922,
+      "grad_norm": 0.4092429578304291,
+      "learning_rate": 0.00025146647230320696,
+      "loss": 3.3123,
+      "step": 39950
+    },
+    {
+      "epoch": 11.644982819870712,
+      "grad_norm": 0.3751949071884155,
+      "learning_rate": 0.0002510291545189504,
+      "loss": 3.3111,
+      "step": 40000
+    },
+    {
+      "epoch": 11.644982819870712,
+      "eval_accuracy": 0.371191240289195,
+      "eval_loss": 3.5466434955596924,
+      "eval_runtime": 179.6345,
+      "eval_samples_per_second": 92.649,
+      "eval_steps_per_second": 5.795,
+      "step": 40000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 68700,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.35916062261248e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}