diff --git "a/cost_to_hit_frequency_1001/checkpoint-50000/trainer_state.json" "b/cost_to_hit_frequency_1001/checkpoint-50000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/cost_to_hit_frequency_1001/checkpoint-50000/trainer_state.json"
@@ -0,0 +1,7493 @@
+{
+  "best_global_step": 48000,
+  "best_metric": 3.5272507667541504,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_hit_frequency_1001/checkpoint-30000",
+  "epoch": 14.564553717082266,
+  "eval_steps": 1000,
+  "global_step": 50000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01456536937776742,
+      "grad_norm": 1.0411887168884277,
+      "learning_rate": 0.000294,
+      "loss": 8.4313,
+      "step": 50
+    },
+    {
+      "epoch": 0.02913073875553484,
+      "grad_norm": 0.9934611916542053,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7347,
+      "step": 100
+    },
+    {
+      "epoch": 0.04369610813330226,
+      "grad_norm": 0.6710580587387085,
+      "learning_rate": 0.0005995711785297549,
+      "loss": 6.3491,
+      "step": 150
+    },
+    {
+      "epoch": 0.05826147751106968,
+      "grad_norm": 0.46758192777633667,
+      "learning_rate": 0.0005991336056009335,
+      "loss": 6.1272,
+      "step": 200
+    },
+    {
+      "epoch": 0.0728268468888371,
+      "grad_norm": 0.5224136710166931,
+      "learning_rate": 0.000598696032672112,
+      "loss": 5.9887,
+      "step": 250
+    },
+    {
+      "epoch": 0.08739221626660452,
+      "grad_norm": 0.5374292135238647,
+      "learning_rate": 0.0005982584597432905,
+      "loss": 5.8537,
+      "step": 300
+    },
+    {
+      "epoch": 0.10195758564437195,
+      "grad_norm": 0.42533859610557556,
+      "learning_rate": 0.0005978208868144691,
+      "loss": 5.7305,
+      "step": 350
+    },
+    {
+      "epoch": 0.11652295502213936,
+      "grad_norm": 0.5247730016708374,
+      "learning_rate": 0.0005973833138856476,
+      "loss": 5.6139,
+      "step": 400
+    },
+    {
+      "epoch": 0.13108832439990678,
+      "grad_norm": 0.5447224974632263,
+      "learning_rate": 0.000596945740956826,
+      "loss": 5.4885,
+      "step": 450
+    },
+    {
+      "epoch": 0.1456536937776742,
+      "grad_norm": 0.5301450490951538,
+      "learning_rate": 0.0005965081680280046,
+      "loss": 5.4119,
+      "step": 500
+    },
+    {
+      "epoch": 0.16021906315544163,
+      "grad_norm": 0.5106812119483948,
+      "learning_rate": 0.0005960705950991831,
+      "loss": 5.3254,
+      "step": 550
+    },
+    {
+      "epoch": 0.17478443253320905,
+      "grad_norm": 0.42297908663749695,
+      "learning_rate": 0.0005956330221703616,
+      "loss": 5.2446,
+      "step": 600
+    },
+    {
+      "epoch": 0.18934980191097647,
+      "grad_norm": 0.47052502632141113,
+      "learning_rate": 0.0005951954492415402,
+      "loss": 5.1939,
+      "step": 650
+    },
+    {
+      "epoch": 0.2039151712887439,
+      "grad_norm": 0.4460456371307373,
+      "learning_rate": 0.0005947578763127188,
+      "loss": 5.1229,
+      "step": 700
+    },
+    {
+      "epoch": 0.2184805406665113,
+      "grad_norm": 0.46692177653312683,
+      "learning_rate": 0.0005943203033838973,
+      "loss": 5.0837,
+      "step": 750
+    },
+    {
+      "epoch": 0.23304591004427871,
+      "grad_norm": 0.4475383758544922,
+      "learning_rate": 0.0005938827304550758,
+      "loss": 5.0189,
+      "step": 800
+    },
+    {
+      "epoch": 0.24761127942204614,
+      "grad_norm": 0.4715788960456848,
+      "learning_rate": 0.0005934451575262544,
+      "loss": 4.9712,
+      "step": 850
+    },
+    {
+      "epoch": 0.26217664879981356,
+      "grad_norm": 0.5530597567558289,
+      "learning_rate": 0.0005930075845974328,
+      "loss": 4.9247,
+      "step": 900
+    },
+    {
+      "epoch": 0.276742018177581,
+      "grad_norm": 0.5195161700248718,
+      "learning_rate": 0.0005925700116686113,
+      "loss": 4.8776,
+      "step": 950
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "grad_norm": 0.46087169647216797,
+      "learning_rate": 0.0005921324387397899,
+      "loss": 4.8133,
+      "step": 1000
+    },
+    {
+      "epoch": 0.2913073875553484,
+      "eval_accuracy": 0.2545722064591014,
+      "eval_loss": 4.754235744476318,
+      "eval_runtime": 180.1197,
+      "eval_samples_per_second": 92.405,
+      "eval_steps_per_second": 5.779,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30587275693311583,
+      "grad_norm": 0.44081172347068787,
+      "learning_rate": 0.0005916948658109684,
+      "loss": 4.7792,
+      "step": 1050
+    },
+    {
+      "epoch": 0.32043812631088325,
+      "grad_norm": 0.48162946105003357,
+      "learning_rate": 0.000591257292882147,
+      "loss": 4.7359,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3350034956886507,
+      "grad_norm": 0.4235544204711914,
+      "learning_rate": 0.0005908197199533255,
+      "loss": 4.6992,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3495688650664181,
+      "grad_norm": 0.4740869402885437,
+      "learning_rate": 0.0005903821470245041,
+      "loss": 4.6654,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3641342344441855,
+      "grad_norm": 0.4276205897331238,
+      "learning_rate": 0.0005899445740956826,
+      "loss": 4.6244,
+      "step": 1250
+    },
+    {
+      "epoch": 0.37869960382195295,
+      "grad_norm": 0.40895992517471313,
+      "learning_rate": 0.0005895070011668611,
+      "loss": 4.6068,
+      "step": 1300
+    },
+    {
+      "epoch": 0.39326497319972037,
+      "grad_norm": 0.4188133478164673,
+      "learning_rate": 0.0005890694282380397,
+      "loss": 4.5584,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4078303425774878,
+      "grad_norm": 0.4317689538002014,
+      "learning_rate": 0.0005886318553092181,
+      "loss": 4.5422,
+      "step": 1400
+    },
+    {
+      "epoch": 0.42239571195525516,
+      "grad_norm": 0.40392470359802246,
+      "learning_rate": 0.0005881942823803966,
+      "loss": 4.5333,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4369610813330226,
+      "grad_norm": 0.4244018793106079,
+      "learning_rate": 0.0005877567094515752,
+      "loss": 4.4986,
+      "step": 1500
+    },
+    {
+      "epoch": 0.45152645071079,
+      "grad_norm": 0.44470831751823425,
+      "learning_rate": 0.0005873191365227537,
+      "loss": 4.4928,
+      "step": 1550
+    },
+    {
+      "epoch": 0.46609182008855743,
+      "grad_norm": 0.4386588931083679,
+      "learning_rate": 0.0005868815635939323,
+      "loss": 4.4553,
+      "step": 1600
+    },
+    {
+      "epoch": 0.48065718946632485,
+      "grad_norm": 0.42980971932411194,
+      "learning_rate": 0.0005864439906651108,
+      "loss": 4.4385,
+      "step": 1650
+    },
+    {
+      "epoch": 0.4952225588440923,
+      "grad_norm": 0.3935016691684723,
+      "learning_rate": 0.0005860064177362894,
+      "loss": 4.4327,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5097879282218597,
+      "grad_norm": 0.4373241662979126,
+      "learning_rate": 0.0005855688448074679,
+      "loss": 4.4099,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5243532975996271,
+      "grad_norm": 0.4172551929950714,
+      "learning_rate": 0.0005851312718786464,
+      "loss": 4.3901,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5389186669773945,
+      "grad_norm": 0.40378788113594055,
+      "learning_rate": 0.0005846936989498249,
+      "loss": 4.383,
+      "step": 1850
+    },
+    {
+      "epoch": 0.553484036355162,
+      "grad_norm": 0.38236093521118164,
+      "learning_rate": 0.0005842561260210034,
+      "loss": 4.3598,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5680494057329294,
+      "grad_norm": 0.381078839302063,
+      "learning_rate": 0.000583818553092182,
+      "loss": 4.3638,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "grad_norm": 0.4327857196331024,
+      "learning_rate": 0.0005833809801633605,
+      "loss": 4.3432,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5826147751106968,
+      "eval_accuracy": 0.29888864119390235,
+      "eval_loss": 4.287432670593262,
+      "eval_runtime": 180.4505,
+      "eval_samples_per_second": 92.236,
+      "eval_steps_per_second": 5.769,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5971801444884642,
+      "grad_norm": 0.4143087863922119,
+      "learning_rate": 0.000582943407234539,
+      "loss": 4.329,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6117455138662317,
+      "grad_norm": 0.3753448724746704,
+      "learning_rate": 0.0005825058343057176,
+      "loss": 4.3033,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6263108832439991,
+      "grad_norm": 0.40621188282966614,
+      "learning_rate": 0.0005820682613768961,
+      "loss": 4.3041,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6408762526217665,
+      "grad_norm": 0.40833911299705505,
+      "learning_rate": 0.0005816306884480747,
+      "loss": 4.2834,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6554416219995339,
+      "grad_norm": 0.4088577628135681,
+      "learning_rate": 0.0005811931155192532,
+      "loss": 4.2804,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6700069913773014,
+      "grad_norm": 0.3746855556964874,
+      "learning_rate": 0.0005807555425904316,
+      "loss": 4.2762,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6845723607550688,
+      "grad_norm": 0.3618931770324707,
+      "learning_rate": 0.0005803179696616102,
+      "loss": 4.2598,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6991377301328362,
+      "grad_norm": 0.3690814971923828,
+      "learning_rate": 0.0005798803967327887,
+      "loss": 4.2413,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7137030995106036,
+      "grad_norm": 0.40264639258384705,
+      "learning_rate": 0.0005794428238039673,
+      "loss": 4.2375,
+      "step": 2450
+    },
+    {
+      "epoch": 0.728268468888371,
+      "grad_norm": 0.4249323606491089,
+      "learning_rate": 0.0005790052508751458,
+      "loss": 4.233,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7428338382661385,
+      "grad_norm": 0.39969372749328613,
+      "learning_rate": 0.0005785676779463243,
+      "loss": 4.2202,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7573992076439059,
+      "grad_norm": 0.3819160759449005,
+      "learning_rate": 0.0005781301050175029,
+      "loss": 4.2199,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7719645770216733,
+      "grad_norm": 0.361541211605072,
+      "learning_rate": 0.0005776925320886814,
+      "loss": 4.204,
+      "step": 2650
+    },
+    {
+      "epoch": 0.7865299463994407,
+      "grad_norm": 0.3613761365413666,
+      "learning_rate": 0.00057725495915986,
+      "loss": 4.1961,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8010953157772082,
+      "grad_norm": 0.4024335741996765,
+      "learning_rate": 0.0005768173862310384,
+      "loss": 4.1899,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8156606851549756,
+      "grad_norm": 0.34226447343826294,
+      "learning_rate": 0.0005763798133022169,
+      "loss": 4.1648,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8302260545327429,
+      "grad_norm": 0.3609713315963745,
+      "learning_rate": 0.0005759422403733955,
+      "loss": 4.1756,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8447914239105103,
+      "grad_norm": 0.37800899147987366,
+      "learning_rate": 0.000575504667444574,
+      "loss": 4.1618,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8593567932882777,
+      "grad_norm": 0.35399892926216125,
+      "learning_rate": 0.0005750670945157526,
+      "loss": 4.151,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "grad_norm": 0.35685768723487854,
+      "learning_rate": 0.0005746295215869311,
+      "loss": 4.1316,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8739221626660452,
+      "eval_accuracy": 0.3165242106956263,
+      "eval_loss": 4.094448566436768,
+      "eval_runtime": 180.2904,
+      "eval_samples_per_second": 92.318,
+      "eval_steps_per_second": 5.774,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8884875320438126,
+      "grad_norm": 0.3725755214691162,
+      "learning_rate": 0.0005741919486581096,
+      "loss": 4.1422,
+      "step": 3050
+    },
+    {
+      "epoch": 0.90305290142158,
+      "grad_norm": 0.3675600290298462,
+      "learning_rate": 0.0005737543757292882,
+      "loss": 4.1361,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9176182707993474,
+      "grad_norm": 0.3458426892757416,
+      "learning_rate": 0.0005733168028004667,
+      "loss": 4.1292,
+      "step": 3150
+    },
+    {
+      "epoch": 0.9321836401771149,
+      "grad_norm": 0.35508471727371216,
+      "learning_rate": 0.0005728792298716453,
+      "loss": 4.1173,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9467490095548823,
+      "grad_norm": 0.3473420739173889,
+      "learning_rate": 0.0005724416569428237,
+      "loss": 4.1051,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9613143789326497,
+      "grad_norm": 0.38041558861732483,
+      "learning_rate": 0.0005720040840140023,
+      "loss": 4.1141,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9758797483104171,
+      "grad_norm": 0.36010000109672546,
+      "learning_rate": 0.0005715665110851808,
+      "loss": 4.1036,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9904451176881846,
+      "grad_norm": 0.35879573225975037,
+      "learning_rate": 0.0005711289381563593,
+      "loss": 4.099,
+      "step": 3400
+    },
+    {
+      "epoch": 1.0049522255884409,
+      "grad_norm": 0.36708125472068787,
+      "learning_rate": 0.0005706913652275379,
+      "loss": 4.0613,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0195175949662083,
+      "grad_norm": 0.36130914092063904,
+      "learning_rate": 0.0005702537922987164,
+      "loss": 4.023,
+      "step": 3500
+    },
+    {
+      "epoch": 1.0340829643439757,
+      "grad_norm": 0.3636951446533203,
+      "learning_rate": 0.0005698162193698949,
+      "loss": 4.0197,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0486483337217432,
+      "grad_norm": 0.3318287432193756,
+      "learning_rate": 0.0005693786464410735,
+      "loss": 4.009,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0632137030995106,
+      "grad_norm": 0.3470671474933624,
+      "learning_rate": 0.000568941073512252,
+      "loss": 3.9984,
+      "step": 3650
+    },
+    {
+      "epoch": 1.077779072477278,
+      "grad_norm": 0.3891682028770447,
+      "learning_rate": 0.0005685035005834305,
+      "loss": 4.0067,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0923444418550454,
+      "grad_norm": 0.3590176999568939,
+      "learning_rate": 0.000568065927654609,
+      "loss": 4.011,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1069098112328128,
+      "grad_norm": 0.3605038821697235,
+      "learning_rate": 0.0005676283547257876,
+      "loss": 4.0014,
+      "step": 3800
+    },
+    {
+      "epoch": 1.1214751806105803,
+      "grad_norm": 0.35803160071372986,
+      "learning_rate": 0.0005671907817969661,
+      "loss": 3.9972,
+      "step": 3850
+    },
+    {
+      "epoch": 1.1360405499883477,
+      "grad_norm": 0.3601053059101105,
+      "learning_rate": 0.0005667532088681446,
+      "loss": 3.9894,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1506059193661151,
+      "grad_norm": 0.3761005699634552,
+      "learning_rate": 0.0005663156359393232,
+      "loss": 3.9892,
+      "step": 3950
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "grad_norm": 0.34091663360595703,
+      "learning_rate": 0.0005658780630105017,
+      "loss": 3.9856,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1651712887438825,
+      "eval_accuracy": 0.3262635618883952,
+      "eval_loss": 3.9839839935302734,
+      "eval_runtime": 180.3348,
+      "eval_samples_per_second": 92.295,
+      "eval_steps_per_second": 5.773,
+      "step": 4000
+    },
+    {
+      "epoch": 1.17973665812165,
+      "grad_norm": 0.3509597182273865,
+      "learning_rate": 0.0005654404900816802,
+      "loss": 3.9952,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1943020274994174,
+      "grad_norm": 0.35156598687171936,
+      "learning_rate": 0.0005650029171528588,
+      "loss": 3.9769,
+      "step": 4100
+    },
+    {
+      "epoch": 1.2088673968771848,
+      "grad_norm": 0.34221357107162476,
+      "learning_rate": 0.0005645653442240373,
+      "loss": 3.9839,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2234327662549522,
+      "grad_norm": 0.3706187307834625,
+      "learning_rate": 0.0005641277712952158,
+      "loss": 3.9845,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2379981356327197,
+      "grad_norm": 0.3384045660495758,
+      "learning_rate": 0.0005636901983663943,
+      "loss": 3.9635,
+      "step": 4250
+    },
+    {
+      "epoch": 1.252563505010487,
+      "grad_norm": 0.36682382225990295,
+      "learning_rate": 0.0005632526254375729,
+      "loss": 3.9741,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2671288743882545,
+      "grad_norm": 0.3488970398902893,
+      "learning_rate": 0.0005628150525087514,
+      "loss": 3.9682,
+      "step": 4350
+    },
+    {
+      "epoch": 1.281694243766022,
+      "grad_norm": 0.3281860053539276,
+      "learning_rate": 0.0005623774795799299,
+      "loss": 3.9527,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2962596131437893,
+      "grad_norm": 0.3465306758880615,
+      "learning_rate": 0.0005619399066511085,
+      "loss": 3.9529,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3108249825215568,
+      "grad_norm": 0.3299737870693207,
+      "learning_rate": 0.000561502333722287,
+      "loss": 3.9559,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3253903518993242,
+      "grad_norm": 0.3362690508365631,
+      "learning_rate": 0.0005610647607934655,
+      "loss": 3.9554,
+      "step": 4550
+    },
+    {
+      "epoch": 1.3399557212770916,
+      "grad_norm": 0.34816160798072815,
+      "learning_rate": 0.000560627187864644,
+      "loss": 3.951,
+      "step": 4600
+    },
+    {
+      "epoch": 1.354521090654859,
+      "grad_norm": 0.3506231904029846,
+      "learning_rate": 0.0005601896149358226,
+      "loss": 3.9438,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3690864600326265,
+      "grad_norm": 0.34532177448272705,
+      "learning_rate": 0.0005597520420070011,
+      "loss": 3.9489,
+      "step": 4700
+    },
+    {
+      "epoch": 1.3836518294103939,
+      "grad_norm": 0.3467022180557251,
+      "learning_rate": 0.0005593144690781796,
+      "loss": 3.9438,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3982171987881613,
+      "grad_norm": 0.3695443272590637,
+      "learning_rate": 0.0005588768961493582,
+      "loss": 3.9331,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4127825681659287,
+      "grad_norm": 0.33018758893013,
+      "learning_rate": 0.0005584393232205367,
+      "loss": 3.9295,
+      "step": 4850
+    },
+    {
+      "epoch": 1.4273479375436962,
+      "grad_norm": 0.3564456105232239,
+      "learning_rate": 0.0005580017502917152,
+      "loss": 3.9395,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4419133069214636,
+      "grad_norm": 0.3350387215614319,
+      "learning_rate": 0.0005575641773628938,
+      "loss": 3.9266,
+      "step": 4950
+    },
+    {
+      "epoch": 1.456478676299231,
+      "grad_norm": 0.33804184198379517,
+      "learning_rate": 0.0005571266044340723,
+      "loss": 3.9435,
+      "step": 5000
+    },
+    {
+      "epoch": 1.456478676299231,
+      "eval_accuracy": 0.33277813599489436,
+      "eval_loss": 3.9095029830932617,
+      "eval_runtime": 180.2432,
+      "eval_samples_per_second": 92.342,
+      "eval_steps_per_second": 5.776,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4710440456769984,
+      "grad_norm": 0.33603259921073914,
+      "learning_rate": 0.0005566890315052507,
+      "loss": 3.9152,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4856094150547658,
+      "grad_norm": 0.3263108730316162,
+      "learning_rate": 0.0005562514585764293,
+      "loss": 3.9295,
+      "step": 5100
+    },
+    {
+      "epoch": 1.500174784432533,
+      "grad_norm": 0.3421551287174225,
+      "learning_rate": 0.0005558138856476079,
+      "loss": 3.9092,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5147401538103007,
+      "grad_norm": 0.3283444344997406,
+      "learning_rate": 0.0005553763127187864,
+      "loss": 3.9035,
+      "step": 5200
+    },
+    {
+      "epoch": 1.529305523188068,
+      "grad_norm": 0.34648576378822327,
+      "learning_rate": 0.0005549387397899649,
+      "loss": 3.9155,
+      "step": 5250
+    },
+    {
+      "epoch": 1.5438708925658355,
+      "grad_norm": 0.3276433050632477,
+      "learning_rate": 0.0005545011668611435,
+      "loss": 3.9183,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5584362619436027,
+      "grad_norm": 0.333248108625412,
+      "learning_rate": 0.000554063593932322,
+      "loss": 3.9177,
+      "step": 5350
+    },
+    {
+      "epoch": 1.5730016313213704,
+      "grad_norm": 0.337734580039978,
+      "learning_rate": 0.0005536260210035005,
+      "loss": 3.9095,
+      "step": 5400
+    },
+    {
+      "epoch": 1.5875670006991376,
+      "grad_norm": 0.3482300043106079,
+      "learning_rate": 0.0005531884480746791,
+      "loss": 3.8858,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6021323700769052,
+      "grad_norm": 0.3184448480606079,
+      "learning_rate": 0.0005527508751458577,
+      "loss": 3.8971,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6166977394546724,
+      "grad_norm": 0.3283264935016632,
+      "learning_rate": 0.0005523133022170361,
+      "loss": 3.877,
+      "step": 5550
+    },
+    {
+      "epoch": 1.63126310883244,
+      "grad_norm": 0.34048277139663696,
+      "learning_rate": 0.0005518757292882146,
+      "loss": 3.8963,
+      "step": 5600
+    },
+    {
+      "epoch": 1.6458284782102073,
+      "grad_norm": 0.32839393615722656,
+      "learning_rate": 0.0005514381563593932,
+      "loss": 3.8916,
+      "step": 5650
+    },
+    {
+      "epoch": 1.660393847587975,
+      "grad_norm": 0.33673906326293945,
+      "learning_rate": 0.0005510005834305717,
+      "loss": 3.8848,
+      "step": 5700
+    },
+    {
+      "epoch": 1.6749592169657421,
+      "grad_norm": 0.34184661507606506,
+      "learning_rate": 0.0005505630105017502,
+      "loss": 3.8913,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6895245863435098,
+      "grad_norm": 0.3328911364078522,
+      "learning_rate": 0.0005501254375729288,
+      "loss": 3.8844,
+      "step": 5800
+    },
+    {
+      "epoch": 1.704089955721277,
+      "grad_norm": 0.34918013215065,
+      "learning_rate": 0.0005496878646441073,
+      "loss": 3.8826,
+      "step": 5850
+    },
+    {
+      "epoch": 1.7186553250990446,
+      "grad_norm": 0.3278209865093231,
+      "learning_rate": 0.0005492502917152858,
+      "loss": 3.8737,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7332206944768118,
+      "grad_norm": 0.32764044404029846,
+      "learning_rate": 0.0005488127187864644,
+      "loss": 3.8653,
+      "step": 5950
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "grad_norm": 0.320593923330307,
+      "learning_rate": 0.000548375145857643,
+      "loss": 3.8665,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7477860638545795,
+      "eval_accuracy": 0.33797937798145206,
+      "eval_loss": 3.8514480590820312,
+      "eval_runtime": 180.396,
+      "eval_samples_per_second": 92.264,
+      "eval_steps_per_second": 5.771,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7623514332323467,
+      "grad_norm": 0.32193249464035034,
+      "learning_rate": 0.0005479375729288214,
+      "loss": 3.863,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7769168026101143,
+      "grad_norm": 0.35405752062797546,
+      "learning_rate": 0.0005474999999999999,
+      "loss": 3.8648,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7914821719878815,
+      "grad_norm": 0.3290136754512787,
+      "learning_rate": 0.0005470624270711785,
+      "loss": 3.8579,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8060475413656492,
+      "grad_norm": 0.3399069607257843,
+      "learning_rate": 0.000546624854142357,
+      "loss": 3.8711,
+      "step": 6200
+    },
+    {
+      "epoch": 1.8206129107434164,
+      "grad_norm": 0.333492249250412,
+      "learning_rate": 0.0005461872812135355,
+      "loss": 3.8686,
+      "step": 6250
+    },
+    {
+      "epoch": 1.835178280121184,
+      "grad_norm": 0.3360602557659149,
+      "learning_rate": 0.0005457497082847141,
+      "loss": 3.8639,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8497436494989512,
+      "grad_norm": 0.349657267332077,
+      "learning_rate": 0.0005453121353558927,
+      "loss": 3.8582,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8643090188767188,
+      "grad_norm": 0.31816044449806213,
+      "learning_rate": 0.0005448745624270712,
+      "loss": 3.8605,
+      "step": 6400
+    },
+    {
+      "epoch": 1.878874388254486,
+      "grad_norm": 0.3400065004825592,
+      "learning_rate": 0.0005444369894982496,
+      "loss": 3.8617,
+      "step": 6450
+    },
+    {
+      "epoch": 1.8934397576322537,
+      "grad_norm": 0.3279556632041931,
+      "learning_rate": 0.0005439994165694282,
+      "loss": 3.8528,
+      "step": 6500
+    },
+    {
+      "epoch": 1.908005127010021,
+      "grad_norm": 0.33743831515312195,
+      "learning_rate": 0.0005435618436406067,
+      "loss": 3.8504,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9225704963877885,
+      "grad_norm": 0.3401290476322174,
+      "learning_rate": 0.0005431242707117852,
+      "loss": 3.8496,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9371358657655557,
+      "grad_norm": 0.3282126486301422,
+      "learning_rate": 0.0005426866977829638,
+      "loss": 3.8469,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9517012351433234,
+      "grad_norm": 0.3605695068836212,
+      "learning_rate": 0.0005422491248541423,
+      "loss": 3.858,
+      "step": 6700
+    },
+    {
+      "epoch": 1.9662666045210906,
+      "grad_norm": 0.32521483302116394,
+      "learning_rate": 0.0005418115519253208,
+      "loss": 3.852,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9808319738988582,
+      "grad_norm": 0.33284640312194824,
+      "learning_rate": 0.0005413739789964994,
+      "loss": 3.8486,
+      "step": 6800
+    },
+    {
+      "epoch": 1.9953973432766254,
+      "grad_norm": 0.3308689296245575,
+      "learning_rate": 0.000540936406067678,
+      "loss": 3.8482,
+      "step": 6850
+    },
+    {
+      "epoch": 2.0099044511768818,
+      "grad_norm": 0.33800598978996277,
+      "learning_rate": 0.0005404988331388564,
+      "loss": 3.77,
+      "step": 6900
+    },
+    {
+      "epoch": 2.0244698205546494,
+      "grad_norm": 0.3277951180934906,
+      "learning_rate": 0.0005400612602100349,
+      "loss": 3.7368,
+      "step": 6950
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "grad_norm": 0.3203679919242859,
+      "learning_rate": 0.0005396236872812135,
+      "loss": 3.7491,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0390351899324166,
+      "eval_accuracy": 0.34216722609676753,
+      "eval_loss": 3.8109800815582275,
+      "eval_runtime": 180.255,
+      "eval_samples_per_second": 92.336,
+      "eval_steps_per_second": 5.775,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0536005593101843,
+      "grad_norm": 0.35276371240615845,
+      "learning_rate": 0.000539186114352392,
+      "loss": 3.7337,
+      "step": 7050
+    },
+    {
+      "epoch": 2.0681659286879515,
+      "grad_norm": 0.35237714648246765,
+      "learning_rate": 0.0005387485414235705,
+      "loss": 3.7357,
+      "step": 7100
+    },
+    {
+      "epoch": 2.082731298065719,
+      "grad_norm": 0.3209347426891327,
+      "learning_rate": 0.0005383109684947491,
+      "loss": 3.7583,
+      "step": 7150
+    },
+    {
+      "epoch": 2.0972966674434863,
+      "grad_norm": 0.32085931301116943,
+      "learning_rate": 0.0005378733955659276,
+      "loss": 3.7539,
+      "step": 7200
+    },
+    {
+      "epoch": 2.111862036821254,
+      "grad_norm": 0.31919988989830017,
+      "learning_rate": 0.0005374358226371061,
+      "loss": 3.7393,
+      "step": 7250
+    },
+    {
+      "epoch": 2.126427406199021,
+      "grad_norm": 0.3325698673725128,
+      "learning_rate": 0.0005369982497082847,
+      "loss": 3.751,
+      "step": 7300
+    },
+    {
+      "epoch": 2.140992775576789,
+      "grad_norm": 0.32088345289230347,
+      "learning_rate": 0.0005365606767794633,
+      "loss": 3.7441,
+      "step": 7350
+    },
+    {
+      "epoch": 2.155558144954556,
+      "grad_norm": 0.31885406374931335,
+      "learning_rate": 0.0005361231038506417,
+      "loss": 3.7468,
+      "step": 7400
+    },
+    {
+      "epoch": 2.1701235143323236,
+      "grad_norm": 0.32321396470069885,
+      "learning_rate": 0.0005356855309218202,
+      "loss": 3.7511,
+      "step": 7450
+    },
+    {
+      "epoch": 2.184688883710091,
+      "grad_norm": 0.339028924703598,
+      "learning_rate": 0.0005352479579929988,
+      "loss": 3.7568,
+      "step": 7500
+    },
+    {
+      "epoch": 2.1992542530878585,
+      "grad_norm": 0.3378174901008606,
+      "learning_rate": 0.0005348103850641773,
+      "loss": 3.7351,
+      "step": 7550
+    },
+    {
+      "epoch": 2.2138196224656257,
+      "grad_norm": 0.32842838764190674,
+      "learning_rate": 0.0005343728121353558,
+      "loss": 3.7614,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2283849918433933,
+      "grad_norm": 0.3337772488594055,
+      "learning_rate": 0.0005339352392065344,
+      "loss": 3.7502,
+      "step": 7650
+    },
+    {
+      "epoch": 2.2429503612211605,
+      "grad_norm": 0.31574419140815735,
+      "learning_rate": 0.000533497666277713,
+      "loss": 3.7559,
+      "step": 7700
+    },
+    {
+      "epoch": 2.257515730598928,
+      "grad_norm": 0.3204760253429413,
+      "learning_rate": 0.0005330600933488915,
+      "loss": 3.7382,
+      "step": 7750
+    },
+    {
+      "epoch": 2.2720810999766954,
+      "grad_norm": 0.33120566606521606,
+      "learning_rate": 0.00053262252042007,
+      "loss": 3.7488,
+      "step": 7800
+    },
+    {
+      "epoch": 2.286646469354463,
+      "grad_norm": 0.3328082263469696,
+      "learning_rate": 0.0005321849474912485,
+      "loss": 3.7518,
+      "step": 7850
+    },
+    {
+      "epoch": 2.3012118387322302,
+      "grad_norm": 0.3446897268295288,
+      "learning_rate": 0.000531747374562427,
+      "loss": 3.7421,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3157772081099974,
+      "grad_norm": 0.3277474641799927,
+      "learning_rate": 0.0005313098016336055,
+      "loss": 3.7376,
+      "step": 7950
+    },
+    {
+      "epoch": 2.330342577487765,
+      "grad_norm": 0.32570740580558777,
+      "learning_rate": 0.0005308722287047841,
+      "loss": 3.7416,
+      "step": 8000
+    },
+    {
+      "epoch": 2.330342577487765,
+      "eval_accuracy": 0.3451675491976329,
+      "eval_loss": 3.7787580490112305,
+      "eval_runtime": 180.355,
+      "eval_samples_per_second": 92.285,
+      "eval_steps_per_second": 5.772,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3449079468655327,
+      "grad_norm": 0.3236296474933624,
+      "learning_rate": 0.0005304346557759626,
+      "loss": 3.737,
+      "step": 8050
+    },
+    {
+      "epoch": 2.3594733162433,
+      "grad_norm": 0.3300062417984009,
+      "learning_rate": 0.0005299970828471411,
+      "loss": 3.74,
+      "step": 8100
+    },
+    {
+      "epoch": 2.374038685621067,
+      "grad_norm": 0.3218088746070862,
+      "learning_rate": 0.0005295595099183197,
+      "loss": 3.7618,
+      "step": 8150
+    },
+    {
+      "epoch": 2.3886040549988348,
+      "grad_norm": 0.32456105947494507,
+      "learning_rate": 0.0005291219369894983,
+      "loss": 3.7461,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4031694243766024,
+      "grad_norm": 0.3256712257862091,
+      "learning_rate": 0.0005286843640606768,
+      "loss": 3.7343,
+      "step": 8250
+    },
+    {
+      "epoch": 2.4177347937543696,
+      "grad_norm": 0.3265218734741211,
+      "learning_rate": 0.0005282467911318552,
+      "loss": 3.7396,
+      "step": 8300
+    },
+    {
+      "epoch": 2.432300163132137,
+      "grad_norm": 0.3039201498031616,
+      "learning_rate": 0.0005278092182030338,
+      "loss": 3.7405,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4468655325099045,
+      "grad_norm": 0.3367139995098114,
+      "learning_rate": 0.0005273716452742123,
+      "loss": 3.7426,
+      "step": 8400
+    },
+    {
+      "epoch": 2.461430901887672,
+      "grad_norm": 0.314224511384964,
+      "learning_rate": 0.0005269340723453908,
+      "loss": 3.7412,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4759962712654393,
+      "grad_norm": 0.3330950140953064,
+      "learning_rate": 0.0005264964994165694,
+      "loss": 3.7463,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4905616406432065,
+      "grad_norm": 0.3340461552143097,
+      "learning_rate": 0.000526058926487748,
+      "loss": 3.7469,
+      "step": 8550
+    },
+    {
+      "epoch": 2.505127010020974,
+      "grad_norm": 0.3362635672092438,
+      "learning_rate": 0.0005256213535589265,
+      "loss": 3.7338,
+      "step": 8600
+    },
+    {
+      "epoch": 2.519692379398742,
+      "grad_norm": 0.3297460377216339,
+      "learning_rate": 0.000525183780630105,
+      "loss": 3.7498,
+      "step": 8650
+    },
+    {
+      "epoch": 2.534257748776509,
+      "grad_norm": 0.3183857500553131,
+      "learning_rate": 0.0005247462077012836,
+      "loss": 3.7391,
+      "step": 8700
+    },
+    {
+      "epoch": 2.548823118154276,
+      "grad_norm": 0.33508941531181335,
+      "learning_rate": 0.000524308634772462,
+      "loss": 3.7348,
+      "step": 8750
+    },
+    {
+      "epoch": 2.563388487532044,
+      "grad_norm": 0.3083733022212982,
+      "learning_rate": 0.0005238710618436405,
+      "loss": 3.7285,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5779538569098115,
+      "grad_norm": 0.31876590847969055,
+      "learning_rate": 0.0005234334889148191,
+      "loss": 3.7443,
+      "step": 8850
+    },
+    {
+      "epoch": 2.5925192262875787,
+      "grad_norm": 0.3193049430847168,
+      "learning_rate": 0.0005229959159859976,
+      "loss": 3.7422,
+      "step": 8900
+    },
+    {
+      "epoch": 2.607084595665346,
+      "grad_norm": 0.32590124011039734,
+      "learning_rate": 0.0005225583430571761,
+      "loss": 3.7424,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "grad_norm": 0.3363872170448303,
+      "learning_rate": 0.0005221207701283547,
+      "loss": 3.728,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6216499650431135,
+      "eval_accuracy": 0.3480742812181514,
+      "eval_loss": 3.7482504844665527,
+      "eval_runtime": 180.2179,
+      "eval_samples_per_second": 92.355,
+      "eval_steps_per_second": 5.776,
+      "step": 9000
+    },
+    {
+      "epoch": 2.636215334420881,
+      "grad_norm": 0.3201189935207367,
+      "learning_rate": 0.0005216831971995333,
+      "loss": 3.7477,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6507807037986484,
+      "grad_norm": 0.33287513256073,
+      "learning_rate": 0.0005212456242707118,
+      "loss": 3.7203,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6653460731764156,
+      "grad_norm": 0.3236483335494995,
+      "learning_rate": 0.0005208080513418903,
+      "loss": 3.7265,
+      "step": 9150
+    },
+    {
+      "epoch": 2.6799114425541832,
+      "grad_norm": 0.3180456757545471,
+      "learning_rate": 0.0005203704784130689,
+      "loss": 3.7303,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6944768119319504,
+      "grad_norm": 0.3273324966430664,
+      "learning_rate": 0.0005199329054842473,
+      "loss": 3.7266,
+      "step": 9250
+    },
+    {
+      "epoch": 2.709042181309718,
+      "grad_norm": 0.3243292272090912,
+      "learning_rate": 0.0005194953325554258,
+      "loss": 3.7301,
+      "step": 9300
+    },
+    {
+      "epoch": 2.7236075506874853,
+      "grad_norm": 0.32646605372428894,
+      "learning_rate": 0.0005190577596266044,
+      "loss": 3.7284,
+      "step": 9350
+    },
+    {
+      "epoch": 2.738172920065253,
+      "grad_norm": 0.3168424665927887,
+      "learning_rate": 0.0005186201866977829,
+      "loss": 3.7384,
+      "step": 9400
+    },
+    {
+      "epoch": 2.75273828944302,
+      "grad_norm": 0.3341065049171448,
+      "learning_rate": 0.0005181826137689614,
+      "loss": 3.7279,
+      "step": 9450
+    },
+    {
+      "epoch": 2.7673036588207878,
+      "grad_norm": 0.3197799623012543,
+      "learning_rate": 0.00051774504084014,
+      "loss": 3.7302,
+      "step": 9500
+    },
+    {
+      "epoch": 2.781869028198555,
+      "grad_norm": 0.31474462151527405,
+      "learning_rate": 0.0005173074679113186,
+      "loss": 3.735,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7964343975763226,
+      "grad_norm": 0.3133241832256317,
+      "learning_rate": 0.0005168698949824971,
+      "loss": 3.7139,
+      "step": 9600
+    },
+    {
+      "epoch": 2.81099976695409,
+      "grad_norm": 0.31363457441329956,
+      "learning_rate": 0.0005164323220536755,
+      "loss": 3.7076,
+      "step": 9650
+    },
+    {
+      "epoch": 2.8255651363318575,
+      "grad_norm": 0.32894420623779297,
+      "learning_rate": 0.0005159947491248541,
+      "loss": 3.717,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8401305057096247,
+      "grad_norm": 0.33178263902664185,
+      "learning_rate": 0.0005155571761960326,
+      "loss": 3.7305,
+      "step": 9750
+    },
+    {
+      "epoch": 2.8546958750873923,
+      "grad_norm": 0.31269919872283936,
+      "learning_rate": 0.0005151196032672111,
+      "loss": 3.7172,
+      "step": 9800
+    },
+    {
+      "epoch": 2.8692612444651595,
+      "grad_norm": 0.32776308059692383,
+      "learning_rate": 0.0005146820303383897,
+      "loss": 3.7171,
+      "step": 9850
+    },
+    {
+      "epoch": 2.883826613842927,
+      "grad_norm": 0.3176999092102051,
+      "learning_rate": 0.0005142444574095682,
+      "loss": 3.6958,
+      "step": 9900
+    },
+    {
+      "epoch": 2.8983919832206944,
+      "grad_norm": 0.3453384339809418,
+      "learning_rate": 0.0005138068844807468,
+      "loss": 3.7037,
+      "step": 9950
+    },
+    {
+      "epoch": 2.912957352598462,
+      "grad_norm": 0.31886717677116394,
+      "learning_rate": 0.0005133693115519253,
+      "loss": 3.6958,
+      "step": 10000
+    },
+    {
+      "epoch": 2.912957352598462,
+      "eval_accuracy": 0.3504000665954622,
+      "eval_loss": 3.722299575805664,
+      "eval_runtime": 180.4701,
+      "eval_samples_per_second": 92.226,
+      "eval_steps_per_second": 5.768,
+      "step": 10000
+    },
+    {
+      "epoch": 2.927522721976229,
+      "grad_norm": 0.3301357626914978,
+      "learning_rate": 0.0005129317386231039,
+      "loss": 3.7174,
+      "step": 10050
+    },
+    {
+      "epoch": 2.942088091353997,
+      "grad_norm": 0.31266558170318604,
+      "learning_rate": 0.0005124941656942824,
+      "loss": 3.7087,
+      "step": 10100
+    },
+    {
+      "epoch": 2.956653460731764,
+      "grad_norm": 0.2986539602279663,
+      "learning_rate": 0.0005120565927654608,
+      "loss": 3.7032,
+      "step": 10150
+    },
+    {
+      "epoch": 2.9712188301095317,
+      "grad_norm": 0.3215900659561157,
+      "learning_rate": 0.0005116190198366394,
+      "loss": 3.7156,
+      "step": 10200
+    },
+    {
+      "epoch": 2.985784199487299,
+      "grad_norm": 0.34506484866142273,
+      "learning_rate": 0.0005111814469078179,
+      "loss": 3.7182,
+      "step": 10250
+    },
+    {
+      "epoch": 3.0002913073875552,
+      "grad_norm": 0.3209165036678314,
+      "learning_rate": 0.0005107438739789964,
+      "loss": 3.6979,
+      "step": 10300
+    },
+    {
+      "epoch": 3.014856676765323,
+      "grad_norm": 0.3145550489425659,
+      "learning_rate": 0.000510306301050175,
+      "loss": 3.5923,
+      "step": 10350
+    },
+    {
+      "epoch": 3.02942204614309,
+      "grad_norm": 0.33601146936416626,
+      "learning_rate": 0.0005098687281213535,
+      "loss": 3.6103,
+      "step": 10400
+    },
+    {
+      "epoch": 3.0439874155208577,
+      "grad_norm": 0.31010255217552185,
+      "learning_rate": 0.0005094311551925321,
+      "loss": 3.6064,
+      "step": 10450
+    },
+    {
+      "epoch": 3.058552784898625,
+      "grad_norm": 0.3235945701599121,
+      "learning_rate": 0.0005089935822637106,
+      "loss": 3.5974,
+      "step": 10500
+    },
+    {
+      "epoch": 3.0731181542763926,
+      "grad_norm": 0.32663047313690186,
+      "learning_rate": 0.0005085560093348892,
+      "loss": 3.6164,
+      "step": 10550
+    },
+    {
+      "epoch": 3.0876835236541598,
+      "grad_norm": 0.32186612486839294,
+      "learning_rate": 0.0005081184364060676,
+      "loss": 3.605,
+      "step": 10600
+    },
+    {
+      "epoch": 3.1022488930319274,
+      "grad_norm": 0.3103710114955902,
+      "learning_rate": 0.0005076808634772461,
+      "loss": 3.622,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1168142624096946,
+      "grad_norm": 0.32506147027015686,
+      "learning_rate": 0.0005072432905484247,
+      "loss": 3.6183,
+      "step": 10700
+    },
+    {
+      "epoch": 3.1313796317874623,
+      "grad_norm": 0.354626327753067,
+      "learning_rate": 0.0005068057176196032,
+      "loss": 3.6236,
+      "step": 10750
+    },
+    {
+      "epoch": 3.1459450011652295,
+      "grad_norm": 0.31761565804481506,
+      "learning_rate": 0.0005063681446907818,
+      "loss": 3.6218,
+      "step": 10800
+    },
+    {
+      "epoch": 3.160510370542997,
+      "grad_norm": 0.3158835172653198,
+      "learning_rate": 0.0005059305717619603,
+      "loss": 3.6275,
+      "step": 10850
+    },
+    {
+      "epoch": 3.1750757399207643,
+      "grad_norm": 0.3345862925052643,
+      "learning_rate": 0.0005054929988331388,
+      "loss": 3.6209,
+      "step": 10900
+    },
+    {
+      "epoch": 3.189641109298532,
+      "grad_norm": 0.33414244651794434,
+      "learning_rate": 0.0005050554259043174,
+      "loss": 3.6138,
+      "step": 10950
+    },
+    {
+      "epoch": 3.204206478676299,
+      "grad_norm": 0.321621835231781,
+      "learning_rate": 0.0005046178529754959,
+      "loss": 3.6306,
+      "step": 11000
+    },
+    {
+      "epoch": 3.204206478676299,
+      "eval_accuracy": 0.3523780599932934,
+      "eval_loss": 3.7092323303222656,
+      "eval_runtime": 181.978,
+      "eval_samples_per_second": 91.462,
+      "eval_steps_per_second": 5.72,
+      "step": 11000
+    },
+    {
+      "epoch": 3.218771848054067,
+      "grad_norm": 0.3331759572029114,
+      "learning_rate": 0.0005041802800466744,
+      "loss": 3.6116,
+      "step": 11050
+    },
+    {
+      "epoch": 3.233337217431834,
+      "grad_norm": 0.33480656147003174,
+      "learning_rate": 0.0005037427071178529,
+      "loss": 3.6186,
+      "step": 11100
+    },
+    {
+      "epoch": 3.2479025868096016,
+      "grad_norm": 0.32737287878990173,
+      "learning_rate": 0.0005033051341890314,
+      "loss": 3.6176,
+      "step": 11150
+    },
+    {
+      "epoch": 3.262467956187369,
+      "grad_norm": 0.33219143748283386,
+      "learning_rate": 0.00050286756126021,
+      "loss": 3.6299,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2770333255651365,
+      "grad_norm": 0.3134367763996124,
+      "learning_rate": 0.0005024299883313885,
+      "loss": 3.6269,
+      "step": 11250
+    },
+    {
+      "epoch": 3.2915986949429037,
+      "grad_norm": 0.3368885815143585,
+      "learning_rate": 0.0005019924154025671,
+      "loss": 3.6383,
+      "step": 11300
+    },
+    {
+      "epoch": 3.3061640643206713,
+      "grad_norm": 0.30437996983528137,
+      "learning_rate": 0.0005015548424737456,
+      "loss": 3.6245,
+      "step": 11350
+    },
+    {
+      "epoch": 3.3207294336984385,
+      "grad_norm": 0.33528828620910645,
+      "learning_rate": 0.0005011172695449241,
+      "loss": 3.6251,
+      "step": 11400
+    },
+    {
+      "epoch": 3.335294803076206,
+      "grad_norm": 0.33781698346138,
+      "learning_rate": 0.0005006796966161027,
+      "loss": 3.63,
+      "step": 11450
+    },
+    {
+      "epoch": 3.3498601724539734,
+      "grad_norm": 0.329375296831131,
+      "learning_rate": 0.0005002421236872811,
+      "loss": 3.6387,
+      "step": 11500
+    },
+    {
+      "epoch": 3.364425541831741,
+      "grad_norm": 0.31199130415916443,
+      "learning_rate": 0.0004998045507584597,
+      "loss": 3.6199,
+      "step": 11550
+    },
+    {
+      "epoch": 3.3789909112095082,
+      "grad_norm": 0.31993409991264343,
+      "learning_rate": 0.0004993669778296382,
+      "loss": 3.6383,
+      "step": 11600
+    },
+    {
+      "epoch": 3.393556280587276,
+      "grad_norm": 0.33537372946739197,
+      "learning_rate": 0.0004989294049008167,
+      "loss": 3.6409,
+      "step": 11650
+    },
+    {
+      "epoch": 3.408121649965043,
+      "grad_norm": 0.3288818299770355,
+      "learning_rate": 0.0004984918319719953,
+      "loss": 3.6544,
+      "step": 11700
+    },
+    {
+      "epoch": 3.4226870193428107,
+      "grad_norm": 0.3143393099308014,
+      "learning_rate": 0.0004980542590431738,
+      "loss": 3.632,
+      "step": 11750
+    },
+    {
+      "epoch": 3.437252388720578,
+      "grad_norm": 0.3316044211387634,
+      "learning_rate": 0.0004976166861143524,
+      "loss": 3.6256,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4518177580983456,
+      "grad_norm": 0.3158373534679413,
+      "learning_rate": 0.0004971791131855309,
+      "loss": 3.6283,
+      "step": 11850
+    },
+    {
+      "epoch": 3.4663831274761128,
+      "grad_norm": 0.3310090899467468,
+      "learning_rate": 0.0004967415402567094,
+      "loss": 3.6383,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4809484968538804,
+      "grad_norm": 0.3304344415664673,
+      "learning_rate": 0.000496303967327888,
+      "loss": 3.6364,
+      "step": 11950
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "grad_norm": 0.3196583390235901,
+      "learning_rate": 0.0004958663943990664,
+      "loss": 3.6245,
+      "step": 12000
+    },
+    {
+      "epoch": 3.4955138662316476,
+      "eval_accuracy": 0.3539525300396798,
+      "eval_loss": 3.6910691261291504,
+      "eval_runtime": 180.2749,
+      "eval_samples_per_second": 92.326,
+      "eval_steps_per_second": 5.775,
+      "step": 12000
+    },
+    {
+      "epoch": 3.510079235609415,
+      "grad_norm": 0.334721177816391,
+      "learning_rate": 0.000495428821470245,
+      "loss": 3.6405,
+      "step": 12050
+    },
+    {
+      "epoch": 3.5246446049871825,
+      "grad_norm": 0.30898579955101013,
+      "learning_rate": 0.0004949912485414235,
+      "loss": 3.633,
+      "step": 12100
+    },
+    {
+      "epoch": 3.53920997436495,
+      "grad_norm": 0.3296958804130554,
+      "learning_rate": 0.0004945536756126021,
+      "loss": 3.6336,
+      "step": 12150
+    },
+    {
+      "epoch": 3.5537753437427173,
+      "grad_norm": 0.3156227469444275,
+      "learning_rate": 0.0004941161026837806,
+      "loss": 3.6264,
+      "step": 12200
+    },
+    {
+      "epoch": 3.5683407131204845,
+      "grad_norm": 0.32900500297546387,
+      "learning_rate": 0.0004936785297549591,
+      "loss": 3.6286,
+      "step": 12250
+    },
+    {
+      "epoch": 3.582906082498252,
+      "grad_norm": 0.33001989126205444,
+      "learning_rate": 0.0004932409568261377,
+      "loss": 3.6485,
+      "step": 12300
+    },
+    {
+      "epoch": 3.59747145187602,
+      "grad_norm": 0.32858744263648987,
+      "learning_rate": 0.0004928033838973162,
+      "loss": 3.6323,
+      "step": 12350
+    },
+    {
+      "epoch": 3.612036821253787,
+      "grad_norm": 0.35113999247550964,
+      "learning_rate": 0.0004923658109684946,
+      "loss": 3.647,
+      "step": 12400
+    },
+    {
+      "epoch": 3.626602190631554,
+      "grad_norm": 0.3282478153705597,
+      "learning_rate": 0.0004919282380396732,
+      "loss": 3.6335,
+      "step": 12450
+    },
+    {
+      "epoch": 3.641167560009322,
+      "grad_norm": 0.31611868739128113,
+      "learning_rate": 0.0004914906651108517,
+      "loss": 3.631,
+      "step": 12500
+    },
+    {
+      "epoch": 3.6557329293870895,
+      "grad_norm": 0.33487775921821594,
+      "learning_rate": 0.0004910530921820303,
+      "loss": 3.6274,
+      "step": 12550
+    },
+    {
+      "epoch": 3.6702982987648567,
+      "grad_norm": 0.33004793524742126,
+      "learning_rate": 0.0004906155192532088,
+      "loss": 3.618,
+      "step": 12600
+    },
+    {
+      "epoch": 3.684863668142624,
+      "grad_norm": 0.30851587653160095,
+      "learning_rate": 0.0004901779463243874,
+      "loss": 3.6229,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6994290375203915,
+      "grad_norm": 0.325185090303421,
+      "learning_rate": 0.0004897403733955659,
+      "loss": 3.6289,
+      "step": 12700
+    },
+    {
+      "epoch": 3.713994406898159,
+      "grad_norm": 0.3187962770462036,
+      "learning_rate": 0.0004893028004667444,
+      "loss": 3.6355,
+      "step": 12750
+    },
+    {
+      "epoch": 3.7285597762759264,
+      "grad_norm": 0.32004639506340027,
+      "learning_rate": 0.000488865227537923,
+      "loss": 3.6424,
+      "step": 12800
+    },
+    {
+      "epoch": 3.7431251456536936,
+      "grad_norm": 0.331478476524353,
+      "learning_rate": 0.0004884276546091015,
+      "loss": 3.624,
+      "step": 12850
+    },
+    {
+      "epoch": 3.7576905150314612,
+      "grad_norm": 0.31720319390296936,
+      "learning_rate": 0.00048799008168028,
+      "loss": 3.6329,
+      "step": 12900
+    },
+    {
+      "epoch": 3.772255884409229,
+      "grad_norm": 0.32388386130332947,
+      "learning_rate": 0.00048755250875145853,
+      "loss": 3.6237,
+      "step": 12950
+    },
+    {
+      "epoch": 3.786821253786996,
+      "grad_norm": 0.326471209526062,
+      "learning_rate": 0.0004871149358226371,
+      "loss": 3.6365,
+      "step": 13000
+    },
+    {
+      "epoch": 3.786821253786996,
+      "eval_accuracy": 0.35590465655600817,
+      "eval_loss": 3.6707494258880615,
+      "eval_runtime": 180.189,
+      "eval_samples_per_second": 92.37,
+      "eval_steps_per_second": 5.777,
+      "step": 13000
+    },
+    {
+      "epoch": 3.8013866231647633,
+      "grad_norm": 0.3287231922149658,
+      "learning_rate": 0.0004866773628938156,
+      "loss": 3.6351,
+      "step": 13050
+    },
+    {
+      "epoch": 3.815951992542531,
+      "grad_norm": 0.3224816620349884,
+      "learning_rate": 0.0004862397899649941,
+      "loss": 3.631,
+      "step": 13100
+    },
+    {
+      "epoch": 3.8305173619202986,
+      "grad_norm": 0.34565699100494385,
+      "learning_rate": 0.00048580221703617264,
+      "loss": 3.6365,
+      "step": 13150
+    },
+    {
+      "epoch": 3.8450827312980658,
+      "grad_norm": 0.31353557109832764,
+      "learning_rate": 0.00048536464410735123,
+      "loss": 3.6346,
+      "step": 13200
+    },
+    {
+      "epoch": 3.859648100675833,
+      "grad_norm": 0.31035754084587097,
+      "learning_rate": 0.00048492707117852966,
+      "loss": 3.6353,
+      "step": 13250
+    },
+    {
+      "epoch": 3.8742134700536006,
+      "grad_norm": 0.3304181694984436,
+      "learning_rate": 0.00048448949824970826,
+      "loss": 3.631,
+      "step": 13300
+    },
+    {
+      "epoch": 3.888778839431368,
+      "grad_norm": 0.3305014669895172,
+      "learning_rate": 0.0004840519253208868,
+      "loss": 3.6234,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9033442088091355,
+      "grad_norm": 0.33002111315727234,
+      "learning_rate": 0.0004836143523920653,
+      "loss": 3.6389,
+      "step": 13400
+    },
+    {
+      "epoch": 3.9179095781869027,
+      "grad_norm": 0.3106802701950073,
+      "learning_rate": 0.0004831767794632438,
+      "loss": 3.6413,
+      "step": 13450
+    },
+    {
+      "epoch": 3.9324749475646703,
+      "grad_norm": 0.32683488726615906,
+      "learning_rate": 0.00048273920653442236,
+      "loss": 3.6114,
+      "step": 13500
+    },
+    {
+      "epoch": 3.9470403169424375,
+      "grad_norm": 0.3140070140361786,
+      "learning_rate": 0.0004823016336056009,
+      "loss": 3.6246,
+      "step": 13550
+    },
+    {
+      "epoch": 3.961605686320205,
+      "grad_norm": 0.3176632523536682,
+      "learning_rate": 0.0004818640606767794,
+      "loss": 3.6215,
+      "step": 13600
+    },
+    {
+      "epoch": 3.9761710556979724,
+      "grad_norm": 0.33348730206489563,
+      "learning_rate": 0.00048142648774795793,
+      "loss": 3.6198,
+      "step": 13650
+    },
+    {
+      "epoch": 3.99073642507574,
+      "grad_norm": 0.3215520679950714,
+      "learning_rate": 0.0004809889148191365,
+      "loss": 3.6326,
+      "step": 13700
+    },
+    {
+      "epoch": 4.005243532975996,
+      "grad_norm": 0.3232531249523163,
+      "learning_rate": 0.000480551341890315,
+      "loss": 3.5772,
+      "step": 13750
+    },
+    {
+      "epoch": 4.0198089023537635,
+      "grad_norm": 0.3443015515804291,
+      "learning_rate": 0.00048011376896149355,
+      "loss": 3.5136,
+      "step": 13800
+    },
+    {
+      "epoch": 4.034374271731531,
+      "grad_norm": 0.3226703405380249,
+      "learning_rate": 0.0004796761960326721,
+      "loss": 3.5234,
+      "step": 13850
+    },
+    {
+      "epoch": 4.048939641109299,
+      "grad_norm": 0.32913267612457275,
+      "learning_rate": 0.0004792386231038506,
+      "loss": 3.5215,
+      "step": 13900
+    },
+    {
+      "epoch": 4.063505010487066,
+      "grad_norm": 0.3350991904735565,
+      "learning_rate": 0.0004788010501750291,
+      "loss": 3.5211,
+      "step": 13950
+    },
+    {
+      "epoch": 4.078070379864833,
+      "grad_norm": 0.3143565058708191,
+      "learning_rate": 0.00047836347724620766,
+      "loss": 3.5138,
+      "step": 14000
+    },
+    {
+      "epoch": 4.078070379864833,
+      "eval_accuracy": 0.3570977076769612,
+      "eval_loss": 3.6638998985290527,
+      "eval_runtime": 180.2734,
+      "eval_samples_per_second": 92.326,
+      "eval_steps_per_second": 5.775,
+      "step": 14000
+    },
+    {
+      "epoch": 4.092635749242601,
+      "grad_norm": 0.318641722202301,
+      "learning_rate": 0.0004779259043173862,
+      "loss": 3.5201,
+      "step": 14050
+    },
+    {
+      "epoch": 4.1072011186203685,
+      "grad_norm": 0.31839898228645325,
+      "learning_rate": 0.0004774883313885647,
+      "loss": 3.5223,
+      "step": 14100
+    },
+    {
+      "epoch": 4.121766487998135,
+      "grad_norm": 0.3353429436683655,
+      "learning_rate": 0.0004770507584597433,
+      "loss": 3.5314,
+      "step": 14150
+    },
+    {
+      "epoch": 4.136331857375903,
+      "grad_norm": 0.3297600746154785,
+      "learning_rate": 0.0004766131855309218,
+      "loss": 3.5386,
+      "step": 14200
+    },
+    {
+      "epoch": 4.150897226753671,
+      "grad_norm": 0.35828185081481934,
+      "learning_rate": 0.0004761756126021003,
+      "loss": 3.5442,
+      "step": 14250
+    },
+    {
+      "epoch": 4.165462596131438,
+      "grad_norm": 0.32543322443962097,
+      "learning_rate": 0.00047573803967327884,
+      "loss": 3.5498,
+      "step": 14300
+    },
+    {
+      "epoch": 4.180027965509205,
+      "grad_norm": 0.33324652910232544,
+      "learning_rate": 0.0004753004667444574,
+      "loss": 3.5393,
+      "step": 14350
+    },
+    {
+      "epoch": 4.194593334886973,
+      "grad_norm": 0.3401516079902649,
+      "learning_rate": 0.00047486289381563587,
+      "loss": 3.5485,
+      "step": 14400
+    },
+    {
+      "epoch": 4.20915870426474,
+      "grad_norm": 0.34022200107574463,
+      "learning_rate": 0.0004744253208868144,
+      "loss": 3.5287,
+      "step": 14450
+    },
+    {
+      "epoch": 4.223724073642508,
+      "grad_norm": 0.3375685214996338,
+      "learning_rate": 0.00047398774795799295,
+      "loss": 3.5567,
+      "step": 14500
+    },
+    {
+      "epoch": 4.238289443020275,
+      "grad_norm": 0.32578080892562866,
+      "learning_rate": 0.00047355017502917154,
+      "loss": 3.5511,
+      "step": 14550
+    },
+    {
+      "epoch": 4.252854812398042,
+      "grad_norm": 0.3124660551548004,
+      "learning_rate": 0.00047311260210035,
+      "loss": 3.5519,
+      "step": 14600
+    },
+    {
+      "epoch": 4.26742018177581,
+      "grad_norm": 0.317643940448761,
+      "learning_rate": 0.00047267502917152857,
+      "loss": 3.5485,
+      "step": 14650
+    },
+    {
+      "epoch": 4.281985551153578,
+      "grad_norm": 0.3317655026912689,
+      "learning_rate": 0.0004722374562427071,
+      "loss": 3.5541,
+      "step": 14700
+    },
+    {
+      "epoch": 4.296550920531344,
+      "grad_norm": 0.32578787207603455,
+      "learning_rate": 0.0004717998833138856,
+      "loss": 3.5354,
+      "step": 14750
+    },
+    {
+      "epoch": 4.311116289909112,
+      "grad_norm": 0.32401853799819946,
+      "learning_rate": 0.00047136231038506413,
+      "loss": 3.5608,
+      "step": 14800
+    },
+    {
+      "epoch": 4.32568165928688,
+      "grad_norm": 0.33071812987327576,
+      "learning_rate": 0.00047092473745624267,
+      "loss": 3.5453,
+      "step": 14850
+    },
+    {
+      "epoch": 4.340247028664647,
+      "grad_norm": 0.3195439577102661,
+      "learning_rate": 0.00047048716452742116,
+      "loss": 3.5509,
+      "step": 14900
+    },
+    {
+      "epoch": 4.354812398042414,
+      "grad_norm": 0.32133200764656067,
+      "learning_rate": 0.0004700495915985997,
+      "loss": 3.5631,
+      "step": 14950
+    },
+    {
+      "epoch": 4.369377767420182,
+      "grad_norm": 0.345612108707428,
+      "learning_rate": 0.0004696120186697783,
+      "loss": 3.5632,
+      "step": 15000
+    },
+    {
+      "epoch": 4.369377767420182,
+      "eval_accuracy": 0.3582166854554288,
+      "eval_loss": 3.653571128845215,
+      "eval_runtime": 180.4957,
+      "eval_samples_per_second": 92.213,
+      "eval_steps_per_second": 5.767,
+      "step": 15000
+    },
+    {
+      "epoch": 4.383943136797949,
+      "grad_norm": 0.33550721406936646,
+      "learning_rate": 0.00046917444574095683,
+      "loss": 3.5562,
+      "step": 15050
+    },
+    {
+      "epoch": 4.398508506175717,
+      "grad_norm": 0.32593655586242676,
+      "learning_rate": 0.0004687368728121353,
+      "loss": 3.5542,
+      "step": 15100
+    },
+    {
+      "epoch": 4.413073875553484,
+      "grad_norm": 0.32876867055892944,
+      "learning_rate": 0.00046829929988331386,
+      "loss": 3.5537,
+      "step": 15150
+    },
+    {
+      "epoch": 4.427639244931251,
+      "grad_norm": 0.31340348720550537,
+      "learning_rate": 0.0004678617269544924,
+      "loss": 3.5547,
+      "step": 15200
+    },
+    {
+      "epoch": 4.442204614309019,
+      "grad_norm": 0.325003981590271,
+      "learning_rate": 0.0004674241540256709,
+      "loss": 3.5638,
+      "step": 15250
+    },
+    {
+      "epoch": 4.456769983686787,
+      "grad_norm": 0.31941288709640503,
+      "learning_rate": 0.0004669865810968494,
+      "loss": 3.5625,
+      "step": 15300
+    },
+    {
+      "epoch": 4.471335353064553,
+      "grad_norm": 0.32604023814201355,
+      "learning_rate": 0.00046654900816802796,
+      "loss": 3.5542,
+      "step": 15350
+    },
+    {
+      "epoch": 4.485900722442321,
+      "grad_norm": 0.3184167444705963,
+      "learning_rate": 0.00046611143523920645,
+      "loss": 3.5597,
+      "step": 15400
+    },
+    {
+      "epoch": 4.500466091820089,
+      "grad_norm": 0.32676759362220764,
+      "learning_rate": 0.00046567386231038504,
+      "loss": 3.5518,
+      "step": 15450
+    },
+    {
+      "epoch": 4.515031461197856,
+      "grad_norm": 0.3253229260444641,
+      "learning_rate": 0.0004652362893815636,
+      "loss": 3.5636,
+      "step": 15500
+    },
+    {
+      "epoch": 4.529596830575623,
+      "grad_norm": 0.33474475145339966,
+      "learning_rate": 0.0004647987164527421,
+      "loss": 3.5638,
+      "step": 15550
+    },
+    {
+      "epoch": 4.544162199953391,
+      "grad_norm": 0.34634941816329956,
+      "learning_rate": 0.0004643611435239206,
+      "loss": 3.5473,
+      "step": 15600
+    },
+    {
+      "epoch": 4.558727569331158,
+      "grad_norm": 0.33891260623931885,
+      "learning_rate": 0.00046392357059509915,
+      "loss": 3.5675,
+      "step": 15650
+    },
+    {
+      "epoch": 4.573292938708926,
+      "grad_norm": 0.32942262291908264,
+      "learning_rate": 0.0004634859976662777,
+      "loss": 3.5603,
+      "step": 15700
+    },
+    {
+      "epoch": 4.587858308086693,
+      "grad_norm": 0.3374430239200592,
+      "learning_rate": 0.0004630484247374562,
+      "loss": 3.5538,
+      "step": 15750
+    },
+    {
+      "epoch": 4.6024236774644605,
+      "grad_norm": 0.3401276767253876,
+      "learning_rate": 0.0004626108518086347,
+      "loss": 3.5644,
+      "step": 15800
+    },
+    {
+      "epoch": 4.616989046842228,
+      "grad_norm": 0.3286304473876953,
+      "learning_rate": 0.0004621732788798133,
+      "loss": 3.5653,
+      "step": 15850
+    },
+    {
+      "epoch": 4.631554416219995,
+      "grad_norm": 0.31420665979385376,
+      "learning_rate": 0.00046173570595099174,
+      "loss": 3.556,
+      "step": 15900
+    },
+    {
+      "epoch": 4.6461197855977625,
+      "grad_norm": 0.3286356031894684,
+      "learning_rate": 0.00046129813302217033,
+      "loss": 3.552,
+      "step": 15950
+    },
+    {
+      "epoch": 4.66068515497553,
+      "grad_norm": 0.33006393909454346,
+      "learning_rate": 0.00046086056009334887,
+      "loss": 3.5684,
+      "step": 16000
+    },
+    {
+      "epoch": 4.66068515497553,
+      "eval_accuracy": 0.3596702866191563,
+      "eval_loss": 3.640338897705078,
+      "eval_runtime": 180.2507,
+      "eval_samples_per_second": 92.338,
+      "eval_steps_per_second": 5.775,
+      "step": 16000
+    },
+    {
+      "epoch": 4.675250524353298,
+      "grad_norm": 0.3313292860984802,
+      "learning_rate": 0.0004604229871645274,
+      "loss": 3.5606,
+      "step": 16050
+    },
+    {
+      "epoch": 4.689815893731065,
+      "grad_norm": 0.31922703981399536,
+      "learning_rate": 0.0004599854142357059,
+      "loss": 3.5739,
+      "step": 16100
+    },
+    {
+      "epoch": 4.704381263108832,
+      "grad_norm": 0.3161007761955261,
+      "learning_rate": 0.00045954784130688444,
+      "loss": 3.5688,
+      "step": 16150
+    },
+    {
+      "epoch": 4.7189466324866,
+      "grad_norm": 0.33094581961631775,
+      "learning_rate": 0.000459110268378063,
+      "loss": 3.564,
+      "step": 16200
+    },
+    {
+      "epoch": 4.7335120018643675,
+      "grad_norm": 0.3282545804977417,
+      "learning_rate": 0.00045867269544924146,
+      "loss": 3.5759,
+      "step": 16250
+    },
+    {
+      "epoch": 4.748077371242134,
+      "grad_norm": 0.32690319418907166,
+      "learning_rate": 0.00045823512252042,
+      "loss": 3.5601,
+      "step": 16300
+    },
+    {
+      "epoch": 4.762642740619902,
+      "grad_norm": 0.3375246524810791,
+      "learning_rate": 0.0004577975495915986,
+      "loss": 3.5569,
+      "step": 16350
+    },
+    {
+      "epoch": 4.7772081099976695,
+      "grad_norm": 0.3194766044616699,
+      "learning_rate": 0.0004573599766627771,
+      "loss": 3.5536,
+      "step": 16400
+    },
+    {
+      "epoch": 4.791773479375437,
+      "grad_norm": 0.31809139251708984,
+      "learning_rate": 0.0004569224037339556,
+      "loss": 3.5626,
+      "step": 16450
+    },
+    {
+      "epoch": 4.806338848753205,
+      "grad_norm": 0.3298538327217102,
+      "learning_rate": 0.00045648483080513416,
+      "loss": 3.5597,
+      "step": 16500
+    },
+    {
+      "epoch": 4.820904218130972,
+      "grad_norm": 0.343118816614151,
+      "learning_rate": 0.0004560472578763127,
+      "loss": 3.563,
+      "step": 16550
+    },
+    {
+      "epoch": 4.835469587508739,
+      "grad_norm": 0.32174625992774963,
+      "learning_rate": 0.0004556096849474912,
+      "loss": 3.5602,
+      "step": 16600
+    },
+    {
+      "epoch": 4.850034956886507,
+      "grad_norm": 0.3458464741706848,
+      "learning_rate": 0.00045517211201866973,
+      "loss": 3.5485,
+      "step": 16650
+    },
+    {
+      "epoch": 4.864600326264274,
+      "grad_norm": 0.3370623290538788,
+      "learning_rate": 0.00045473453908984827,
+      "loss": 3.5624,
+      "step": 16700
+    },
+    {
+      "epoch": 4.879165695642041,
+      "grad_norm": 0.33553197979927063,
+      "learning_rate": 0.00045429696616102675,
+      "loss": 3.5675,
+      "step": 16750
+    },
+    {
+      "epoch": 4.893731065019809,
+      "grad_norm": 0.3206152617931366,
+      "learning_rate": 0.00045385939323220535,
+      "loss": 3.5618,
+      "step": 16800
+    },
+    {
+      "epoch": 4.908296434397577,
+      "grad_norm": 0.3171241581439972,
+      "learning_rate": 0.0004534218203033839,
+      "loss": 3.5692,
+      "step": 16850
+    },
+    {
+      "epoch": 4.922861803775344,
+      "grad_norm": 0.3172144889831543,
+      "learning_rate": 0.0004529842473745624,
+      "loss": 3.5552,
+      "step": 16900
+    },
+    {
+      "epoch": 4.937427173153111,
+      "grad_norm": 0.32273098826408386,
+      "learning_rate": 0.0004525466744457409,
+      "loss": 3.5712,
+      "step": 16950
+    },
+    {
+      "epoch": 4.951992542530879,
+      "grad_norm": 0.3339548707008362,
+      "learning_rate": 0.00045210910151691945,
+      "loss": 3.5646,
+      "step": 17000
+    },
+    {
+      "epoch": 4.951992542530879,
+      "eval_accuracy": 0.36071871835716146,
+      "eval_loss": 3.625553846359253,
+      "eval_runtime": 180.4809,
+      "eval_samples_per_second": 92.22,
+      "eval_steps_per_second": 5.768,
+      "step": 17000
+    },
+    {
+      "epoch": 4.966557911908646,
+      "grad_norm": 0.3269366919994354,
+      "learning_rate": 0.000451671528588098,
+      "loss": 3.5542,
+      "step": 17050
+    },
+    {
+      "epoch": 4.981123281286413,
+      "grad_norm": 0.31721675395965576,
+      "learning_rate": 0.0004512339556592765,
+      "loss": 3.5729,
+      "step": 17100
+    },
+    {
+      "epoch": 4.995688650664181,
+      "grad_norm": 0.3314802050590515,
+      "learning_rate": 0.000450796382730455,
+      "loss": 3.5643,
+      "step": 17150
+    },
+    {
+      "epoch": 5.010195758564437,
+      "grad_norm": 0.34938499331474304,
+      "learning_rate": 0.0004503588098016336,
+      "loss": 3.4822,
+      "step": 17200
+    },
+    {
+      "epoch": 5.024761127942204,
+      "grad_norm": 0.3565429449081421,
+      "learning_rate": 0.0004499212368728121,
+      "loss": 3.4413,
+      "step": 17250
+    },
+    {
+      "epoch": 5.039326497319972,
+      "grad_norm": 0.34626901149749756,
+      "learning_rate": 0.00044948366394399064,
+      "loss": 3.4539,
+      "step": 17300
+    },
+    {
+      "epoch": 5.0538918666977395,
+      "grad_norm": 0.336347758769989,
+      "learning_rate": 0.0004490460910151692,
+      "loss": 3.4579,
+      "step": 17350
+    },
+    {
+      "epoch": 5.068457236075507,
+      "grad_norm": 0.3387928605079651,
+      "learning_rate": 0.00044860851808634767,
+      "loss": 3.4767,
+      "step": 17400
+    },
+    {
+      "epoch": 5.083022605453274,
+      "grad_norm": 0.3393719494342804,
+      "learning_rate": 0.0004481709451575262,
+      "loss": 3.4596,
+      "step": 17450
+    },
+    {
+      "epoch": 5.0975879748310415,
+      "grad_norm": 0.3251345157623291,
+      "learning_rate": 0.00044773337222870475,
+      "loss": 3.4748,
+      "step": 17500
+    },
+    {
+      "epoch": 5.112153344208809,
+      "grad_norm": 0.32468897104263306,
+      "learning_rate": 0.0004472957992998833,
+      "loss": 3.4805,
+      "step": 17550
+    },
+    {
+      "epoch": 5.126718713586577,
+      "grad_norm": 0.3337823450565338,
+      "learning_rate": 0.00044685822637106177,
+      "loss": 3.4754,
+      "step": 17600
+    },
+    {
+      "epoch": 5.141284082964344,
+      "grad_norm": 0.3582659959793091,
+      "learning_rate": 0.00044642065344224037,
+      "loss": 3.4752,
+      "step": 17650
+    },
+    {
+      "epoch": 5.155849452342111,
+      "grad_norm": 0.3382004499435425,
+      "learning_rate": 0.0004459830805134189,
+      "loss": 3.4633,
+      "step": 17700
+    },
+    {
+      "epoch": 5.170414821719879,
+      "grad_norm": 0.33493444323539734,
+      "learning_rate": 0.0004455455075845974,
+      "loss": 3.4896,
+      "step": 17750
+    },
+    {
+      "epoch": 5.1849801910976465,
+      "grad_norm": 0.33412331342697144,
+      "learning_rate": 0.00044510793465577593,
+      "loss": 3.4854,
+      "step": 17800
+    },
+    {
+      "epoch": 5.199545560475413,
+      "grad_norm": 0.3649858832359314,
+      "learning_rate": 0.00044467036172695447,
+      "loss": 3.4862,
+      "step": 17850
+    },
+    {
+      "epoch": 5.214110929853181,
+      "grad_norm": 0.3273285925388336,
+      "learning_rate": 0.00044423278879813296,
+      "loss": 3.4898,
+      "step": 17900
+    },
+    {
+      "epoch": 5.228676299230949,
+      "grad_norm": 0.36678996682167053,
+      "learning_rate": 0.0004437952158693115,
+      "loss": 3.4798,
+      "step": 17950
+    },
+    {
+      "epoch": 5.243241668608716,
+      "grad_norm": 0.33765271306037903,
+      "learning_rate": 0.00044335764294049004,
+      "loss": 3.4892,
+      "step": 18000
+    },
+    {
+      "epoch": 5.243241668608716,
+      "eval_accuracy": 0.3612939037403981,
+      "eval_loss": 3.6294894218444824,
+      "eval_runtime": 180.4902,
+      "eval_samples_per_second": 92.216,
+      "eval_steps_per_second": 5.768,
+      "step": 18000
+    },
+    {
+      "epoch": 5.257807037986483,
+      "grad_norm": 0.35786503553390503,
+      "learning_rate": 0.00044292007001166863,
+      "loss": 3.4823,
+      "step": 18050
+    },
+    {
+      "epoch": 5.272372407364251,
+      "grad_norm": 0.3416072130203247,
+      "learning_rate": 0.00044248249708284706,
+      "loss": 3.4879,
+      "step": 18100
+    },
+    {
+      "epoch": 5.286937776742018,
+      "grad_norm": 0.34881216287612915,
+      "learning_rate": 0.00044204492415402566,
+      "loss": 3.4922,
+      "step": 18150
+    },
+    {
+      "epoch": 5.301503146119786,
+      "grad_norm": 0.34528306126594543,
+      "learning_rate": 0.0004416073512252042,
+      "loss": 3.4949,
+      "step": 18200
+    },
+    {
+      "epoch": 5.316068515497553,
+      "grad_norm": 0.328722208738327,
+      "learning_rate": 0.0004411697782963827,
+      "loss": 3.4898,
+      "step": 18250
+    },
+    {
+      "epoch": 5.33063388487532,
+      "grad_norm": 0.3258577585220337,
+      "learning_rate": 0.0004407322053675612,
+      "loss": 3.4844,
+      "step": 18300
+    },
+    {
+      "epoch": 5.345199254253088,
+      "grad_norm": 0.3687138855457306,
+      "learning_rate": 0.00044029463243873976,
+      "loss": 3.4868,
+      "step": 18350
+    },
+    {
+      "epoch": 5.359764623630856,
+      "grad_norm": 0.3356603682041168,
+      "learning_rate": 0.00043985705950991825,
+      "loss": 3.4854,
+      "step": 18400
+    },
+    {
+      "epoch": 5.374329993008622,
+      "grad_norm": 0.3445993661880493,
+      "learning_rate": 0.0004394194865810968,
+      "loss": 3.4981,
+      "step": 18450
+    },
+    {
+      "epoch": 5.38889536238639,
+      "grad_norm": 0.328427791595459,
+      "learning_rate": 0.00043898191365227533,
+      "loss": 3.4887,
+      "step": 18500
+    },
+    {
+      "epoch": 5.403460731764158,
+      "grad_norm": 0.3391731083393097,
+      "learning_rate": 0.0004385443407234539,
+      "loss": 3.5023,
+      "step": 18550
+    },
+    {
+      "epoch": 5.418026101141925,
+      "grad_norm": 0.3405122458934784,
+      "learning_rate": 0.0004381067677946324,
+      "loss": 3.5082,
+      "step": 18600
+    },
+    {
+      "epoch": 5.432591470519692,
+      "grad_norm": 0.32964596152305603,
+      "learning_rate": 0.00043766919486581095,
+      "loss": 3.5064,
+      "step": 18650
+    },
+    {
+      "epoch": 5.44715683989746,
+      "grad_norm": 0.32743725180625916,
+      "learning_rate": 0.0004372316219369895,
+      "loss": 3.5069,
+      "step": 18700
+    },
+    {
+      "epoch": 5.461722209275227,
+      "grad_norm": 0.33889785408973694,
+      "learning_rate": 0.00043679404900816797,
+      "loss": 3.4917,
+      "step": 18750
+    },
+    {
+      "epoch": 5.476287578652995,
+      "grad_norm": 0.3374757468700409,
+      "learning_rate": 0.0004363564760793465,
+      "loss": 3.5129,
+      "step": 18800
+    },
+    {
+      "epoch": 5.490852948030762,
+      "grad_norm": 0.32586970925331116,
+      "learning_rate": 0.00043591890315052505,
+      "loss": 3.4983,
+      "step": 18850
+    },
+    {
+      "epoch": 5.505418317408529,
+      "grad_norm": 0.3159201443195343,
+      "learning_rate": 0.00043548133022170354,
+      "loss": 3.5049,
+      "step": 18900
+    },
+    {
+      "epoch": 5.519983686786297,
+      "grad_norm": 0.3207235634326935,
+      "learning_rate": 0.0004350437572928821,
+      "loss": 3.5027,
+      "step": 18950
+    },
+    {
+      "epoch": 5.534549056164065,
+      "grad_norm": 0.32409095764160156,
+      "learning_rate": 0.00043460618436406067,
+      "loss": 3.5037,
+      "step": 19000
+    },
+    {
+      "epoch": 5.534549056164065,
+      "eval_accuracy": 0.36227261247507964,
+      "eval_loss": 3.617015838623047,
+      "eval_runtime": 180.3747,
+      "eval_samples_per_second": 92.275,
+      "eval_steps_per_second": 5.771,
+      "step": 19000
+    },
+    {
+      "epoch": 5.549114425541831,
+      "grad_norm": 0.33343568444252014,
+      "learning_rate": 0.0004341686114352392,
+      "loss": 3.5044,
+      "step": 19050
+    },
+    {
+      "epoch": 5.563679794919599,
+      "grad_norm": 0.3471834063529968,
+      "learning_rate": 0.0004337310385064177,
+      "loss": 3.4995,
+      "step": 19100
+    },
+    {
+      "epoch": 5.578245164297367,
+      "grad_norm": 0.32965055108070374,
+      "learning_rate": 0.00043329346557759624,
+      "loss": 3.5091,
+      "step": 19150
+    },
+    {
+      "epoch": 5.592810533675134,
+      "grad_norm": 0.32729023694992065,
+      "learning_rate": 0.0004328558926487748,
+      "loss": 3.4987,
+      "step": 19200
+    },
+    {
+      "epoch": 5.607375903052901,
+      "grad_norm": 0.32407552003860474,
+      "learning_rate": 0.00043241831971995326,
+      "loss": 3.5105,
+      "step": 19250
+    },
+    {
+      "epoch": 5.621941272430669,
+      "grad_norm": 0.3459337055683136,
+      "learning_rate": 0.0004319807467911318,
+      "loss": 3.5139,
+      "step": 19300
+    },
+    {
+      "epoch": 5.636506641808436,
+      "grad_norm": 0.34581705927848816,
+      "learning_rate": 0.00043154317386231034,
+      "loss": 3.5169,
+      "step": 19350
+    },
+    {
+      "epoch": 5.651072011186204,
+      "grad_norm": 0.323258638381958,
+      "learning_rate": 0.00043110560093348883,
+      "loss": 3.5006,
+      "step": 19400
+    },
+    {
+      "epoch": 5.665637380563971,
+      "grad_norm": 0.3501630127429962,
+      "learning_rate": 0.0004306680280046674,
+      "loss": 3.5059,
+      "step": 19450
+    },
+    {
+      "epoch": 5.6802027499417385,
+      "grad_norm": 0.3383364975452423,
+      "learning_rate": 0.00043023045507584596,
+      "loss": 3.5082,
+      "step": 19500
+    },
+    {
+      "epoch": 5.694768119319506,
+      "grad_norm": 0.3391266465187073,
+      "learning_rate": 0.0004297928821470245,
+      "loss": 3.5073,
+      "step": 19550
+    },
+    {
+      "epoch": 5.709333488697274,
+      "grad_norm": 0.33838364481925964,
+      "learning_rate": 0.000429355309218203,
+      "loss": 3.5057,
+      "step": 19600
+    },
+    {
+      "epoch": 5.7238988580750405,
+      "grad_norm": 0.3325950801372528,
+      "learning_rate": 0.00042891773628938153,
+      "loss": 3.518,
+      "step": 19650
+    },
+    {
+      "epoch": 5.738464227452808,
+      "grad_norm": 0.3349588215351105,
+      "learning_rate": 0.00042848016336056007,
+      "loss": 3.5155,
+      "step": 19700
+    },
+    {
+      "epoch": 5.753029596830576,
+      "grad_norm": 0.33944258093833923,
+      "learning_rate": 0.00042804259043173855,
+      "loss": 3.5028,
+      "step": 19750
+    },
+    {
+      "epoch": 5.7675949662083426,
+      "grad_norm": 0.3170711398124695,
+      "learning_rate": 0.0004276050175029171,
+      "loss": 3.5181,
+      "step": 19800
+    },
+    {
+      "epoch": 5.78216033558611,
+      "grad_norm": 0.3340502083301544,
+      "learning_rate": 0.0004271674445740957,
+      "loss": 3.4968,
+      "step": 19850
+    },
+    {
+      "epoch": 5.796725704963878,
+      "grad_norm": 0.34251272678375244,
+      "learning_rate": 0.0004267298716452741,
+      "loss": 3.5079,
+      "step": 19900
+    },
+    {
+      "epoch": 5.8112910743416455,
+      "grad_norm": 0.3394465446472168,
+      "learning_rate": 0.0004262922987164527,
+      "loss": 3.5056,
+      "step": 19950
+    },
+    {
+      "epoch": 5.825856443719413,
+      "grad_norm": 0.32446908950805664,
+      "learning_rate": 0.00042585472578763125,
+      "loss": 3.5029,
+      "step": 20000
+    },
+    {
+      "epoch": 5.825856443719413,
+      "eval_accuracy": 0.36320252686510796,
+      "eval_loss": 3.60426664352417,
+      "eval_runtime": 180.5652,
+      "eval_samples_per_second": 92.177,
+      "eval_steps_per_second": 5.765,
+      "step": 20000
+    },
+    {
+      "epoch": 5.84042181309718,
+      "grad_norm": 0.3498161733150482,
+      "learning_rate": 0.0004254171528588098,
+      "loss": 3.5086,
+      "step": 20050
+    },
+    {
+      "epoch": 5.8549871824749475,
+      "grad_norm": 0.33967551589012146,
+      "learning_rate": 0.0004249795799299883,
+      "loss": 3.5126,
+      "step": 20100
+    },
+    {
+      "epoch": 5.869552551852715,
+      "grad_norm": 0.3366953730583191,
+      "learning_rate": 0.0004245420070011668,
+      "loss": 3.5301,
+      "step": 20150
+    },
+    {
+      "epoch": 5.884117921230482,
+      "grad_norm": 0.33286792039871216,
+      "learning_rate": 0.00042410443407234536,
+      "loss": 3.511,
+      "step": 20200
+    },
+    {
+      "epoch": 5.89868329060825,
+      "grad_norm": 0.34662094712257385,
+      "learning_rate": 0.00042366686114352385,
+      "loss": 3.512,
+      "step": 20250
+    },
+    {
+      "epoch": 5.913248659986017,
+      "grad_norm": 0.3202279508113861,
+      "learning_rate": 0.0004232292882147024,
+      "loss": 3.5006,
+      "step": 20300
+    },
+    {
+      "epoch": 5.927814029363785,
+      "grad_norm": 0.34777122735977173,
+      "learning_rate": 0.000422791715285881,
+      "loss": 3.521,
+      "step": 20350
+    },
+    {
+      "epoch": 5.9423793987415525,
+      "grad_norm": 0.34444618225097656,
+      "learning_rate": 0.00042235414235705947,
+      "loss": 3.5126,
+      "step": 20400
+    },
+    {
+      "epoch": 5.956944768119319,
+      "grad_norm": 0.3303092122077942,
+      "learning_rate": 0.000421916569428238,
+      "loss": 3.516,
+      "step": 20450
+    },
+    {
+      "epoch": 5.971510137497087,
+      "grad_norm": 0.34319791197776794,
+      "learning_rate": 0.00042147899649941654,
+      "loss": 3.5029,
+      "step": 20500
+    },
+    {
+      "epoch": 5.986075506874855,
+      "grad_norm": 0.33462879061698914,
+      "learning_rate": 0.0004210414235705951,
+      "loss": 3.509,
+      "step": 20550
+    },
+    {
+      "epoch": 6.0005826147751105,
+      "grad_norm": 0.332768976688385,
+      "learning_rate": 0.00042060385064177357,
+      "loss": 3.5038,
+      "step": 20600
+    },
+    {
+      "epoch": 6.015147984152878,
+      "grad_norm": 0.32959234714508057,
+      "learning_rate": 0.0004201662777129521,
+      "loss": 3.3948,
+      "step": 20650
+    },
+    {
+      "epoch": 6.029713353530646,
+      "grad_norm": 0.3324235677719116,
+      "learning_rate": 0.00041972870478413065,
+      "loss": 3.4112,
+      "step": 20700
+    },
+    {
+      "epoch": 6.044278722908413,
+      "grad_norm": 0.3403053879737854,
+      "learning_rate": 0.00041929113185530914,
+      "loss": 3.4081,
+      "step": 20750
+    },
+    {
+      "epoch": 6.05884409228618,
+      "grad_norm": 0.3473146855831146,
+      "learning_rate": 0.00041885355892648773,
+      "loss": 3.4049,
+      "step": 20800
+    },
+    {
+      "epoch": 6.073409461663948,
+      "grad_norm": 0.34440669417381287,
+      "learning_rate": 0.00041841598599766627,
+      "loss": 3.4084,
+      "step": 20850
+    },
+    {
+      "epoch": 6.087974831041715,
+      "grad_norm": 0.3304244875907898,
+      "learning_rate": 0.00041797841306884476,
+      "loss": 3.4264,
+      "step": 20900
+    },
+    {
+      "epoch": 6.102540200419483,
+      "grad_norm": 0.33876416087150574,
+      "learning_rate": 0.0004175408401400233,
+      "loss": 3.415,
+      "step": 20950
+    },
+    {
+      "epoch": 6.11710556979725,
+      "grad_norm": 0.3384806215763092,
+      "learning_rate": 0.00041710326721120184,
+      "loss": 3.4399,
+      "step": 21000
+    },
+    {
+      "epoch": 6.11710556979725,
+      "eval_accuracy": 0.3637827680479111,
+      "eval_loss": 3.6081302165985107,
+      "eval_runtime": 180.2744,
+      "eval_samples_per_second": 92.326,
+      "eval_steps_per_second": 5.775,
+      "step": 21000
+    },
+    {
+      "epoch": 6.1316709391750175,
+      "grad_norm": 0.363090455532074,
+      "learning_rate": 0.0004166656942823804,
+      "loss": 3.4368,
+      "step": 21050
+    },
+    {
+      "epoch": 6.146236308552785,
+      "grad_norm": 0.3336644470691681,
+      "learning_rate": 0.00041622812135355886,
+      "loss": 3.4255,
+      "step": 21100
+    },
+    {
+      "epoch": 6.160801677930552,
+      "grad_norm": 0.3573184311389923,
+      "learning_rate": 0.0004157905484247374,
+      "loss": 3.43,
+      "step": 21150
+    },
+    {
+      "epoch": 6.1753670473083195,
+      "grad_norm": 0.3469174802303314,
+      "learning_rate": 0.000415352975495916,
+      "loss": 3.4248,
+      "step": 21200
+    },
+    {
+      "epoch": 6.189932416686087,
+      "grad_norm": 0.33994483947753906,
+      "learning_rate": 0.0004149154025670945,
+      "loss": 3.4331,
+      "step": 21250
+    },
+    {
+      "epoch": 6.204497786063855,
+      "grad_norm": 0.34334084391593933,
+      "learning_rate": 0.000414477829638273,
+      "loss": 3.4336,
+      "step": 21300
+    },
+    {
+      "epoch": 6.219063155441622,
+      "grad_norm": 0.3307756185531616,
+      "learning_rate": 0.00041404025670945156,
+      "loss": 3.4457,
+      "step": 21350
+    },
+    {
+      "epoch": 6.233628524819389,
+      "grad_norm": 0.3440045118331909,
+      "learning_rate": 0.00041360268378063005,
+      "loss": 3.4505,
+      "step": 21400
+    },
+    {
+      "epoch": 6.248193894197157,
+      "grad_norm": 0.32408636808395386,
+      "learning_rate": 0.0004131651108518086,
+      "loss": 3.4495,
+      "step": 21450
+    },
+    {
+      "epoch": 6.2627592635749245,
+      "grad_norm": 0.3418697714805603,
+      "learning_rate": 0.0004127275379229871,
+      "loss": 3.4413,
+      "step": 21500
+    },
+    {
+      "epoch": 6.277324632952691,
+      "grad_norm": 0.3394606113433838,
+      "learning_rate": 0.00041228996499416567,
+      "loss": 3.4419,
+      "step": 21550
+    },
+    {
+      "epoch": 6.291890002330459,
+      "grad_norm": 0.3462677299976349,
+      "learning_rate": 0.00041185239206534415,
+      "loss": 3.4454,
+      "step": 21600
+    },
+    {
+      "epoch": 6.306455371708227,
+      "grad_norm": 0.33543628454208374,
+      "learning_rate": 0.00041141481913652275,
+      "loss": 3.4359,
+      "step": 21650
+    },
+    {
+      "epoch": 6.321020741085994,
+      "grad_norm": 0.3553283214569092,
+      "learning_rate": 0.0004109772462077013,
+      "loss": 3.4364,
+      "step": 21700
+    },
+    {
+      "epoch": 6.335586110463761,
+      "grad_norm": 0.3360411822795868,
+      "learning_rate": 0.00041053967327887977,
+      "loss": 3.4451,
+      "step": 21750
+    },
+    {
+      "epoch": 6.350151479841529,
+      "grad_norm": 0.33588552474975586,
+      "learning_rate": 0.0004101021003500583,
+      "loss": 3.439,
+      "step": 21800
+    },
+    {
+      "epoch": 6.364716849219296,
+      "grad_norm": 0.3321113884449005,
+      "learning_rate": 0.00040966452742123685,
+      "loss": 3.4385,
+      "step": 21850
+    },
+    {
+      "epoch": 6.379282218597064,
+      "grad_norm": 0.3304464817047119,
+      "learning_rate": 0.00040922695449241534,
+      "loss": 3.4573,
+      "step": 21900
+    },
+    {
+      "epoch": 6.393847587974831,
+      "grad_norm": 0.3388485014438629,
+      "learning_rate": 0.0004087893815635939,
+      "loss": 3.4549,
+      "step": 21950
+    },
+    {
+      "epoch": 6.408412957352598,
+      "grad_norm": 0.36697396636009216,
+      "learning_rate": 0.0004083518086347724,
+      "loss": 3.4438,
+      "step": 22000
+    },
+    {
+      "epoch": 6.408412957352598,
+      "eval_accuracy": 0.3643471363716102,
+      "eval_loss": 3.600018262863159,
+      "eval_runtime": 180.2847,
+      "eval_samples_per_second": 92.321,
+      "eval_steps_per_second": 5.774,
+      "step": 22000
+    },
+    {
+      "epoch": 6.422978326730366,
+      "grad_norm": 0.3477044999599457,
+      "learning_rate": 0.000407914235705951,
+      "loss": 3.4598,
+      "step": 22050
+    },
+    {
+      "epoch": 6.437543696108134,
+      "grad_norm": 0.32996484637260437,
+      "learning_rate": 0.00040747666277712944,
+      "loss": 3.4483,
+      "step": 22100
+    },
+    {
+      "epoch": 6.4521090654859,
+      "grad_norm": 0.33145061135292053,
+      "learning_rate": 0.00040703908984830804,
+      "loss": 3.4543,
+      "step": 22150
+    },
+    {
+      "epoch": 6.466674434863668,
+      "grad_norm": 0.33102595806121826,
+      "learning_rate": 0.0004066015169194866,
+      "loss": 3.437,
+      "step": 22200
+    },
+    {
+      "epoch": 6.481239804241436,
+      "grad_norm": 0.34182071685791016,
+      "learning_rate": 0.00040616394399066506,
+      "loss": 3.4591,
+      "step": 22250
+    },
+    {
+      "epoch": 6.495805173619203,
+      "grad_norm": 0.35360977053642273,
+      "learning_rate": 0.0004057263710618436,
+      "loss": 3.4661,
+      "step": 22300
+    },
+    {
+      "epoch": 6.51037054299697,
+      "grad_norm": 0.34044864773750305,
+      "learning_rate": 0.00040528879813302214,
+      "loss": 3.4658,
+      "step": 22350
+    },
+    {
+      "epoch": 6.524935912374738,
+      "grad_norm": 0.3496011793613434,
+      "learning_rate": 0.00040485122520420063,
+      "loss": 3.455,
+      "step": 22400
+    },
+    {
+      "epoch": 6.539501281752505,
+      "grad_norm": 0.31914111971855164,
+      "learning_rate": 0.00040441365227537917,
+      "loss": 3.4606,
+      "step": 22450
+    },
+    {
+      "epoch": 6.554066651130273,
+      "grad_norm": 0.32800233364105225,
+      "learning_rate": 0.0004039760793465577,
+      "loss": 3.4562,
+      "step": 22500
+    },
+    {
+      "epoch": 6.56863202050804,
+      "grad_norm": 0.33165040612220764,
+      "learning_rate": 0.0004035385064177363,
+      "loss": 3.455,
+      "step": 22550
+    },
+    {
+      "epoch": 6.583197389885807,
+      "grad_norm": 0.3741567134857178,
+      "learning_rate": 0.0004031009334889148,
+      "loss": 3.4562,
+      "step": 22600
+    },
+    {
+      "epoch": 6.597762759263575,
+      "grad_norm": 0.35394638776779175,
+      "learning_rate": 0.00040266336056009333,
+      "loss": 3.4607,
+      "step": 22650
+    },
+    {
+      "epoch": 6.612328128641343,
+      "grad_norm": 0.3237501084804535,
+      "learning_rate": 0.00040222578763127187,
+      "loss": 3.4658,
+      "step": 22700
+    },
+    {
+      "epoch": 6.626893498019109,
+      "grad_norm": 0.34644386172294617,
+      "learning_rate": 0.00040178821470245035,
+      "loss": 3.4642,
+      "step": 22750
+    },
+    {
+      "epoch": 6.641458867396877,
+      "grad_norm": 0.34503695368766785,
+      "learning_rate": 0.0004013506417736289,
+      "loss": 3.4739,
+      "step": 22800
+    },
+    {
+      "epoch": 6.656024236774645,
+      "grad_norm": 0.3343126177787781,
+      "learning_rate": 0.00040091306884480743,
+      "loss": 3.4505,
+      "step": 22850
+    },
+    {
+      "epoch": 6.670589606152412,
+      "grad_norm": 0.33412104845046997,
+      "learning_rate": 0.0004004754959159859,
+      "loss": 3.4603,
+      "step": 22900
+    },
+    {
+      "epoch": 6.685154975530179,
+      "grad_norm": 0.32703226804733276,
+      "learning_rate": 0.00040003792298716446,
+      "loss": 3.4612,
+      "step": 22950
+    },
+    {
+      "epoch": 6.699720344907947,
+      "grad_norm": 0.32835039496421814,
+      "learning_rate": 0.00039960035005834305,
+      "loss": 3.4622,
+      "step": 23000
+    },
+    {
+      "epoch": 6.699720344907947,
+      "eval_accuracy": 0.36494254495311274,
+      "eval_loss": 3.5912156105041504,
+      "eval_runtime": 180.4184,
+      "eval_samples_per_second": 92.252,
+      "eval_steps_per_second": 5.77,
+      "step": 23000
+    },
+    {
+      "epoch": 6.714285714285714,
+      "grad_norm": 0.3277016878128052,
+      "learning_rate": 0.0003991627771295216,
+      "loss": 3.4609,
+      "step": 23050
+    },
+    {
+      "epoch": 6.728851083663482,
+      "grad_norm": 0.3436872363090515,
+      "learning_rate": 0.0003987252042007001,
+      "loss": 3.4605,
+      "step": 23100
+    },
+    {
+      "epoch": 6.743416453041249,
+      "grad_norm": 0.32483038306236267,
+      "learning_rate": 0.0003982876312718786,
+      "loss": 3.468,
+      "step": 23150
+    },
+    {
+      "epoch": 6.7579818224190165,
+      "grad_norm": 0.3559059500694275,
+      "learning_rate": 0.00039785005834305716,
+      "loss": 3.4694,
+      "step": 23200
+    },
+    {
+      "epoch": 6.772547191796784,
+      "grad_norm": 0.34260398149490356,
+      "learning_rate": 0.00039741248541423564,
+      "loss": 3.4727,
+      "step": 23250
+    },
+    {
+      "epoch": 6.787112561174552,
+      "grad_norm": 0.32523587346076965,
+      "learning_rate": 0.0003969749124854142,
+      "loss": 3.4571,
+      "step": 23300
+    },
+    {
+      "epoch": 6.8016779305523185,
+      "grad_norm": 0.3347657322883606,
+      "learning_rate": 0.0003965373395565927,
+      "loss": 3.4717,
+      "step": 23350
+    },
+    {
+      "epoch": 6.816243299930086,
+      "grad_norm": 0.33626583218574524,
+      "learning_rate": 0.0003960997666277712,
+      "loss": 3.4646,
+      "step": 23400
+    },
+    {
+      "epoch": 6.830808669307854,
+      "grad_norm": 0.36179831624031067,
+      "learning_rate": 0.0003956621936989498,
+      "loss": 3.4717,
+      "step": 23450
+    },
+    {
+      "epoch": 6.845374038685621,
+      "grad_norm": 0.34891805052757263,
+      "learning_rate": 0.00039522462077012834,
+      "loss": 3.4699,
+      "step": 23500
+    },
+    {
+      "epoch": 6.859939408063388,
+      "grad_norm": 0.37656670808792114,
+      "learning_rate": 0.0003947870478413069,
+      "loss": 3.4674,
+      "step": 23550
+    },
+    {
+      "epoch": 6.874504777441156,
+      "grad_norm": 0.3371601402759552,
+      "learning_rate": 0.00039434947491248537,
+      "loss": 3.4684,
+      "step": 23600
+    },
+    {
+      "epoch": 6.8890701468189235,
+      "grad_norm": 0.3327315151691437,
+      "learning_rate": 0.0003939119019836639,
+      "loss": 3.4778,
+      "step": 23650
+    },
+    {
+      "epoch": 6.903635516196691,
+      "grad_norm": 0.33458471298217773,
+      "learning_rate": 0.00039347432905484245,
+      "loss": 3.4688,
+      "step": 23700
+    },
+    {
+      "epoch": 6.918200885574458,
+      "grad_norm": 0.3311387896537781,
+      "learning_rate": 0.00039303675612602094,
+      "loss": 3.4707,
+      "step": 23750
+    },
+    {
+      "epoch": 6.9327662549522255,
+      "grad_norm": 0.33576178550720215,
+      "learning_rate": 0.0003925991831971995,
+      "loss": 3.4649,
+      "step": 23800
+    },
+    {
+      "epoch": 6.947331624329993,
+      "grad_norm": 0.316501259803772,
+      "learning_rate": 0.00039216161026837807,
+      "loss": 3.4702,
+      "step": 23850
+    },
+    {
+      "epoch": 6.961896993707761,
+      "grad_norm": 0.3234950006008148,
+      "learning_rate": 0.00039172403733955656,
+      "loss": 3.4551,
+      "step": 23900
+    },
+    {
+      "epoch": 6.976462363085528,
+      "grad_norm": 0.34549012780189514,
+      "learning_rate": 0.0003912864644107351,
+      "loss": 3.4708,
+      "step": 23950
+    },
+    {
+      "epoch": 6.991027732463295,
+      "grad_norm": 0.33629485964775085,
+      "learning_rate": 0.00039084889148191364,
+      "loss": 3.4761,
+      "step": 24000
+    },
+    {
+      "epoch": 6.991027732463295,
+      "eval_accuracy": 0.3658422421224764,
+      "eval_loss": 3.5796563625335693,
+      "eval_runtime": 180.4423,
+      "eval_samples_per_second": 92.24,
+      "eval_steps_per_second": 5.769,
+      "step": 24000
+    },
+    {
+      "epoch": 7.005534840363552,
+      "grad_norm": 0.3567199409008026,
+      "learning_rate": 0.0003904113185530922,
+      "loss": 3.4224,
+      "step": 24050
+    },
+    {
+      "epoch": 7.020100209741319,
+      "grad_norm": 0.33759805560112,
+      "learning_rate": 0.00038997374562427066,
+      "loss": 3.3554,
+      "step": 24100
+    },
+    {
+      "epoch": 7.034665579119086,
+      "grad_norm": 0.3463039696216583,
+      "learning_rate": 0.0003895361726954492,
+      "loss": 3.3629,
+      "step": 24150
+    },
+    {
+      "epoch": 7.049230948496854,
+      "grad_norm": 0.34043920040130615,
+      "learning_rate": 0.00038909859976662774,
+      "loss": 3.3713,
+      "step": 24200
+    },
+    {
+      "epoch": 7.063796317874622,
+      "grad_norm": 0.3372809886932373,
+      "learning_rate": 0.0003886610268378062,
+      "loss": 3.3729,
+      "step": 24250
+    },
+    {
+      "epoch": 7.0783616872523885,
+      "grad_norm": 0.3626004159450531,
+      "learning_rate": 0.0003882234539089848,
+      "loss": 3.3779,
+      "step": 24300
+    },
+    {
+      "epoch": 7.092927056630156,
+      "grad_norm": 0.3814680278301239,
+      "learning_rate": 0.00038778588098016336,
+      "loss": 3.3831,
+      "step": 24350
+    },
+    {
+      "epoch": 7.107492426007924,
+      "grad_norm": 0.3421391248703003,
+      "learning_rate": 0.00038734830805134185,
+      "loss": 3.3799,
+      "step": 24400
+    },
+    {
+      "epoch": 7.122057795385691,
+      "grad_norm": 0.34770506620407104,
+      "learning_rate": 0.0003869107351225204,
+      "loss": 3.3751,
+      "step": 24450
+    },
+    {
+      "epoch": 7.136623164763458,
+      "grad_norm": 0.348093181848526,
+      "learning_rate": 0.0003864731621936989,
+      "loss": 3.3744,
+      "step": 24500
+    },
+    {
+      "epoch": 7.151188534141226,
+      "grad_norm": 0.34899893403053284,
+      "learning_rate": 0.00038603558926487747,
+      "loss": 3.378,
+      "step": 24550
+    },
+    {
+      "epoch": 7.165753903518993,
+      "grad_norm": 0.32636308670043945,
+      "learning_rate": 0.00038559801633605595,
+      "loss": 3.3811,
+      "step": 24600
+    },
+    {
+      "epoch": 7.180319272896761,
+      "grad_norm": 0.32693901658058167,
+      "learning_rate": 0.0003851604434072345,
+      "loss": 3.3978,
+      "step": 24650
+    },
+    {
+      "epoch": 7.194884642274528,
+      "grad_norm": 0.35337990522384644,
+      "learning_rate": 0.0003847228704784131,
+      "loss": 3.4016,
+      "step": 24700
+    },
+    {
+      "epoch": 7.2094500116522955,
+      "grad_norm": 0.33998918533325195,
+      "learning_rate": 0.0003842852975495915,
+      "loss": 3.3913,
+      "step": 24750
+    },
+    {
+      "epoch": 7.224015381030063,
+      "grad_norm": 0.34085580706596375,
+      "learning_rate": 0.0003838477246207701,
+      "loss": 3.3876,
+      "step": 24800
+    },
+    {
+      "epoch": 7.238580750407831,
+      "grad_norm": 0.34505322575569153,
+      "learning_rate": 0.00038341015169194865,
+      "loss": 3.4013,
+      "step": 24850
+    },
+    {
+      "epoch": 7.2531461197855975,
+      "grad_norm": 0.35665056109428406,
+      "learning_rate": 0.00038297257876312714,
+      "loss": 3.3945,
+      "step": 24900
+    },
+    {
+      "epoch": 7.267711489163365,
+      "grad_norm": 0.33130306005477905,
+      "learning_rate": 0.0003825350058343057,
+      "loss": 3.3973,
+      "step": 24950
+    },
+    {
+      "epoch": 7.282276858541133,
+      "grad_norm": 0.33717137575149536,
+      "learning_rate": 0.0003820974329054842,
+      "loss": 3.4035,
+      "step": 25000
+    },
+    {
+      "epoch": 7.282276858541133,
+      "eval_accuracy": 0.36597134137652254,
+      "eval_loss": 3.5882999897003174,
+      "eval_runtime": 180.2331,
+      "eval_samples_per_second": 92.347,
+      "eval_steps_per_second": 5.776,
+      "step": 25000
+    },
+    {
+      "epoch": 7.2968422279189,
+      "grad_norm": 0.343801349401474,
+      "learning_rate": 0.00038165985997666276,
+      "loss": 3.3994,
+      "step": 25050
+    },
+    {
+      "epoch": 7.311407597296667,
+      "grad_norm": 0.34225597977638245,
+      "learning_rate": 0.00038122228704784124,
+      "loss": 3.4,
+      "step": 25100
+    },
+    {
+      "epoch": 7.325972966674435,
+      "grad_norm": 0.3473186492919922,
+      "learning_rate": 0.0003807847141190198,
+      "loss": 3.397,
+      "step": 25150
+    },
+    {
+      "epoch": 7.3405383360522025,
+      "grad_norm": 0.3287709653377533,
+      "learning_rate": 0.0003803471411901984,
+      "loss": 3.4058,
+      "step": 25200
+    },
+    {
+      "epoch": 7.35510370542997,
+      "grad_norm": 0.351204514503479,
+      "learning_rate": 0.00037990956826137686,
+      "loss": 3.4163,
+      "step": 25250
+    },
+    {
+      "epoch": 7.369669074807737,
+      "grad_norm": 0.35390347242355347,
+      "learning_rate": 0.0003794719953325554,
+      "loss": 3.4228,
+      "step": 25300
+    },
+    {
+      "epoch": 7.384234444185505,
+      "grad_norm": 0.3401016891002655,
+      "learning_rate": 0.00037903442240373394,
+      "loss": 3.406,
+      "step": 25350
+    },
+    {
+      "epoch": 7.398799813563272,
+      "grad_norm": 0.35391902923583984,
+      "learning_rate": 0.00037859684947491243,
+      "loss": 3.4097,
+      "step": 25400
+    },
+    {
+      "epoch": 7.413365182941039,
+      "grad_norm": 0.342098206281662,
+      "learning_rate": 0.00037815927654609097,
+      "loss": 3.4266,
+      "step": 25450
+    },
+    {
+      "epoch": 7.427930552318807,
+      "grad_norm": 0.34706324338912964,
+      "learning_rate": 0.0003777217036172695,
+      "loss": 3.4083,
+      "step": 25500
+    },
+    {
+      "epoch": 7.442495921696574,
+      "grad_norm": 0.35251185297966003,
+      "learning_rate": 0.00037728413068844805,
+      "loss": 3.4125,
+      "step": 25550
+    },
+    {
+      "epoch": 7.457061291074342,
+      "grad_norm": 0.3509294390678406,
+      "learning_rate": 0.00037684655775962653,
+      "loss": 3.4203,
+      "step": 25600
+    },
+    {
+      "epoch": 7.471626660452109,
+      "grad_norm": 0.3560699224472046,
+      "learning_rate": 0.00037640898483080513,
+      "loss": 3.4272,
+      "step": 25650
+    },
+    {
+      "epoch": 7.486192029829876,
+      "grad_norm": 0.34861257672309875,
+      "learning_rate": 0.00037597141190198367,
+      "loss": 3.4182,
+      "step": 25700
+    },
+    {
+      "epoch": 7.500757399207644,
+      "grad_norm": 0.33859142661094666,
+      "learning_rate": 0.00037553383897316215,
+      "loss": 3.4299,
+      "step": 25750
+    },
+    {
+      "epoch": 7.515322768585412,
+      "grad_norm": 0.35380759835243225,
+      "learning_rate": 0.0003750962660443407,
+      "loss": 3.4201,
+      "step": 25800
+    },
+    {
+      "epoch": 7.529888137963178,
+      "grad_norm": 0.34941068291664124,
+      "learning_rate": 0.00037465869311551923,
+      "loss": 3.4167,
+      "step": 25850
+    },
+    {
+      "epoch": 7.544453507340946,
+      "grad_norm": 0.35646477341651917,
+      "learning_rate": 0.0003742211201866977,
+      "loss": 3.4306,
+      "step": 25900
+    },
+    {
+      "epoch": 7.559018876718714,
+      "grad_norm": 0.35378143191337585,
+      "learning_rate": 0.00037378354725787626,
+      "loss": 3.4086,
+      "step": 25950
+    },
+    {
+      "epoch": 7.573584246096481,
+      "grad_norm": 0.3527311384677887,
+      "learning_rate": 0.0003733459743290548,
+      "loss": 3.4207,
+      "step": 26000
+    },
+    {
+      "epoch": 7.573584246096481,
+      "eval_accuracy": 0.36644117800600207,
+      "eval_loss": 3.5814883708953857,
+      "eval_runtime": 180.4499,
+      "eval_samples_per_second": 92.236,
+      "eval_steps_per_second": 5.769,
+      "step": 26000
+    },
+    {
+      "epoch": 7.588149615474248,
+      "grad_norm": 0.3543234169483185,
+      "learning_rate": 0.0003729084014002334,
+      "loss": 3.4278,
+      "step": 26050
+    },
+    {
+      "epoch": 7.602714984852016,
+      "grad_norm": 0.35281285643577576,
+      "learning_rate": 0.0003724708284714119,
+      "loss": 3.421,
+      "step": 26100
+    },
+    {
+      "epoch": 7.617280354229783,
+      "grad_norm": 0.3394710123538971,
+      "learning_rate": 0.0003720332555425904,
+      "loss": 3.4238,
+      "step": 26150
+    },
+    {
+      "epoch": 7.631845723607551,
+      "grad_norm": 0.34304413199424744,
+      "learning_rate": 0.00037159568261376896,
+      "loss": 3.4145,
+      "step": 26200
+    },
+    {
+      "epoch": 7.646411092985318,
+      "grad_norm": 0.3429325520992279,
+      "learning_rate": 0.00037115810968494744,
+      "loss": 3.4268,
+      "step": 26250
+    },
+    {
+      "epoch": 7.660976462363085,
+      "grad_norm": 0.3383738100528717,
+      "learning_rate": 0.000370720536756126,
+      "loss": 3.4228,
+      "step": 26300
+    },
+    {
+      "epoch": 7.675541831740853,
+      "grad_norm": 0.3476937413215637,
+      "learning_rate": 0.0003702829638273045,
+      "loss": 3.4207,
+      "step": 26350
+    },
+    {
+      "epoch": 7.690107201118621,
+      "grad_norm": 0.342341810464859,
+      "learning_rate": 0.000369845390898483,
+      "loss": 3.4207,
+      "step": 26400
+    },
+    {
+      "epoch": 7.704672570496387,
+      "grad_norm": 0.35490912199020386,
+      "learning_rate": 0.00036940781796966155,
+      "loss": 3.4281,
+      "step": 26450
+    },
+    {
+      "epoch": 7.719237939874155,
+      "grad_norm": 0.34896060824394226,
+      "learning_rate": 0.00036897024504084014,
+      "loss": 3.4361,
+      "step": 26500
+    },
+    {
+      "epoch": 7.733803309251923,
+      "grad_norm": 0.3462172746658325,
+      "learning_rate": 0.0003685326721120187,
+      "loss": 3.425,
+      "step": 26550
+    },
+    {
+      "epoch": 7.74836867862969,
+      "grad_norm": 0.35829275846481323,
+      "learning_rate": 0.00036809509918319717,
+      "loss": 3.4346,
+      "step": 26600
+    },
+    {
+      "epoch": 7.762934048007457,
+      "grad_norm": 0.3367747664451599,
+      "learning_rate": 0.0003676575262543757,
+      "loss": 3.4247,
+      "step": 26650
+    },
+    {
+      "epoch": 7.777499417385225,
+      "grad_norm": 0.33087530732154846,
+      "learning_rate": 0.00036721995332555425,
+      "loss": 3.432,
+      "step": 26700
+    },
+    {
+      "epoch": 7.792064786762992,
+      "grad_norm": 0.3543736934661865,
+      "learning_rate": 0.00036678238039673274,
+      "loss": 3.4379,
+      "step": 26750
+    },
+    {
+      "epoch": 7.80663015614076,
+      "grad_norm": 0.3304196894168854,
+      "learning_rate": 0.0003663448074679113,
+      "loss": 3.4238,
+      "step": 26800
+    },
+    {
+      "epoch": 7.821195525518527,
+      "grad_norm": 0.35223904252052307,
+      "learning_rate": 0.0003659072345390898,
+      "loss": 3.426,
+      "step": 26850
+    },
+    {
+      "epoch": 7.8357608948962945,
+      "grad_norm": 0.34050217270851135,
+      "learning_rate": 0.0003654696616102683,
+      "loss": 3.4172,
+      "step": 26900
+    },
+    {
+      "epoch": 7.850326264274062,
+      "grad_norm": 0.3450503349304199,
+      "learning_rate": 0.00036503208868144684,
+      "loss": 3.4337,
+      "step": 26950
+    },
+    {
+      "epoch": 7.86489163365183,
+      "grad_norm": 0.3508300483226776,
+      "learning_rate": 0.00036459451575262543,
+      "loss": 3.4351,
+      "step": 27000
+    },
+    {
+      "epoch": 7.86489163365183,
+      "eval_accuracy": 0.36722659058981666,
+      "eval_loss": 3.571290969848633,
+      "eval_runtime": 180.2666,
+      "eval_samples_per_second": 92.33,
+      "eval_steps_per_second": 5.775,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8794570030295965,
+      "grad_norm": 0.35574260354042053,
+      "learning_rate": 0.000364156942823804,
+      "loss": 3.4341,
+      "step": 27050
+    },
+    {
+      "epoch": 7.894022372407364,
+      "grad_norm": 0.34330523014068604,
+      "learning_rate": 0.00036371936989498246,
+      "loss": 3.4396,
+      "step": 27100
+    },
+    {
+      "epoch": 7.908587741785132,
+      "grad_norm": 0.35326018929481506,
+      "learning_rate": 0.000363281796966161,
+      "loss": 3.4192,
+      "step": 27150
+    },
+    {
+      "epoch": 7.923153111162899,
+      "grad_norm": 0.356656938791275,
+      "learning_rate": 0.00036284422403733954,
+      "loss": 3.4359,
+      "step": 27200
+    },
+    {
+      "epoch": 7.937718480540666,
+      "grad_norm": 0.32995936274528503,
+      "learning_rate": 0.000362406651108518,
+      "loss": 3.4363,
+      "step": 27250
+    },
+    {
+      "epoch": 7.952283849918434,
+      "grad_norm": 0.3421317934989929,
+      "learning_rate": 0.00036196907817969657,
+      "loss": 3.4263,
+      "step": 27300
+    },
+    {
+      "epoch": 7.9668492192962015,
+      "grad_norm": 0.33741775155067444,
+      "learning_rate": 0.0003615315052508751,
+      "loss": 3.4332,
+      "step": 27350
+    },
+    {
+      "epoch": 7.981414588673969,
+      "grad_norm": 0.34820324182510376,
+      "learning_rate": 0.0003610939323220536,
+      "loss": 3.4311,
+      "step": 27400
+    },
+    {
+      "epoch": 7.995979958051736,
+      "grad_norm": 0.36536943912506104,
+      "learning_rate": 0.0003606563593932322,
+      "loss": 3.4287,
+      "step": 27450
+    },
+    {
+      "epoch": 8.010487065951992,
+      "grad_norm": 0.3467245399951935,
+      "learning_rate": 0.0003602187864644107,
+      "loss": 3.3578,
+      "step": 27500
+    },
+    {
+      "epoch": 8.02505243532976,
+      "grad_norm": 0.36606982350349426,
+      "learning_rate": 0.00035978121353558927,
+      "loss": 3.3254,
+      "step": 27550
+    },
+    {
+      "epoch": 8.039617804707527,
+      "grad_norm": 0.370090126991272,
+      "learning_rate": 0.00035934364060676775,
+      "loss": 3.3324,
+      "step": 27600
+    },
+    {
+      "epoch": 8.054183174085296,
+      "grad_norm": 0.3692021667957306,
+      "learning_rate": 0.0003589060676779463,
+      "loss": 3.3385,
+      "step": 27650
+    },
+    {
+      "epoch": 8.068748543463062,
+      "grad_norm": 0.35137107968330383,
+      "learning_rate": 0.00035846849474912483,
+      "loss": 3.3448,
+      "step": 27700
+    },
+    {
+      "epoch": 8.08331391284083,
+      "grad_norm": 0.3459080755710602,
+      "learning_rate": 0.0003580309218203033,
+      "loss": 3.3318,
+      "step": 27750
+    },
+    {
+      "epoch": 8.097879282218598,
+      "grad_norm": 0.35793742537498474,
+      "learning_rate": 0.00035759334889148186,
+      "loss": 3.3345,
+      "step": 27800
+    },
+    {
+      "epoch": 8.112444651596364,
+      "grad_norm": 0.35751616954803467,
+      "learning_rate": 0.00035715577596266045,
+      "loss": 3.3613,
+      "step": 27850
+    },
+    {
+      "epoch": 8.127010020974131,
+      "grad_norm": 0.3466125428676605,
+      "learning_rate": 0.00035671820303383894,
+      "loss": 3.3478,
+      "step": 27900
+    },
+    {
+      "epoch": 8.1415753903519,
+      "grad_norm": 0.3528430759906769,
+      "learning_rate": 0.0003562806301050175,
+      "loss": 3.3564,
+      "step": 27950
+    },
+    {
+      "epoch": 8.156140759729666,
+      "grad_norm": 0.36010900139808655,
+      "learning_rate": 0.000355843057176196,
+      "loss": 3.3456,
+      "step": 28000
+    },
+    {
+      "epoch": 8.156140759729666,
+      "eval_accuracy": 0.36759131361900715,
+      "eval_loss": 3.580268144607544,
+      "eval_runtime": 180.2651,
+      "eval_samples_per_second": 92.331,
+      "eval_steps_per_second": 5.775,
+      "step": 28000
+    },
+    {
+      "epoch": 8.170706129107435,
+      "grad_norm": 0.3616182804107666,
+      "learning_rate": 0.00035540548424737456,
+      "loss": 3.3554,
+      "step": 28050
+    },
+    {
+      "epoch": 8.185271498485202,
+      "grad_norm": 0.3429206311702728,
+      "learning_rate": 0.00035496791131855304,
+      "loss": 3.3689,
+      "step": 28100
+    },
+    {
+      "epoch": 8.199836867862969,
+      "grad_norm": 0.3601152300834656,
+      "learning_rate": 0.0003545303383897316,
+      "loss": 3.3647,
+      "step": 28150
+    },
+    {
+      "epoch": 8.214402237240737,
+      "grad_norm": 0.346986323595047,
+      "learning_rate": 0.0003540927654609101,
+      "loss": 3.3639,
+      "step": 28200
+    },
+    {
+      "epoch": 8.228967606618504,
+      "grad_norm": 0.3525499105453491,
+      "learning_rate": 0.0003536551925320886,
+      "loss": 3.3606,
+      "step": 28250
+    },
+    {
+      "epoch": 8.24353297599627,
+      "grad_norm": 0.3487248420715332,
+      "learning_rate": 0.0003532176196032672,
+      "loss": 3.3582,
+      "step": 28300
+    },
+    {
+      "epoch": 8.258098345374039,
+      "grad_norm": 0.3517068028450012,
+      "learning_rate": 0.00035278004667444574,
+      "loss": 3.3674,
+      "step": 28350
+    },
+    {
+      "epoch": 8.272663714751806,
+      "grad_norm": 0.3672351837158203,
+      "learning_rate": 0.00035234247374562423,
+      "loss": 3.3631,
+      "step": 28400
+    },
+    {
+      "epoch": 8.287229084129574,
+      "grad_norm": 0.3655698001384735,
+      "learning_rate": 0.00035190490081680277,
+      "loss": 3.3605,
+      "step": 28450
+    },
+    {
+      "epoch": 8.301794453507341,
+      "grad_norm": 0.3492221534252167,
+      "learning_rate": 0.0003514673278879813,
+      "loss": 3.3721,
+      "step": 28500
+    },
+    {
+      "epoch": 8.316359822885108,
+      "grad_norm": 0.33222460746765137,
+      "learning_rate": 0.00035102975495915985,
+      "loss": 3.3655,
+      "step": 28550
+    },
+    {
+      "epoch": 8.330925192262876,
+      "grad_norm": 0.3473198115825653,
+      "learning_rate": 0.00035059218203033833,
+      "loss": 3.3765,
+      "step": 28600
+    },
+    {
+      "epoch": 8.345490561640643,
+      "grad_norm": 0.3525267541408539,
+      "learning_rate": 0.0003501546091015169,
+      "loss": 3.3692,
+      "step": 28650
+    },
+    {
+      "epoch": 8.36005593101841,
+      "grad_norm": 0.3442334234714508,
+      "learning_rate": 0.00034971703617269547,
+      "loss": 3.3892,
+      "step": 28700
+    },
+    {
+      "epoch": 8.374621300396178,
+      "grad_norm": 0.35674968361854553,
+      "learning_rate": 0.0003492794632438739,
+      "loss": 3.378,
+      "step": 28750
+    },
+    {
+      "epoch": 8.389186669773945,
+      "grad_norm": 0.34580376744270325,
+      "learning_rate": 0.0003488418903150525,
+      "loss": 3.3797,
+      "step": 28800
+    },
+    {
+      "epoch": 8.403752039151712,
+      "grad_norm": 0.34604698419570923,
+      "learning_rate": 0.00034840431738623103,
+      "loss": 3.3674,
+      "step": 28850
+    },
+    {
+      "epoch": 8.41831740852948,
+      "grad_norm": 0.3592158854007721,
+      "learning_rate": 0.0003479667444574095,
+      "loss": 3.377,
+      "step": 28900
+    },
+    {
+      "epoch": 8.432882777907247,
+      "grad_norm": 0.359283447265625,
+      "learning_rate": 0.00034752917152858806,
+      "loss": 3.3884,
+      "step": 28950
+    },
+    {
+      "epoch": 8.447448147285016,
+      "grad_norm": 0.34626027941703796,
+      "learning_rate": 0.0003470915985997666,
+      "loss": 3.379,
+      "step": 29000
+    },
+    {
+      "epoch": 8.447448147285016,
+      "eval_accuracy": 0.3679311103805677,
+      "eval_loss": 3.57328462600708,
+      "eval_runtime": 180.157,
+      "eval_samples_per_second": 92.386,
+      "eval_steps_per_second": 5.778,
+      "step": 29000
+    },
+    {
+      "epoch": 8.462013516662783,
+      "grad_norm": 0.3666757643222809,
+      "learning_rate": 0.00034665402567094514,
+      "loss": 3.3796,
+      "step": 29050
+    },
+    {
+      "epoch": 8.47657888604055,
+      "grad_norm": 0.3576624393463135,
+      "learning_rate": 0.0003462164527421236,
+      "loss": 3.3846,
+      "step": 29100
+    },
+    {
+      "epoch": 8.491144255418318,
+      "grad_norm": 0.36986008286476135,
+      "learning_rate": 0.00034577887981330216,
+      "loss": 3.3789,
+      "step": 29150
+    },
+    {
+      "epoch": 8.505709624796085,
+      "grad_norm": 0.34708988666534424,
+      "learning_rate": 0.00034534130688448076,
+      "loss": 3.3752,
+      "step": 29200
+    },
+    {
+      "epoch": 8.520274994173853,
+      "grad_norm": 0.36563989520072937,
+      "learning_rate": 0.00034490373395565924,
+      "loss": 3.3733,
+      "step": 29250
+    },
+    {
+      "epoch": 8.53484036355162,
+      "grad_norm": 0.36509010195732117,
+      "learning_rate": 0.0003444661610268378,
+      "loss": 3.3893,
+      "step": 29300
+    },
+    {
+      "epoch": 8.549405732929387,
+      "grad_norm": 0.3598864674568176,
+      "learning_rate": 0.0003440285880980163,
+      "loss": 3.3822,
+      "step": 29350
+    },
+    {
+      "epoch": 8.563971102307155,
+      "grad_norm": 0.3573833405971527,
+      "learning_rate": 0.0003435910151691948,
+      "loss": 3.3875,
+      "step": 29400
+    },
+    {
+      "epoch": 8.578536471684922,
+      "grad_norm": 0.33812621235847473,
+      "learning_rate": 0.00034315344224037335,
+      "loss": 3.3907,
+      "step": 29450
+    },
+    {
+      "epoch": 8.593101841062689,
+      "grad_norm": 0.3446572422981262,
+      "learning_rate": 0.0003427158693115519,
+      "loss": 3.3985,
+      "step": 29500
+    },
+    {
+      "epoch": 8.607667210440457,
+      "grad_norm": 0.34378212690353394,
+      "learning_rate": 0.00034227829638273043,
+      "loss": 3.3868,
+      "step": 29550
+    },
+    {
+      "epoch": 8.622232579818224,
+      "grad_norm": 0.35360199213027954,
+      "learning_rate": 0.0003418407234539089,
+      "loss": 3.4024,
+      "step": 29600
+    },
+    {
+      "epoch": 8.63679794919599,
+      "grad_norm": 0.35581034421920776,
+      "learning_rate": 0.0003414031505250875,
+      "loss": 3.3898,
+      "step": 29650
+    },
+    {
+      "epoch": 8.65136331857376,
+      "grad_norm": 0.35615333914756775,
+      "learning_rate": 0.00034096557759626605,
+      "loss": 3.3959,
+      "step": 29700
+    },
+    {
+      "epoch": 8.665928687951526,
+      "grad_norm": 0.35061442852020264,
+      "learning_rate": 0.00034052800466744453,
+      "loss": 3.39,
+      "step": 29750
+    },
+    {
+      "epoch": 8.680494057329295,
+      "grad_norm": 0.3618820905685425,
+      "learning_rate": 0.0003400904317386231,
+      "loss": 3.3843,
+      "step": 29800
+    },
+    {
+      "epoch": 8.695059426707061,
+      "grad_norm": 0.34694838523864746,
+      "learning_rate": 0.0003396528588098016,
+      "loss": 3.3842,
+      "step": 29850
+    },
+    {
+      "epoch": 8.709624796084828,
+      "grad_norm": 0.34534159302711487,
+      "learning_rate": 0.0003392152858809801,
+      "loss": 3.392,
+      "step": 29900
+    },
+    {
+      "epoch": 8.724190165462597,
+      "grad_norm": 0.35484299063682556,
+      "learning_rate": 0.00033877771295215864,
+      "loss": 3.3917,
+      "step": 29950
+    },
+    {
+      "epoch": 8.738755534840363,
+      "grad_norm": 0.34446921944618225,
+      "learning_rate": 0.0003383401400233372,
+      "loss": 3.3926,
+      "step": 30000
+    },
+    {
+      "epoch": 8.738755534840363,
+      "eval_accuracy": 0.3686057656808563,
+      "eval_loss": 3.563647985458374,
+      "eval_runtime": 180.6198,
+      "eval_samples_per_second": 92.149,
+      "eval_steps_per_second": 5.763,
+      "step": 30000
+    },
+    {
+      "epoch": 8.753320904218132,
+      "grad_norm": 0.38022834062576294,
+      "learning_rate": 0.0003379025670945158,
+      "loss": 3.3971,
+      "step": 30050
+    },
+    {
+      "epoch": 8.767886273595899,
+      "grad_norm": 0.3438722491264343,
+      "learning_rate": 0.00033746499416569426,
+      "loss": 3.3964,
+      "step": 30100
+    },
+    {
+      "epoch": 8.782451642973665,
+      "grad_norm": 0.3680688142776489,
+      "learning_rate": 0.0003370274212368728,
+      "loss": 3.4007,
+      "step": 30150
+    },
+    {
+      "epoch": 8.797017012351434,
+      "grad_norm": 0.3484303653240204,
+      "learning_rate": 0.00033658984830805134,
+      "loss": 3.3908,
+      "step": 30200
+    },
+    {
+      "epoch": 8.8115823817292,
+      "grad_norm": 0.34887251257896423,
+      "learning_rate": 0.0003361522753792298,
+      "loss": 3.3864,
+      "step": 30250
+    },
+    {
+      "epoch": 8.826147751106967,
+      "grad_norm": 0.38353490829467773,
+      "learning_rate": 0.00033571470245040837,
+      "loss": 3.3924,
+      "step": 30300
+    },
+    {
+      "epoch": 8.840713120484736,
+      "grad_norm": 0.3619522750377655,
+      "learning_rate": 0.0003352771295215869,
+      "loss": 3.3943,
+      "step": 30350
+    },
+    {
+      "epoch": 8.855278489862503,
+      "grad_norm": 0.34908801317214966,
+      "learning_rate": 0.0003348395565927654,
+      "loss": 3.4001,
+      "step": 30400
+    },
+    {
+      "epoch": 8.86984385924027,
+      "grad_norm": 0.3426980972290039,
+      "learning_rate": 0.00033440198366394393,
+      "loss": 3.3924,
+      "step": 30450
+    },
+    {
+      "epoch": 8.884409228618038,
+      "grad_norm": 0.3510425090789795,
+      "learning_rate": 0.0003339644107351225,
+      "loss": 3.3992,
+      "step": 30500
+    },
+    {
+      "epoch": 8.898974597995805,
+      "grad_norm": 0.3653899133205414,
+      "learning_rate": 0.00033352683780630107,
+      "loss": 3.3945,
+      "step": 30550
+    },
+    {
+      "epoch": 8.913539967373573,
+      "grad_norm": 0.3542352318763733,
+      "learning_rate": 0.00033308926487747955,
+      "loss": 3.4124,
+      "step": 30600
+    },
+    {
+      "epoch": 8.92810533675134,
+      "grad_norm": 0.3585004508495331,
+      "learning_rate": 0.0003326516919486581,
+      "loss": 3.3896,
+      "step": 30650
+    },
+    {
+      "epoch": 8.942670706129107,
+      "grad_norm": 0.3748304843902588,
+      "learning_rate": 0.00033221411901983663,
+      "loss": 3.3986,
+      "step": 30700
+    },
+    {
+      "epoch": 8.957236075506875,
+      "grad_norm": 0.3452504277229309,
+      "learning_rate": 0.0003317765460910151,
+      "loss": 3.3955,
+      "step": 30750
+    },
+    {
+      "epoch": 8.971801444884642,
+      "grad_norm": 0.3590230941772461,
+      "learning_rate": 0.00033133897316219366,
+      "loss": 3.396,
+      "step": 30800
+    },
+    {
+      "epoch": 8.986366814262409,
+      "grad_norm": 0.3469237685203552,
+      "learning_rate": 0.0003309014002333722,
+      "loss": 3.3886,
+      "step": 30850
+    },
+    {
+      "epoch": 9.000873922162667,
+      "grad_norm": 0.34952783584594727,
+      "learning_rate": 0.0003304638273045507,
+      "loss": 3.3905,
+      "step": 30900
+    },
+    {
+      "epoch": 9.015439291540433,
+      "grad_norm": 0.3712822198867798,
+      "learning_rate": 0.0003300262543757292,
+      "loss": 3.2828,
+      "step": 30950
+    },
+    {
+      "epoch": 9.0300046609182,
+      "grad_norm": 0.371115505695343,
+      "learning_rate": 0.0003295886814469078,
+      "loss": 3.305,
+      "step": 31000
+    },
+    {
+      "epoch": 9.0300046609182,
+      "eval_accuracy": 0.36886643330036484,
+      "eval_loss": 3.5686051845550537,
+      "eval_runtime": 181.9038,
+      "eval_samples_per_second": 91.499,
+      "eval_steps_per_second": 5.723,
+      "step": 31000
+    },
+    {
+      "epoch": 9.044570030295969,
+      "grad_norm": 0.3616848289966583,
+      "learning_rate": 0.00032915110851808636,
+      "loss": 3.2979,
+      "step": 31050
+    },
+    {
+      "epoch": 9.059135399673735,
+      "grad_norm": 0.3446025550365448,
+      "learning_rate": 0.00032871353558926484,
+      "loss": 3.3074,
+      "step": 31100
+    },
+    {
+      "epoch": 9.073700769051504,
+      "grad_norm": 0.36741337180137634,
+      "learning_rate": 0.0003282759626604434,
+      "loss": 3.2965,
+      "step": 31150
+    },
+    {
+      "epoch": 9.08826613842927,
+      "grad_norm": 0.3401558995246887,
+      "learning_rate": 0.0003278383897316219,
+      "loss": 3.305,
+      "step": 31200
+    },
+    {
+      "epoch": 9.102831507807037,
+      "grad_norm": 0.364580363035202,
+      "learning_rate": 0.0003274008168028004,
+      "loss": 3.3159,
+      "step": 31250
+    },
+    {
+      "epoch": 9.117396877184806,
+      "grad_norm": 0.3511641025543213,
+      "learning_rate": 0.00032696324387397895,
+      "loss": 3.3232,
+      "step": 31300
+    },
+    {
+      "epoch": 9.131962246562573,
+      "grad_norm": 0.3950608968734741,
+      "learning_rate": 0.0003265256709451575,
+      "loss": 3.3135,
+      "step": 31350
+    },
+    {
+      "epoch": 9.14652761594034,
+      "grad_norm": 0.35723525285720825,
+      "learning_rate": 0.00032608809801633597,
+      "loss": 3.3117,
+      "step": 31400
+    },
+    {
+      "epoch": 9.161092985318108,
+      "grad_norm": 0.38060232996940613,
+      "learning_rate": 0.00032565052508751457,
+      "loss": 3.3211,
+      "step": 31450
+    },
+    {
+      "epoch": 9.175658354695875,
+      "grad_norm": 0.3600994944572449,
+      "learning_rate": 0.0003252129521586931,
+      "loss": 3.3195,
+      "step": 31500
+    },
+    {
+      "epoch": 9.190223724073643,
+      "grad_norm": 0.38114050030708313,
+      "learning_rate": 0.00032477537922987165,
+      "loss": 3.3272,
+      "step": 31550
+    },
+    {
+      "epoch": 9.20478909345141,
+      "grad_norm": 0.35321810841560364,
+      "learning_rate": 0.00032433780630105013,
+      "loss": 3.3198,
+      "step": 31600
+    },
+    {
+      "epoch": 9.219354462829177,
+      "grad_norm": 0.3709951639175415,
+      "learning_rate": 0.00032390023337222867,
+      "loss": 3.3285,
+      "step": 31650
+    },
+    {
+      "epoch": 9.233919832206945,
+      "grad_norm": 0.3782629668712616,
+      "learning_rate": 0.0003234626604434072,
+      "loss": 3.3391,
+      "step": 31700
+    },
+    {
+      "epoch": 9.248485201584712,
+      "grad_norm": 0.3684213161468506,
+      "learning_rate": 0.0003230250875145857,
+      "loss": 3.3321,
+      "step": 31750
+    },
+    {
+      "epoch": 9.263050570962479,
+      "grad_norm": 0.37817445397377014,
+      "learning_rate": 0.00032258751458576424,
+      "loss": 3.3253,
+      "step": 31800
+    },
+    {
+      "epoch": 9.277615940340247,
+      "grad_norm": 0.37698403000831604,
+      "learning_rate": 0.00032214994165694283,
+      "loss": 3.3263,
+      "step": 31850
+    },
+    {
+      "epoch": 9.292181309718014,
+      "grad_norm": 0.35898423194885254,
+      "learning_rate": 0.0003217123687281213,
+      "loss": 3.3407,
+      "step": 31900
+    },
+    {
+      "epoch": 9.306746679095783,
+      "grad_norm": 0.37121668457984924,
+      "learning_rate": 0.00032127479579929986,
+      "loss": 3.3469,
+      "step": 31950
+    },
+    {
+      "epoch": 9.32131204847355,
+      "grad_norm": 0.3602243959903717,
+      "learning_rate": 0.0003208372228704784,
+      "loss": 3.333,
+      "step": 32000
+    },
+    {
+      "epoch": 9.32131204847355,
+      "eval_accuracy": 0.36881940260672325,
+      "eval_loss": 3.568206787109375,
+      "eval_runtime": 181.6399,
+      "eval_samples_per_second": 91.632,
+      "eval_steps_per_second": 5.731,
+      "step": 32000
+    },
+    {
+      "epoch": 9.335877417851316,
+      "grad_norm": 0.3777805268764496,
+      "learning_rate": 0.00032039964994165694,
+      "loss": 3.3404,
+      "step": 32050
+    },
+    {
+      "epoch": 9.350442787229085,
+      "grad_norm": 0.36622655391693115,
+      "learning_rate": 0.0003199620770128354,
+      "loss": 3.3462,
+      "step": 32100
+    },
+    {
+      "epoch": 9.365008156606851,
+      "grad_norm": 0.3432258069515228,
+      "learning_rate": 0.00031952450408401396,
+      "loss": 3.3499,
+      "step": 32150
+    },
+    {
+      "epoch": 9.379573525984618,
+      "grad_norm": 0.3571391999721527,
+      "learning_rate": 0.0003190869311551925,
+      "loss": 3.3437,
+      "step": 32200
+    },
+    {
+      "epoch": 9.394138895362387,
+      "grad_norm": 0.3796580731868744,
+      "learning_rate": 0.000318649358226371,
+      "loss": 3.3445,
+      "step": 32250
+    },
+    {
+      "epoch": 9.408704264740154,
+      "grad_norm": 0.3999924659729004,
+      "learning_rate": 0.0003182117852975496,
+      "loss": 3.3398,
+      "step": 32300
+    },
+    {
+      "epoch": 9.423269634117922,
+      "grad_norm": 0.3521633744239807,
+      "learning_rate": 0.0003177742123687281,
+      "loss": 3.3518,
+      "step": 32350
+    },
+    {
+      "epoch": 9.437835003495689,
+      "grad_norm": 0.34816059470176697,
+      "learning_rate": 0.0003173366394399066,
+      "loss": 3.3498,
+      "step": 32400
+    },
+    {
+      "epoch": 9.452400372873456,
+      "grad_norm": 0.3519940674304962,
+      "learning_rate": 0.00031689906651108515,
+      "loss": 3.3491,
+      "step": 32450
+    },
+    {
+      "epoch": 9.466965742251224,
+      "grad_norm": 0.366641104221344,
+      "learning_rate": 0.0003164614935822637,
+      "loss": 3.3575,
+      "step": 32500
+    },
+    {
+      "epoch": 9.48153111162899,
+      "grad_norm": 0.3859027028083801,
+      "learning_rate": 0.00031602392065344223,
+      "loss": 3.3477,
+      "step": 32550
+    },
+    {
+      "epoch": 9.496096481006758,
+      "grad_norm": 0.3514662981033325,
+      "learning_rate": 0.0003155863477246207,
+      "loss": 3.3536,
+      "step": 32600
+    },
+    {
+      "epoch": 9.510661850384526,
+      "grad_norm": 0.37433597445487976,
+      "learning_rate": 0.00031514877479579925,
+      "loss": 3.3606,
+      "step": 32650
+    },
+    {
+      "epoch": 9.525227219762293,
+      "grad_norm": 0.3747974932193756,
+      "learning_rate": 0.00031471120186697785,
+      "loss": 3.3621,
+      "step": 32700
+    },
+    {
+      "epoch": 9.53979258914006,
+      "grad_norm": 0.38271790742874146,
+      "learning_rate": 0.00031427362893815633,
+      "loss": 3.3599,
+      "step": 32750
+    },
+    {
+      "epoch": 9.554357958517828,
+      "grad_norm": 0.3738161027431488,
+      "learning_rate": 0.0003138360560093349,
+      "loss": 3.35,
+      "step": 32800
+    },
+    {
+      "epoch": 9.568923327895595,
+      "grad_norm": 0.37082603573799133,
+      "learning_rate": 0.0003133984830805134,
+      "loss": 3.3593,
+      "step": 32850
+    },
+    {
+      "epoch": 9.583488697273363,
+      "grad_norm": 0.38742882013320923,
+      "learning_rate": 0.0003129609101516919,
+      "loss": 3.3513,
+      "step": 32900
+    },
+    {
+      "epoch": 9.59805406665113,
+      "grad_norm": 0.36848726868629456,
+      "learning_rate": 0.00031252333722287044,
+      "loss": 3.3619,
+      "step": 32950
+    },
+    {
+      "epoch": 9.612619436028897,
+      "grad_norm": 0.34450680017471313,
+      "learning_rate": 0.000312085764294049,
+      "loss": 3.3523,
+      "step": 33000
+    },
+    {
+      "epoch": 9.612619436028897,
+      "eval_accuracy": 0.36961469163620253,
+      "eval_loss": 3.5594406127929688,
+      "eval_runtime": 181.5027,
+      "eval_samples_per_second": 91.701,
+      "eval_steps_per_second": 5.735,
+      "step": 33000
+    },
+    {
+      "epoch": 9.627184805406666,
+      "grad_norm": 0.34740373492240906,
+      "learning_rate": 0.0003116481913652275,
+      "loss": 3.3523,
+      "step": 33050
+    },
+    {
+      "epoch": 9.641750174784432,
+      "grad_norm": 0.3639390468597412,
+      "learning_rate": 0.000311210618436406,
+      "loss": 3.3569,
+      "step": 33100
+    },
+    {
+      "epoch": 9.6563155441622,
+      "grad_norm": 0.3668532073497772,
+      "learning_rate": 0.0003107730455075846,
+      "loss": 3.3582,
+      "step": 33150
+    },
+    {
+      "epoch": 9.670880913539968,
+      "grad_norm": 0.3689277172088623,
+      "learning_rate": 0.00031033547257876314,
+      "loss": 3.3615,
+      "step": 33200
+    },
+    {
+      "epoch": 9.685446282917734,
+      "grad_norm": 0.3605565130710602,
+      "learning_rate": 0.0003098978996499416,
+      "loss": 3.3501,
+      "step": 33250
+    },
+    {
+      "epoch": 9.700011652295503,
+      "grad_norm": 0.3678613007068634,
+      "learning_rate": 0.00030946032672112016,
+      "loss": 3.3653,
+      "step": 33300
+    },
+    {
+      "epoch": 9.71457702167327,
+      "grad_norm": 0.360675185918808,
+      "learning_rate": 0.0003090227537922987,
+      "loss": 3.3637,
+      "step": 33350
+    },
+    {
+      "epoch": 9.729142391051036,
+      "grad_norm": 0.3719678819179535,
+      "learning_rate": 0.0003085851808634772,
+      "loss": 3.357,
+      "step": 33400
+    },
+    {
+      "epoch": 9.743707760428805,
+      "grad_norm": 0.3562043607234955,
+      "learning_rate": 0.00030814760793465573,
+      "loss": 3.3531,
+      "step": 33450
+    },
+    {
+      "epoch": 9.758273129806572,
+      "grad_norm": 0.37112271785736084,
+      "learning_rate": 0.00030771003500583427,
+      "loss": 3.3629,
+      "step": 33500
+    },
+    {
+      "epoch": 9.772838499184338,
+      "grad_norm": 0.3823767900466919,
+      "learning_rate": 0.00030727246207701286,
+      "loss": 3.3711,
+      "step": 33550
+    },
+    {
+      "epoch": 9.787403868562107,
+      "grad_norm": 0.3594043552875519,
+      "learning_rate": 0.0003068348891481913,
+      "loss": 3.3712,
+      "step": 33600
+    },
+    {
+      "epoch": 9.801969237939874,
+      "grad_norm": 0.3566214442253113,
+      "learning_rate": 0.0003063973162193699,
+      "loss": 3.3842,
+      "step": 33650
+    },
+    {
+      "epoch": 9.816534607317642,
+      "grad_norm": 0.36310461163520813,
+      "learning_rate": 0.00030595974329054843,
+      "loss": 3.3596,
+      "step": 33700
+    },
+    {
+      "epoch": 9.831099976695409,
+      "grad_norm": 0.36038920283317566,
+      "learning_rate": 0.0003055221703617269,
+      "loss": 3.3677,
+      "step": 33750
+    },
+    {
+      "epoch": 9.845665346073176,
+      "grad_norm": 0.34875422716140747,
+      "learning_rate": 0.00030508459743290546,
+      "loss": 3.3738,
+      "step": 33800
+    },
+    {
+      "epoch": 9.860230715450944,
+      "grad_norm": 0.3687998056411743,
+      "learning_rate": 0.000304647024504084,
+      "loss": 3.3725,
+      "step": 33850
+    },
+    {
+      "epoch": 9.874796084828711,
+      "grad_norm": 0.3492382764816284,
+      "learning_rate": 0.0003042094515752625,
+      "loss": 3.3642,
+      "step": 33900
+    },
+    {
+      "epoch": 9.88936145420648,
+      "grad_norm": 0.34819406270980835,
+      "learning_rate": 0.000303771878646441,
+      "loss": 3.3783,
+      "step": 33950
+    },
+    {
+      "epoch": 9.903926823584246,
+      "grad_norm": 0.36848151683807373,
+      "learning_rate": 0.00030333430571761956,
+      "loss": 3.3654,
+      "step": 34000
+    },
+    {
+      "epoch": 9.903926823584246,
+      "eval_accuracy": 0.3703816446727628,
+      "eval_loss": 3.5503089427948,
+      "eval_runtime": 181.4493,
+      "eval_samples_per_second": 91.728,
+      "eval_steps_per_second": 5.737,
+      "step": 34000
+    },
+    {
+      "epoch": 9.918492192962013,
+      "grad_norm": 0.39530327916145325,
+      "learning_rate": 0.00030289673278879816,
+      "loss": 3.3634,
+      "step": 34050
+    },
+    {
+      "epoch": 9.933057562339782,
+      "grad_norm": 0.3620380759239197,
+      "learning_rate": 0.00030245915985997664,
+      "loss": 3.3729,
+      "step": 34100
+    },
+    {
+      "epoch": 9.947622931717548,
+      "grad_norm": 0.356423020362854,
+      "learning_rate": 0.0003020215869311552,
+      "loss": 3.3823,
+      "step": 34150
+    },
+    {
+      "epoch": 9.962188301095315,
+      "grad_norm": 0.35574576258659363,
+      "learning_rate": 0.0003015840140023337,
+      "loss": 3.3698,
+      "step": 34200
+    },
+    {
+      "epoch": 9.976753670473084,
+      "grad_norm": 0.3700348734855652,
+      "learning_rate": 0.0003011464410735122,
+      "loss": 3.368,
+      "step": 34250
+    },
+    {
+      "epoch": 9.99131903985085,
+      "grad_norm": 0.3582363724708557,
+      "learning_rate": 0.00030070886814469075,
+      "loss": 3.3747,
+      "step": 34300
+    },
+    {
+      "epoch": 10.005826147751106,
+      "grad_norm": 0.3664577901363373,
+      "learning_rate": 0.0003002712952158693,
+      "loss": 3.3242,
+      "step": 34350
+    },
+    {
+      "epoch": 10.020391517128875,
+      "grad_norm": 0.3791219890117645,
+      "learning_rate": 0.0002998337222870478,
+      "loss": 3.2693,
+      "step": 34400
+    },
+    {
+      "epoch": 10.034956886506642,
+      "grad_norm": 0.3724304437637329,
+      "learning_rate": 0.0002993961493582263,
+      "loss": 3.2642,
+      "step": 34450
+    },
+    {
+      "epoch": 10.049522255884408,
+      "grad_norm": 0.36520498991012573,
+      "learning_rate": 0.0002989585764294049,
+      "loss": 3.2756,
+      "step": 34500
+    },
+    {
+      "epoch": 10.064087625262177,
+      "grad_norm": 0.3840792179107666,
+      "learning_rate": 0.0002985210035005834,
+      "loss": 3.28,
+      "step": 34550
+    },
+    {
+      "epoch": 10.078652994639944,
+      "grad_norm": 0.3588644564151764,
+      "learning_rate": 0.00029808343057176193,
+      "loss": 3.2725,
+      "step": 34600
+    },
+    {
+      "epoch": 10.093218364017712,
+      "grad_norm": 0.3608386516571045,
+      "learning_rate": 0.00029764585764294047,
+      "loss": 3.2845,
+      "step": 34650
+    },
+    {
+      "epoch": 10.107783733395479,
+      "grad_norm": 0.37310636043548584,
+      "learning_rate": 0.00029720828471411896,
+      "loss": 3.2934,
+      "step": 34700
+    },
+    {
+      "epoch": 10.122349102773246,
+      "grad_norm": 0.3664185404777527,
+      "learning_rate": 0.00029677071178529755,
+      "loss": 3.2953,
+      "step": 34750
+    },
+    {
+      "epoch": 10.136914472151014,
+      "grad_norm": 0.3596240282058716,
+      "learning_rate": 0.00029633313885647604,
+      "loss": 3.304,
+      "step": 34800
+    },
+    {
+      "epoch": 10.151479841528781,
+      "grad_norm": 0.3951849341392517,
+      "learning_rate": 0.0002958955659276546,
+      "loss": 3.2999,
+      "step": 34850
+    },
+    {
+      "epoch": 10.166045210906548,
+      "grad_norm": 0.38322994112968445,
+      "learning_rate": 0.0002954579929988331,
+      "loss": 3.304,
+      "step": 34900
+    },
+    {
+      "epoch": 10.180610580284316,
+      "grad_norm": 0.36491626501083374,
+      "learning_rate": 0.00029502042007001166,
+      "loss": 3.2911,
+      "step": 34950
+    },
+    {
+      "epoch": 10.195175949662083,
+      "grad_norm": 0.37527546286582947,
+      "learning_rate": 0.0002945828471411902,
+      "loss": 3.2834,
+      "step": 35000
+    },
+    {
+      "epoch": 10.195175949662083,
+      "eval_accuracy": 0.3699247414840347,
+      "eval_loss": 3.5616824626922607,
+      "eval_runtime": 180.3891,
+      "eval_samples_per_second": 92.267,
+      "eval_steps_per_second": 5.771,
+      "step": 35000
+    },
+    {
+      "epoch": 10.209741319039852,
+      "grad_norm": 0.3800257742404938,
+      "learning_rate": 0.0002941452742123687,
+      "loss": 3.2981,
+      "step": 35050
+    },
+    {
+      "epoch": 10.224306688417618,
+      "grad_norm": 0.3631065785884857,
+      "learning_rate": 0.0002937077012835472,
+      "loss": 3.3006,
+      "step": 35100
+    },
+    {
+      "epoch": 10.238872057795385,
+      "grad_norm": 0.3786700963973999,
+      "learning_rate": 0.00029327012835472576,
+      "loss": 3.3138,
+      "step": 35150
+    },
+    {
+      "epoch": 10.253437427173154,
+      "grad_norm": 0.35197684168815613,
+      "learning_rate": 0.0002928325554259043,
+      "loss": 3.3003,
+      "step": 35200
+    },
+    {
+      "epoch": 10.26800279655092,
+      "grad_norm": 0.36957064270973206,
+      "learning_rate": 0.00029239498249708284,
+      "loss": 3.3105,
+      "step": 35250
+    },
+    {
+      "epoch": 10.282568165928687,
+      "grad_norm": 0.37074217200279236,
+      "learning_rate": 0.00029195740956826133,
+      "loss": 3.3123,
+      "step": 35300
+    },
+    {
+      "epoch": 10.297133535306456,
+      "grad_norm": 0.3711046278476715,
+      "learning_rate": 0.0002915198366394399,
+      "loss": 3.302,
+      "step": 35350
+    },
+    {
+      "epoch": 10.311698904684222,
+      "grad_norm": 0.3888838589191437,
+      "learning_rate": 0.0002910822637106184,
+      "loss": 3.3077,
+      "step": 35400
+    },
+    {
+      "epoch": 10.326264274061991,
+      "grad_norm": 0.3660491704940796,
+      "learning_rate": 0.00029064469078179695,
+      "loss": 3.3186,
+      "step": 35450
+    },
+    {
+      "epoch": 10.340829643439758,
+      "grad_norm": 0.3750646412372589,
+      "learning_rate": 0.0002902071178529755,
+      "loss": 3.3088,
+      "step": 35500
+    },
+    {
+      "epoch": 10.355395012817525,
+      "grad_norm": 0.3611460030078888,
+      "learning_rate": 0.000289769544924154,
+      "loss": 3.3126,
+      "step": 35550
+    },
+    {
+      "epoch": 10.369960382195293,
+      "grad_norm": 0.3784548342227936,
+      "learning_rate": 0.00028933197199533257,
+      "loss": 3.3199,
+      "step": 35600
+    },
+    {
+      "epoch": 10.38452575157306,
+      "grad_norm": 0.3654816746711731,
+      "learning_rate": 0.00028889439906651105,
+      "loss": 3.317,
+      "step": 35650
+    },
+    {
+      "epoch": 10.399091120950827,
+      "grad_norm": 0.3819401264190674,
+      "learning_rate": 0.0002884568261376896,
+      "loss": 3.3174,
+      "step": 35700
+    },
+    {
+      "epoch": 10.413656490328595,
+      "grad_norm": 0.3685275912284851,
+      "learning_rate": 0.00028801925320886813,
+      "loss": 3.3172,
+      "step": 35750
+    },
+    {
+      "epoch": 10.428221859706362,
+      "grad_norm": 0.3687780201435089,
+      "learning_rate": 0.0002875816802800466,
+      "loss": 3.3185,
+      "step": 35800
+    },
+    {
+      "epoch": 10.44278722908413,
+      "grad_norm": 0.3637807369232178,
+      "learning_rate": 0.0002871441073512252,
+      "loss": 3.3257,
+      "step": 35850
+    },
+    {
+      "epoch": 10.457352598461897,
+      "grad_norm": 0.3877573013305664,
+      "learning_rate": 0.0002867065344224037,
+      "loss": 3.3316,
+      "step": 35900
+    },
+    {
+      "epoch": 10.471917967839664,
+      "grad_norm": 0.37709304690361023,
+      "learning_rate": 0.00028626896149358224,
+      "loss": 3.3028,
+      "step": 35950
+    },
+    {
+      "epoch": 10.486483337217432,
+      "grad_norm": 0.36883544921875,
+      "learning_rate": 0.0002858313885647608,
+      "loss": 3.3313,
+      "step": 36000
+    },
+    {
+      "epoch": 10.486483337217432,
+      "eval_accuracy": 0.37047523575310953,
+      "eval_loss": 3.555171489715576,
+      "eval_runtime": 180.6124,
+      "eval_samples_per_second": 92.153,
+      "eval_steps_per_second": 5.764,
+      "step": 36000
+    },
+    {
+      "epoch": 10.5010487065952,
+      "grad_norm": 0.37332433462142944,
+      "learning_rate": 0.0002853938156359393,
+      "loss": 3.3147,
+      "step": 36050
+    },
+    {
+      "epoch": 10.515614075972966,
+      "grad_norm": 0.3772258758544922,
+      "learning_rate": 0.00028495624270711786,
+      "loss": 3.3325,
+      "step": 36100
+    },
+    {
+      "epoch": 10.530179445350734,
+      "grad_norm": 0.35568490624427795,
+      "learning_rate": 0.00028451866977829634,
+      "loss": 3.3381,
+      "step": 36150
+    },
+    {
+      "epoch": 10.544744814728501,
+      "grad_norm": 0.3858466148376465,
+      "learning_rate": 0.0002840810968494749,
+      "loss": 3.3242,
+      "step": 36200
+    },
+    {
+      "epoch": 10.55931018410627,
+      "grad_norm": 0.3936407268047333,
+      "learning_rate": 0.0002836435239206534,
+      "loss": 3.3213,
+      "step": 36250
+    },
+    {
+      "epoch": 10.573875553484037,
+      "grad_norm": 0.3783574104309082,
+      "learning_rate": 0.00028320595099183196,
+      "loss": 3.3174,
+      "step": 36300
+    },
+    {
+      "epoch": 10.588440922861803,
+      "grad_norm": 0.3611924648284912,
+      "learning_rate": 0.0002827683780630105,
+      "loss": 3.3281,
+      "step": 36350
+    },
+    {
+      "epoch": 10.603006292239572,
+      "grad_norm": 0.36673375964164734,
+      "learning_rate": 0.000282330805134189,
+      "loss": 3.3274,
+      "step": 36400
+    },
+    {
+      "epoch": 10.617571661617339,
+      "grad_norm": 0.3864386975765228,
+      "learning_rate": 0.00028189323220536753,
+      "loss": 3.3263,
+      "step": 36450
+    },
+    {
+      "epoch": 10.632137030995105,
+      "grad_norm": 0.37186652421951294,
+      "learning_rate": 0.00028145565927654607,
+      "loss": 3.3256,
+      "step": 36500
+    },
+    {
+      "epoch": 10.646702400372874,
+      "grad_norm": 0.3645637333393097,
+      "learning_rate": 0.0002810180863477246,
+      "loss": 3.3304,
+      "step": 36550
+    },
+    {
+      "epoch": 10.66126776975064,
+      "grad_norm": 0.3960283696651459,
+      "learning_rate": 0.00028058051341890315,
+      "loss": 3.3293,
+      "step": 36600
+    },
+    {
+      "epoch": 10.675833139128407,
+      "grad_norm": 0.3968350291252136,
+      "learning_rate": 0.00028014294049008164,
+      "loss": 3.3289,
+      "step": 36650
+    },
+    {
+      "epoch": 10.690398508506176,
+      "grad_norm": 0.3649657666683197,
+      "learning_rate": 0.0002797053675612602,
+      "loss": 3.326,
+      "step": 36700
+    },
+    {
+      "epoch": 10.704963877883943,
+      "grad_norm": 0.366464227437973,
+      "learning_rate": 0.0002792677946324387,
+      "loss": 3.3453,
+      "step": 36750
+    },
+    {
+      "epoch": 10.719529247261711,
+      "grad_norm": 0.36643803119659424,
+      "learning_rate": 0.00027883022170361726,
+      "loss": 3.3272,
+      "step": 36800
+    },
+    {
+      "epoch": 10.734094616639478,
+      "grad_norm": 0.35845255851745605,
+      "learning_rate": 0.0002783926487747958,
+      "loss": 3.3341,
+      "step": 36850
+    },
+    {
+      "epoch": 10.748659986017245,
+      "grad_norm": 0.3823663294315338,
+      "learning_rate": 0.0002779550758459743,
+      "loss": 3.3404,
+      "step": 36900
+    },
+    {
+      "epoch": 10.763225355395013,
+      "grad_norm": 0.36972370743751526,
+      "learning_rate": 0.0002775175029171528,
+      "loss": 3.3372,
+      "step": 36950
+    },
+    {
+      "epoch": 10.77779072477278,
+      "grad_norm": 0.3613353967666626,
+      "learning_rate": 0.00027707992998833136,
+      "loss": 3.3399,
+      "step": 37000
+    },
+    {
+      "epoch": 10.77779072477278,
+      "eval_accuracy": 0.37111332468909186,
+      "eval_loss": 3.54584002494812,
+      "eval_runtime": 180.3064,
+      "eval_samples_per_second": 92.31,
+      "eval_steps_per_second": 5.774,
+      "step": 37000
+    },
+    {
+      "epoch": 10.792356094150549,
+      "grad_norm": 0.37549999356269836,
+      "learning_rate": 0.0002766423570595099,
+      "loss": 3.3357,
+      "step": 37050
+    },
+    {
+      "epoch": 10.806921463528315,
+      "grad_norm": 0.38684558868408203,
+      "learning_rate": 0.00027620478413068844,
+      "loss": 3.3365,
+      "step": 37100
+    },
+    {
+      "epoch": 10.821486832906082,
+      "grad_norm": 0.3710017204284668,
+      "learning_rate": 0.000275767211201867,
+      "loss": 3.3438,
+      "step": 37150
+    },
+    {
+      "epoch": 10.83605220228385,
+      "grad_norm": 0.3841908872127533,
+      "learning_rate": 0.00027532963827304547,
+      "loss": 3.3451,
+      "step": 37200
+    },
+    {
+      "epoch": 10.850617571661617,
+      "grad_norm": 0.37406450510025024,
+      "learning_rate": 0.000274892065344224,
+      "loss": 3.3508,
+      "step": 37250
+    },
+    {
+      "epoch": 10.865182941039384,
+      "grad_norm": 0.37421655654907227,
+      "learning_rate": 0.00027445449241540255,
+      "loss": 3.3346,
+      "step": 37300
+    },
+    {
+      "epoch": 10.879748310417153,
+      "grad_norm": 0.3724828064441681,
+      "learning_rate": 0.0002740169194865811,
+      "loss": 3.3251,
+      "step": 37350
+    },
+    {
+      "epoch": 10.89431367979492,
+      "grad_norm": 0.37316784262657166,
+      "learning_rate": 0.0002735793465577596,
+      "loss": 3.3407,
+      "step": 37400
+    },
+    {
+      "epoch": 10.908879049172686,
+      "grad_norm": 0.35748419165611267,
+      "learning_rate": 0.0002731417736289381,
+      "loss": 3.3403,
+      "step": 37450
+    },
+    {
+      "epoch": 10.923444418550455,
+      "grad_norm": 0.3694682717323303,
+      "learning_rate": 0.00027270420070011665,
+      "loss": 3.3413,
+      "step": 37500
+    },
+    {
+      "epoch": 10.938009787928221,
+      "grad_norm": 0.37885811924934387,
+      "learning_rate": 0.0002722666277712952,
+      "loss": 3.3426,
+      "step": 37550
+    },
+    {
+      "epoch": 10.95257515730599,
+      "grad_norm": 0.38499969244003296,
+      "learning_rate": 0.00027182905484247373,
+      "loss": 3.3389,
+      "step": 37600
+    },
+    {
+      "epoch": 10.967140526683757,
+      "grad_norm": 0.36556389927864075,
+      "learning_rate": 0.00027139148191365227,
+      "loss": 3.3459,
+      "step": 37650
+    },
+    {
+      "epoch": 10.981705896061523,
+      "grad_norm": 0.3706257939338684,
+      "learning_rate": 0.00027095390898483076,
+      "loss": 3.3406,
+      "step": 37700
+    },
+    {
+      "epoch": 10.996271265439292,
+      "grad_norm": 0.36823248863220215,
+      "learning_rate": 0.0002705163360560093,
+      "loss": 3.3361,
+      "step": 37750
+    },
+    {
+      "epoch": 11.010778373339548,
+      "grad_norm": 0.35125017166137695,
+      "learning_rate": 0.00027007876312718784,
+      "loss": 3.2779,
+      "step": 37800
+    },
+    {
+      "epoch": 11.025343742717315,
+      "grad_norm": 0.3946673274040222,
+      "learning_rate": 0.0002696411901983664,
+      "loss": 3.2391,
+      "step": 37850
+    },
+    {
+      "epoch": 11.039909112095083,
+      "grad_norm": 0.37063831090927124,
+      "learning_rate": 0.0002692036172695449,
+      "loss": 3.2478,
+      "step": 37900
+    },
+    {
+      "epoch": 11.05447448147285,
+      "grad_norm": 0.36030757427215576,
+      "learning_rate": 0.0002687660443407234,
+      "loss": 3.2521,
+      "step": 37950
+    },
+    {
+      "epoch": 11.069039850850617,
+      "grad_norm": 0.3777706027030945,
+      "learning_rate": 0.00026832847141190194,
+      "loss": 3.2605,
+      "step": 38000
+    },
+    {
+      "epoch": 11.069039850850617,
+      "eval_accuracy": 0.370730142112647,
+      "eval_loss": 3.558288097381592,
+      "eval_runtime": 180.3247,
+      "eval_samples_per_second": 92.3,
+      "eval_steps_per_second": 5.773,
+      "step": 38000
+    },
+    {
+      "epoch": 11.083605220228385,
+      "grad_norm": 0.3810880482196808,
+      "learning_rate": 0.0002678908984830805,
+      "loss": 3.2545,
+      "step": 38050
+    },
+    {
+      "epoch": 11.098170589606152,
+      "grad_norm": 0.38010433316230774,
+      "learning_rate": 0.000267453325554259,
+      "loss": 3.2619,
+      "step": 38100
+    },
+    {
+      "epoch": 11.11273595898392,
+      "grad_norm": 0.38559970259666443,
+      "learning_rate": 0.00026701575262543756,
+      "loss": 3.26,
+      "step": 38150
+    },
+    {
+      "epoch": 11.127301328361687,
+      "grad_norm": 0.37837737798690796,
+      "learning_rate": 0.00026657817969661605,
+      "loss": 3.2626,
+      "step": 38200
+    },
+    {
+      "epoch": 11.141866697739454,
+      "grad_norm": 0.3784601092338562,
+      "learning_rate": 0.00026614060676779464,
+      "loss": 3.258,
+      "step": 38250
+    },
+    {
+      "epoch": 11.156432067117223,
+      "grad_norm": 0.35845887660980225,
+      "learning_rate": 0.00026570303383897313,
+      "loss": 3.2586,
+      "step": 38300
+    },
+    {
+      "epoch": 11.17099743649499,
+      "grad_norm": 0.37323319911956787,
+      "learning_rate": 0.00026526546091015167,
+      "loss": 3.2596,
+      "step": 38350
+    },
+    {
+      "epoch": 11.185562805872756,
+      "grad_norm": 0.4025420546531677,
+      "learning_rate": 0.0002648278879813302,
+      "loss": 3.2722,
+      "step": 38400
+    },
+    {
+      "epoch": 11.200128175250525,
+      "grad_norm": 0.3721407651901245,
+      "learning_rate": 0.0002643903150525087,
+      "loss": 3.2737,
+      "step": 38450
+    },
+    {
+      "epoch": 11.214693544628291,
+      "grad_norm": 0.3787324130535126,
+      "learning_rate": 0.0002639527421236873,
+      "loss": 3.267,
+      "step": 38500
+    },
+    {
+      "epoch": 11.22925891400606,
+      "grad_norm": 0.385883092880249,
+      "learning_rate": 0.0002635151691948658,
+      "loss": 3.2813,
+      "step": 38550
+    },
+    {
+      "epoch": 11.243824283383827,
+      "grad_norm": 0.3823045492172241,
+      "learning_rate": 0.0002630775962660443,
+      "loss": 3.2806,
+      "step": 38600
+    },
+    {
+      "epoch": 11.258389652761593,
+      "grad_norm": 0.3888196349143982,
+      "learning_rate": 0.00026264002333722285,
+      "loss": 3.2669,
+      "step": 38650
+    },
+    {
+      "epoch": 11.272955022139362,
+      "grad_norm": 0.4065677523612976,
+      "learning_rate": 0.00026220245040840134,
+      "loss": 3.2815,
+      "step": 38700
+    },
+    {
+      "epoch": 11.287520391517129,
+      "grad_norm": 0.36197319626808167,
+      "learning_rate": 0.00026176487747957993,
+      "loss": 3.2846,
+      "step": 38750
+    },
+    {
+      "epoch": 11.302085760894895,
+      "grad_norm": 0.379085510969162,
+      "learning_rate": 0.0002613273045507584,
+      "loss": 3.2861,
+      "step": 38800
+    },
+    {
+      "epoch": 11.316651130272664,
+      "grad_norm": 0.39975711703300476,
+      "learning_rate": 0.00026088973162193696,
+      "loss": 3.3018,
+      "step": 38850
+    },
+    {
+      "epoch": 11.33121649965043,
+      "grad_norm": 0.390550434589386,
+      "learning_rate": 0.0002604521586931155,
+      "loss": 3.2864,
+      "step": 38900
+    },
+    {
+      "epoch": 11.3457818690282,
+      "grad_norm": 0.40258410573005676,
+      "learning_rate": 0.00026001458576429404,
+      "loss": 3.2845,
+      "step": 38950
+    },
+    {
+      "epoch": 11.360347238405966,
+      "grad_norm": 0.36794647574424744,
+      "learning_rate": 0.0002595770128354726,
+      "loss": 3.2906,
+      "step": 39000
+    },
+    {
+      "epoch": 11.360347238405966,
+      "eval_accuracy": 0.37129404012940964,
+      "eval_loss": 3.552013397216797,
+      "eval_runtime": 180.3711,
+      "eval_samples_per_second": 92.276,
+      "eval_steps_per_second": 5.771,
+      "step": 39000
+    },
+    {
+      "epoch": 11.374912607783733,
+      "grad_norm": 0.3978714048862457,
+      "learning_rate": 0.00025913943990665106,
+      "loss": 3.2952,
+      "step": 39050
+    },
+    {
+      "epoch": 11.389477977161501,
+      "grad_norm": 0.3712661862373352,
+      "learning_rate": 0.0002587018669778296,
+      "loss": 3.2887,
+      "step": 39100
+    },
+    {
+      "epoch": 11.404043346539268,
+      "grad_norm": 0.3962024748325348,
+      "learning_rate": 0.00025826429404900814,
+      "loss": 3.2818,
+      "step": 39150
+    },
+    {
+      "epoch": 11.418608715917035,
+      "grad_norm": 0.3791441321372986,
+      "learning_rate": 0.0002578267211201867,
+      "loss": 3.2947,
+      "step": 39200
+    },
+    {
+      "epoch": 11.433174085294803,
+      "grad_norm": 0.38361239433288574,
+      "learning_rate": 0.0002573891481913652,
+      "loss": 3.3016,
+      "step": 39250
+    },
+    {
+      "epoch": 11.44773945467257,
+      "grad_norm": 0.3753799498081207,
+      "learning_rate": 0.0002569515752625437,
+      "loss": 3.291,
+      "step": 39300
+    },
+    {
+      "epoch": 11.462304824050339,
+      "grad_norm": 0.37291768193244934,
+      "learning_rate": 0.0002565140023337223,
+      "loss": 3.2858,
+      "step": 39350
+    },
+    {
+      "epoch": 11.476870193428105,
+      "grad_norm": 0.382899671792984,
+      "learning_rate": 0.0002560764294049008,
+      "loss": 3.2902,
+      "step": 39400
+    },
+    {
+      "epoch": 11.491435562805872,
+      "grad_norm": 0.38682591915130615,
+      "learning_rate": 0.00025563885647607933,
+      "loss": 3.3095,
+      "step": 39450
+    },
+    {
+      "epoch": 11.50600093218364,
+      "grad_norm": 0.39052340388298035,
+      "learning_rate": 0.00025520128354725787,
+      "loss": 3.3024,
+      "step": 39500
+    },
+    {
+      "epoch": 11.520566301561407,
+      "grad_norm": 0.38648220896720886,
+      "learning_rate": 0.00025476371061843636,
+      "loss": 3.3037,
+      "step": 39550
+    },
+    {
+      "epoch": 11.535131670939174,
+      "grad_norm": 0.37323859333992004,
+      "learning_rate": 0.00025432613768961495,
+      "loss": 3.2976,
+      "step": 39600
+    },
+    {
+      "epoch": 11.549697040316943,
+      "grad_norm": 0.3768618106842041,
+      "learning_rate": 0.00025388856476079343,
+      "loss": 3.3111,
+      "step": 39650
+    },
+    {
+      "epoch": 11.56426240969471,
+      "grad_norm": 0.3987623155117035,
+      "learning_rate": 0.000253450991831972,
+      "loss": 3.301,
+      "step": 39700
+    },
+    {
+      "epoch": 11.578827779072478,
+      "grad_norm": 0.42070692777633667,
+      "learning_rate": 0.0002530134189031505,
+      "loss": 3.3047,
+      "step": 39750
+    },
+    {
+      "epoch": 11.593393148450245,
+      "grad_norm": 0.37431833148002625,
+      "learning_rate": 0.000252575845974329,
+      "loss": 3.3051,
+      "step": 39800
+    },
+    {
+      "epoch": 11.607958517828012,
+      "grad_norm": 0.3832058310508728,
+      "learning_rate": 0.0002521382730455076,
+      "loss": 3.2968,
+      "step": 39850
+    },
+    {
+      "epoch": 11.62252388720578,
+      "grad_norm": 0.3984127342700958,
+      "learning_rate": 0.0002517007001166861,
+      "loss": 3.3157,
+      "step": 39900
+    },
+    {
+      "epoch": 11.637089256583547,
+      "grad_norm": 0.40139371156692505,
+      "learning_rate": 0.0002512631271878646,
+      "loss": 3.2996,
+      "step": 39950
+    },
+    {
+      "epoch": 11.651654625961314,
+      "grad_norm": 0.3891284465789795,
+      "learning_rate": 0.00025082555425904316,
+      "loss": 3.3088,
+      "step": 40000
+    },
+    {
+      "epoch": 11.651654625961314,
+      "eval_accuracy": 0.3715696399941494,
+      "eval_loss": 3.5464000701904297,
+      "eval_runtime": 180.4742,
+      "eval_samples_per_second": 92.224,
+      "eval_steps_per_second": 5.768,
+      "step": 40000
+    },
+    {
+      "epoch": 11.666219995339082,
+      "grad_norm": 0.37767043709754944,
+      "learning_rate": 0.0002503879813302217,
+      "loss": 3.2951,
+      "step": 40050
+    },
+    {
+      "epoch": 11.680785364716849,
+      "grad_norm": 0.40471839904785156,
+      "learning_rate": 0.00024995040840140024,
+      "loss": 3.2946,
+      "step": 40100
+    },
+    {
+      "epoch": 11.695350734094617,
+      "grad_norm": 0.3795658051967621,
+      "learning_rate": 0.0002495128354725787,
+      "loss": 3.3187,
+      "step": 40150
+    },
+    {
+      "epoch": 11.709916103472384,
+      "grad_norm": 0.3852717876434326,
+      "learning_rate": 0.00024907526254375727,
+      "loss": 3.2955,
+      "step": 40200
+    },
+    {
+      "epoch": 11.724481472850151,
+      "grad_norm": 0.37112799286842346,
+      "learning_rate": 0.0002486376896149358,
+      "loss": 3.3023,
+      "step": 40250
+    },
+    {
+      "epoch": 11.73904684222792,
+      "grad_norm": 0.37619829177856445,
+      "learning_rate": 0.00024820011668611435,
+      "loss": 3.3103,
+      "step": 40300
+    },
+    {
+      "epoch": 11.753612211605686,
+      "grad_norm": 0.3923087418079376,
+      "learning_rate": 0.0002477625437572929,
+      "loss": 3.3131,
+      "step": 40350
+    },
+    {
+      "epoch": 11.768177580983453,
+      "grad_norm": 0.3909642696380615,
+      "learning_rate": 0.00024732497082847137,
+      "loss": 3.3085,
+      "step": 40400
+    },
+    {
+      "epoch": 11.782742950361222,
+      "grad_norm": 0.3891732096672058,
+      "learning_rate": 0.0002468873978996499,
+      "loss": 3.3118,
+      "step": 40450
+    },
+    {
+      "epoch": 11.797308319738988,
+      "grad_norm": 0.39520296454429626,
+      "learning_rate": 0.00024644982497082845,
+      "loss": 3.3134,
+      "step": 40500
+    },
+    {
+      "epoch": 11.811873689116755,
+      "grad_norm": 0.3944683074951172,
+      "learning_rate": 0.000246012252042007,
+      "loss": 3.3071,
+      "step": 40550
+    },
+    {
+      "epoch": 11.826439058494524,
+      "grad_norm": 0.3806307315826416,
+      "learning_rate": 0.00024557467911318553,
+      "loss": 3.3058,
+      "step": 40600
+    },
+    {
+      "epoch": 11.84100442787229,
+      "grad_norm": 0.38682928681373596,
+      "learning_rate": 0.000245137106184364,
+      "loss": 3.3067,
+      "step": 40650
+    },
+    {
+      "epoch": 11.855569797250059,
+      "grad_norm": 0.3885536789894104,
+      "learning_rate": 0.00024469953325554256,
+      "loss": 3.3188,
+      "step": 40700
+    },
+    {
+      "epoch": 11.870135166627826,
+      "grad_norm": 0.39508283138275146,
+      "learning_rate": 0.0002442619603267211,
+      "loss": 3.3167,
+      "step": 40750
+    },
+    {
+      "epoch": 11.884700536005592,
+      "grad_norm": 0.37365779280662537,
+      "learning_rate": 0.00024382438739789964,
+      "loss": 3.2989,
+      "step": 40800
+    },
+    {
+      "epoch": 11.899265905383361,
+      "grad_norm": 0.37982553243637085,
+      "learning_rate": 0.00024338681446907818,
+      "loss": 3.3162,
+      "step": 40850
+    },
+    {
+      "epoch": 11.913831274761128,
+      "grad_norm": 0.3698308765888214,
+      "learning_rate": 0.0002429492415402567,
+      "loss": 3.3203,
+      "step": 40900
+    },
+    {
+      "epoch": 11.928396644138896,
+      "grad_norm": 0.3770948052406311,
+      "learning_rate": 0.0002425116686114352,
+      "loss": 3.315,
+      "step": 40950
+    },
+    {
+      "epoch": 11.942962013516663,
+      "grad_norm": 0.3643822968006134,
+      "learning_rate": 0.00024207409568261377,
+      "loss": 3.3188,
+      "step": 41000
+    },
+    {
+      "epoch": 11.942962013516663,
+      "eval_accuracy": 0.37253788439949564,
+      "eval_loss": 3.536294460296631,
+      "eval_runtime": 180.6316,
+      "eval_samples_per_second": 92.143,
+      "eval_steps_per_second": 5.763,
+      "step": 41000
+    },
+    {
+      "epoch": 11.95752738289443,
+      "grad_norm": 0.37310171127319336,
+      "learning_rate": 0.00024163652275379228,
+      "loss": 3.3067,
+      "step": 41050
+    },
+    {
+      "epoch": 11.972092752272198,
+      "grad_norm": 0.3831028342247009,
+      "learning_rate": 0.00024119894982497082,
+      "loss": 3.3263,
+      "step": 41100
+    },
+    {
+      "epoch": 11.986658121649965,
+      "grad_norm": 0.3900957405567169,
+      "learning_rate": 0.00024076137689614933,
+      "loss": 3.3106,
+      "step": 41150
+    },
+    {
+      "epoch": 12.001165229550221,
+      "grad_norm": 0.3873760998249054,
+      "learning_rate": 0.00024032380396732785,
+      "loss": 3.3022,
+      "step": 41200
+    },
+    {
+      "epoch": 12.01573059892799,
+      "grad_norm": 0.3788856863975525,
+      "learning_rate": 0.00023988623103850641,
+      "loss": 3.2189,
+      "step": 41250
+    },
+    {
+      "epoch": 12.030295968305756,
+      "grad_norm": 0.4146612584590912,
+      "learning_rate": 0.00023944865810968493,
+      "loss": 3.2262,
+      "step": 41300
+    },
+    {
+      "epoch": 12.044861337683523,
+      "grad_norm": 0.3976421356201172,
+      "learning_rate": 0.00023901108518086347,
+      "loss": 3.2397,
+      "step": 41350
+    },
+    {
+      "epoch": 12.059426707061291,
+      "grad_norm": 0.3815682828426361,
+      "learning_rate": 0.00023857351225204198,
+      "loss": 3.2293,
+      "step": 41400
+    },
+    {
+      "epoch": 12.073992076439058,
+      "grad_norm": 0.3816235363483429,
+      "learning_rate": 0.0002381359393232205,
+      "loss": 3.2318,
+      "step": 41450
+    },
+    {
+      "epoch": 12.088557445816827,
+      "grad_norm": 0.3730505108833313,
+      "learning_rate": 0.00023769836639439906,
+      "loss": 3.2306,
+      "step": 41500
+    },
+    {
+      "epoch": 12.103122815194594,
+      "grad_norm": 0.36907413601875305,
+      "learning_rate": 0.00023726079346557757,
+      "loss": 3.2172,
+      "step": 41550
+    },
+    {
+      "epoch": 12.11768818457236,
+      "grad_norm": 0.3938505947589874,
+      "learning_rate": 0.0002368232205367561,
+      "loss": 3.2349,
+      "step": 41600
+    },
+    {
+      "epoch": 12.132253553950129,
+      "grad_norm": 0.39459192752838135,
+      "learning_rate": 0.00023638564760793463,
+      "loss": 3.244,
+      "step": 41650
+    },
+    {
+      "epoch": 12.146818923327896,
+      "grad_norm": 0.3762718141078949,
+      "learning_rate": 0.00023594807467911317,
+      "loss": 3.2336,
+      "step": 41700
+    },
+    {
+      "epoch": 12.161384292705662,
+      "grad_norm": 0.38366296887397766,
+      "learning_rate": 0.0002355105017502917,
+      "loss": 3.2468,
+      "step": 41750
+    },
+    {
+      "epoch": 12.17594966208343,
+      "grad_norm": 0.37330591678619385,
+      "learning_rate": 0.00023507292882147022,
+      "loss": 3.2367,
+      "step": 41800
+    },
+    {
+      "epoch": 12.190515031461198,
+      "grad_norm": 0.39677342772483826,
+      "learning_rate": 0.00023463535589264876,
+      "loss": 3.2597,
+      "step": 41850
+    },
+    {
+      "epoch": 12.205080400838966,
+      "grad_norm": 0.3829995393753052,
+      "learning_rate": 0.0002341977829638273,
+      "loss": 3.2453,
+      "step": 41900
+    },
+    {
+      "epoch": 12.219645770216733,
+      "grad_norm": 0.40625911951065063,
+      "learning_rate": 0.0002337602100350058,
+      "loss": 3.2457,
+      "step": 41950
+    },
+    {
+      "epoch": 12.2342111395945,
+      "grad_norm": 0.3920283019542694,
+      "learning_rate": 0.00023332263710618435,
+      "loss": 3.2678,
+      "step": 42000
+    },
+    {
+      "epoch": 12.2342111395945,
+      "eval_accuracy": 0.3720842733593225,
+      "eval_loss": 3.546815872192383,
+      "eval_runtime": 180.3312,
+      "eval_samples_per_second": 92.297,
+      "eval_steps_per_second": 5.773,
+      "step": 42000
+    },
+    {
+      "epoch": 12.248776508972268,
+      "grad_norm": 0.398946613073349,
+      "learning_rate": 0.00023288506417736286,
+      "loss": 3.245,
+      "step": 42050
+    },
+    {
+      "epoch": 12.263341878350035,
+      "grad_norm": 0.3961947560310364,
+      "learning_rate": 0.00023244749124854143,
+      "loss": 3.2517,
+      "step": 42100
+    },
+    {
+      "epoch": 12.277907247727802,
+      "grad_norm": 0.3835267722606659,
+      "learning_rate": 0.00023200991831971994,
+      "loss": 3.2522,
+      "step": 42150
+    },
+    {
+      "epoch": 12.29247261710557,
+      "grad_norm": 0.42905566096305847,
+      "learning_rate": 0.00023157234539089846,
+      "loss": 3.2587,
+      "step": 42200
+    },
+    {
+      "epoch": 12.307037986483337,
+      "grad_norm": 0.39819803833961487,
+      "learning_rate": 0.000231134772462077,
+      "loss": 3.2556,
+      "step": 42250
+    },
+    {
+      "epoch": 12.321603355861104,
+      "grad_norm": 0.393216997385025,
+      "learning_rate": 0.0002306971995332555,
+      "loss": 3.2738,
+      "step": 42300
+    },
+    {
+      "epoch": 12.336168725238872,
+      "grad_norm": 0.3680713176727295,
+      "learning_rate": 0.00023025962660443408,
+      "loss": 3.2608,
+      "step": 42350
+    },
+    {
+      "epoch": 12.350734094616639,
+      "grad_norm": 0.3907005488872528,
+      "learning_rate": 0.0002298220536756126,
+      "loss": 3.2592,
+      "step": 42400
+    },
+    {
+      "epoch": 12.365299463994408,
+      "grad_norm": 0.39694586396217346,
+      "learning_rate": 0.0002293844807467911,
+      "loss": 3.2655,
+      "step": 42450
+    },
+    {
+      "epoch": 12.379864833372174,
+      "grad_norm": 0.3920033276081085,
+      "learning_rate": 0.00022894690781796964,
+      "loss": 3.2694,
+      "step": 42500
+    },
+    {
+      "epoch": 12.394430202749941,
+      "grad_norm": 0.38897332549095154,
+      "learning_rate": 0.00022850933488914815,
+      "loss": 3.2831,
+      "step": 42550
+    },
+    {
+      "epoch": 12.40899557212771,
+      "grad_norm": 0.3850444257259369,
+      "learning_rate": 0.00022807176196032672,
+      "loss": 3.2698,
+      "step": 42600
+    },
+    {
+      "epoch": 12.423560941505476,
+      "grad_norm": 0.39484626054763794,
+      "learning_rate": 0.00022763418903150523,
+      "loss": 3.2621,
+      "step": 42650
+    },
+    {
+      "epoch": 12.438126310883243,
+      "grad_norm": 0.4130299985408783,
+      "learning_rate": 0.00022719661610268375,
+      "loss": 3.2765,
+      "step": 42700
+    },
+    {
+      "epoch": 12.452691680261012,
+      "grad_norm": 0.3730163872241974,
+      "learning_rate": 0.0002267590431738623,
+      "loss": 3.2681,
+      "step": 42750
+    },
+    {
+      "epoch": 12.467257049638778,
+      "grad_norm": 0.3927021026611328,
+      "learning_rate": 0.00022632147024504083,
+      "loss": 3.2684,
+      "step": 42800
+    },
+    {
+      "epoch": 12.481822419016547,
+      "grad_norm": 0.39260363578796387,
+      "learning_rate": 0.00022588389731621937,
+      "loss": 3.2652,
+      "step": 42850
+    },
+    {
+      "epoch": 12.496387788394314,
+      "grad_norm": 0.3787255883216858,
+      "learning_rate": 0.00022544632438739788,
+      "loss": 3.2619,
+      "step": 42900
+    },
+    {
+      "epoch": 12.51095315777208,
+      "grad_norm": 0.38174960017204285,
+      "learning_rate": 0.0002250087514585764,
+      "loss": 3.2742,
+      "step": 42950
+    },
+    {
+      "epoch": 12.525518527149849,
+      "grad_norm": 0.39155444502830505,
+      "learning_rate": 0.00022457117852975496,
+      "loss": 3.2818,
+      "step": 43000
+    },
+    {
+      "epoch": 12.525518527149849,
+      "eval_accuracy": 0.37251695574082516,
+      "eval_loss": 3.541425943374634,
+      "eval_runtime": 180.4575,
+      "eval_samples_per_second": 92.232,
+      "eval_steps_per_second": 5.769,
+      "step": 43000
+    },
+    {
+      "epoch": 12.540083896527616,
+      "grad_norm": 0.3748781979084015,
+      "learning_rate": 0.00022413360560093347,
+      "loss": 3.28,
+      "step": 43050
+    },
+    {
+      "epoch": 12.554649265905383,
+      "grad_norm": 0.3864782452583313,
+      "learning_rate": 0.000223696032672112,
+      "loss": 3.2746,
+      "step": 43100
+    },
+    {
+      "epoch": 12.569214635283151,
+      "grad_norm": 0.39516115188598633,
+      "learning_rate": 0.00022325845974329053,
+      "loss": 3.2664,
+      "step": 43150
+    },
+    {
+      "epoch": 12.583780004660918,
+      "grad_norm": 0.3874489367008209,
+      "learning_rate": 0.00022282088681446904,
+      "loss": 3.2765,
+      "step": 43200
+    },
+    {
+      "epoch": 12.598345374038686,
+      "grad_norm": 0.4148963689804077,
+      "learning_rate": 0.0002223833138856476,
+      "loss": 3.2833,
+      "step": 43250
+    },
+    {
+      "epoch": 12.612910743416453,
+      "grad_norm": 0.38245537877082825,
+      "learning_rate": 0.00022194574095682612,
+      "loss": 3.2826,
+      "step": 43300
+    },
+    {
+      "epoch": 12.62747611279422,
+      "grad_norm": 0.3959484100341797,
+      "learning_rate": 0.00022150816802800466,
+      "loss": 3.2772,
+      "step": 43350
+    },
+    {
+      "epoch": 12.642041482171988,
+      "grad_norm": 0.3956339359283447,
+      "learning_rate": 0.00022107059509918317,
+      "loss": 3.2741,
+      "step": 43400
+    },
+    {
+      "epoch": 12.656606851549755,
+      "grad_norm": 0.3839803636074066,
+      "learning_rate": 0.00022063302217036168,
+      "loss": 3.2662,
+      "step": 43450
+    },
+    {
+      "epoch": 12.671172220927522,
+      "grad_norm": 0.40059152245521545,
+      "learning_rate": 0.00022019544924154025,
+      "loss": 3.2851,
+      "step": 43500
+    },
+    {
+      "epoch": 12.68573759030529,
+      "grad_norm": 0.3880845904350281,
+      "learning_rate": 0.00021975787631271876,
+      "loss": 3.2854,
+      "step": 43550
+    },
+    {
+      "epoch": 12.700302959683057,
+      "grad_norm": 0.3912261128425598,
+      "learning_rate": 0.0002193203033838973,
+      "loss": 3.2838,
+      "step": 43600
+    },
+    {
+      "epoch": 12.714868329060826,
+      "grad_norm": 0.41812238097190857,
+      "learning_rate": 0.00021888273045507582,
+      "loss": 3.2902,
+      "step": 43650
+    },
+    {
+      "epoch": 12.729433698438593,
+      "grad_norm": 0.3847753703594208,
+      "learning_rate": 0.00021844515752625436,
+      "loss": 3.2726,
+      "step": 43700
+    },
+    {
+      "epoch": 12.74399906781636,
+      "grad_norm": 0.3847730755805969,
+      "learning_rate": 0.0002180075845974329,
+      "loss": 3.2761,
+      "step": 43750
+    },
+    {
+      "epoch": 12.758564437194128,
+      "grad_norm": 0.3814358413219452,
+      "learning_rate": 0.0002175700116686114,
+      "loss": 3.2816,
+      "step": 43800
+    },
+    {
+      "epoch": 12.773129806571895,
+      "grad_norm": 0.39806804060935974,
+      "learning_rate": 0.00021713243873978995,
+      "loss": 3.2889,
+      "step": 43850
+    },
+    {
+      "epoch": 12.787695175949661,
+      "grad_norm": 0.3872688412666321,
+      "learning_rate": 0.0002166948658109685,
+      "loss": 3.2889,
+      "step": 43900
+    },
+    {
+      "epoch": 12.80226054532743,
+      "grad_norm": 0.3840930461883545,
+      "learning_rate": 0.000216257292882147,
+      "loss": 3.2767,
+      "step": 43950
+    },
+    {
+      "epoch": 12.816825914705197,
+      "grad_norm": 0.3970656096935272,
+      "learning_rate": 0.00021581971995332554,
+      "loss": 3.2772,
+      "step": 44000
+    },
+    {
+      "epoch": 12.816825914705197,
+      "eval_accuracy": 0.37332153333229867,
+      "eval_loss": 3.5360467433929443,
+      "eval_runtime": 180.2447,
+      "eval_samples_per_second": 92.341,
+      "eval_steps_per_second": 5.775,
+      "step": 44000
+    },
+    {
+      "epoch": 12.831391284082965,
+      "grad_norm": 0.3944132328033447,
+      "learning_rate": 0.00021538214702450405,
+      "loss": 3.2893,
+      "step": 44050
+    },
+    {
+      "epoch": 12.845956653460732,
+      "grad_norm": 0.40921568870544434,
+      "learning_rate": 0.00021494457409568262,
+      "loss": 3.2811,
+      "step": 44100
+    },
+    {
+      "epoch": 12.860522022838499,
+      "grad_norm": 0.37589746713638306,
+      "learning_rate": 0.00021450700116686113,
+      "loss": 3.278,
+      "step": 44150
+    },
+    {
+      "epoch": 12.875087392216267,
+      "grad_norm": 0.4068247377872467,
+      "learning_rate": 0.00021406942823803965,
+      "loss": 3.2848,
+      "step": 44200
+    },
+    {
+      "epoch": 12.889652761594034,
+      "grad_norm": 0.41013479232788086,
+      "learning_rate": 0.0002136318553092182,
+      "loss": 3.2929,
+      "step": 44250
+    },
+    {
+      "epoch": 12.9042181309718,
+      "grad_norm": 0.39379021525382996,
+      "learning_rate": 0.0002131942823803967,
+      "loss": 3.2898,
+      "step": 44300
+    },
+    {
+      "epoch": 12.91878350034957,
+      "grad_norm": 0.38993388414382935,
+      "learning_rate": 0.00021275670945157527,
+      "loss": 3.287,
+      "step": 44350
+    },
+    {
+      "epoch": 12.933348869727336,
+      "grad_norm": 0.4032069146633148,
+      "learning_rate": 0.00021231913652275378,
+      "loss": 3.2932,
+      "step": 44400
+    },
+    {
+      "epoch": 12.947914239105105,
+      "grad_norm": 0.40004608035087585,
+      "learning_rate": 0.0002118815635939323,
+      "loss": 3.2954,
+      "step": 44450
+    },
+    {
+      "epoch": 12.962479608482871,
+      "grad_norm": 0.39480239152908325,
+      "learning_rate": 0.00021144399066511083,
+      "loss": 3.2967,
+      "step": 44500
+    },
+    {
+      "epoch": 12.977044977860638,
+      "grad_norm": 0.4099850058555603,
+      "learning_rate": 0.00021100641773628935,
+      "loss": 3.2809,
+      "step": 44550
+    },
+    {
+      "epoch": 12.991610347238407,
+      "grad_norm": 0.38592153787612915,
+      "learning_rate": 0.0002105688448074679,
+      "loss": 3.2948,
+      "step": 44600
+    },
+    {
+      "epoch": 13.006117455138662,
+      "grad_norm": 0.3878653049468994,
+      "learning_rate": 0.00021013127187864643,
+      "loss": 3.2447,
+      "step": 44650
+    },
+    {
+      "epoch": 13.02068282451643,
+      "grad_norm": 0.40472573041915894,
+      "learning_rate": 0.00020969369894982494,
+      "loss": 3.1872,
+      "step": 44700
+    },
+    {
+      "epoch": 13.035248193894198,
+      "grad_norm": 0.38480067253112793,
+      "learning_rate": 0.00020925612602100348,
+      "loss": 3.1966,
+      "step": 44750
+    },
+    {
+      "epoch": 13.049813563271965,
+      "grad_norm": 0.4043852388858795,
+      "learning_rate": 0.00020881855309218202,
+      "loss": 3.1928,
+      "step": 44800
+    },
+    {
+      "epoch": 13.064378932649731,
+      "grad_norm": 0.3920169174671173,
+      "learning_rate": 0.00020838098016336056,
+      "loss": 3.2142,
+      "step": 44850
+    },
+    {
+      "epoch": 13.0789443020275,
+      "grad_norm": 0.4085189402103424,
+      "learning_rate": 0.00020794340723453907,
+      "loss": 3.1996,
+      "step": 44900
+    },
+    {
+      "epoch": 13.093509671405267,
+      "grad_norm": 0.39081132411956787,
+      "learning_rate": 0.00020750583430571758,
+      "loss": 3.2162,
+      "step": 44950
+    },
+    {
+      "epoch": 13.108075040783035,
+      "grad_norm": 0.4104847311973572,
+      "learning_rate": 0.00020706826137689615,
+      "loss": 3.215,
+      "step": 45000
+    },
+    {
+      "epoch": 13.108075040783035,
+      "eval_accuracy": 0.3724899130919812,
+      "eval_loss": 3.5466861724853516,
+      "eval_runtime": 180.2123,
+      "eval_samples_per_second": 92.358,
+      "eval_steps_per_second": 5.777,
+      "step": 45000
+    },
+    {
+      "epoch": 13.122640410160802,
+      "grad_norm": 0.40783169865608215,
+      "learning_rate": 0.00020663068844807466,
+      "loss": 3.217,
+      "step": 45050
+    },
+    {
+      "epoch": 13.137205779538569,
+      "grad_norm": 0.3994167149066925,
+      "learning_rate": 0.0002061931155192532,
+      "loss": 3.214,
+      "step": 45100
+    },
+    {
+      "epoch": 13.151771148916337,
+      "grad_norm": 0.41038912534713745,
+      "learning_rate": 0.00020575554259043172,
+      "loss": 3.2265,
+      "step": 45150
+    },
+    {
+      "epoch": 13.166336518294104,
+      "grad_norm": 0.3970767557621002,
+      "learning_rate": 0.00020531796966161023,
+      "loss": 3.2219,
+      "step": 45200
+    },
+    {
+      "epoch": 13.18090188767187,
+      "grad_norm": 0.4076697528362274,
+      "learning_rate": 0.0002048803967327888,
+      "loss": 3.2133,
+      "step": 45250
+    },
+    {
+      "epoch": 13.19546725704964,
+      "grad_norm": 0.40613362193107605,
+      "learning_rate": 0.0002044428238039673,
+      "loss": 3.22,
+      "step": 45300
+    },
+    {
+      "epoch": 13.210032626427406,
+      "grad_norm": 0.39395052194595337,
+      "learning_rate": 0.00020400525087514585,
+      "loss": 3.2279,
+      "step": 45350
+    },
+    {
+      "epoch": 13.224597995805174,
+      "grad_norm": 0.3916940987110138,
+      "learning_rate": 0.00020356767794632436,
+      "loss": 3.2346,
+      "step": 45400
+    },
+    {
+      "epoch": 13.239163365182941,
+      "grad_norm": 0.41231533885002136,
+      "learning_rate": 0.00020313010501750287,
+      "loss": 3.2357,
+      "step": 45450
+    },
+    {
+      "epoch": 13.253728734560708,
+      "grad_norm": 0.4182799160480499,
+      "learning_rate": 0.00020269253208868144,
+      "loss": 3.2334,
+      "step": 45500
+    },
+    {
+      "epoch": 13.268294103938477,
+      "grad_norm": 0.4099382162094116,
+      "learning_rate": 0.00020225495915985995,
+      "loss": 3.2341,
+      "step": 45550
+    },
+    {
+      "epoch": 13.282859473316243,
+      "grad_norm": 0.4044232666492462,
+      "learning_rate": 0.0002018173862310385,
+      "loss": 3.2213,
+      "step": 45600
+    },
+    {
+      "epoch": 13.29742484269401,
+      "grad_norm": 0.39154335856437683,
+      "learning_rate": 0.000201379813302217,
+      "loss": 3.237,
+      "step": 45650
+    },
+    {
+      "epoch": 13.311990212071779,
+      "grad_norm": 0.4079340398311615,
+      "learning_rate": 0.00020094224037339555,
+      "loss": 3.2376,
+      "step": 45700
+    },
+    {
+      "epoch": 13.326555581449545,
+      "grad_norm": 0.39542028307914734,
+      "learning_rate": 0.0002005046674445741,
+      "loss": 3.2315,
+      "step": 45750
+    },
+    {
+      "epoch": 13.341120950827314,
+      "grad_norm": 0.39488768577575684,
+      "learning_rate": 0.0002000670945157526,
+      "loss": 3.2401,
+      "step": 45800
+    },
+    {
+      "epoch": 13.35568632020508,
+      "grad_norm": 0.41860339045524597,
+      "learning_rate": 0.00019962952158693114,
+      "loss": 3.2385,
+      "step": 45850
+    },
+    {
+      "epoch": 13.370251689582847,
+      "grad_norm": 0.4021410644054413,
+      "learning_rate": 0.00019919194865810968,
+      "loss": 3.2472,
+      "step": 45900
+    },
+    {
+      "epoch": 13.384817058960616,
+      "grad_norm": 0.3935169279575348,
+      "learning_rate": 0.0001987543757292882,
+      "loss": 3.2474,
+      "step": 45950
+    },
+    {
+      "epoch": 13.399382428338383,
+      "grad_norm": 0.4164498448371887,
+      "learning_rate": 0.00019831680280046673,
+      "loss": 3.2448,
+      "step": 46000
+    },
+    {
+      "epoch": 13.399382428338383,
+      "eval_accuracy": 0.37297574015729884,
+      "eval_loss": 3.5430777072906494,
+      "eval_runtime": 180.1561,
+      "eval_samples_per_second": 92.387,
+      "eval_steps_per_second": 5.778,
+      "step": 46000
+    },
+    {
+      "epoch": 13.41394779771615,
+      "grad_norm": 0.4161559045314789,
+      "learning_rate": 0.00019787922987164524,
+      "loss": 3.2397,
+      "step": 46050
+    },
+    {
+      "epoch": 13.428513167093918,
+      "grad_norm": 0.40776827931404114,
+      "learning_rate": 0.0001974416569428238,
+      "loss": 3.2325,
+      "step": 46100
+    },
+    {
+      "epoch": 13.443078536471685,
+      "grad_norm": 0.3878330886363983,
+      "learning_rate": 0.00019700408401400232,
+      "loss": 3.2609,
+      "step": 46150
+    },
+    {
+      "epoch": 13.457643905849451,
+      "grad_norm": 0.40034887194633484,
+      "learning_rate": 0.00019656651108518084,
+      "loss": 3.2584,
+      "step": 46200
+    },
+    {
+      "epoch": 13.47220927522722,
+      "grad_norm": 0.40647125244140625,
+      "learning_rate": 0.00019612893815635938,
+      "loss": 3.2431,
+      "step": 46250
+    },
+    {
+      "epoch": 13.486774644604987,
+      "grad_norm": 0.3935099244117737,
+      "learning_rate": 0.0001956913652275379,
+      "loss": 3.2455,
+      "step": 46300
+    },
+    {
+      "epoch": 13.501340013982755,
+      "grad_norm": 0.3952663540840149,
+      "learning_rate": 0.00019525379229871646,
+      "loss": 3.2482,
+      "step": 46350
+    },
+    {
+      "epoch": 13.515905383360522,
+      "grad_norm": 0.390480637550354,
+      "learning_rate": 0.00019481621936989497,
+      "loss": 3.2544,
+      "step": 46400
+    },
+    {
+      "epoch": 13.530470752738289,
+      "grad_norm": 0.40572217106819153,
+      "learning_rate": 0.00019437864644107348,
+      "loss": 3.2502,
+      "step": 46450
+    },
+    {
+      "epoch": 13.545036122116057,
+      "grad_norm": 0.38214248418807983,
+      "learning_rate": 0.00019394107351225202,
+      "loss": 3.2427,
+      "step": 46500
+    },
+    {
+      "epoch": 13.559601491493824,
+      "grad_norm": 0.4259106516838074,
+      "learning_rate": 0.00019350350058343054,
+      "loss": 3.2479,
+      "step": 46550
+    },
+    {
+      "epoch": 13.574166860871593,
+      "grad_norm": 0.3941766917705536,
+      "learning_rate": 0.0001930659276546091,
+      "loss": 3.2628,
+      "step": 46600
+    },
+    {
+      "epoch": 13.58873223024936,
+      "grad_norm": 0.40022504329681396,
+      "learning_rate": 0.00019262835472578762,
+      "loss": 3.2478,
+      "step": 46650
+    },
+    {
+      "epoch": 13.603297599627126,
+      "grad_norm": 0.3927033841609955,
+      "learning_rate": 0.00019219078179696613,
+      "loss": 3.2597,
+      "step": 46700
+    },
+    {
+      "epoch": 13.617862969004895,
+      "grad_norm": 0.4204312562942505,
+      "learning_rate": 0.00019175320886814467,
+      "loss": 3.2552,
+      "step": 46750
+    },
+    {
+      "epoch": 13.632428338382661,
+      "grad_norm": 0.4014910161495209,
+      "learning_rate": 0.0001913156359393232,
+      "loss": 3.2582,
+      "step": 46800
+    },
+    {
+      "epoch": 13.646993707760428,
+      "grad_norm": 0.3960302770137787,
+      "learning_rate": 0.00019087806301050175,
+      "loss": 3.247,
+      "step": 46850
+    },
+    {
+      "epoch": 13.661559077138197,
+      "grad_norm": 0.40421754121780396,
+      "learning_rate": 0.00019044049008168026,
+      "loss": 3.2509,
+      "step": 46900
+    },
+    {
+      "epoch": 13.676124446515963,
+      "grad_norm": 0.4028851091861725,
+      "learning_rate": 0.00019000291715285877,
+      "loss": 3.2603,
+      "step": 46950
+    },
+    {
+      "epoch": 13.69068981589373,
+      "grad_norm": 0.4152960181236267,
+      "learning_rate": 0.00018956534422403734,
+      "loss": 3.2716,
+      "step": 47000
+    },
+    {
+      "epoch": 13.69068981589373,
+      "eval_accuracy": 0.37343640580151827,
+      "eval_loss": 3.536942481994629,
+      "eval_runtime": 180.1541,
+      "eval_samples_per_second": 92.388,
+      "eval_steps_per_second": 5.778,
+      "step": 47000
+    },
+    {
+      "epoch": 13.705255185271499,
+      "grad_norm": 0.40029028058052063,
+      "learning_rate": 0.00018912777129521585,
+      "loss": 3.2608,
+      "step": 47050
+    },
+    {
+      "epoch": 13.719820554649266,
+      "grad_norm": 0.4005506634712219,
+      "learning_rate": 0.0001886901983663944,
+      "loss": 3.2562,
+      "step": 47100
+    },
+    {
+      "epoch": 13.734385924027034,
+      "grad_norm": 0.4043956398963928,
+      "learning_rate": 0.0001882526254375729,
+      "loss": 3.2553,
+      "step": 47150
+    },
+    {
+      "epoch": 13.7489512934048,
+      "grad_norm": 0.393660306930542,
+      "learning_rate": 0.00018781505250875142,
+      "loss": 3.2504,
+      "step": 47200
+    },
+    {
+      "epoch": 13.763516662782568,
+      "grad_norm": 0.41873812675476074,
+      "learning_rate": 0.00018737747957992999,
+      "loss": 3.2641,
+      "step": 47250
+    },
+    {
+      "epoch": 13.778082032160336,
+      "grad_norm": 0.39937934279441833,
+      "learning_rate": 0.0001869399066511085,
+      "loss": 3.2601,
+      "step": 47300
+    },
+    {
+      "epoch": 13.792647401538103,
+      "grad_norm": 0.39644569158554077,
+      "learning_rate": 0.00018650233372228704,
+      "loss": 3.2579,
+      "step": 47350
+    },
+    {
+      "epoch": 13.80721277091587,
+      "grad_norm": 0.4110250174999237,
+      "learning_rate": 0.00018606476079346555,
+      "loss": 3.2545,
+      "step": 47400
+    },
+    {
+      "epoch": 13.821778140293638,
+      "grad_norm": 0.39572134613990784,
+      "learning_rate": 0.00018562718786464406,
+      "loss": 3.2551,
+      "step": 47450
+    },
+    {
+      "epoch": 13.836343509671405,
+      "grad_norm": 0.40120694041252136,
+      "learning_rate": 0.00018518961493582263,
+      "loss": 3.2497,
+      "step": 47500
+    },
+    {
+      "epoch": 13.850908879049173,
+      "grad_norm": 0.3942031264305115,
+      "learning_rate": 0.00018475204200700114,
+      "loss": 3.2592,
+      "step": 47550
+    },
+    {
+      "epoch": 13.86547424842694,
+      "grad_norm": 0.4140487611293793,
+      "learning_rate": 0.00018431446907817968,
+      "loss": 3.2552,
+      "step": 47600
+    },
+    {
+      "epoch": 13.880039617804707,
+      "grad_norm": 0.39110127091407776,
+      "learning_rate": 0.0001838768961493582,
+      "loss": 3.261,
+      "step": 47650
+    },
+    {
+      "epoch": 13.894604987182475,
+      "grad_norm": 0.4091663360595703,
+      "learning_rate": 0.00018343932322053674,
+      "loss": 3.2709,
+      "step": 47700
+    },
+    {
+      "epoch": 13.909170356560242,
+      "grad_norm": 0.39773812890052795,
+      "learning_rate": 0.00018300175029171528,
+      "loss": 3.2645,
+      "step": 47750
+    },
+    {
+      "epoch": 13.923735725938009,
+      "grad_norm": 0.4022299647331238,
+      "learning_rate": 0.0001825641773628938,
+      "loss": 3.2597,
+      "step": 47800
+    },
+    {
+      "epoch": 13.938301095315778,
+      "grad_norm": 0.3977898061275482,
+      "learning_rate": 0.00018212660443407233,
+      "loss": 3.2697,
+      "step": 47850
+    },
+    {
+      "epoch": 13.952866464693544,
+      "grad_norm": 0.38834723830223083,
+      "learning_rate": 0.00018168903150525087,
+      "loss": 3.2585,
+      "step": 47900
+    },
+    {
+      "epoch": 13.967431834071313,
+      "grad_norm": 0.3896270990371704,
+      "learning_rate": 0.00018125145857642938,
+      "loss": 3.2654,
+      "step": 47950
+    },
+    {
+      "epoch": 13.98199720344908,
+      "grad_norm": 0.41397517919540405,
+      "learning_rate": 0.00018081388564760792,
+      "loss": 3.2672,
+      "step": 48000
+    },
+    {
+      "epoch": 13.98199720344908,
+      "eval_accuracy": 0.37412834488172014,
+      "eval_loss": 3.5272507667541504,
+      "eval_runtime": 180.2142,
+      "eval_samples_per_second": 92.357,
+      "eval_steps_per_second": 5.776,
+      "step": 48000
+    },
+    {
+      "epoch": 13.996562572826846,
+      "grad_norm": 0.3925948739051819,
+      "learning_rate": 0.00018037631271878644,
+      "loss": 3.2785,
+      "step": 48050
+    },
+    {
+      "epoch": 14.011069680727104,
+      "grad_norm": 0.39326012134552,
+      "learning_rate": 0.000179938739789965,
+      "loss": 3.2021,
+      "step": 48100
+    },
+    {
+      "epoch": 14.02563505010487,
+      "grad_norm": 0.40781304240226746,
+      "learning_rate": 0.00017950116686114352,
+      "loss": 3.1782,
+      "step": 48150
+    },
+    {
+      "epoch": 14.040200419482638,
+      "grad_norm": 0.3889636695384979,
+      "learning_rate": 0.00017906359393232203,
+      "loss": 3.1885,
+      "step": 48200
+    },
+    {
+      "epoch": 14.054765788860406,
+      "grad_norm": 0.4008404314517975,
+      "learning_rate": 0.00017862602100350057,
+      "loss": 3.1821,
+      "step": 48250
+    },
+    {
+      "epoch": 14.069331158238173,
+      "grad_norm": 0.4058891832828522,
+      "learning_rate": 0.00017818844807467908,
+      "loss": 3.1926,
+      "step": 48300
+    },
+    {
+      "epoch": 14.08389652761594,
+      "grad_norm": 0.3980492949485779,
+      "learning_rate": 0.00017775087514585765,
+      "loss": 3.1981,
+      "step": 48350
+    },
+    {
+      "epoch": 14.098461896993708,
+      "grad_norm": 0.4085221588611603,
+      "learning_rate": 0.00017731330221703616,
+      "loss": 3.1935,
+      "step": 48400
+    },
+    {
+      "epoch": 14.113027266371475,
+      "grad_norm": 0.41492384672164917,
+      "learning_rate": 0.00017687572928821467,
+      "loss": 3.194,
+      "step": 48450
+    },
+    {
+      "epoch": 14.127592635749243,
+      "grad_norm": 0.4290497899055481,
+      "learning_rate": 0.00017643815635939321,
+      "loss": 3.1947,
+      "step": 48500
+    },
+    {
+      "epoch": 14.14215800512701,
+      "grad_norm": 0.42287999391555786,
+      "learning_rate": 0.00017600058343057173,
+      "loss": 3.1991,
+      "step": 48550
+    },
+    {
+      "epoch": 14.156723374504777,
+      "grad_norm": 0.39472466707229614,
+      "learning_rate": 0.0001755630105017503,
+      "loss": 3.2116,
+      "step": 48600
+    },
+    {
+      "epoch": 14.171288743882545,
+      "grad_norm": 0.4188964068889618,
+      "learning_rate": 0.0001751254375729288,
+      "loss": 3.1942,
+      "step": 48650
+    },
+    {
+      "epoch": 14.185854113260312,
+      "grad_norm": 0.4070267975330353,
+      "learning_rate": 0.00017468786464410732,
+      "loss": 3.1969,
+      "step": 48700
+    },
+    {
+      "epoch": 14.200419482638079,
+      "grad_norm": 0.40462633967399597,
+      "learning_rate": 0.00017425029171528586,
+      "loss": 3.2017,
+      "step": 48750
+    },
+    {
+      "epoch": 14.214984852015847,
+      "grad_norm": 0.40400370955467224,
+      "learning_rate": 0.0001738127187864644,
+      "loss": 3.2093,
+      "step": 48800
+    },
+    {
+      "epoch": 14.229550221393614,
+      "grad_norm": 0.3998878002166748,
+      "learning_rate": 0.00017337514585764294,
+      "loss": 3.2056,
+      "step": 48850
+    },
+    {
+      "epoch": 14.244115590771383,
+      "grad_norm": 0.3977794945240021,
+      "learning_rate": 0.00017293757292882145,
+      "loss": 3.2038,
+      "step": 48900
+    },
+    {
+      "epoch": 14.25868096014915,
+      "grad_norm": 0.4316108226776123,
+      "learning_rate": 0.00017249999999999996,
+      "loss": 3.2037,
+      "step": 48950
+    },
+    {
+      "epoch": 14.273246329526916,
+      "grad_norm": 0.41260573267936707,
+      "learning_rate": 0.00017206242707117853,
+      "loss": 3.2114,
+      "step": 49000
+    },
+    {
+      "epoch": 14.273246329526916,
+      "eval_accuracy": 0.37353940302059335,
+      "eval_loss": 3.538419723510742,
+      "eval_runtime": 180.2527,
+      "eval_samples_per_second": 92.337,
+      "eval_steps_per_second": 5.775,
+      "step": 49000
+    },
+    {
+      "epoch": 14.287811698904685,
+      "grad_norm": 0.4174029231071472,
+      "learning_rate": 0.00017162485414235704,
+      "loss": 3.2237,
+      "step": 49050
+    },
+    {
+      "epoch": 14.302377068282452,
+      "grad_norm": 0.42132076621055603,
+      "learning_rate": 0.00017118728121353558,
+      "loss": 3.2049,
+      "step": 49100
+    },
+    {
+      "epoch": 14.316942437660218,
+      "grad_norm": 0.41422000527381897,
+      "learning_rate": 0.0001707497082847141,
+      "loss": 3.208,
+      "step": 49150
+    },
+    {
+      "epoch": 14.331507807037987,
+      "grad_norm": 0.4296468198299408,
+      "learning_rate": 0.0001703121353558926,
+      "loss": 3.2076,
+      "step": 49200
+    },
+    {
+      "epoch": 14.346073176415754,
+      "grad_norm": 0.40375787019729614,
+      "learning_rate": 0.00016987456242707118,
+      "loss": 3.2195,
+      "step": 49250
+    },
+    {
+      "epoch": 14.360638545793522,
+      "grad_norm": 0.4078134298324585,
+      "learning_rate": 0.0001694369894982497,
+      "loss": 3.2231,
+      "step": 49300
+    },
+    {
+      "epoch": 14.375203915171289,
+      "grad_norm": 0.4103347063064575,
+      "learning_rate": 0.00016899941656942823,
+      "loss": 3.224,
+      "step": 49350
+    },
+    {
+      "epoch": 14.389769284549056,
+      "grad_norm": 0.4056347906589508,
+      "learning_rate": 0.00016856184364060674,
+      "loss": 3.2211,
+      "step": 49400
+    },
+    {
+      "epoch": 14.404334653926824,
+      "grad_norm": 0.43045109510421753,
+      "learning_rate": 0.00016812427071178528,
+      "loss": 3.2178,
+      "step": 49450
+    },
+    {
+      "epoch": 14.418900023304591,
+      "grad_norm": 0.4060124158859253,
+      "learning_rate": 0.00016768669778296382,
+      "loss": 3.2132,
+      "step": 49500
+    },
+    {
+      "epoch": 14.433465392682358,
+      "grad_norm": 0.40384456515312195,
+      "learning_rate": 0.00016724912485414234,
+      "loss": 3.2172,
+      "step": 49550
+    },
+    {
+      "epoch": 14.448030762060126,
+      "grad_norm": 0.40116435289382935,
+      "learning_rate": 0.00016681155192532088,
+      "loss": 3.2099,
+      "step": 49600
+    },
+    {
+      "epoch": 14.462596131437893,
+      "grad_norm": 0.4094943404197693,
+      "learning_rate": 0.00016637397899649942,
+      "loss": 3.2127,
+      "step": 49650
+    },
+    {
+      "epoch": 14.477161500815662,
+      "grad_norm": 0.40145185589790344,
+      "learning_rate": 0.00016593640606767793,
+      "loss": 3.22,
+      "step": 49700
+    },
+    {
+      "epoch": 14.491726870193428,
+      "grad_norm": 0.42102572321891785,
+      "learning_rate": 0.00016549883313885647,
+      "loss": 3.2253,
+      "step": 49750
+    },
+    {
+      "epoch": 14.506292239571195,
+      "grad_norm": 0.41271886229515076,
+      "learning_rate": 0.00016506126021003498,
+      "loss": 3.207,
+      "step": 49800
+    },
+    {
+      "epoch": 14.520857608948964,
+      "grad_norm": 0.41741323471069336,
+      "learning_rate": 0.00016462368728121355,
+      "loss": 3.2322,
+      "step": 49850
+    },
+    {
+      "epoch": 14.53542297832673,
+      "grad_norm": 0.40796613693237305,
+      "learning_rate": 0.00016418611435239206,
+      "loss": 3.2178,
+      "step": 49900
+    },
+    {
+      "epoch": 14.549988347704497,
+      "grad_norm": 0.4142317771911621,
+      "learning_rate": 0.00016374854142357057,
+      "loss": 3.2196,
+      "step": 49950
+    },
+    {
+      "epoch": 14.564553717082266,
+      "grad_norm": 0.4134737253189087,
+      "learning_rate": 0.0001633109684947491,
+      "loss": 3.229,
+      "step": 50000
+    },
+    {
+      "epoch": 14.564553717082266,
+      "eval_accuracy": 0.3739324620427029,
+      "eval_loss": 3.535773515701294,
+      "eval_runtime": 180.2641,
+      "eval_samples_per_second": 92.331,
+      "eval_steps_per_second": 5.775,
+      "step": 50000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 68660,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 2
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.045105940496384e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}