diff --git "a/last_to_drop_frequency_40817/checkpoint-40000/trainer_state.json" "b/last_to_drop_frequency_40817/checkpoint-40000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/last_to_drop_frequency_40817/checkpoint-40000/trainer_state.json"
@@ -0,0 +1,6003 @@
+{
+  "best_global_step": 40000,
+  "best_metric": 3.5595271587371826,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/last_to_drop_frequency_40817/checkpoint-40000",
+  "epoch": 11.644982819870712,
+  "eval_steps": 1000,
+  "global_step": 40000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.014559431599790344,
+      "grad_norm": 1.5551334619522095,
+      "learning_rate": 0.000294,
+      "loss": 8.4667,
+      "step": 50
+    },
+    {
+      "epoch": 0.029118863199580687,
+      "grad_norm": 0.7336986064910889,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7245,
+      "step": 100
+    },
+    {
+      "epoch": 0.043678294799371034,
+      "grad_norm": 0.4792507588863373,
+      "learning_rate": 0.0005998287212350713,
+      "loss": 6.3255,
+      "step": 150
+    },
+    {
+      "epoch": 0.058237726399161374,
+      "grad_norm": 0.47392821311950684,
+      "learning_rate": 0.0005996539469851441,
+      "loss": 6.1138,
+      "step": 200
+    },
+    {
+      "epoch": 0.07279715799895171,
+      "grad_norm": 0.442217618227005,
+      "learning_rate": 0.000599479172735217,
+      "loss": 5.9746,
+      "step": 250
+    },
+    {
+      "epoch": 0.08735658959874207,
+      "grad_norm": 0.4978708028793335,
+      "learning_rate": 0.0005993043984852897,
+      "loss": 5.8573,
+      "step": 300
+    },
+    {
+      "epoch": 0.10191602119853241,
+      "grad_norm": 0.5078408122062683,
+      "learning_rate": 0.0005991296242353626,
+      "loss": 5.7377,
+      "step": 350
+    },
+    {
+      "epoch": 0.11647545279832275,
+      "grad_norm": 0.4501552879810333,
+      "learning_rate": 0.0005989548499854355,
+      "loss": 5.613,
+      "step": 400
+    },
+    {
+      "epoch": 0.1310348843981131,
+      "grad_norm": 0.41562119126319885,
+      "learning_rate": 0.0005987800757355083,
+      "loss": 5.5049,
+      "step": 450
+    },
+    {
+      "epoch": 0.14559431599790343,
+      "grad_norm": 0.39685097336769104,
+      "learning_rate": 0.0005986053014855811,
+      "loss": 5.4153,
+      "step": 500
+    },
+    {
+      "epoch": 0.1601537475976938,
+      "grad_norm": 0.4735598862171173,
+      "learning_rate": 0.000598430527235654,
+      "loss": 5.3339,
+      "step": 550
+    },
+    {
+      "epoch": 0.17471317919748414,
+      "grad_norm": 0.4490765929222107,
+      "learning_rate": 0.0005982557529857267,
+      "loss": 5.2571,
+      "step": 600
+    },
+    {
+      "epoch": 0.18927261079727448,
+      "grad_norm": 0.5662270188331604,
+      "learning_rate": 0.0005980809787357995,
+      "loss": 5.183,
+      "step": 650
+    },
+    {
+      "epoch": 0.20383204239706482,
+      "grad_norm": 0.4178728759288788,
+      "learning_rate": 0.0005979062044858724,
+      "loss": 5.1337,
+      "step": 700
+    },
+    {
+      "epoch": 0.21839147399685516,
+      "grad_norm": 0.4277268648147583,
+      "learning_rate": 0.0005977314302359452,
+      "loss": 5.082,
+      "step": 750
+    },
+    {
+      "epoch": 0.2329509055966455,
+      "grad_norm": 0.49093976616859436,
+      "learning_rate": 0.0005975566559860181,
+      "loss": 5.0414,
+      "step": 800
+    },
+    {
+      "epoch": 0.24751033719643586,
+      "grad_norm": 0.40832236409187317,
+      "learning_rate": 0.0005973818817360908,
+      "loss": 4.9782,
+      "step": 850
+    },
+    {
+      "epoch": 0.2620697687962262,
+      "grad_norm": 0.42992544174194336,
+      "learning_rate": 0.0005972071074861636,
+      "loss": 4.9343,
+      "step": 900
+    },
+    {
+      "epoch": 0.2766292003960165,
+      "grad_norm": 0.5416184067726135,
+      "learning_rate": 0.0005970323332362365,
+      "loss": 4.8685,
+      "step": 950
+    },
+    {
+      "epoch": 0.29118863199580686,
+      "grad_norm": 0.5198241472244263,
+      "learning_rate": 0.0005968575589863093,
+      "loss": 4.849,
+      "step": 1000
+    },
+    {
+      "epoch": 0.29118863199580686,
+      "eval_accuracy": 0.25379510529217636,
+      "eval_loss": 4.761143684387207,
+      "eval_runtime": 183.641,
+      "eval_samples_per_second": 90.628,
+      "eval_steps_per_second": 5.669,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30574806359559725,
+      "grad_norm": 0.47911056876182556,
+      "learning_rate": 0.0005966827847363822,
+      "loss": 4.7726,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3203074951953876,
+      "grad_norm": 0.4758392572402954,
+      "learning_rate": 0.000596508010486455,
+      "loss": 4.7537,
+      "step": 1100
+    },
+    {
+      "epoch": 0.33486692679517793,
+      "grad_norm": 0.47129762172698975,
+      "learning_rate": 0.0005963332362365277,
+      "loss": 4.7115,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3494263583949683,
+      "grad_norm": 0.42803141474723816,
+      "learning_rate": 0.0005961584619866006,
+      "loss": 4.6751,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3639857899947586,
+      "grad_norm": 0.4740878641605377,
+      "learning_rate": 0.0005959836877366734,
+      "loss": 4.6417,
+      "step": 1250
+    },
+    {
+      "epoch": 0.37854522159454895,
+      "grad_norm": 0.40221309661865234,
+      "learning_rate": 0.0005958089134867463,
+      "loss": 4.6053,
+      "step": 1300
+    },
+    {
+      "epoch": 0.3931046531943393,
+      "grad_norm": 0.44672706723213196,
+      "learning_rate": 0.0005956341392368191,
+      "loss": 4.5801,
+      "step": 1350
+    },
+    {
+      "epoch": 0.40766408479412963,
+      "grad_norm": 0.4823697507381439,
+      "learning_rate": 0.0005954593649868918,
+      "loss": 4.5599,
+      "step": 1400
+    },
+    {
+      "epoch": 0.42222351639392,
+      "grad_norm": 0.5122449398040771,
+      "learning_rate": 0.0005952845907369647,
+      "loss": 4.5344,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4367829479937103,
+      "grad_norm": 0.4088864028453827,
+      "learning_rate": 0.0005951098164870375,
+      "loss": 4.4951,
+      "step": 1500
+    },
+    {
+      "epoch": 0.45134237959350065,
+      "grad_norm": 0.40731462836265564,
+      "learning_rate": 0.0005949350422371104,
+      "loss": 4.5018,
+      "step": 1550
+    },
+    {
+      "epoch": 0.465901811193291,
+      "grad_norm": 0.4263319671154022,
+      "learning_rate": 0.0005947602679871832,
+      "loss": 4.4755,
+      "step": 1600
+    },
+    {
+      "epoch": 0.48046124279308133,
+      "grad_norm": 0.38340768218040466,
+      "learning_rate": 0.000594585493737256,
+      "loss": 4.4569,
+      "step": 1650
+    },
+    {
+      "epoch": 0.49502067439287173,
+      "grad_norm": 0.3979549705982208,
+      "learning_rate": 0.0005944107194873288,
+      "loss": 4.4444,
+      "step": 1700
+    },
+    {
+      "epoch": 0.509580105992662,
+      "grad_norm": 0.4176700711250305,
+      "learning_rate": 0.0005942359452374016,
+      "loss": 4.4173,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5241395375924524,
+      "grad_norm": 0.3926246464252472,
+      "learning_rate": 0.0005940611709874745,
+      "loss": 4.4004,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5386989691922427,
+      "grad_norm": 0.3877740502357483,
+      "learning_rate": 0.0005938863967375473,
+      "loss": 4.3869,
+      "step": 1850
+    },
+    {
+      "epoch": 0.553258400792033,
+      "grad_norm": 0.4315814971923828,
+      "learning_rate": 0.0005937116224876201,
+      "loss": 4.3636,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5678178323918234,
+      "grad_norm": 0.403978556394577,
+      "learning_rate": 0.000593536848237693,
+      "loss": 4.3628,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5823772639916137,
+      "grad_norm": 0.39984941482543945,
+      "learning_rate": 0.0005933620739877657,
+      "loss": 4.3402,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5823772639916137,
+      "eval_accuracy": 0.29928804185701036,
+      "eval_loss": 4.288719177246094,
+      "eval_runtime": 180.6232,
+      "eval_samples_per_second": 92.142,
+      "eval_steps_per_second": 5.763,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5969366955914042,
+      "grad_norm": 0.4173935055732727,
+      "learning_rate": 0.0005931872997378385,
+      "loss": 4.3411,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6114961271911945,
+      "grad_norm": 0.37241849303245544,
+      "learning_rate": 0.0005930125254879114,
+      "loss": 4.3243,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6260555587909848,
+      "grad_norm": 0.4274754822254181,
+      "learning_rate": 0.0005928377512379842,
+      "loss": 4.2883,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6406149903907752,
+      "grad_norm": 0.4375714063644409,
+      "learning_rate": 0.0005926629769880571,
+      "loss": 4.2941,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6551744219905655,
+      "grad_norm": 0.39245837926864624,
+      "learning_rate": 0.0005924882027381298,
+      "loss": 4.2863,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6697338535903559,
+      "grad_norm": 0.3508373498916626,
+      "learning_rate": 0.0005923134284882026,
+      "loss": 4.2683,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6842932851901462,
+      "grad_norm": 0.37966057658195496,
+      "learning_rate": 0.0005921386542382755,
+      "loss": 4.268,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6988527167899365,
+      "grad_norm": 0.4270515441894531,
+      "learning_rate": 0.0005919638799883483,
+      "loss": 4.2548,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7134121483897269,
+      "grad_norm": 0.36582618951797485,
+      "learning_rate": 0.0005917891057384212,
+      "loss": 4.2418,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7279715799895172,
+      "grad_norm": 0.3588745594024658,
+      "learning_rate": 0.000591614331488494,
+      "loss": 4.2315,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7425310115893076,
+      "grad_norm": 0.3805822730064392,
+      "learning_rate": 0.0005914395572385667,
+      "loss": 4.2263,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7570904431890979,
+      "grad_norm": 0.37862271070480347,
+      "learning_rate": 0.0005912647829886396,
+      "loss": 4.2177,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7716498747888882,
+      "grad_norm": 0.40694668889045715,
+      "learning_rate": 0.0005910900087387124,
+      "loss": 4.1886,
+      "step": 2650
+    },
+    {
+      "epoch": 0.7862093063886786,
+      "grad_norm": 0.3988340497016907,
+      "learning_rate": 0.0005909152344887853,
+      "loss": 4.1907,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8007687379884689,
+      "grad_norm": 0.4412493109703064,
+      "learning_rate": 0.0005907404602388581,
+      "loss": 4.1929,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8153281695882593,
+      "grad_norm": 0.37306517362594604,
+      "learning_rate": 0.0005905656859889308,
+      "loss": 4.1721,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8298876011880496,
+      "grad_norm": 0.36752834916114807,
+      "learning_rate": 0.0005903909117390037,
+      "loss": 4.1729,
+      "step": 2850
+    },
+    {
+      "epoch": 0.84444703278784,
+      "grad_norm": 0.38249292969703674,
+      "learning_rate": 0.0005902161374890766,
+      "loss": 4.17,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8590064643876303,
+      "grad_norm": 0.3479909598827362,
+      "learning_rate": 0.0005900413632391494,
+      "loss": 4.1629,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8735658959874206,
+      "grad_norm": 0.34885624051094055,
+      "learning_rate": 0.0005898665889892223,
+      "loss": 4.1563,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8735658959874206,
+      "eval_accuracy": 0.31519818808069494,
+      "eval_loss": 4.099164009094238,
+      "eval_runtime": 183.4247,
+      "eval_samples_per_second": 90.735,
+      "eval_steps_per_second": 5.675,
+      "step": 3000
+    },
+    {
+      "epoch": 0.888125327587211,
+      "grad_norm": 0.38681846857070923,
+      "learning_rate": 0.0005896918147392951,
+      "loss": 4.1567,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9026847591870013,
+      "grad_norm": 0.3432327210903168,
+      "learning_rate": 0.0005895170404893678,
+      "loss": 4.1293,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9172441907867916,
+      "grad_norm": 0.3937830626964569,
+      "learning_rate": 0.0005893422662394407,
+      "loss": 4.1285,
+      "step": 3150
+    },
+    {
+      "epoch": 0.931803622386582,
+      "grad_norm": 0.39171546697616577,
+      "learning_rate": 0.0005891674919895135,
+      "loss": 4.1279,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9463630539863723,
+      "grad_norm": 0.37026646733283997,
+      "learning_rate": 0.0005889927177395864,
+      "loss": 4.1106,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9609224855861627,
+      "grad_norm": 0.3460790812969208,
+      "learning_rate": 0.0005888179434896592,
+      "loss": 4.1132,
+      "step": 3300
+    },
+    {
+      "epoch": 0.975481917185953,
+      "grad_norm": 0.36886388063430786,
+      "learning_rate": 0.000588643169239732,
+      "loss": 4.0977,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9900413487857435,
+      "grad_norm": 0.36020082235336304,
+      "learning_rate": 0.0005884683949898048,
+      "loss": 4.0966,
+      "step": 3400
+    },
+    {
+      "epoch": 1.0043678294799372,
+      "grad_norm": 0.33763444423675537,
+      "learning_rate": 0.0005882936207398776,
+      "loss": 4.0577,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0189272610797275,
+      "grad_norm": 0.34525808691978455,
+      "learning_rate": 0.0005881188464899504,
+      "loss": 4.0248,
+      "step": 3500
+    },
+    {
+      "epoch": 1.0334866926795179,
+      "grad_norm": 0.37068355083465576,
+      "learning_rate": 0.0005879440722400233,
+      "loss": 4.0183,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0480461242793082,
+      "grad_norm": 0.34973421692848206,
+      "learning_rate": 0.0005877692979900961,
+      "loss": 4.0291,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0626055558790986,
+      "grad_norm": 0.3637358248233795,
+      "learning_rate": 0.000587594523740169,
+      "loss": 4.0199,
+      "step": 3650
+    },
+    {
+      "epoch": 1.077164987478889,
+      "grad_norm": 0.34920114278793335,
+      "learning_rate": 0.0005874197494902417,
+      "loss": 4.0247,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0917244190786792,
+      "grad_norm": 0.3420464098453522,
+      "learning_rate": 0.0005872449752403145,
+      "loss": 4.0189,
+      "step": 3750
+    },
+    {
+      "epoch": 1.1062838506784696,
+      "grad_norm": 0.34696176648139954,
+      "learning_rate": 0.0005870702009903874,
+      "loss": 4.0089,
+      "step": 3800
+    },
+    {
+      "epoch": 1.12084328227826,
+      "grad_norm": 0.3416752815246582,
+      "learning_rate": 0.0005868954267404602,
+      "loss": 3.9978,
+      "step": 3850
+    },
+    {
+      "epoch": 1.1354027138780503,
+      "grad_norm": 0.3729047179222107,
+      "learning_rate": 0.0005867206524905331,
+      "loss": 3.9976,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1499621454778406,
+      "grad_norm": 0.34707263112068176,
+      "learning_rate": 0.0005865458782406058,
+      "loss": 3.9927,
+      "step": 3950
+    },
+    {
+      "epoch": 1.164521577077631,
+      "grad_norm": 0.3424519896507263,
+      "learning_rate": 0.0005863711039906786,
+      "loss": 3.9798,
+      "step": 4000
+    },
+    {
+      "epoch": 1.164521577077631,
+      "eval_accuracy": 0.32528629009357674,
+      "eval_loss": 3.9908077716827393,
+      "eval_runtime": 180.5563,
+      "eval_samples_per_second": 92.176,
+      "eval_steps_per_second": 5.766,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1790810086774213,
+      "grad_norm": 0.3473677635192871,
+      "learning_rate": 0.0005861963297407515,
+      "loss": 3.9837,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1936404402772116,
+      "grad_norm": 0.3695130944252014,
+      "learning_rate": 0.0005860215554908243,
+      "loss": 3.9857,
+      "step": 4100
+    },
+    {
+      "epoch": 1.208199871877002,
+      "grad_norm": 0.3494517207145691,
+      "learning_rate": 0.0005858467812408972,
+      "loss": 3.9749,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2227593034767923,
+      "grad_norm": 0.3514440655708313,
+      "learning_rate": 0.00058567200699097,
+      "loss": 3.9773,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2373187350765826,
+      "grad_norm": 0.33939051628112793,
+      "learning_rate": 0.0005854972327410427,
+      "loss": 3.9868,
+      "step": 4250
+    },
+    {
+      "epoch": 1.251878166676373,
+      "grad_norm": 0.39269140362739563,
+      "learning_rate": 0.0005853224584911156,
+      "loss": 3.9676,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2664375982761633,
+      "grad_norm": 0.3487934470176697,
+      "learning_rate": 0.0005851476842411884,
+      "loss": 3.973,
+      "step": 4350
+    },
+    {
+      "epoch": 1.2809970298759537,
+      "grad_norm": 0.33803650736808777,
+      "learning_rate": 0.0005849729099912613,
+      "loss": 3.9805,
+      "step": 4400
+    },
+    {
+      "epoch": 1.295556461475744,
+      "grad_norm": 0.34375283122062683,
+      "learning_rate": 0.0005847981357413341,
+      "loss": 3.9729,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3101158930755343,
+      "grad_norm": 0.3429529070854187,
+      "learning_rate": 0.0005846233614914068,
+      "loss": 3.9492,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3246753246753247,
+      "grad_norm": 0.3482668399810791,
+      "learning_rate": 0.0005844485872414797,
+      "loss": 3.9654,
+      "step": 4550
+    },
+    {
+      "epoch": 1.339234756275115,
+      "grad_norm": 0.3361050486564636,
+      "learning_rate": 0.0005842738129915525,
+      "loss": 3.9693,
+      "step": 4600
+    },
+    {
+      "epoch": 1.3537941878749054,
+      "grad_norm": 0.34350207448005676,
+      "learning_rate": 0.0005840990387416253,
+      "loss": 3.9628,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3683536194746957,
+      "grad_norm": 0.35732749104499817,
+      "learning_rate": 0.0005839242644916982,
+      "loss": 3.9383,
+      "step": 4700
+    },
+    {
+      "epoch": 1.382913051074486,
+      "grad_norm": 0.32812654972076416,
+      "learning_rate": 0.000583749490241771,
+      "loss": 3.9402,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3974724826742764,
+      "grad_norm": 0.3359614312648773,
+      "learning_rate": 0.0005835747159918438,
+      "loss": 3.9409,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4120319142740667,
+      "grad_norm": 0.36291930079460144,
+      "learning_rate": 0.0005833999417419166,
+      "loss": 3.9373,
+      "step": 4850
+    },
+    {
+      "epoch": 1.426591345873857,
+      "grad_norm": 0.3357282876968384,
+      "learning_rate": 0.0005832251674919894,
+      "loss": 3.9373,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4411507774736474,
+      "grad_norm": 0.3662075996398926,
+      "learning_rate": 0.0005830503932420623,
+      "loss": 3.9326,
+      "step": 4950
+    },
+    {
+      "epoch": 1.4557102090734377,
+      "grad_norm": 0.3387506604194641,
+      "learning_rate": 0.0005828756189921351,
+      "loss": 3.9189,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4557102090734377,
+      "eval_accuracy": 0.3320894535210645,
+      "eval_loss": 3.91398549079895,
+      "eval_runtime": 185.101,
+      "eval_samples_per_second": 89.913,
+      "eval_steps_per_second": 5.624,
+      "step": 5000
+    },
+    {
+      "epoch": 1.470269640673228,
+      "grad_norm": 0.32989710569381714,
+      "learning_rate": 0.000582700844742208,
+      "loss": 3.9282,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4848290722730184,
+      "grad_norm": 0.3328815996646881,
+      "learning_rate": 0.0005825260704922807,
+      "loss": 3.9183,
+      "step": 5100
+    },
+    {
+      "epoch": 1.4993885038728088,
+      "grad_norm": 0.33961018919944763,
+      "learning_rate": 0.0005823512962423535,
+      "loss": 3.9253,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5139479354725993,
+      "grad_norm": 0.33562958240509033,
+      "learning_rate": 0.0005821765219924264,
+      "loss": 3.9222,
+      "step": 5200
+    },
+    {
+      "epoch": 1.5285073670723897,
+      "grad_norm": 0.3406899571418762,
+      "learning_rate": 0.0005820017477424992,
+      "loss": 3.9185,
+      "step": 5250
+    },
+    {
+      "epoch": 1.54306679867218,
+      "grad_norm": 0.3406858742237091,
+      "learning_rate": 0.0005818269734925721,
+      "loss": 3.9156,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5576262302719703,
+      "grad_norm": 0.34090015292167664,
+      "learning_rate": 0.0005816521992426448,
+      "loss": 3.8969,
+      "step": 5350
+    },
+    {
+      "epoch": 1.5721856618717607,
+      "grad_norm": 0.31158268451690674,
+      "learning_rate": 0.0005814774249927176,
+      "loss": 3.9143,
+      "step": 5400
+    },
+    {
+      "epoch": 1.586745093471551,
+      "grad_norm": 0.34926122426986694,
+      "learning_rate": 0.0005813026507427905,
+      "loss": 3.9132,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6013045250713414,
+      "grad_norm": 0.34333717823028564,
+      "learning_rate": 0.0005811278764928634,
+      "loss": 3.9041,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6158639566711317,
+      "grad_norm": 0.3164921998977661,
+      "learning_rate": 0.0005809531022429362,
+      "loss": 3.908,
+      "step": 5550
+    },
+    {
+      "epoch": 1.630423388270922,
+      "grad_norm": 0.3325600028038025,
+      "learning_rate": 0.0005807783279930091,
+      "loss": 3.8937,
+      "step": 5600
+    },
+    {
+      "epoch": 1.6449828198707124,
+      "grad_norm": 0.3716844916343689,
+      "learning_rate": 0.0005806035537430818,
+      "loss": 3.913,
+      "step": 5650
+    },
+    {
+      "epoch": 1.6595422514705027,
+      "grad_norm": 0.3302454352378845,
+      "learning_rate": 0.0005804287794931546,
+      "loss": 3.8894,
+      "step": 5700
+    },
+    {
+      "epoch": 1.674101683070293,
+      "grad_norm": 0.3286576271057129,
+      "learning_rate": 0.0005802540052432275,
+      "loss": 3.9061,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6886611146700834,
+      "grad_norm": 0.31899774074554443,
+      "learning_rate": 0.0005800792309933003,
+      "loss": 3.885,
+      "step": 5800
+    },
+    {
+      "epoch": 1.7032205462698737,
+      "grad_norm": 0.38346347212791443,
+      "learning_rate": 0.0005799044567433732,
+      "loss": 3.8978,
+      "step": 5850
+    },
+    {
+      "epoch": 1.717779977869664,
+      "grad_norm": 0.32501021027565,
+      "learning_rate": 0.000579729682493446,
+      "loss": 3.8928,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7323394094694544,
+      "grad_norm": 0.33264926075935364,
+      "learning_rate": 0.0005795549082435187,
+      "loss": 3.8917,
+      "step": 5950
+    },
+    {
+      "epoch": 1.7468988410692448,
+      "grad_norm": 0.35515546798706055,
+      "learning_rate": 0.0005793801339935916,
+      "loss": 3.8836,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7468988410692448,
+      "eval_accuracy": 0.33675024013551297,
+      "eval_loss": 3.8585171699523926,
+      "eval_runtime": 185.0399,
+      "eval_samples_per_second": 89.943,
+      "eval_steps_per_second": 5.626,
+      "step": 6000
+    },
+    {
+      "epoch": 1.761458272669035,
+      "grad_norm": 0.3250105679035187,
+      "learning_rate": 0.0005792053597436644,
+      "loss": 3.8774,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7760177042688254,
+      "grad_norm": 0.333280473947525,
+      "learning_rate": 0.0005790305854937372,
+      "loss": 3.8726,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7905771358686158,
+      "grad_norm": 0.32873275876045227,
+      "learning_rate": 0.0005788558112438101,
+      "loss": 3.8701,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8051365674684061,
+      "grad_norm": 0.3332742154598236,
+      "learning_rate": 0.0005786810369938828,
+      "loss": 3.8699,
+      "step": 6200
+    },
+    {
+      "epoch": 1.8196959990681965,
+      "grad_norm": 0.3222472369670868,
+      "learning_rate": 0.0005785062627439557,
+      "loss": 3.874,
+      "step": 6250
+    },
+    {
+      "epoch": 1.8342554306679868,
+      "grad_norm": 0.3324868381023407,
+      "learning_rate": 0.0005783314884940285,
+      "loss": 3.869,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8488148622677771,
+      "grad_norm": 0.32730036973953247,
+      "learning_rate": 0.0005781567142441013,
+      "loss": 3.8536,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8633742938675675,
+      "grad_norm": 0.3353622257709503,
+      "learning_rate": 0.0005779819399941742,
+      "loss": 3.869,
+      "step": 6400
+    },
+    {
+      "epoch": 1.8779337254673578,
+      "grad_norm": 0.33830076456069946,
+      "learning_rate": 0.000577807165744247,
+      "loss": 3.8726,
+      "step": 6450
+    },
+    {
+      "epoch": 1.8924931570671482,
+      "grad_norm": 0.31618306040763855,
+      "learning_rate": 0.0005776323914943198,
+      "loss": 3.8508,
+      "step": 6500
+    },
+    {
+      "epoch": 1.9070525886669385,
+      "grad_norm": 0.33165860176086426,
+      "learning_rate": 0.0005774576172443926,
+      "loss": 3.8566,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9216120202667288,
+      "grad_norm": 0.3387751579284668,
+      "learning_rate": 0.0005772828429944654,
+      "loss": 3.8548,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9361714518665192,
+      "grad_norm": 0.3364385664463043,
+      "learning_rate": 0.0005771080687445383,
+      "loss": 3.8539,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9507308834663095,
+      "grad_norm": 0.34390878677368164,
+      "learning_rate": 0.0005769332944946111,
+      "loss": 3.8631,
+      "step": 6700
+    },
+    {
+      "epoch": 1.9652903150660999,
+      "grad_norm": 0.3324083685874939,
+      "learning_rate": 0.0005767585202446839,
+      "loss": 3.8482,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9798497466658902,
+      "grad_norm": 0.32365697622299194,
+      "learning_rate": 0.0005765837459947567,
+      "loss": 3.8303,
+      "step": 6800
+    },
+    {
+      "epoch": 1.9944091782656805,
+      "grad_norm": 0.3342290222644806,
+      "learning_rate": 0.0005764089717448295,
+      "loss": 3.8508,
+      "step": 6850
+    },
+    {
+      "epoch": 2.0087356589598744,
+      "grad_norm": 0.3290010392665863,
+      "learning_rate": 0.0005762341974949024,
+      "loss": 3.7915,
+      "step": 6900
+    },
+    {
+      "epoch": 2.0232950905596647,
+      "grad_norm": 0.3240971565246582,
+      "learning_rate": 0.0005760594232449752,
+      "loss": 3.7587,
+      "step": 6950
+    },
+    {
+      "epoch": 2.037854522159455,
+      "grad_norm": 0.3391764163970947,
+      "learning_rate": 0.0005758846489950481,
+      "loss": 3.7526,
+      "step": 7000
+    },
+    {
+      "epoch": 2.037854522159455,
+      "eval_accuracy": 0.34109519666654636,
+      "eval_loss": 3.816195249557495,
+      "eval_runtime": 184.953,
+      "eval_samples_per_second": 89.985,
+      "eval_steps_per_second": 5.628,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0524139537592454,
+      "grad_norm": 0.33266958594322205,
+      "learning_rate": 0.0005757098747451208,
+      "loss": 3.7541,
+      "step": 7050
+    },
+    {
+      "epoch": 2.0669733853590357,
+      "grad_norm": 0.34850549697875977,
+      "learning_rate": 0.0005755351004951936,
+      "loss": 3.7518,
+      "step": 7100
+    },
+    {
+      "epoch": 2.081532816958826,
+      "grad_norm": 0.3229345679283142,
+      "learning_rate": 0.0005753603262452665,
+      "loss": 3.7485,
+      "step": 7150
+    },
+    {
+      "epoch": 2.0960922485586164,
+      "grad_norm": 0.31956946849823,
+      "learning_rate": 0.0005751855519953393,
+      "loss": 3.7446,
+      "step": 7200
+    },
+    {
+      "epoch": 2.1106516801584068,
+      "grad_norm": 0.3483135402202606,
+      "learning_rate": 0.0005750107777454121,
+      "loss": 3.76,
+      "step": 7250
+    },
+    {
+      "epoch": 2.125211111758197,
+      "grad_norm": 0.3251873850822449,
+      "learning_rate": 0.0005748360034954849,
+      "loss": 3.7494,
+      "step": 7300
+    },
+    {
+      "epoch": 2.1397705433579874,
+      "grad_norm": 0.3456031382083893,
+      "learning_rate": 0.0005746612292455577,
+      "loss": 3.7564,
+      "step": 7350
+    },
+    {
+      "epoch": 2.154329974957778,
+      "grad_norm": 0.3253571093082428,
+      "learning_rate": 0.0005744864549956306,
+      "loss": 3.7517,
+      "step": 7400
+    },
+    {
+      "epoch": 2.168889406557568,
+      "grad_norm": 0.322238564491272,
+      "learning_rate": 0.0005743116807457034,
+      "loss": 3.7582,
+      "step": 7450
+    },
+    {
+      "epoch": 2.1834488381573585,
+      "grad_norm": 0.33640897274017334,
+      "learning_rate": 0.0005741369064957762,
+      "loss": 3.7567,
+      "step": 7500
+    },
+    {
+      "epoch": 2.198008269757149,
+      "grad_norm": 0.3346073627471924,
+      "learning_rate": 0.0005739621322458491,
+      "loss": 3.7461,
+      "step": 7550
+    },
+    {
+      "epoch": 2.212567701356939,
+      "grad_norm": 0.3327328860759735,
+      "learning_rate": 0.0005737873579959218,
+      "loss": 3.7594,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2271271329567295,
+      "grad_norm": 0.3236997723579407,
+      "learning_rate": 0.0005736125837459947,
+      "loss": 3.7726,
+      "step": 7650
+    },
+    {
+      "epoch": 2.24168656455652,
+      "grad_norm": 0.33130574226379395,
+      "learning_rate": 0.0005734378094960675,
+      "loss": 3.7486,
+      "step": 7700
+    },
+    {
+      "epoch": 2.25624599615631,
+      "grad_norm": 0.34919485449790955,
+      "learning_rate": 0.0005732630352461403,
+      "loss": 3.7578,
+      "step": 7750
+    },
+    {
+      "epoch": 2.2708054277561005,
+      "grad_norm": 0.3182968497276306,
+      "learning_rate": 0.0005730882609962132,
+      "loss": 3.7604,
+      "step": 7800
+    },
+    {
+      "epoch": 2.285364859355891,
+      "grad_norm": 0.30436646938323975,
+      "learning_rate": 0.0005729134867462859,
+      "loss": 3.7412,
+      "step": 7850
+    },
+    {
+      "epoch": 2.299924290955681,
+      "grad_norm": 0.3302886188030243,
+      "learning_rate": 0.0005727387124963588,
+      "loss": 3.7515,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3144837225554715,
+      "grad_norm": 0.30620837211608887,
+      "learning_rate": 0.0005725639382464317,
+      "loss": 3.7695,
+      "step": 7950
+    },
+    {
+      "epoch": 2.329043154155262,
+      "grad_norm": 0.3169257640838623,
+      "learning_rate": 0.0005723891639965045,
+      "loss": 3.7682,
+      "step": 8000
+    },
+    {
+      "epoch": 2.329043154155262,
+      "eval_accuracy": 0.34396188967982283,
+      "eval_loss": 3.788954496383667,
+      "eval_runtime": 182.8165,
+      "eval_samples_per_second": 91.037,
+      "eval_steps_per_second": 5.694,
+      "step": 8000
+    },
+    {
+      "epoch": 2.343602585755052,
+      "grad_norm": 0.3280718922615051,
+      "learning_rate": 0.0005722143897465773,
+      "loss": 3.7452,
+      "step": 8050
+    },
+    {
+      "epoch": 2.3581620173548425,
+      "grad_norm": 0.3237084746360779,
+      "learning_rate": 0.0005720396154966502,
+      "loss": 3.762,
+      "step": 8100
+    },
+    {
+      "epoch": 2.372721448954633,
+      "grad_norm": 0.31791386008262634,
+      "learning_rate": 0.0005718648412467229,
+      "loss": 3.7504,
+      "step": 8150
+    },
+    {
+      "epoch": 2.3872808805544232,
+      "grad_norm": 0.32723358273506165,
+      "learning_rate": 0.0005716900669967958,
+      "loss": 3.7561,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4018403121542136,
+      "grad_norm": 0.3216814398765564,
+      "learning_rate": 0.0005715152927468686,
+      "loss": 3.7496,
+      "step": 8250
+    },
+    {
+      "epoch": 2.416399743754004,
+      "grad_norm": 0.32928794622421265,
+      "learning_rate": 0.0005713405184969414,
+      "loss": 3.7533,
+      "step": 8300
+    },
+    {
+      "epoch": 2.4309591753537942,
+      "grad_norm": 0.3223062753677368,
+      "learning_rate": 0.0005711657442470143,
+      "loss": 3.766,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4455186069535846,
+      "grad_norm": 0.3292803168296814,
+      "learning_rate": 0.000570990969997087,
+      "loss": 3.7502,
+      "step": 8400
+    },
+    {
+      "epoch": 2.460078038553375,
+      "grad_norm": 0.3402736783027649,
+      "learning_rate": 0.0005708161957471599,
+      "loss": 3.744,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4746374701531653,
+      "grad_norm": 0.3164720833301544,
+      "learning_rate": 0.0005706414214972327,
+      "loss": 3.7426,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4891969017529556,
+      "grad_norm": 0.33465683460235596,
+      "learning_rate": 0.0005704666472473055,
+      "loss": 3.756,
+      "step": 8550
+    },
+    {
+      "epoch": 2.503756333352746,
+      "grad_norm": 0.3301171362400055,
+      "learning_rate": 0.0005702918729973784,
+      "loss": 3.7448,
+      "step": 8600
+    },
+    {
+      "epoch": 2.5183157649525363,
+      "grad_norm": 0.3436541259288788,
+      "learning_rate": 0.0005701170987474512,
+      "loss": 3.7449,
+      "step": 8650
+    },
+    {
+      "epoch": 2.5328751965523266,
+      "grad_norm": 0.3333314061164856,
+      "learning_rate": 0.0005699423244975239,
+      "loss": 3.7381,
+      "step": 8700
+    },
+    {
+      "epoch": 2.547434628152117,
+      "grad_norm": 0.3258245885372162,
+      "learning_rate": 0.0005697675502475968,
+      "loss": 3.7338,
+      "step": 8750
+    },
+    {
+      "epoch": 2.5619940597519073,
+      "grad_norm": 0.34784647822380066,
+      "learning_rate": 0.0005695927759976696,
+      "loss": 3.734,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5765534913516976,
+      "grad_norm": 0.31109482049942017,
+      "learning_rate": 0.0005694180017477425,
+      "loss": 3.7372,
+      "step": 8850
+    },
+    {
+      "epoch": 2.591112922951488,
+      "grad_norm": 0.31201112270355225,
+      "learning_rate": 0.0005692432274978153,
+      "loss": 3.7499,
+      "step": 8900
+    },
+    {
+      "epoch": 2.6056723545512783,
+      "grad_norm": 0.31193050742149353,
+      "learning_rate": 0.000569068453247888,
+      "loss": 3.7385,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6202317861510687,
+      "grad_norm": 0.3446432948112488,
+      "learning_rate": 0.0005688936789979609,
+      "loss": 3.7477,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6202317861510687,
+      "eval_accuracy": 0.3468738524556142,
+      "eval_loss": 3.757246255874634,
+      "eval_runtime": 182.4423,
+      "eval_samples_per_second": 91.223,
+      "eval_steps_per_second": 5.706,
+      "step": 9000
+    },
+    {
+      "epoch": 2.634791217750859,
+      "grad_norm": 0.31883829832077026,
+      "learning_rate": 0.0005687189047480337,
+      "loss": 3.7364,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6493506493506493,
+      "grad_norm": 0.3273116946220398,
+      "learning_rate": 0.0005685441304981066,
+      "loss": 3.7312,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6639100809504397,
+      "grad_norm": 0.3443247973918915,
+      "learning_rate": 0.0005683693562481794,
+      "loss": 3.7366,
+      "step": 9150
+    },
+    {
+      "epoch": 2.67846951255023,
+      "grad_norm": 0.30951568484306335,
+      "learning_rate": 0.0005681945819982522,
+      "loss": 3.7425,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6930289441500204,
+      "grad_norm": 0.3140866756439209,
+      "learning_rate": 0.000568019807748325,
+      "loss": 3.7396,
+      "step": 9250
+    },
+    {
+      "epoch": 2.7075883757498107,
+      "grad_norm": 0.32707467675209045,
+      "learning_rate": 0.0005678450334983978,
+      "loss": 3.7348,
+      "step": 9300
+    },
+    {
+      "epoch": 2.722147807349601,
+      "grad_norm": 0.32110151648521423,
+      "learning_rate": 0.0005676702592484707,
+      "loss": 3.7223,
+      "step": 9350
+    },
+    {
+      "epoch": 2.7367072389493914,
+      "grad_norm": 0.3235968053340912,
+      "learning_rate": 0.0005674954849985435,
+      "loss": 3.7379,
+      "step": 9400
+    },
+    {
+      "epoch": 2.7512666705491817,
+      "grad_norm": 0.34924793243408203,
+      "learning_rate": 0.0005673207107486163,
+      "loss": 3.7503,
+      "step": 9450
+    },
+    {
+      "epoch": 2.765826102148972,
+      "grad_norm": 0.32524895668029785,
+      "learning_rate": 0.0005671459364986892,
+      "loss": 3.7302,
+      "step": 9500
+    },
+    {
+      "epoch": 2.7803855337487624,
+      "grad_norm": 0.3183753490447998,
+      "learning_rate": 0.0005669711622487619,
+      "loss": 3.7229,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7949449653485527,
+      "grad_norm": 0.31938815116882324,
+      "learning_rate": 0.0005667963879988348,
+      "loss": 3.7208,
+      "step": 9600
+    },
+    {
+      "epoch": 2.809504396948343,
+      "grad_norm": 0.3149973154067993,
+      "learning_rate": 0.0005666216137489076,
+      "loss": 3.7312,
+      "step": 9650
+    },
+    {
+      "epoch": 2.8240638285481334,
+      "grad_norm": 0.32664161920547485,
+      "learning_rate": 0.0005664468394989804,
+      "loss": 3.7436,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8386232601479238,
+      "grad_norm": 0.31149327754974365,
+      "learning_rate": 0.0005662720652490533,
+      "loss": 3.728,
+      "step": 9750
+    },
+    {
+      "epoch": 2.853182691747714,
+      "grad_norm": 0.3289666175842285,
+      "learning_rate": 0.000566097290999126,
+      "loss": 3.7286,
+      "step": 9800
+    },
+    {
+      "epoch": 2.8677421233475044,
+      "grad_norm": 0.3204244077205658,
+      "learning_rate": 0.0005659225167491988,
+      "loss": 3.7122,
+      "step": 9850
+    },
+    {
+      "epoch": 2.882301554947295,
+      "grad_norm": 0.33363139629364014,
+      "learning_rate": 0.0005657477424992717,
+      "loss": 3.7409,
+      "step": 9900
+    },
+    {
+      "epoch": 2.896860986547085,
+      "grad_norm": 0.3554539084434509,
+      "learning_rate": 0.0005655729682493445,
+      "loss": 3.7301,
+      "step": 9950
+    },
+    {
+      "epoch": 2.9114204181468755,
+      "grad_norm": 0.306832879781723,
+      "learning_rate": 0.0005653981939994174,
+      "loss": 3.73,
+      "step": 10000
+    },
+    {
+      "epoch": 2.9114204181468755,
+      "eval_accuracy": 0.3494360034301546,
+      "eval_loss": 3.729952573776245,
+      "eval_runtime": 181.5285,
+      "eval_samples_per_second": 91.683,
+      "eval_steps_per_second": 5.735,
+      "step": 10000
+    },
+    {
+      "epoch": 2.925979849746666,
+      "grad_norm": 0.31433573365211487,
+      "learning_rate": 0.0005652234197494902,
+      "loss": 3.7247,
+      "step": 10050
+    },
+    {
+      "epoch": 2.940539281346456,
+      "grad_norm": 0.3179089426994324,
+      "learning_rate": 0.0005650486454995629,
+      "loss": 3.7153,
+      "step": 10100
+    },
+    {
+      "epoch": 2.9550987129462465,
+      "grad_norm": 0.3196451961994171,
+      "learning_rate": 0.0005648738712496358,
+      "loss": 3.7189,
+      "step": 10150
+    },
+    {
+      "epoch": 2.969658144546037,
+      "grad_norm": 0.30295759439468384,
+      "learning_rate": 0.0005646990969997086,
+      "loss": 3.7165,
+      "step": 10200
+    },
+    {
+      "epoch": 2.984217576145827,
+      "grad_norm": 0.32530921697616577,
+      "learning_rate": 0.0005645243227497815,
+      "loss": 3.715,
+      "step": 10250
+    },
+    {
+      "epoch": 2.9987770077456175,
+      "grad_norm": 0.30198994278907776,
+      "learning_rate": 0.0005643495484998543,
+      "loss": 3.7192,
+      "step": 10300
+    },
+    {
+      "epoch": 3.0131034884398114,
+      "grad_norm": 0.31793293356895447,
+      "learning_rate": 0.000564174774249927,
+      "loss": 3.6316,
+      "step": 10350
+    },
+    {
+      "epoch": 3.0276629200396017,
+      "grad_norm": 0.3131251633167267,
+      "learning_rate": 0.0005639999999999999,
+      "loss": 3.6161,
+      "step": 10400
+    },
+    {
+      "epoch": 3.042222351639392,
+      "grad_norm": 0.3221314251422882,
+      "learning_rate": 0.0005638252257500727,
+      "loss": 3.6239,
+      "step": 10450
+    },
+    {
+      "epoch": 3.0567817832391824,
+      "grad_norm": 0.3299553096294403,
+      "learning_rate": 0.0005636504515001456,
+      "loss": 3.6255,
+      "step": 10500
+    },
+    {
+      "epoch": 3.0713412148389727,
+      "grad_norm": 0.3239217698574066,
+      "learning_rate": 0.0005634756772502185,
+      "loss": 3.6207,
+      "step": 10550
+    },
+    {
+      "epoch": 3.085900646438763,
+      "grad_norm": 0.3120846152305603,
+      "learning_rate": 0.0005633009030002913,
+      "loss": 3.6305,
+      "step": 10600
+    },
+    {
+      "epoch": 3.1004600780385534,
+      "grad_norm": 0.324990838766098,
+      "learning_rate": 0.000563126128750364,
+      "loss": 3.6298,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1150195096383437,
+      "grad_norm": 0.3125215172767639,
+      "learning_rate": 0.0005629513545004369,
+      "loss": 3.617,
+      "step": 10700
+    },
+    {
+      "epoch": 3.129578941238134,
+      "grad_norm": 0.3323279917240143,
+      "learning_rate": 0.0005627765802505097,
+      "loss": 3.6235,
+      "step": 10750
+    },
+    {
+      "epoch": 3.1441383728379244,
+      "grad_norm": 0.3290170133113861,
+      "learning_rate": 0.0005626018060005826,
+      "loss": 3.6227,
+      "step": 10800
+    },
+    {
+      "epoch": 3.1586978044377148,
+      "grad_norm": 0.3450184762477875,
+      "learning_rate": 0.0005624270317506554,
+      "loss": 3.64,
+      "step": 10850
+    },
+    {
+      "epoch": 3.173257236037505,
+      "grad_norm": 0.32774847745895386,
+      "learning_rate": 0.0005622522575007282,
+      "loss": 3.646,
+      "step": 10900
+    },
+    {
+      "epoch": 3.1878166676372954,
+      "grad_norm": 0.32285189628601074,
+      "learning_rate": 0.000562077483250801,
+      "loss": 3.643,
+      "step": 10950
+    },
+    {
+      "epoch": 3.2023760992370858,
+      "grad_norm": 0.3201664686203003,
+      "learning_rate": 0.0005619027090008738,
+      "loss": 3.6397,
+      "step": 11000
+    },
+    {
+      "epoch": 3.2023760992370858,
+      "eval_accuracy": 0.3513435653971105,
+      "eval_loss": 3.7152557373046875,
+      "eval_runtime": 180.6216,
+      "eval_samples_per_second": 92.143,
+      "eval_steps_per_second": 5.763,
+      "step": 11000
+    },
+    {
+      "epoch": 3.216935530836876,
+      "grad_norm": 0.32860246300697327,
+      "learning_rate": 0.0005617279347509467,
+      "loss": 3.6478,
+      "step": 11050
+    },
+    {
+      "epoch": 3.2314949624366665,
+      "grad_norm": 0.32338783144950867,
+      "learning_rate": 0.0005615531605010195,
+      "loss": 3.6419,
+      "step": 11100
+    },
+    {
+      "epoch": 3.246054394036457,
+      "grad_norm": 0.3216056823730469,
+      "learning_rate": 0.0005613783862510923,
+      "loss": 3.6497,
+      "step": 11150
+    },
+    {
+      "epoch": 3.260613825636247,
+      "grad_norm": 0.36512988805770874,
+      "learning_rate": 0.0005612036120011652,
+      "loss": 3.6238,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2751732572360375,
+      "grad_norm": 0.33006951212882996,
+      "learning_rate": 0.0005610288377512379,
+      "loss": 3.65,
+      "step": 11250
+    },
+    {
+      "epoch": 3.289732688835828,
+      "grad_norm": 0.32506290078163147,
+      "learning_rate": 0.0005608540635013107,
+      "loss": 3.6369,
+      "step": 11300
+    },
+    {
+      "epoch": 3.304292120435618,
+      "grad_norm": 0.3291010856628418,
+      "learning_rate": 0.0005606792892513836,
+      "loss": 3.644,
+      "step": 11350
+    },
+    {
+      "epoch": 3.3188515520354085,
+      "grad_norm": 0.3134164810180664,
+      "learning_rate": 0.0005605045150014564,
+      "loss": 3.6428,
+      "step": 11400
+    },
+    {
+      "epoch": 3.333410983635199,
+      "grad_norm": 0.3079008162021637,
+      "learning_rate": 0.0005603297407515293,
+      "loss": 3.638,
+      "step": 11450
+    },
+    {
+      "epoch": 3.347970415234989,
+      "grad_norm": 0.2959432899951935,
+      "learning_rate": 0.000560154966501602,
+      "loss": 3.6469,
+      "step": 11500
+    },
+    {
+      "epoch": 3.3625298468347795,
+      "grad_norm": 0.3210470676422119,
+      "learning_rate": 0.0005599801922516748,
+      "loss": 3.6441,
+      "step": 11550
+    },
+    {
+      "epoch": 3.37708927843457,
+      "grad_norm": 0.3303925395011902,
+      "learning_rate": 0.0005598054180017477,
+      "loss": 3.6448,
+      "step": 11600
+    },
+    {
+      "epoch": 3.39164871003436,
+      "grad_norm": 0.3426654040813446,
+      "learning_rate": 0.0005596306437518205,
+      "loss": 3.638,
+      "step": 11650
+    },
+    {
+      "epoch": 3.4062081416341505,
+      "grad_norm": 0.35107845067977905,
+      "learning_rate": 0.0005594558695018934,
+      "loss": 3.6483,
+      "step": 11700
+    },
+    {
+      "epoch": 3.420767573233941,
+      "grad_norm": 0.3188258111476898,
+      "learning_rate": 0.0005592810952519662,
+      "loss": 3.6422,
+      "step": 11750
+    },
+    {
+      "epoch": 3.435327004833731,
+      "grad_norm": 0.33043134212493896,
+      "learning_rate": 0.0005591063210020389,
+      "loss": 3.6448,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4498864364335216,
+      "grad_norm": 0.31511127948760986,
+      "learning_rate": 0.0005589315467521118,
+      "loss": 3.648,
+      "step": 11850
+    },
+    {
+      "epoch": 3.464445868033312,
+      "grad_norm": 0.3306327164173126,
+      "learning_rate": 0.0005587567725021846,
+      "loss": 3.6258,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4790052996331022,
+      "grad_norm": 0.3343588411808014,
+      "learning_rate": 0.0005585819982522575,
+      "loss": 3.646,
+      "step": 11950
+    },
+    {
+      "epoch": 3.4935647312328926,
+      "grad_norm": 0.3293665945529938,
+      "learning_rate": 0.0005584072240023303,
+      "loss": 3.6405,
+      "step": 12000
+    },
+    {
+      "epoch": 3.4935647312328926,
+      "eval_accuracy": 0.35307957260170497,
+      "eval_loss": 3.6972014904022217,
+      "eval_runtime": 181.5639,
+      "eval_samples_per_second": 91.665,
+      "eval_steps_per_second": 5.734,
+      "step": 12000
+    },
+    {
+      "epoch": 3.508124162832683,
+      "grad_norm": 0.3309422433376312,
+      "learning_rate": 0.000558232449752403,
+      "loss": 3.6445,
+      "step": 12050
+    },
+    {
+      "epoch": 3.5226835944324733,
+      "grad_norm": 0.3296276032924652,
+      "learning_rate": 0.0005580576755024759,
+      "loss": 3.6433,
+      "step": 12100
+    },
+    {
+      "epoch": 3.5372430260322636,
+      "grad_norm": 0.3203052580356598,
+      "learning_rate": 0.0005578829012525487,
+      "loss": 3.6408,
+      "step": 12150
+    },
+    {
+      "epoch": 3.551802457632054,
+      "grad_norm": 0.31741246581077576,
+      "learning_rate": 0.0005577081270026216,
+      "loss": 3.6379,
+      "step": 12200
+    },
+    {
+      "epoch": 3.5663618892318443,
+      "grad_norm": 0.32449865341186523,
+      "learning_rate": 0.0005575333527526944,
+      "loss": 3.6515,
+      "step": 12250
+    },
+    {
+      "epoch": 3.5809213208316346,
+      "grad_norm": 0.3303356170654297,
+      "learning_rate": 0.0005573585785027672,
+      "loss": 3.6346,
+      "step": 12300
+    },
+    {
+      "epoch": 3.595480752431425,
+      "grad_norm": 0.3001437783241272,
+      "learning_rate": 0.00055718380425284,
+      "loss": 3.6476,
+      "step": 12350
+    },
+    {
+      "epoch": 3.6100401840312153,
+      "grad_norm": 0.3065738379955292,
+      "learning_rate": 0.0005570090300029128,
+      "loss": 3.6495,
+      "step": 12400
+    },
+    {
+      "epoch": 3.6245996156310056,
+      "grad_norm": 0.3155801594257355,
+      "learning_rate": 0.0005568342557529856,
+      "loss": 3.6398,
+      "step": 12450
+    },
+    {
+      "epoch": 3.639159047230796,
+      "grad_norm": 0.3072325587272644,
+      "learning_rate": 0.0005566594815030585,
+      "loss": 3.6446,
+      "step": 12500
+    },
+    {
+      "epoch": 3.6537184788305863,
+      "grad_norm": 0.331887811422348,
+      "learning_rate": 0.0005564847072531313,
+      "loss": 3.6402,
+      "step": 12550
+    },
+    {
+      "epoch": 3.6682779104303767,
+      "grad_norm": 0.30090418457984924,
+      "learning_rate": 0.0005563099330032042,
+      "loss": 3.6303,
+      "step": 12600
+    },
+    {
+      "epoch": 3.682837342030167,
+      "grad_norm": 0.3239140808582306,
+      "learning_rate": 0.0005561351587532769,
+      "loss": 3.6552,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6973967736299573,
+      "grad_norm": 0.320881724357605,
+      "learning_rate": 0.0005559603845033497,
+      "loss": 3.6356,
+      "step": 12700
+    },
+    {
+      "epoch": 3.7119562052297477,
+      "grad_norm": 0.3165138363838196,
+      "learning_rate": 0.0005557856102534226,
+      "loss": 3.6434,
+      "step": 12750
+    },
+    {
+      "epoch": 3.726515636829538,
+      "grad_norm": 0.3095230162143707,
+      "learning_rate": 0.0005556108360034954,
+      "loss": 3.6385,
+      "step": 12800
+    },
+    {
+      "epoch": 3.7410750684293284,
+      "grad_norm": 0.34694117307662964,
+      "learning_rate": 0.0005554360617535683,
+      "loss": 3.6463,
+      "step": 12850
+    },
+    {
+      "epoch": 3.755634500029119,
+      "grad_norm": 0.32559525966644287,
+      "learning_rate": 0.000555261287503641,
+      "loss": 3.6323,
+      "step": 12900
+    },
+    {
+      "epoch": 3.770193931628909,
+      "grad_norm": 0.3220575451850891,
+      "learning_rate": 0.0005550865132537138,
+      "loss": 3.6369,
+      "step": 12950
+    },
+    {
+      "epoch": 3.7847533632287,
+      "grad_norm": 0.31526488065719604,
+      "learning_rate": 0.0005549117390037867,
+      "loss": 3.6412,
+      "step": 13000
+    },
+    {
+      "epoch": 3.7847533632287,
+      "eval_accuracy": 0.35453631828429244,
+      "eval_loss": 3.682695150375366,
+      "eval_runtime": 183.1807,
+      "eval_samples_per_second": 90.856,
+      "eval_steps_per_second": 5.683,
+      "step": 13000
+    },
+    {
+      "epoch": 3.7993127948284897,
+      "grad_norm": 0.3306889832019806,
+      "learning_rate": 0.0005547369647538596,
+      "loss": 3.6618,
+      "step": 13050
+    },
+    {
+      "epoch": 3.8138722264282805,
+      "grad_norm": 0.33385586738586426,
+      "learning_rate": 0.0005545621905039324,
+      "loss": 3.6427,
+      "step": 13100
+    },
+    {
+      "epoch": 3.8284316580280704,
+      "grad_norm": 0.30829793214797974,
+      "learning_rate": 0.0005543874162540053,
+      "loss": 3.6345,
+      "step": 13150
+    },
+    {
+      "epoch": 3.842991089627861,
+      "grad_norm": 0.3245658576488495,
+      "learning_rate": 0.000554212642004078,
+      "loss": 3.6519,
+      "step": 13200
+    },
+    {
+      "epoch": 3.857550521227651,
+      "grad_norm": 0.29873931407928467,
+      "learning_rate": 0.0005540378677541508,
+      "loss": 3.639,
+      "step": 13250
+    },
+    {
+      "epoch": 3.872109952827442,
+      "grad_norm": 0.3140360414981842,
+      "learning_rate": 0.0005538630935042237,
+      "loss": 3.644,
+      "step": 13300
+    },
+    {
+      "epoch": 3.8866693844272318,
+      "grad_norm": 0.31487107276916504,
+      "learning_rate": 0.0005536883192542965,
+      "loss": 3.6451,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9012288160270225,
+      "grad_norm": 0.31665652990341187,
+      "learning_rate": 0.0005535135450043694,
+      "loss": 3.63,
+      "step": 13400
+    },
+    {
+      "epoch": 3.9157882476268124,
+      "grad_norm": 0.3285450339317322,
+      "learning_rate": 0.0005533387707544422,
+      "loss": 3.6402,
+      "step": 13450
+    },
+    {
+      "epoch": 3.930347679226603,
+      "grad_norm": 0.3168368935585022,
+      "learning_rate": 0.0005531639965045149,
+      "loss": 3.6433,
+      "step": 13500
+    },
+    {
+      "epoch": 3.944907110826393,
+      "grad_norm": 0.3096484839916229,
+      "learning_rate": 0.0005529892222545878,
+      "loss": 3.6292,
+      "step": 13550
+    },
+    {
+      "epoch": 3.959466542426184,
+      "grad_norm": 0.31400060653686523,
+      "learning_rate": 0.0005528144480046606,
+      "loss": 3.6337,
+      "step": 13600
+    },
+    {
+      "epoch": 3.974025974025974,
+      "grad_norm": 0.32995402812957764,
+      "learning_rate": 0.0005526396737547335,
+      "loss": 3.644,
+      "step": 13650
+    },
+    {
+      "epoch": 3.9885854056257646,
+      "grad_norm": 0.30545228719711304,
+      "learning_rate": 0.0005524648995048063,
+      "loss": 3.6337,
+      "step": 13700
+    },
+    {
+      "epoch": 4.002911886319958,
+      "grad_norm": 0.3340036869049072,
+      "learning_rate": 0.000552290125254879,
+      "loss": 3.6049,
+      "step": 13750
+    },
+    {
+      "epoch": 4.017471317919749,
+      "grad_norm": 0.3237653076648712,
+      "learning_rate": 0.0005521153510049519,
+      "loss": 3.5263,
+      "step": 13800
+    },
+    {
+      "epoch": 4.032030749519539,
+      "grad_norm": 0.33258405327796936,
+      "learning_rate": 0.0005519405767550247,
+      "loss": 3.5231,
+      "step": 13850
+    },
+    {
+      "epoch": 4.046590181119329,
+      "grad_norm": 0.33560073375701904,
+      "learning_rate": 0.0005517658025050975,
+      "loss": 3.5422,
+      "step": 13900
+    },
+    {
+      "epoch": 4.061149612719119,
+      "grad_norm": 0.32539400458335876,
+      "learning_rate": 0.0005515910282551704,
+      "loss": 3.5393,
+      "step": 13950
+    },
+    {
+      "epoch": 4.07570904431891,
+      "grad_norm": 0.3466116786003113,
+      "learning_rate": 0.0005514162540052432,
+      "loss": 3.5371,
+      "step": 14000
+    },
+    {
+      "epoch": 4.07570904431891,
+      "eval_accuracy": 0.35599906074061566,
+      "eval_loss": 3.6699209213256836,
+      "eval_runtime": 180.5976,
+      "eval_samples_per_second": 92.155,
+      "eval_steps_per_second": 5.764,
+      "step": 14000
+    },
+    {
+      "epoch": 4.0902684759187,
+      "grad_norm": 0.35234954953193665,
+      "learning_rate": 0.000551241479755316,
+      "loss": 3.5405,
+      "step": 14050
+    },
+    {
+      "epoch": 4.104827907518491,
+      "grad_norm": 0.3241097629070282,
+      "learning_rate": 0.0005510667055053888,
+      "loss": 3.5312,
+      "step": 14100
+    },
+    {
+      "epoch": 4.119387339118281,
+      "grad_norm": 0.35480767488479614,
+      "learning_rate": 0.0005508919312554616,
+      "loss": 3.541,
+      "step": 14150
+    },
+    {
+      "epoch": 4.1339467707180715,
+      "grad_norm": 0.31226274371147156,
+      "learning_rate": 0.0005507171570055345,
+      "loss": 3.5525,
+      "step": 14200
+    },
+    {
+      "epoch": 4.148506202317861,
+      "grad_norm": 0.3221980631351471,
+      "learning_rate": 0.0005505423827556073,
+      "loss": 3.5545,
+      "step": 14250
+    },
+    {
+      "epoch": 4.163065633917652,
+      "grad_norm": 0.33322617411613464,
+      "learning_rate": 0.0005503676085056802,
+      "loss": 3.5607,
+      "step": 14300
+    },
+    {
+      "epoch": 4.177625065517442,
+      "grad_norm": 0.31406116485595703,
+      "learning_rate": 0.0005501928342557529,
+      "loss": 3.5597,
+      "step": 14350
+    },
+    {
+      "epoch": 4.192184497117233,
+      "grad_norm": 0.30982154607772827,
+      "learning_rate": 0.0005500180600058257,
+      "loss": 3.5544,
+      "step": 14400
+    },
+    {
+      "epoch": 4.206743928717023,
+      "grad_norm": 0.31833505630493164,
+      "learning_rate": 0.0005498432857558986,
+      "loss": 3.5589,
+      "step": 14450
+    },
+    {
+      "epoch": 4.2213033603168135,
+      "grad_norm": 0.31112346053123474,
+      "learning_rate": 0.0005496685115059714,
+      "loss": 3.5535,
+      "step": 14500
+    },
+    {
+      "epoch": 4.235862791916603,
+      "grad_norm": 0.3102998733520508,
+      "learning_rate": 0.0005494937372560443,
+      "loss": 3.5584,
+      "step": 14550
+    },
+    {
+      "epoch": 4.250422223516394,
+      "grad_norm": 0.3442176878452301,
+      "learning_rate": 0.000549318963006117,
+      "loss": 3.5691,
+      "step": 14600
+    },
+    {
+      "epoch": 4.264981655116184,
+      "grad_norm": 0.3217466175556183,
+      "learning_rate": 0.0005491441887561898,
+      "loss": 3.5748,
+      "step": 14650
+    },
+    {
+      "epoch": 4.279541086715975,
+      "grad_norm": 0.32345715165138245,
+      "learning_rate": 0.0005489694145062627,
+      "loss": 3.5711,
+      "step": 14700
+    },
+    {
+      "epoch": 4.294100518315765,
+      "grad_norm": 0.31309959292411804,
+      "learning_rate": 0.0005487946402563355,
+      "loss": 3.5544,
+      "step": 14750
+    },
+    {
+      "epoch": 4.308659949915556,
+      "grad_norm": 0.31507858633995056,
+      "learning_rate": 0.0005486198660064084,
+      "loss": 3.5806,
+      "step": 14800
+    },
+    {
+      "epoch": 4.3232193815153455,
+      "grad_norm": 0.3113386631011963,
+      "learning_rate": 0.0005484450917564812,
+      "loss": 3.5698,
+      "step": 14850
+    },
+    {
+      "epoch": 4.337778813115136,
+      "grad_norm": 0.30662500858306885,
+      "learning_rate": 0.0005482703175065539,
+      "loss": 3.5684,
+      "step": 14900
+    },
+    {
+      "epoch": 4.352338244714926,
+      "grad_norm": 0.33159640431404114,
+      "learning_rate": 0.0005480955432566268,
+      "loss": 3.5681,
+      "step": 14950
+    },
+    {
+      "epoch": 4.366897676314717,
+      "grad_norm": 0.3497229218482971,
+      "learning_rate": 0.0005479207690066996,
+      "loss": 3.5768,
+      "step": 15000
+    },
+    {
+      "epoch": 4.366897676314717,
+      "eval_accuracy": 0.35680662627036064,
+      "eval_loss": 3.663057565689087,
+      "eval_runtime": 180.5674,
+      "eval_samples_per_second": 92.171,
+      "eval_steps_per_second": 5.765,
+      "step": 15000
+    },
+    {
+      "epoch": 4.381457107914507,
+      "grad_norm": 0.3152848184108734,
+      "learning_rate": 0.0005477459947567725,
+      "loss": 3.5651,
+      "step": 15050
+    },
+    {
+      "epoch": 4.396016539514298,
+      "grad_norm": 0.31485655903816223,
+      "learning_rate": 0.0005475712205068453,
+      "loss": 3.5724,
+      "step": 15100
+    },
+    {
+      "epoch": 4.4105759711140875,
+      "grad_norm": 0.3210237920284271,
+      "learning_rate": 0.000547396446256918,
+      "loss": 3.5743,
+      "step": 15150
+    },
+    {
+      "epoch": 4.425135402713878,
+      "grad_norm": 0.31647804379463196,
+      "learning_rate": 0.0005472216720069909,
+      "loss": 3.5643,
+      "step": 15200
+    },
+    {
+      "epoch": 4.439694834313668,
+      "grad_norm": 0.3220058083534241,
+      "learning_rate": 0.0005470468977570637,
+      "loss": 3.5777,
+      "step": 15250
+    },
+    {
+      "epoch": 4.454254265913459,
+      "grad_norm": 0.31475868821144104,
+      "learning_rate": 0.0005468721235071365,
+      "loss": 3.5759,
+      "step": 15300
+    },
+    {
+      "epoch": 4.468813697513249,
+      "grad_norm": 0.31258007884025574,
+      "learning_rate": 0.0005466973492572094,
+      "loss": 3.58,
+      "step": 15350
+    },
+    {
+      "epoch": 4.48337312911304,
+      "grad_norm": 0.3323783874511719,
+      "learning_rate": 0.0005465225750072822,
+      "loss": 3.5717,
+      "step": 15400
+    },
+    {
+      "epoch": 4.4979325607128295,
+      "grad_norm": 0.31647196412086487,
+      "learning_rate": 0.000546347800757355,
+      "loss": 3.5666,
+      "step": 15450
+    },
+    {
+      "epoch": 4.51249199231262,
+      "grad_norm": 0.3166157007217407,
+      "learning_rate": 0.0005461730265074279,
+      "loss": 3.5661,
+      "step": 15500
+    },
+    {
+      "epoch": 4.52705142391241,
+      "grad_norm": 0.33359718322753906,
+      "learning_rate": 0.0005459982522575007,
+      "loss": 3.581,
+      "step": 15550
+    },
+    {
+      "epoch": 4.541610855512201,
+      "grad_norm": 0.30880287289619446,
+      "learning_rate": 0.0005458234780075735,
+      "loss": 3.5767,
+      "step": 15600
+    },
+    {
+      "epoch": 4.556170287111991,
+      "grad_norm": 0.3321440517902374,
+      "learning_rate": 0.0005456487037576464,
+      "loss": 3.5927,
+      "step": 15650
+    },
+    {
+      "epoch": 4.570729718711782,
+      "grad_norm": 0.35169097781181335,
+      "learning_rate": 0.0005454739295077192,
+      "loss": 3.5777,
+      "step": 15700
+    },
+    {
+      "epoch": 4.585289150311572,
+      "grad_norm": 0.3210912048816681,
+      "learning_rate": 0.000545299155257792,
+      "loss": 3.5641,
+      "step": 15750
+    },
+    {
+      "epoch": 4.599848581911362,
+      "grad_norm": 0.3266526460647583,
+      "learning_rate": 0.0005451243810078648,
+      "loss": 3.5624,
+      "step": 15800
+    },
+    {
+      "epoch": 4.614408013511152,
+      "grad_norm": 0.3169322609901428,
+      "learning_rate": 0.0005449496067579376,
+      "loss": 3.582,
+      "step": 15850
+    },
+    {
+      "epoch": 4.628967445110943,
+      "grad_norm": 0.30979159474372864,
+      "learning_rate": 0.0005447748325080105,
+      "loss": 3.5808,
+      "step": 15900
+    },
+    {
+      "epoch": 4.643526876710733,
+      "grad_norm": 0.3104844391345978,
+      "learning_rate": 0.0005446000582580833,
+      "loss": 3.5779,
+      "step": 15950
+    },
+    {
+      "epoch": 4.658086308310524,
+      "grad_norm": 0.3167930543422699,
+      "learning_rate": 0.0005444252840081562,
+      "loss": 3.577,
+      "step": 16000
+    },
+    {
+      "epoch": 4.658086308310524,
+      "eval_accuracy": 0.3581025173162721,
+      "eval_loss": 3.6506025791168213,
+      "eval_runtime": 184.1909,
+      "eval_samples_per_second": 90.357,
+      "eval_steps_per_second": 5.652,
+      "step": 16000
+    },
+    {
+      "epoch": 4.672645739910314,
+      "grad_norm": 0.3104100823402405,
+      "learning_rate": 0.0005442505097582289,
+      "loss": 3.5755,
+      "step": 16050
+    },
+    {
+      "epoch": 4.687205171510104,
+      "grad_norm": 0.32251089811325073,
+      "learning_rate": 0.0005440757355083017,
+      "loss": 3.5785,
+      "step": 16100
+    },
+    {
+      "epoch": 4.701764603109894,
+      "grad_norm": 0.30579274892807007,
+      "learning_rate": 0.0005439009612583746,
+      "loss": 3.5736,
+      "step": 16150
+    },
+    {
+      "epoch": 4.716324034709685,
+      "grad_norm": 0.32924431562423706,
+      "learning_rate": 0.0005437261870084474,
+      "loss": 3.5859,
+      "step": 16200
+    },
+    {
+      "epoch": 4.730883466309475,
+      "grad_norm": 0.32339397072792053,
+      "learning_rate": 0.0005435514127585203,
+      "loss": 3.5714,
+      "step": 16250
+    },
+    {
+      "epoch": 4.745442897909266,
+      "grad_norm": 0.3301834762096405,
+      "learning_rate": 0.000543376638508593,
+      "loss": 3.581,
+      "step": 16300
+    },
+    {
+      "epoch": 4.760002329509056,
+      "grad_norm": 0.3323529064655304,
+      "learning_rate": 0.0005432018642586658,
+      "loss": 3.5745,
+      "step": 16350
+    },
+    {
+      "epoch": 4.7745617611088464,
+      "grad_norm": 0.31460458040237427,
+      "learning_rate": 0.0005430270900087387,
+      "loss": 3.5752,
+      "step": 16400
+    },
+    {
+      "epoch": 4.789121192708636,
+      "grad_norm": 0.30962061882019043,
+      "learning_rate": 0.0005428523157588115,
+      "loss": 3.5847,
+      "step": 16450
+    },
+    {
+      "epoch": 4.803680624308427,
+      "grad_norm": 0.31121689081192017,
+      "learning_rate": 0.0005426775415088843,
+      "loss": 3.581,
+      "step": 16500
+    },
+    {
+      "epoch": 4.818240055908217,
+      "grad_norm": 0.3271123170852661,
+      "learning_rate": 0.0005425027672589572,
+      "loss": 3.5747,
+      "step": 16550
+    },
+    {
+      "epoch": 4.832799487508008,
+      "grad_norm": 0.34155216813087463,
+      "learning_rate": 0.0005423279930090299,
+      "loss": 3.5757,
+      "step": 16600
+    },
+    {
+      "epoch": 4.847358919107798,
+      "grad_norm": 0.31826114654541016,
+      "learning_rate": 0.0005421532187591028,
+      "loss": 3.5863,
+      "step": 16650
+    },
+    {
+      "epoch": 4.8619183507075885,
+      "grad_norm": 0.3213462829589844,
+      "learning_rate": 0.0005419784445091756,
+      "loss": 3.5846,
+      "step": 16700
+    },
+    {
+      "epoch": 4.876477782307378,
+      "grad_norm": 0.3335978388786316,
+      "learning_rate": 0.0005418036702592484,
+      "loss": 3.5778,
+      "step": 16750
+    },
+    {
+      "epoch": 4.891037213907169,
+      "grad_norm": 0.32565537095069885,
+      "learning_rate": 0.0005416288960093213,
+      "loss": 3.5903,
+      "step": 16800
+    },
+    {
+      "epoch": 4.905596645506959,
+      "grad_norm": 0.31601616740226746,
+      "learning_rate": 0.000541454121759394,
+      "loss": 3.581,
+      "step": 16850
+    },
+    {
+      "epoch": 4.92015607710675,
+      "grad_norm": 0.3034924268722534,
+      "learning_rate": 0.0005412793475094669,
+      "loss": 3.5731,
+      "step": 16900
+    },
+    {
+      "epoch": 4.93471550870654,
+      "grad_norm": 0.30528074502944946,
+      "learning_rate": 0.0005411045732595397,
+      "loss": 3.5775,
+      "step": 16950
+    },
+    {
+      "epoch": 4.9492749403063305,
+      "grad_norm": 0.32346123456954956,
+      "learning_rate": 0.0005409297990096125,
+      "loss": 3.5711,
+      "step": 17000
+    },
+    {
+      "epoch": 4.9492749403063305,
+      "eval_accuracy": 0.3593712465046746,
+      "eval_loss": 3.6345937252044678,
+      "eval_runtime": 183.7217,
+      "eval_samples_per_second": 90.588,
+      "eval_steps_per_second": 5.666,
+      "step": 17000
+    },
+    {
+      "epoch": 4.96383437190612,
+      "grad_norm": 0.3116399049758911,
+      "learning_rate": 0.0005407550247596854,
+      "loss": 3.5657,
+      "step": 17050
+    },
+    {
+      "epoch": 4.978393803505911,
+      "grad_norm": 0.3291073143482208,
+      "learning_rate": 0.0005405802505097582,
+      "loss": 3.5751,
+      "step": 17100
+    },
+    {
+      "epoch": 4.992953235105701,
+      "grad_norm": 0.3149360716342926,
+      "learning_rate": 0.000540405476259831,
+      "loss": 3.5743,
+      "step": 17150
+    },
+    {
+      "epoch": 5.007279715799895,
+      "grad_norm": 0.3213154971599579,
+      "learning_rate": 0.0005402307020099038,
+      "loss": 3.5347,
+      "step": 17200
+    },
+    {
+      "epoch": 5.021839147399685,
+      "grad_norm": 0.3356756567955017,
+      "learning_rate": 0.0005400559277599766,
+      "loss": 3.4594,
+      "step": 17250
+    },
+    {
+      "epoch": 5.036398578999476,
+      "grad_norm": 0.3190675973892212,
+      "learning_rate": 0.0005398811535100495,
+      "loss": 3.4737,
+      "step": 17300
+    },
+    {
+      "epoch": 5.050958010599266,
+      "grad_norm": 0.30441927909851074,
+      "learning_rate": 0.0005397063792601223,
+      "loss": 3.4683,
+      "step": 17350
+    },
+    {
+      "epoch": 5.065517442199057,
+      "grad_norm": 0.3276670277118683,
+      "learning_rate": 0.0005395316050101951,
+      "loss": 3.4779,
+      "step": 17400
+    },
+    {
+      "epoch": 5.080076873798847,
+      "grad_norm": 0.3393913805484772,
+      "learning_rate": 0.0005393568307602679,
+      "loss": 3.4886,
+      "step": 17450
+    },
+    {
+      "epoch": 5.094636305398637,
+      "grad_norm": 0.33122798800468445,
+      "learning_rate": 0.0005391820565103407,
+      "loss": 3.4829,
+      "step": 17500
+    },
+    {
+      "epoch": 5.109195736998427,
+      "grad_norm": 0.32901448011398315,
+      "learning_rate": 0.0005390072822604136,
+      "loss": 3.4872,
+      "step": 17550
+    },
+    {
+      "epoch": 5.123755168598218,
+      "grad_norm": 0.3309627175331116,
+      "learning_rate": 0.0005388325080104864,
+      "loss": 3.4831,
+      "step": 17600
+    },
+    {
+      "epoch": 5.138314600198008,
+      "grad_norm": 0.32044172286987305,
+      "learning_rate": 0.0005386577337605593,
+      "loss": 3.4888,
+      "step": 17650
+    },
+    {
+      "epoch": 5.152874031797799,
+      "grad_norm": 0.3464089334011078,
+      "learning_rate": 0.000538482959510632,
+      "loss": 3.4972,
+      "step": 17700
+    },
+    {
+      "epoch": 5.167433463397589,
+      "grad_norm": 0.3171513080596924,
+      "learning_rate": 0.0005383081852607048,
+      "loss": 3.5026,
+      "step": 17750
+    },
+    {
+      "epoch": 5.1819928949973795,
+      "grad_norm": 0.3164452612400055,
+      "learning_rate": 0.0005381334110107777,
+      "loss": 3.4926,
+      "step": 17800
+    },
+    {
+      "epoch": 5.196552326597169,
+      "grad_norm": 0.32658103108406067,
+      "learning_rate": 0.0005379586367608505,
+      "loss": 3.5046,
+      "step": 17850
+    },
+    {
+      "epoch": 5.21111175819696,
+      "grad_norm": 0.32511815428733826,
+      "learning_rate": 0.0005377838625109233,
+      "loss": 3.4953,
+      "step": 17900
+    },
+    {
+      "epoch": 5.22567118979675,
+      "grad_norm": 0.343904972076416,
+      "learning_rate": 0.0005376090882609961,
+      "loss": 3.5065,
+      "step": 17950
+    },
+    {
+      "epoch": 5.240230621396541,
+      "grad_norm": 0.33408525586128235,
+      "learning_rate": 0.0005374343140110689,
+      "loss": 3.5066,
+      "step": 18000
+    },
+    {
+      "epoch": 5.240230621396541,
+      "eval_accuracy": 0.35997068871065013,
+      "eval_loss": 3.6377220153808594,
+      "eval_runtime": 181.2212,
+      "eval_samples_per_second": 91.838,
+      "eval_steps_per_second": 5.744,
+      "step": 18000
+    },
+    {
+      "epoch": 5.254790052996331,
+      "grad_norm": 0.3558831512928009,
+      "learning_rate": 0.0005372595397611418,
+      "loss": 3.5154,
+      "step": 18050
+    },
+    {
+      "epoch": 5.2693494845961215,
+      "grad_norm": 0.3240915536880493,
+      "learning_rate": 0.0005370847655112147,
+      "loss": 3.5104,
+      "step": 18100
+    },
+    {
+      "epoch": 5.283908916195911,
+      "grad_norm": 0.3641294538974762,
+      "learning_rate": 0.0005369099912612875,
+      "loss": 3.5125,
+      "step": 18150
+    },
+    {
+      "epoch": 5.298468347795702,
+      "grad_norm": 0.323595255613327,
+      "learning_rate": 0.0005367352170113603,
+      "loss": 3.5091,
+      "step": 18200
+    },
+    {
+      "epoch": 5.313027779395492,
+      "grad_norm": 0.31085318326950073,
+      "learning_rate": 0.0005365604427614331,
+      "loss": 3.5061,
+      "step": 18250
+    },
+    {
+      "epoch": 5.327587210995283,
+      "grad_norm": 0.3321459889411926,
+      "learning_rate": 0.0005363856685115059,
+      "loss": 3.5128,
+      "step": 18300
+    },
+    {
+      "epoch": 5.342146642595073,
+      "grad_norm": 0.3359740674495697,
+      "learning_rate": 0.0005362108942615788,
+      "loss": 3.5207,
+      "step": 18350
+    },
+    {
+      "epoch": 5.3567060741948636,
+      "grad_norm": 0.35164040327072144,
+      "learning_rate": 0.0005360361200116516,
+      "loss": 3.5206,
+      "step": 18400
+    },
+    {
+      "epoch": 5.3712655057946534,
+      "grad_norm": 0.33065569400787354,
+      "learning_rate": 0.0005358613457617244,
+      "loss": 3.5137,
+      "step": 18450
+    },
+    {
+      "epoch": 5.385824937394444,
+      "grad_norm": 0.31795698404312134,
+      "learning_rate": 0.0005356865715117973,
+      "loss": 3.5181,
+      "step": 18500
+    },
+    {
+      "epoch": 5.400384368994234,
+      "grad_norm": 0.3166426718235016,
+      "learning_rate": 0.00053551179726187,
+      "loss": 3.5129,
+      "step": 18550
+    },
+    {
+      "epoch": 5.414943800594025,
+      "grad_norm": 0.3113225996494293,
+      "learning_rate": 0.0005353370230119429,
+      "loss": 3.5224,
+      "step": 18600
+    },
+    {
+      "epoch": 5.429503232193815,
+      "grad_norm": 0.3037504553794861,
+      "learning_rate": 0.0005351622487620157,
+      "loss": 3.5212,
+      "step": 18650
+    },
+    {
+      "epoch": 5.444062663793606,
+      "grad_norm": 0.3170977830886841,
+      "learning_rate": 0.0005349874745120885,
+      "loss": 3.5185,
+      "step": 18700
+    },
+    {
+      "epoch": 5.4586220953933955,
+      "grad_norm": 0.3276199698448181,
+      "learning_rate": 0.0005348127002621614,
+      "loss": 3.5143,
+      "step": 18750
+    },
+    {
+      "epoch": 5.473181526993186,
+      "grad_norm": 0.35049423575401306,
+      "learning_rate": 0.0005346379260122341,
+      "loss": 3.5178,
+      "step": 18800
+    },
+    {
+      "epoch": 5.487740958592976,
+      "grad_norm": 0.3257882595062256,
+      "learning_rate": 0.000534463151762307,
+      "loss": 3.513,
+      "step": 18850
+    },
+    {
+      "epoch": 5.502300390192767,
+      "grad_norm": 0.3254280686378479,
+      "learning_rate": 0.0005342883775123798,
+      "loss": 3.5157,
+      "step": 18900
+    },
+    {
+      "epoch": 5.516859821792557,
+      "grad_norm": 0.35354653000831604,
+      "learning_rate": 0.0005341136032624526,
+      "loss": 3.5323,
+      "step": 18950
+    },
+    {
+      "epoch": 5.531419253392348,
+      "grad_norm": 0.3293665945529938,
+      "learning_rate": 0.0005339388290125255,
+      "loss": 3.5294,
+      "step": 19000
+    },
+    {
+      "epoch": 5.531419253392348,
+      "eval_accuracy": 0.36105410583223874,
+      "eval_loss": 3.6271042823791504,
+      "eval_runtime": 181.268,
+      "eval_samples_per_second": 91.814,
+      "eval_steps_per_second": 5.743,
+      "step": 19000
+    },
+    {
+      "epoch": 5.5459786849921375,
+      "grad_norm": 0.32479095458984375,
+      "learning_rate": 0.0005337640547625983,
+      "loss": 3.5257,
+      "step": 19050
+    },
+    {
+      "epoch": 5.560538116591928,
+      "grad_norm": 0.30282458662986755,
+      "learning_rate": 0.000533589280512671,
+      "loss": 3.5376,
+      "step": 19100
+    },
+    {
+      "epoch": 5.575097548191718,
+      "grad_norm": 0.3051811754703522,
+      "learning_rate": 0.0005334145062627439,
+      "loss": 3.5188,
+      "step": 19150
+    },
+    {
+      "epoch": 5.589656979791509,
+      "grad_norm": 0.34127405285835266,
+      "learning_rate": 0.0005332397320128167,
+      "loss": 3.5171,
+      "step": 19200
+    },
+    {
+      "epoch": 5.604216411391299,
+      "grad_norm": 0.3210941553115845,
+      "learning_rate": 0.0005330649577628896,
+      "loss": 3.5248,
+      "step": 19250
+    },
+    {
+      "epoch": 5.61877584299109,
+      "grad_norm": 0.3192020654678345,
+      "learning_rate": 0.0005328901835129624,
+      "loss": 3.533,
+      "step": 19300
+    },
+    {
+      "epoch": 5.6333352745908805,
+      "grad_norm": 0.34110450744628906,
+      "learning_rate": 0.0005327154092630351,
+      "loss": 3.5295,
+      "step": 19350
+    },
+    {
+      "epoch": 5.64789470619067,
+      "grad_norm": 0.3144545555114746,
+      "learning_rate": 0.000532540635013108,
+      "loss": 3.5335,
+      "step": 19400
+    },
+    {
+      "epoch": 5.66245413779046,
+      "grad_norm": 0.3245835304260254,
+      "learning_rate": 0.0005323658607631808,
+      "loss": 3.5229,
+      "step": 19450
+    },
+    {
+      "epoch": 5.677013569390251,
+      "grad_norm": 0.3528177738189697,
+      "learning_rate": 0.0005321910865132537,
+      "loss": 3.5209,
+      "step": 19500
+    },
+    {
+      "epoch": 5.691573000990042,
+      "grad_norm": 0.3312878906726837,
+      "learning_rate": 0.0005320163122633265,
+      "loss": 3.5321,
+      "step": 19550
+    },
+    {
+      "epoch": 5.706132432589832,
+      "grad_norm": 0.3077809512615204,
+      "learning_rate": 0.0005318415380133993,
+      "loss": 3.5183,
+      "step": 19600
+    },
+    {
+      "epoch": 5.720691864189622,
+      "grad_norm": 0.32409968972206116,
+      "learning_rate": 0.0005316667637634721,
+      "loss": 3.5276,
+      "step": 19650
+    },
+    {
+      "epoch": 5.735251295789412,
+      "grad_norm": 0.3110126852989197,
+      "learning_rate": 0.0005314919895135449,
+      "loss": 3.5253,
+      "step": 19700
+    },
+    {
+      "epoch": 5.749810727389203,
+      "grad_norm": 0.33343297243118286,
+      "learning_rate": 0.0005313172152636178,
+      "loss": 3.5272,
+      "step": 19750
+    },
+    {
+      "epoch": 5.764370158988993,
+      "grad_norm": 0.3315747082233429,
+      "learning_rate": 0.0005311424410136906,
+      "loss": 3.534,
+      "step": 19800
+    },
+    {
+      "epoch": 5.778929590588783,
+      "grad_norm": 0.3091914653778076,
+      "learning_rate": 0.0005309676667637634,
+      "loss": 3.5276,
+      "step": 19850
+    },
+    {
+      "epoch": 5.793489022188574,
+      "grad_norm": 0.30921050906181335,
+      "learning_rate": 0.0005307928925138363,
+      "loss": 3.5219,
+      "step": 19900
+    },
+    {
+      "epoch": 5.8080484537883645,
+      "grad_norm": 0.30907315015792847,
+      "learning_rate": 0.000530618118263909,
+      "loss": 3.534,
+      "step": 19950
+    },
+    {
+      "epoch": 5.822607885388154,
+      "grad_norm": 0.36628568172454834,
+      "learning_rate": 0.0005304433440139819,
+      "loss": 3.538,
+      "step": 20000
+    },
+    {
+      "epoch": 5.822607885388154,
+      "eval_accuracy": 0.36194950645964236,
+      "eval_loss": 3.61657452583313,
+      "eval_runtime": 183.6498,
+      "eval_samples_per_second": 90.624,
+      "eval_steps_per_second": 5.668,
+      "step": 20000
+    },
+    {
+      "epoch": 5.837167316987944,
+      "grad_norm": 0.3185259997844696,
+      "learning_rate": 0.0005302685697640547,
+      "loss": 3.5243,
+      "step": 20050
+    },
+    {
+      "epoch": 5.851726748587735,
+      "grad_norm": 0.3328113257884979,
+      "learning_rate": 0.0005300937955141275,
+      "loss": 3.5306,
+      "step": 20100
+    },
+    {
+      "epoch": 5.866286180187526,
+      "grad_norm": 0.31715288758277893,
+      "learning_rate": 0.0005299190212642004,
+      "loss": 3.5368,
+      "step": 20150
+    },
+    {
+      "epoch": 5.880845611787316,
+      "grad_norm": 0.3114943206310272,
+      "learning_rate": 0.0005297442470142731,
+      "loss": 3.5279,
+      "step": 20200
+    },
+    {
+      "epoch": 5.895405043387106,
+      "grad_norm": 0.3375224471092224,
+      "learning_rate": 0.000529569472764346,
+      "loss": 3.5214,
+      "step": 20250
+    },
+    {
+      "epoch": 5.9099644749868965,
+      "grad_norm": 0.29627102613449097,
+      "learning_rate": 0.0005293946985144188,
+      "loss": 3.5182,
+      "step": 20300
+    },
+    {
+      "epoch": 5.924523906586687,
+      "grad_norm": 0.33964815735816956,
+      "learning_rate": 0.0005292199242644916,
+      "loss": 3.541,
+      "step": 20350
+    },
+    {
+      "epoch": 5.939083338186477,
+      "grad_norm": 0.3077552914619446,
+      "learning_rate": 0.0005290451500145645,
+      "loss": 3.5246,
+      "step": 20400
+    },
+    {
+      "epoch": 5.953642769786267,
+      "grad_norm": 0.3167116641998291,
+      "learning_rate": 0.0005288703757646373,
+      "loss": 3.5294,
+      "step": 20450
+    },
+    {
+      "epoch": 5.968202201386058,
+      "grad_norm": 0.3327026665210724,
+      "learning_rate": 0.00052869560151471,
+      "loss": 3.5322,
+      "step": 20500
+    },
+    {
+      "epoch": 5.982761632985849,
+      "grad_norm": 0.3215795159339905,
+      "learning_rate": 0.0005285208272647829,
+      "loss": 3.5378,
+      "step": 20550
+    },
+    {
+      "epoch": 5.9973210645856385,
+      "grad_norm": 0.3464929759502411,
+      "learning_rate": 0.0005283460530148558,
+      "loss": 3.544,
+      "step": 20600
+    },
+    {
+      "epoch": 6.011647545279832,
+      "grad_norm": 0.37006425857543945,
+      "learning_rate": 0.0005281712787649286,
+      "loss": 3.4569,
+      "step": 20650
+    },
+    {
+      "epoch": 6.026206976879623,
+      "grad_norm": 0.32685425877571106,
+      "learning_rate": 0.0005279965045150015,
+      "loss": 3.4143,
+      "step": 20700
+    },
+    {
+      "epoch": 6.040766408479413,
+      "grad_norm": 0.31896543502807617,
+      "learning_rate": 0.0005278217302650743,
+      "loss": 3.4284,
+      "step": 20750
+    },
+    {
+      "epoch": 6.055325840079203,
+      "grad_norm": 0.3501061499118805,
+      "learning_rate": 0.000527646956015147,
+      "loss": 3.4257,
+      "step": 20800
+    },
+    {
+      "epoch": 6.069885271678993,
+      "grad_norm": 0.3293428421020508,
+      "learning_rate": 0.0005274721817652199,
+      "loss": 3.4253,
+      "step": 20850
+    },
+    {
+      "epoch": 6.084444703278784,
+      "grad_norm": 0.33916565775871277,
+      "learning_rate": 0.0005272974075152927,
+      "loss": 3.4532,
+      "step": 20900
+    },
+    {
+      "epoch": 6.099004134878574,
+      "grad_norm": 0.3229523301124573,
+      "learning_rate": 0.0005271226332653656,
+      "loss": 3.4476,
+      "step": 20950
+    },
+    {
+      "epoch": 6.113563566478365,
+      "grad_norm": 0.3364764153957367,
+      "learning_rate": 0.0005269478590154384,
+      "loss": 3.4415,
+      "step": 21000
+    },
+    {
+      "epoch": 6.113563566478365,
+      "eval_accuracy": 0.3621457538197391,
+      "eval_loss": 3.6198906898498535,
+      "eval_runtime": 183.4856,
+      "eval_samples_per_second": 90.705,
+      "eval_steps_per_second": 5.673,
+      "step": 21000
+    },
+    {
+      "epoch": 6.128122998078155,
+      "grad_norm": 0.3735044300556183,
+      "learning_rate": 0.0005267730847655111,
+      "loss": 3.4457,
+      "step": 21050
+    },
+    {
+      "epoch": 6.142682429677945,
+      "grad_norm": 0.34455105662345886,
+      "learning_rate": 0.000526598310515584,
+      "loss": 3.4528,
+      "step": 21100
+    },
+    {
+      "epoch": 6.157241861277735,
+      "grad_norm": 0.33916333317756653,
+      "learning_rate": 0.0005264235362656568,
+      "loss": 3.4609,
+      "step": 21150
+    },
+    {
+      "epoch": 6.171801292877526,
+      "grad_norm": 0.3121279180049896,
+      "learning_rate": 0.0005262487620157297,
+      "loss": 3.4478,
+      "step": 21200
+    },
+    {
+      "epoch": 6.186360724477316,
+      "grad_norm": 0.30740803480148315,
+      "learning_rate": 0.0005260739877658025,
+      "loss": 3.4448,
+      "step": 21250
+    },
+    {
+      "epoch": 6.200920156077107,
+      "grad_norm": 0.3505891263484955,
+      "learning_rate": 0.0005258992135158753,
+      "loss": 3.4585,
+      "step": 21300
+    },
+    {
+      "epoch": 6.215479587676897,
+      "grad_norm": 0.33900803327560425,
+      "learning_rate": 0.0005257244392659481,
+      "loss": 3.46,
+      "step": 21350
+    },
+    {
+      "epoch": 6.2300390192766875,
+      "grad_norm": 0.3224051892757416,
+      "learning_rate": 0.0005255496650160209,
+      "loss": 3.4557,
+      "step": 21400
+    },
+    {
+      "epoch": 6.244598450876477,
+      "grad_norm": 0.35417911410331726,
+      "learning_rate": 0.0005253748907660938,
+      "loss": 3.4583,
+      "step": 21450
+    },
+    {
+      "epoch": 6.259157882476268,
+      "grad_norm": 0.34107911586761475,
+      "learning_rate": 0.0005252001165161666,
+      "loss": 3.4659,
+      "step": 21500
+    },
+    {
+      "epoch": 6.273717314076059,
+      "grad_norm": 0.32315975427627563,
+      "learning_rate": 0.0005250253422662394,
+      "loss": 3.4613,
+      "step": 21550
+    },
+    {
+      "epoch": 6.288276745675849,
+      "grad_norm": 0.3344326615333557,
+      "learning_rate": 0.0005248505680163123,
+      "loss": 3.4729,
+      "step": 21600
+    },
+    {
+      "epoch": 6.302836177275639,
+      "grad_norm": 0.34388530254364014,
+      "learning_rate": 0.000524675793766385,
+      "loss": 3.4723,
+      "step": 21650
+    },
+    {
+      "epoch": 6.3173956088754295,
+      "grad_norm": 0.34264546632766724,
+      "learning_rate": 0.0005245010195164579,
+      "loss": 3.4751,
+      "step": 21700
+    },
+    {
+      "epoch": 6.33195504047522,
+      "grad_norm": 0.32228031754493713,
+      "learning_rate": 0.0005243262452665307,
+      "loss": 3.4586,
+      "step": 21750
+    },
+    {
+      "epoch": 6.34651447207501,
+      "grad_norm": 0.34229689836502075,
+      "learning_rate": 0.0005241514710166035,
+      "loss": 3.4657,
+      "step": 21800
+    },
+    {
+      "epoch": 6.3610739036748,
+      "grad_norm": 0.3267248570919037,
+      "learning_rate": 0.0005239766967666764,
+      "loss": 3.4747,
+      "step": 21850
+    },
+    {
+      "epoch": 6.375633335274591,
+      "grad_norm": 0.3363324999809265,
+      "learning_rate": 0.0005238019225167491,
+      "loss": 3.4736,
+      "step": 21900
+    },
+    {
+      "epoch": 6.390192766874382,
+      "grad_norm": 0.32636144757270813,
+      "learning_rate": 0.0005236271482668219,
+      "loss": 3.4731,
+      "step": 21950
+    },
+    {
+      "epoch": 6.4047521984741715,
+      "grad_norm": 0.3209141194820404,
+      "learning_rate": 0.0005234523740168948,
+      "loss": 3.475,
+      "step": 22000
+    },
+    {
+      "epoch": 6.4047521984741715,
+      "eval_accuracy": 0.36269357673806785,
+      "eval_loss": 3.6098952293395996,
+      "eval_runtime": 183.5371,
+      "eval_samples_per_second": 90.679,
+      "eval_steps_per_second": 5.672,
+      "step": 22000
+    },
+    {
+      "epoch": 6.419311630073962,
+      "grad_norm": 0.3223513066768646,
+      "learning_rate": 0.0005232775997669676,
+      "loss": 3.4759,
+      "step": 22050
+    },
+    {
+      "epoch": 6.433871061673752,
+      "grad_norm": 0.3284885585308075,
+      "learning_rate": 0.0005231028255170405,
+      "loss": 3.4796,
+      "step": 22100
+    },
+    {
+      "epoch": 6.448430493273543,
+      "grad_norm": 0.32980912923812866,
+      "learning_rate": 0.0005229280512671133,
+      "loss": 3.4839,
+      "step": 22150
+    },
+    {
+      "epoch": 6.462989924873333,
+      "grad_norm": 0.33856451511383057,
+      "learning_rate": 0.000522753277017186,
+      "loss": 3.4825,
+      "step": 22200
+    },
+    {
+      "epoch": 6.477549356473124,
+      "grad_norm": 0.3303597867488861,
+      "learning_rate": 0.0005225785027672589,
+      "loss": 3.4827,
+      "step": 22250
+    },
+    {
+      "epoch": 6.492108788072914,
+      "grad_norm": 0.32675686478614807,
+      "learning_rate": 0.0005224037285173317,
+      "loss": 3.4781,
+      "step": 22300
+    },
+    {
+      "epoch": 6.506668219672704,
+      "grad_norm": 0.3315143883228302,
+      "learning_rate": 0.0005222289542674046,
+      "loss": 3.4786,
+      "step": 22350
+    },
+    {
+      "epoch": 6.521227651272494,
+      "grad_norm": 0.35115185379981995,
+      "learning_rate": 0.0005220541800174774,
+      "loss": 3.4777,
+      "step": 22400
+    },
+    {
+      "epoch": 6.535787082872285,
+      "grad_norm": 0.32922348380088806,
+      "learning_rate": 0.0005218794057675501,
+      "loss": 3.4764,
+      "step": 22450
+    },
+    {
+      "epoch": 6.550346514472075,
+      "grad_norm": 0.32848137617111206,
+      "learning_rate": 0.000521704631517623,
+      "loss": 3.4864,
+      "step": 22500
+    },
+    {
+      "epoch": 6.564905946071866,
+      "grad_norm": 0.3455169200897217,
+      "learning_rate": 0.0005215298572676958,
+      "loss": 3.4872,
+      "step": 22550
+    },
+    {
+      "epoch": 6.579465377671656,
+      "grad_norm": 0.3491528034210205,
+      "learning_rate": 0.0005213550830177687,
+      "loss": 3.4941,
+      "step": 22600
+    },
+    {
+      "epoch": 6.594024809271446,
+      "grad_norm": 0.3292933404445648,
+      "learning_rate": 0.0005211803087678415,
+      "loss": 3.4849,
+      "step": 22650
+    },
+    {
+      "epoch": 6.608584240871236,
+      "grad_norm": 0.33583250641822815,
+      "learning_rate": 0.0005210055345179143,
+      "loss": 3.4787,
+      "step": 22700
+    },
+    {
+      "epoch": 6.623143672471027,
+      "grad_norm": 0.32590252161026,
+      "learning_rate": 0.0005208307602679871,
+      "loss": 3.4819,
+      "step": 22750
+    },
+    {
+      "epoch": 6.637703104070817,
+      "grad_norm": 0.34313255548477173,
+      "learning_rate": 0.0005206559860180599,
+      "loss": 3.4679,
+      "step": 22800
+    },
+    {
+      "epoch": 6.652262535670608,
+      "grad_norm": 0.3168715238571167,
+      "learning_rate": 0.0005204812117681328,
+      "loss": 3.4812,
+      "step": 22850
+    },
+    {
+      "epoch": 6.666821967270398,
+      "grad_norm": 0.33726438879966736,
+      "learning_rate": 0.0005203064375182056,
+      "loss": 3.4791,
+      "step": 22900
+    },
+    {
+      "epoch": 6.6813813988701884,
+      "grad_norm": 0.33907851576805115,
+      "learning_rate": 0.0005201316632682784,
+      "loss": 3.4817,
+      "step": 22950
+    },
+    {
+      "epoch": 6.695940830469978,
+      "grad_norm": 0.3657963275909424,
+      "learning_rate": 0.0005199568890183513,
+      "loss": 3.4859,
+      "step": 23000
+    },
+    {
+      "epoch": 6.695940830469978,
+      "eval_accuracy": 0.3636106127844396,
+      "eval_loss": 3.6045539379119873,
+      "eval_runtime": 184.0903,
+      "eval_samples_per_second": 90.407,
+      "eval_steps_per_second": 5.655,
+      "step": 23000
+    },
+    {
+      "epoch": 6.710500262069769,
+      "grad_norm": 0.3322959840297699,
+      "learning_rate": 0.000519782114768424,
+      "loss": 3.4824,
+      "step": 23050
+    },
+    {
+      "epoch": 6.725059693669559,
+      "grad_norm": 0.343662828207016,
+      "learning_rate": 0.0005196073405184969,
+      "loss": 3.4897,
+      "step": 23100
+    },
+    {
+      "epoch": 6.73961912526935,
+      "grad_norm": 0.32909801602363586,
+      "learning_rate": 0.0005194325662685697,
+      "loss": 3.4858,
+      "step": 23150
+    },
+    {
+      "epoch": 6.75417855686914,
+      "grad_norm": 0.3375694751739502,
+      "learning_rate": 0.0005192577920186426,
+      "loss": 3.4808,
+      "step": 23200
+    },
+    {
+      "epoch": 6.7687379884689305,
+      "grad_norm": 0.3138526678085327,
+      "learning_rate": 0.0005190830177687154,
+      "loss": 3.483,
+      "step": 23250
+    },
+    {
+      "epoch": 6.78329742006872,
+      "grad_norm": 0.3340669572353363,
+      "learning_rate": 0.0005189082435187883,
+      "loss": 3.4903,
+      "step": 23300
+    },
+    {
+      "epoch": 6.797856851668511,
+      "grad_norm": 0.3336253762245178,
+      "learning_rate": 0.000518733469268861,
+      "loss": 3.4864,
+      "step": 23350
+    },
+    {
+      "epoch": 6.812416283268301,
+      "grad_norm": 0.3235922157764435,
+      "learning_rate": 0.0005185586950189338,
+      "loss": 3.5037,
+      "step": 23400
+    },
+    {
+      "epoch": 6.826975714868092,
+      "grad_norm": 0.3445108234882355,
+      "learning_rate": 0.0005183839207690067,
+      "loss": 3.4908,
+      "step": 23450
+    },
+    {
+      "epoch": 6.841535146467882,
+      "grad_norm": 0.3229808211326599,
+      "learning_rate": 0.0005182091465190795,
+      "loss": 3.4906,
+      "step": 23500
+    },
+    {
+      "epoch": 6.8560945780676725,
+      "grad_norm": 0.29649391770362854,
+      "learning_rate": 0.0005180343722691524,
+      "loss": 3.4964,
+      "step": 23550
+    },
+    {
+      "epoch": 6.870654009667462,
+      "grad_norm": 0.3273935616016388,
+      "learning_rate": 0.0005178595980192251,
+      "loss": 3.493,
+      "step": 23600
+    },
+    {
+      "epoch": 6.885213441267253,
+      "grad_norm": 0.33352574706077576,
+      "learning_rate": 0.0005176848237692979,
+      "loss": 3.4915,
+      "step": 23650
+    },
+    {
+      "epoch": 6.899772872867043,
+      "grad_norm": 0.3277892768383026,
+      "learning_rate": 0.0005175100495193708,
+      "loss": 3.4986,
+      "step": 23700
+    },
+    {
+      "epoch": 6.914332304466834,
+      "grad_norm": 0.3182038366794586,
+      "learning_rate": 0.0005173352752694436,
+      "loss": 3.5041,
+      "step": 23750
+    },
+    {
+      "epoch": 6.928891736066624,
+      "grad_norm": 0.3153535723686218,
+      "learning_rate": 0.0005171605010195165,
+      "loss": 3.5035,
+      "step": 23800
+    },
+    {
+      "epoch": 6.943451167666415,
+      "grad_norm": 0.34128624200820923,
+      "learning_rate": 0.0005169857267695893,
+      "loss": 3.4943,
+      "step": 23850
+    },
+    {
+      "epoch": 6.9580105992662045,
+      "grad_norm": 0.3200225234031677,
+      "learning_rate": 0.000516810952519662,
+      "loss": 3.5004,
+      "step": 23900
+    },
+    {
+      "epoch": 6.972570030865995,
+      "grad_norm": 0.35053977370262146,
+      "learning_rate": 0.0005166361782697349,
+      "loss": 3.4938,
+      "step": 23950
+    },
+    {
+      "epoch": 6.987129462465785,
+      "grad_norm": 0.35640257596969604,
+      "learning_rate": 0.0005164614040198077,
+      "loss": 3.4951,
+      "step": 24000
+    },
+    {
+      "epoch": 6.987129462465785,
+      "eval_accuracy": 0.36452964775538993,
+      "eval_loss": 3.5928568840026855,
+      "eval_runtime": 184.1497,
+      "eval_samples_per_second": 90.378,
+      "eval_steps_per_second": 5.653,
+      "step": 24000
+    },
+    {
+      "epoch": 7.001455943159979,
+      "grad_norm": 0.3846636712551117,
+      "learning_rate": 0.0005162866297698806,
+      "loss": 3.4935,
+      "step": 24050
+    },
+    {
+      "epoch": 7.016015374759769,
+      "grad_norm": 0.3523205518722534,
+      "learning_rate": 0.0005161118555199534,
+      "loss": 3.3822,
+      "step": 24100
+    },
+    {
+      "epoch": 7.03057480635956,
+      "grad_norm": 0.36663973331451416,
+      "learning_rate": 0.0005159370812700261,
+      "loss": 3.3874,
+      "step": 24150
+    },
+    {
+      "epoch": 7.04513423795935,
+      "grad_norm": 0.38096940517425537,
+      "learning_rate": 0.000515762307020099,
+      "loss": 3.3899,
+      "step": 24200
+    },
+    {
+      "epoch": 7.059693669559141,
+      "grad_norm": 0.35516002774238586,
+      "learning_rate": 0.0005155875327701718,
+      "loss": 3.3847,
+      "step": 24250
+    },
+    {
+      "epoch": 7.074253101158931,
+      "grad_norm": 0.3651926815509796,
+      "learning_rate": 0.0005154127585202447,
+      "loss": 3.4049,
+      "step": 24300
+    },
+    {
+      "epoch": 7.0888125327587215,
+      "grad_norm": 0.36075493693351746,
+      "learning_rate": 0.0005152379842703175,
+      "loss": 3.3965,
+      "step": 24350
+    },
+    {
+      "epoch": 7.103371964358511,
+      "grad_norm": 0.38245540857315063,
+      "learning_rate": 0.0005150632100203903,
+      "loss": 3.4028,
+      "step": 24400
+    },
+    {
+      "epoch": 7.117931395958302,
+      "grad_norm": 0.32894188165664673,
+      "learning_rate": 0.0005148884357704631,
+      "loss": 3.3985,
+      "step": 24450
+    },
+    {
+      "epoch": 7.132490827558092,
+      "grad_norm": 0.3118518590927124,
+      "learning_rate": 0.0005147136615205359,
+      "loss": 3.4151,
+      "step": 24500
+    },
+    {
+      "epoch": 7.147050259157883,
+      "grad_norm": 0.3686443269252777,
+      "learning_rate": 0.0005145388872706087,
+      "loss": 3.4092,
+      "step": 24550
+    },
+    {
+      "epoch": 7.161609690757673,
+      "grad_norm": 0.35504400730133057,
+      "learning_rate": 0.0005143641130206816,
+      "loss": 3.4128,
+      "step": 24600
+    },
+    {
+      "epoch": 7.1761691223574635,
+      "grad_norm": 0.371929794549942,
+      "learning_rate": 0.0005141893387707544,
+      "loss": 3.4088,
+      "step": 24650
+    },
+    {
+      "epoch": 7.190728553957253,
+      "grad_norm": 0.35544171929359436,
+      "learning_rate": 0.0005140145645208272,
+      "loss": 3.4102,
+      "step": 24700
+    },
+    {
+      "epoch": 7.205287985557044,
+      "grad_norm": 0.32105565071105957,
+      "learning_rate": 0.0005138397902709,
+      "loss": 3.4146,
+      "step": 24750
+    },
+    {
+      "epoch": 7.219847417156834,
+      "grad_norm": 0.3172771632671356,
+      "learning_rate": 0.0005136650160209728,
+      "loss": 3.4218,
+      "step": 24800
+    },
+    {
+      "epoch": 7.234406848756625,
+      "grad_norm": 0.3447094261646271,
+      "learning_rate": 0.0005134902417710457,
+      "loss": 3.4251,
+      "step": 24850
+    },
+    {
+      "epoch": 7.248966280356415,
+      "grad_norm": 0.3414628505706787,
+      "learning_rate": 0.0005133154675211185,
+      "loss": 3.4205,
+      "step": 24900
+    },
+    {
+      "epoch": 7.2635257119562056,
+      "grad_norm": 0.36512497067451477,
+      "learning_rate": 0.0005131406932711914,
+      "loss": 3.4273,
+      "step": 24950
+    },
+    {
+      "epoch": 7.2780851435559955,
+      "grad_norm": 0.3672768771648407,
+      "learning_rate": 0.0005129659190212641,
+      "loss": 3.4339,
+      "step": 25000
+    },
+    {
+      "epoch": 7.2780851435559955,
+      "eval_accuracy": 0.36433104871931843,
+      "eval_loss": 3.5997819900512695,
+      "eval_runtime": 180.8992,
+      "eval_samples_per_second": 92.002,
+      "eval_steps_per_second": 5.755,
+      "step": 25000
+    },
+    {
+      "epoch": 7.292644575155786,
+      "grad_norm": 0.3394540846347809,
+      "learning_rate": 0.0005127911447713369,
+      "loss": 3.4232,
+      "step": 25050
+    },
+    {
+      "epoch": 7.307204006755576,
+      "grad_norm": 0.3045949637889862,
+      "learning_rate": 0.0005126163705214098,
+      "loss": 3.4267,
+      "step": 25100
+    },
+    {
+      "epoch": 7.321763438355367,
+      "grad_norm": 0.32903987169265747,
+      "learning_rate": 0.0005124415962714826,
+      "loss": 3.4341,
+      "step": 25150
+    },
+    {
+      "epoch": 7.336322869955157,
+      "grad_norm": 0.3628155589103699,
+      "learning_rate": 0.0005122668220215555,
+      "loss": 3.4336,
+      "step": 25200
+    },
+    {
+      "epoch": 7.350882301554948,
+      "grad_norm": 0.3750855624675751,
+      "learning_rate": 0.0005120920477716282,
+      "loss": 3.436,
+      "step": 25250
+    },
+    {
+      "epoch": 7.3654417331547375,
+      "grad_norm": 0.31662774085998535,
+      "learning_rate": 0.000511917273521701,
+      "loss": 3.4372,
+      "step": 25300
+    },
+    {
+      "epoch": 7.380001164754528,
+      "grad_norm": 0.3318006694316864,
+      "learning_rate": 0.0005117424992717739,
+      "loss": 3.4377,
+      "step": 25350
+    },
+    {
+      "epoch": 7.394560596354318,
+      "grad_norm": 0.3489433526992798,
+      "learning_rate": 0.0005115677250218467,
+      "loss": 3.4364,
+      "step": 25400
+    },
+    {
+      "epoch": 7.409120027954109,
+      "grad_norm": 0.3378850817680359,
+      "learning_rate": 0.0005113929507719196,
+      "loss": 3.4392,
+      "step": 25450
+    },
+    {
+      "epoch": 7.423679459553899,
+      "grad_norm": 0.3490906357765198,
+      "learning_rate": 0.0005112181765219924,
+      "loss": 3.4404,
+      "step": 25500
+    },
+    {
+      "epoch": 7.43823889115369,
+      "grad_norm": 0.33684709668159485,
+      "learning_rate": 0.0005110434022720651,
+      "loss": 3.4316,
+      "step": 25550
+    },
+    {
+      "epoch": 7.4527983227534795,
+      "grad_norm": 0.3533405363559723,
+      "learning_rate": 0.000510868628022138,
+      "loss": 3.4519,
+      "step": 25600
+    },
+    {
+      "epoch": 7.46735775435327,
+      "grad_norm": 0.364666610956192,
+      "learning_rate": 0.0005106938537722109,
+      "loss": 3.4428,
+      "step": 25650
+    },
+    {
+      "epoch": 7.48191718595306,
+      "grad_norm": 0.3563931882381439,
+      "learning_rate": 0.0005105190795222837,
+      "loss": 3.442,
+      "step": 25700
+    },
+    {
+      "epoch": 7.496476617552851,
+      "grad_norm": 0.35002008080482483,
+      "learning_rate": 0.0005103443052723565,
+      "loss": 3.4379,
+      "step": 25750
+    },
+    {
+      "epoch": 7.511036049152641,
+      "grad_norm": 0.3543298542499542,
+      "learning_rate": 0.0005101695310224294,
+      "loss": 3.457,
+      "step": 25800
+    },
+    {
+      "epoch": 7.525595480752432,
+      "grad_norm": 0.33176884055137634,
+      "learning_rate": 0.0005099947567725021,
+      "loss": 3.4399,
+      "step": 25850
+    },
+    {
+      "epoch": 7.540154912352222,
+      "grad_norm": 0.34475451707839966,
+      "learning_rate": 0.000509819982522575,
+      "loss": 3.4443,
+      "step": 25900
+    },
+    {
+      "epoch": 7.554714343952012,
+      "grad_norm": 0.33004602789878845,
+      "learning_rate": 0.0005096452082726478,
+      "loss": 3.4584,
+      "step": 25950
+    },
+    {
+      "epoch": 7.569273775551802,
+      "grad_norm": 0.3163653016090393,
+      "learning_rate": 0.0005094704340227206,
+      "loss": 3.45,
+      "step": 26000
+    },
+    {
+      "epoch": 7.569273775551802,
+      "eval_accuracy": 0.36467333515745,
+      "eval_loss": 3.5947012901306152,
+      "eval_runtime": 180.7981,
+      "eval_samples_per_second": 92.053,
+      "eval_steps_per_second": 5.758,
+      "step": 26000
+    },
+    {
+      "epoch": 7.583833207151593,
+      "grad_norm": 0.3281993567943573,
+      "learning_rate": 0.0005092956597727935,
+      "loss": 3.4512,
+      "step": 26050
+    },
+    {
+      "epoch": 7.598392638751383,
+      "grad_norm": 0.31753283739089966,
+      "learning_rate": 0.0005091208855228662,
+      "loss": 3.4646,
+      "step": 26100
+    },
+    {
+      "epoch": 7.612952070351174,
+      "grad_norm": 0.3362863063812256,
+      "learning_rate": 0.0005089461112729391,
+      "loss": 3.4544,
+      "step": 26150
+    },
+    {
+      "epoch": 7.627511501950964,
+      "grad_norm": 0.34793269634246826,
+      "learning_rate": 0.0005087713370230119,
+      "loss": 3.4481,
+      "step": 26200
+    },
+    {
+      "epoch": 7.642070933550754,
+      "grad_norm": 0.3432117700576782,
+      "learning_rate": 0.0005085965627730847,
+      "loss": 3.4591,
+      "step": 26250
+    },
+    {
+      "epoch": 7.656630365150544,
+      "grad_norm": 0.3630698323249817,
+      "learning_rate": 0.0005084217885231576,
+      "loss": 3.4618,
+      "step": 26300
+    },
+    {
+      "epoch": 7.671189796750335,
+      "grad_norm": 0.3361819088459015,
+      "learning_rate": 0.0005082470142732304,
+      "loss": 3.4646,
+      "step": 26350
+    },
+    {
+      "epoch": 7.685749228350125,
+      "grad_norm": 0.3673403263092041,
+      "learning_rate": 0.0005080722400233032,
+      "loss": 3.4539,
+      "step": 26400
+    },
+    {
+      "epoch": 7.700308659949916,
+      "grad_norm": 0.33987388014793396,
+      "learning_rate": 0.000507897465773376,
+      "loss": 3.4693,
+      "step": 26450
+    },
+    {
+      "epoch": 7.714868091549706,
+      "grad_norm": 0.32190704345703125,
+      "learning_rate": 0.0005077226915234488,
+      "loss": 3.4468,
+      "step": 26500
+    },
+    {
+      "epoch": 7.729427523149496,
+      "grad_norm": 0.3864888846874237,
+      "learning_rate": 0.0005075479172735217,
+      "loss": 3.4556,
+      "step": 26550
+    },
+    {
+      "epoch": 7.743986954749286,
+      "grad_norm": 0.3400271534919739,
+      "learning_rate": 0.0005073731430235945,
+      "loss": 3.4587,
+      "step": 26600
+    },
+    {
+      "epoch": 7.758546386349077,
+      "grad_norm": 0.3375173509120941,
+      "learning_rate": 0.0005071983687736674,
+      "loss": 3.4628,
+      "step": 26650
+    },
+    {
+      "epoch": 7.773105817948867,
+      "grad_norm": 0.3561650216579437,
+      "learning_rate": 0.0005070235945237401,
+      "loss": 3.4572,
+      "step": 26700
+    },
+    {
+      "epoch": 7.787665249548658,
+      "grad_norm": 0.3330904543399811,
+      "learning_rate": 0.0005068488202738129,
+      "loss": 3.4615,
+      "step": 26750
+    },
+    {
+      "epoch": 7.802224681148448,
+      "grad_norm": 0.3155699074268341,
+      "learning_rate": 0.0005066740460238858,
+      "loss": 3.4407,
+      "step": 26800
+    },
+    {
+      "epoch": 7.8167841127482385,
+      "grad_norm": 0.3466147780418396,
+      "learning_rate": 0.0005064992717739586,
+      "loss": 3.457,
+      "step": 26850
+    },
+    {
+      "epoch": 7.831343544348028,
+      "grad_norm": 0.3634095788002014,
+      "learning_rate": 0.0005063244975240315,
+      "loss": 3.4655,
+      "step": 26900
+    },
+    {
+      "epoch": 7.845902975947819,
+      "grad_norm": 0.3383113741874695,
+      "learning_rate": 0.0005061497232741042,
+      "loss": 3.4613,
+      "step": 26950
+    },
+    {
+      "epoch": 7.860462407547609,
+      "grad_norm": 0.33195146918296814,
+      "learning_rate": 0.000505974949024177,
+      "loss": 3.4628,
+      "step": 27000
+    },
+    {
+      "epoch": 7.860462407547609,
+      "eval_accuracy": 0.3657741546812521,
+      "eval_loss": 3.5823051929473877,
+      "eval_runtime": 183.1613,
+      "eval_samples_per_second": 90.865,
+      "eval_steps_per_second": 5.684,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8750218391474,
+      "grad_norm": 0.35151028633117676,
+      "learning_rate": 0.0005058001747742499,
+      "loss": 3.4647,
+      "step": 27050
+    },
+    {
+      "epoch": 7.88958127074719,
+      "grad_norm": 0.35772132873535156,
+      "learning_rate": 0.0005056254005243227,
+      "loss": 3.4699,
+      "step": 27100
+    },
+    {
+      "epoch": 7.9041407023469805,
+      "grad_norm": 0.3402451276779175,
+      "learning_rate": 0.0005054506262743955,
+      "loss": 3.4738,
+      "step": 27150
+    },
+    {
+      "epoch": 7.91870013394677,
+      "grad_norm": 0.33174848556518555,
+      "learning_rate": 0.0005052758520244684,
+      "loss": 3.4582,
+      "step": 27200
+    },
+    {
+      "epoch": 7.933259565546561,
+      "grad_norm": 0.33006104826927185,
+      "learning_rate": 0.0005051010777745411,
+      "loss": 3.4596,
+      "step": 27250
+    },
+    {
+      "epoch": 7.947818997146351,
+      "grad_norm": 0.347843199968338,
+      "learning_rate": 0.000504926303524614,
+      "loss": 3.4695,
+      "step": 27300
+    },
+    {
+      "epoch": 7.962378428746142,
+      "grad_norm": 0.32010769844055176,
+      "learning_rate": 0.0005047515292746868,
+      "loss": 3.4707,
+      "step": 27350
+    },
+    {
+      "epoch": 7.976937860345932,
+      "grad_norm": 0.3584131896495819,
+      "learning_rate": 0.0005045767550247596,
+      "loss": 3.4584,
+      "step": 27400
+    },
+    {
+      "epoch": 7.991497291945723,
+      "grad_norm": 0.3257739543914795,
+      "learning_rate": 0.0005044019807748325,
+      "loss": 3.4689,
+      "step": 27450
+    },
+    {
+      "epoch": 8.005823772639916,
+      "grad_norm": 0.33740708231925964,
+      "learning_rate": 0.0005042272065249052,
+      "loss": 3.4213,
+      "step": 27500
+    },
+    {
+      "epoch": 8.020383204239707,
+      "grad_norm": 0.33311763405799866,
+      "learning_rate": 0.0005040524322749781,
+      "loss": 3.3665,
+      "step": 27550
+    },
+    {
+      "epoch": 8.034942635839498,
+      "grad_norm": 0.32844987511634827,
+      "learning_rate": 0.0005038776580250509,
+      "loss": 3.3403,
+      "step": 27600
+    },
+    {
+      "epoch": 8.049502067439287,
+      "grad_norm": 0.33761027455329895,
+      "learning_rate": 0.0005037028837751237,
+      "loss": 3.3569,
+      "step": 27650
+    },
+    {
+      "epoch": 8.064061499039077,
+      "grad_norm": 0.35406097769737244,
+      "learning_rate": 0.0005035281095251966,
+      "loss": 3.3567,
+      "step": 27700
+    },
+    {
+      "epoch": 8.078620930638868,
+      "grad_norm": 0.38495901226997375,
+      "learning_rate": 0.0005033533352752694,
+      "loss": 3.3788,
+      "step": 27750
+    },
+    {
+      "epoch": 8.093180362238659,
+      "grad_norm": 0.331709086894989,
+      "learning_rate": 0.0005031785610253422,
+      "loss": 3.3591,
+      "step": 27800
+    },
+    {
+      "epoch": 8.107739793838448,
+      "grad_norm": 0.3502473533153534,
+      "learning_rate": 0.000503003786775415,
+      "loss": 3.3805,
+      "step": 27850
+    },
+    {
+      "epoch": 8.122299225438239,
+      "grad_norm": 0.3384426534175873,
+      "learning_rate": 0.0005028290125254878,
+      "loss": 3.3839,
+      "step": 27900
+    },
+    {
+      "epoch": 8.13685865703803,
+      "grad_norm": 0.36867547035217285,
+      "learning_rate": 0.0005026542382755607,
+      "loss": 3.3739,
+      "step": 27950
+    },
+    {
+      "epoch": 8.15141808863782,
+      "grad_norm": 0.342602014541626,
+      "learning_rate": 0.0005024794640256335,
+      "loss": 3.3902,
+      "step": 28000
+    },
+    {
+      "epoch": 8.15141808863782,
+      "eval_accuracy": 0.3654784314274215,
+      "eval_loss": 3.5930871963500977,
+      "eval_runtime": 181.7974,
+      "eval_samples_per_second": 91.547,
+      "eval_steps_per_second": 5.726,
+      "step": 28000
+    },
+    {
+      "epoch": 8.16597752023761,
+      "grad_norm": 0.34287703037261963,
+      "learning_rate": 0.0005023046897757064,
+      "loss": 3.3962,
+      "step": 28050
+    },
+    {
+      "epoch": 8.1805369518374,
+      "grad_norm": 0.3469769060611725,
+      "learning_rate": 0.0005021299155257791,
+      "loss": 3.3802,
+      "step": 28100
+    },
+    {
+      "epoch": 8.19509638343719,
+      "grad_norm": 0.33318281173706055,
+      "learning_rate": 0.000501955141275852,
+      "loss": 3.3841,
+      "step": 28150
+    },
+    {
+      "epoch": 8.209655815036982,
+      "grad_norm": 0.3634045720100403,
+      "learning_rate": 0.0005017803670259248,
+      "loss": 3.3948,
+      "step": 28200
+    },
+    {
+      "epoch": 8.22421524663677,
+      "grad_norm": 0.33881765604019165,
+      "learning_rate": 0.0005016055927759977,
+      "loss": 3.3884,
+      "step": 28250
+    },
+    {
+      "epoch": 8.238774678236561,
+      "grad_norm": 0.3295370638370514,
+      "learning_rate": 0.0005014308185260705,
+      "loss": 3.3949,
+      "step": 28300
+    },
+    {
+      "epoch": 8.253334109836352,
+      "grad_norm": 0.32435914874076843,
+      "learning_rate": 0.0005012560442761432,
+      "loss": 3.3864,
+      "step": 28350
+    },
+    {
+      "epoch": 8.267893541436143,
+      "grad_norm": 0.3608424663543701,
+      "learning_rate": 0.0005010812700262161,
+      "loss": 3.3904,
+      "step": 28400
+    },
+    {
+      "epoch": 8.282452973035932,
+      "grad_norm": 0.34927839040756226,
+      "learning_rate": 0.0005009064957762889,
+      "loss": 3.4077,
+      "step": 28450
+    },
+    {
+      "epoch": 8.297012404635723,
+      "grad_norm": 0.37262028455734253,
+      "learning_rate": 0.0005007317215263618,
+      "loss": 3.3987,
+      "step": 28500
+    },
+    {
+      "epoch": 8.311571836235514,
+      "grad_norm": 0.335907906293869,
+      "learning_rate": 0.0005005569472764346,
+      "loss": 3.4031,
+      "step": 28550
+    },
+    {
+      "epoch": 8.326131267835304,
+      "grad_norm": 0.32725778222084045,
+      "learning_rate": 0.0005003821730265074,
+      "loss": 3.4097,
+      "step": 28600
+    },
+    {
+      "epoch": 8.340690699435093,
+      "grad_norm": 0.34939050674438477,
+      "learning_rate": 0.0005002073987765802,
+      "loss": 3.4022,
+      "step": 28650
+    },
+    {
+      "epoch": 8.355250131034884,
+      "grad_norm": 0.329519659280777,
+      "learning_rate": 0.000500032624526653,
+      "loss": 3.395,
+      "step": 28700
+    },
+    {
+      "epoch": 8.369809562634675,
+      "grad_norm": 0.342352032661438,
+      "learning_rate": 0.0004998578502767259,
+      "loss": 3.4104,
+      "step": 28750
+    },
+    {
+      "epoch": 8.384368994234466,
+      "grad_norm": 0.33699142932891846,
+      "learning_rate": 0.0004996830760267987,
+      "loss": 3.4088,
+      "step": 28800
+    },
+    {
+      "epoch": 8.398928425834255,
+      "grad_norm": 0.3412262797355652,
+      "learning_rate": 0.0004995083017768715,
+      "loss": 3.4005,
+      "step": 28850
+    },
+    {
+      "epoch": 8.413487857434045,
+      "grad_norm": 0.3386961817741394,
+      "learning_rate": 0.0004993335275269444,
+      "loss": 3.4233,
+      "step": 28900
+    },
+    {
+      "epoch": 8.428047289033836,
+      "grad_norm": 0.3285767138004303,
+      "learning_rate": 0.0004991587532770171,
+      "loss": 3.4173,
+      "step": 28950
+    },
+    {
+      "epoch": 8.442606720633627,
+      "grad_norm": 0.36489197611808777,
+      "learning_rate": 0.00049898397902709,
+      "loss": 3.4052,
+      "step": 29000
+    },
+    {
+      "epoch": 8.442606720633627,
+      "eval_accuracy": 0.3661564196109552,
+      "eval_loss": 3.5855534076690674,
+      "eval_runtime": 181.6388,
+      "eval_samples_per_second": 91.627,
+      "eval_steps_per_second": 5.731,
+      "step": 29000
+    },
+    {
+      "epoch": 8.457166152233416,
+      "grad_norm": 0.33611413836479187,
+      "learning_rate": 0.0004988092047771628,
+      "loss": 3.4126,
+      "step": 29050
+    },
+    {
+      "epoch": 8.471725583833207,
+      "grad_norm": 0.353683203458786,
+      "learning_rate": 0.0004986344305272356,
+      "loss": 3.411,
+      "step": 29100
+    },
+    {
+      "epoch": 8.486285015432998,
+      "grad_norm": 0.3237438201904297,
+      "learning_rate": 0.0004984596562773085,
+      "loss": 3.4116,
+      "step": 29150
+    },
+    {
+      "epoch": 8.500844447032788,
+      "grad_norm": 0.3344637155532837,
+      "learning_rate": 0.0004982848820273812,
+      "loss": 3.4252,
+      "step": 29200
+    },
+    {
+      "epoch": 8.515403878632577,
+      "grad_norm": 0.36675405502319336,
+      "learning_rate": 0.0004981101077774541,
+      "loss": 3.4157,
+      "step": 29250
+    },
+    {
+      "epoch": 8.529963310232368,
+      "grad_norm": 0.3269510269165039,
+      "learning_rate": 0.0004979353335275269,
+      "loss": 3.4108,
+      "step": 29300
+    },
+    {
+      "epoch": 8.544522741832159,
+      "grad_norm": 0.32414621114730835,
+      "learning_rate": 0.0004977605592775997,
+      "loss": 3.4189,
+      "step": 29350
+    },
+    {
+      "epoch": 8.55908217343195,
+      "grad_norm": 0.34106162190437317,
+      "learning_rate": 0.0004975857850276726,
+      "loss": 3.413,
+      "step": 29400
+    },
+    {
+      "epoch": 8.573641605031739,
+      "grad_norm": 0.3581826388835907,
+      "learning_rate": 0.0004974110107777454,
+      "loss": 3.4321,
+      "step": 29450
+    },
+    {
+      "epoch": 8.58820103663153,
+      "grad_norm": 0.34445253014564514,
+      "learning_rate": 0.0004972362365278182,
+      "loss": 3.4358,
+      "step": 29500
+    },
+    {
+      "epoch": 8.60276046823132,
+      "grad_norm": 0.3163345158100128,
+      "learning_rate": 0.000497061462277891,
+      "loss": 3.4149,
+      "step": 29550
+    },
+    {
+      "epoch": 8.617319899831111,
+      "grad_norm": 0.3480691909790039,
+      "learning_rate": 0.0004968866880279638,
+      "loss": 3.4231,
+      "step": 29600
+    },
+    {
+      "epoch": 8.6318793314309,
+      "grad_norm": 0.35285916924476624,
+      "learning_rate": 0.0004967119137780367,
+      "loss": 3.438,
+      "step": 29650
+    },
+    {
+      "epoch": 8.646438763030691,
+      "grad_norm": 0.3620506823062897,
+      "learning_rate": 0.0004965371395281095,
+      "loss": 3.428,
+      "step": 29700
+    },
+    {
+      "epoch": 8.660998194630482,
+      "grad_norm": 0.3322892189025879,
+      "learning_rate": 0.0004963623652781822,
+      "loss": 3.4191,
+      "step": 29750
+    },
+    {
+      "epoch": 8.675557626230272,
+      "grad_norm": 0.3197033405303955,
+      "learning_rate": 0.0004961875910282551,
+      "loss": 3.4174,
+      "step": 29800
+    },
+    {
+      "epoch": 8.690117057830061,
+      "grad_norm": 0.34963804483413696,
+      "learning_rate": 0.0004960128167783279,
+      "loss": 3.4357,
+      "step": 29850
+    },
+    {
+      "epoch": 8.704676489429852,
+      "grad_norm": 0.3247370421886444,
+      "learning_rate": 0.0004958380425284008,
+      "loss": 3.421,
+      "step": 29900
+    },
+    {
+      "epoch": 8.719235921029643,
+      "grad_norm": 0.32233262062072754,
+      "learning_rate": 0.0004956632682784736,
+      "loss": 3.4173,
+      "step": 29950
+    },
+    {
+      "epoch": 8.733795352629434,
+      "grad_norm": 0.36941850185394287,
+      "learning_rate": 0.0004954884940285464,
+      "loss": 3.4335,
+      "step": 30000
+    },
+    {
+      "epoch": 8.733795352629434,
+      "eval_accuracy": 0.36665227049024096,
+      "eval_loss": 3.578705310821533,
+      "eval_runtime": 182.0872,
+      "eval_samples_per_second": 91.401,
+      "eval_steps_per_second": 5.717,
+      "step": 30000
+    },
+    {
+      "epoch": 8.748354784229225,
+      "grad_norm": 0.3275495767593384,
+      "learning_rate": 0.0004953137197786192,
+      "loss": 3.4322,
+      "step": 30050
+    },
+    {
+      "epoch": 8.762914215829014,
+      "grad_norm": 0.3445492684841156,
+      "learning_rate": 0.000495138945528692,
+      "loss": 3.4295,
+      "step": 30100
+    },
+    {
+      "epoch": 8.777473647428804,
+      "grad_norm": 0.364786297082901,
+      "learning_rate": 0.0004949641712787649,
+      "loss": 3.4453,
+      "step": 30150
+    },
+    {
+      "epoch": 8.792033079028595,
+      "grad_norm": 0.3113223612308502,
+      "learning_rate": 0.0004947893970288377,
+      "loss": 3.4291,
+      "step": 30200
+    },
+    {
+      "epoch": 8.806592510628384,
+      "grad_norm": 0.3300077021121979,
+      "learning_rate": 0.0004946146227789105,
+      "loss": 3.4139,
+      "step": 30250
+    },
+    {
+      "epoch": 8.821151942228175,
+      "grad_norm": 0.3236207067966461,
+      "learning_rate": 0.0004944398485289834,
+      "loss": 3.4379,
+      "step": 30300
+    },
+    {
+      "epoch": 8.835711373827966,
+      "grad_norm": 0.34814879298210144,
+      "learning_rate": 0.0004942650742790561,
+      "loss": 3.4444,
+      "step": 30350
+    },
+    {
+      "epoch": 8.850270805427757,
+      "grad_norm": 0.35743340849876404,
+      "learning_rate": 0.000494090300029129,
+      "loss": 3.4288,
+      "step": 30400
+    },
+    {
+      "epoch": 8.864830237027547,
+      "grad_norm": 0.3275567889213562,
+      "learning_rate": 0.0004939155257792018,
+      "loss": 3.4436,
+      "step": 30450
+    },
+    {
+      "epoch": 8.879389668627336,
+      "grad_norm": 0.3158799707889557,
+      "learning_rate": 0.0004937407515292746,
+      "loss": 3.4395,
+      "step": 30500
+    },
+    {
+      "epoch": 8.893949100227127,
+      "grad_norm": 0.33471766114234924,
+      "learning_rate": 0.0004935659772793475,
+      "loss": 3.4325,
+      "step": 30550
+    },
+    {
+      "epoch": 8.908508531826918,
+      "grad_norm": 0.34228864312171936,
+      "learning_rate": 0.0004933912030294202,
+      "loss": 3.427,
+      "step": 30600
+    },
+    {
+      "epoch": 8.923067963426707,
+      "grad_norm": 0.3427802324295044,
+      "learning_rate": 0.0004932164287794931,
+      "loss": 3.4251,
+      "step": 30650
+    },
+    {
+      "epoch": 8.937627395026498,
+      "grad_norm": 0.35321247577667236,
+      "learning_rate": 0.000493041654529566,
+      "loss": 3.4431,
+      "step": 30700
+    },
+    {
+      "epoch": 8.952186826626289,
+      "grad_norm": 0.3708088994026184,
+      "learning_rate": 0.0004928668802796388,
+      "loss": 3.4538,
+      "step": 30750
+    },
+    {
+      "epoch": 8.96674625822608,
+      "grad_norm": 0.34218692779541016,
+      "learning_rate": 0.0004926921060297116,
+      "loss": 3.4313,
+      "step": 30800
+    },
+    {
+      "epoch": 8.98130568982587,
+      "grad_norm": 0.35662880539894104,
+      "learning_rate": 0.0004925173317797845,
+      "loss": 3.4317,
+      "step": 30850
+    },
+    {
+      "epoch": 8.995865121425659,
+      "grad_norm": 0.3506118655204773,
+      "learning_rate": 0.0004923425575298572,
+      "loss": 3.4502,
+      "step": 30900
+    },
+    {
+      "epoch": 9.010191602119853,
+      "grad_norm": 0.34358587861061096,
+      "learning_rate": 0.0004921677832799301,
+      "loss": 3.3637,
+      "step": 30950
+    },
+    {
+      "epoch": 9.024751033719644,
+      "grad_norm": 0.3556085526943207,
+      "learning_rate": 0.0004919930090300029,
+      "loss": 3.3189,
+      "step": 31000
+    },
+    {
+      "epoch": 9.024751033719644,
+      "eval_accuracy": 0.36717398980524946,
+      "eval_loss": 3.5799267292022705,
+      "eval_runtime": 181.8055,
+      "eval_samples_per_second": 91.543,
+      "eval_steps_per_second": 5.726,
+      "step": 31000
+    },
+    {
+      "epoch": 9.039310465319433,
+      "grad_norm": 0.34578433632850647,
+      "learning_rate": 0.0004918182347800757,
+      "loss": 3.3231,
+      "step": 31050
+    },
+    {
+      "epoch": 9.053869896919224,
+      "grad_norm": 0.3525102138519287,
+      "learning_rate": 0.0004916434605301486,
+      "loss": 3.3193,
+      "step": 31100
+    },
+    {
+      "epoch": 9.068429328519015,
+      "grad_norm": 0.3447619080543518,
+      "learning_rate": 0.0004914686862802213,
+      "loss": 3.3293,
+      "step": 31150
+    },
+    {
+      "epoch": 9.082988760118806,
+      "grad_norm": 0.316193550825119,
+      "learning_rate": 0.0004912939120302941,
+      "loss": 3.35,
+      "step": 31200
+    },
+    {
+      "epoch": 9.097548191718595,
+      "grad_norm": 0.3357117772102356,
+      "learning_rate": 0.000491119137780367,
+      "loss": 3.3503,
+      "step": 31250
+    },
+    {
+      "epoch": 9.112107623318385,
+      "grad_norm": 0.3565595746040344,
+      "learning_rate": 0.0004909443635304398,
+      "loss": 3.3394,
+      "step": 31300
+    },
+    {
+      "epoch": 9.126667054918176,
+      "grad_norm": 0.35598695278167725,
+      "learning_rate": 0.0004907695892805127,
+      "loss": 3.3571,
+      "step": 31350
+    },
+    {
+      "epoch": 9.141226486517967,
+      "grad_norm": 0.3496910035610199,
+      "learning_rate": 0.0004905948150305855,
+      "loss": 3.354,
+      "step": 31400
+    },
+    {
+      "epoch": 9.155785918117756,
+      "grad_norm": 0.34782034158706665,
+      "learning_rate": 0.0004904200407806582,
+      "loss": 3.3431,
+      "step": 31450
+    },
+    {
+      "epoch": 9.170345349717547,
+      "grad_norm": 0.34046244621276855,
+      "learning_rate": 0.0004902452665307311,
+      "loss": 3.3657,
+      "step": 31500
+    },
+    {
+      "epoch": 9.184904781317337,
+      "grad_norm": 0.37150949239730835,
+      "learning_rate": 0.0004900704922808039,
+      "loss": 3.3665,
+      "step": 31550
+    },
+    {
+      "epoch": 9.199464212917128,
+      "grad_norm": 0.36348044872283936,
+      "learning_rate": 0.0004898957180308768,
+      "loss": 3.3567,
+      "step": 31600
+    },
+    {
+      "epoch": 9.214023644516917,
+      "grad_norm": 0.3551836311817169,
+      "learning_rate": 0.0004897209437809496,
+      "loss": 3.3674,
+      "step": 31650
+    },
+    {
+      "epoch": 9.228583076116708,
+      "grad_norm": 0.3500552475452423,
+      "learning_rate": 0.0004895461695310223,
+      "loss": 3.3814,
+      "step": 31700
+    },
+    {
+      "epoch": 9.243142507716499,
+      "grad_norm": 0.3479650318622589,
+      "learning_rate": 0.0004893713952810952,
+      "loss": 3.3613,
+      "step": 31750
+    },
+    {
+      "epoch": 9.25770193931629,
+      "grad_norm": 0.3503901958465576,
+      "learning_rate": 0.000489196621031168,
+      "loss": 3.3602,
+      "step": 31800
+    },
+    {
+      "epoch": 9.272261370916079,
+      "grad_norm": 0.33610227704048157,
+      "learning_rate": 0.0004890218467812409,
+      "loss": 3.3631,
+      "step": 31850
+    },
+    {
+      "epoch": 9.28682080251587,
+      "grad_norm": 0.3341948091983795,
+      "learning_rate": 0.0004888470725313137,
+      "loss": 3.3609,
+      "step": 31900
+    },
+    {
+      "epoch": 9.30138023411566,
+      "grad_norm": 0.3447319567203522,
+      "learning_rate": 0.0004886722982813865,
+      "loss": 3.3727,
+      "step": 31950
+    },
+    {
+      "epoch": 9.315939665715451,
+      "grad_norm": 0.32863977551460266,
+      "learning_rate": 0.0004884975240314593,
+      "loss": 3.3782,
+      "step": 32000
+    },
+    {
+      "epoch": 9.315939665715451,
+      "eval_accuracy": 0.366884968827947,
+      "eval_loss": 3.581573724746704,
+      "eval_runtime": 182.0337,
+      "eval_samples_per_second": 91.428,
+      "eval_steps_per_second": 5.719,
+      "step": 32000
+    },
+    {
+      "epoch": 9.33049909731524,
+      "grad_norm": 0.3508942127227783,
+      "learning_rate": 0.0004883227497815321,
+      "loss": 3.3778,
+      "step": 32050
+    },
+    {
+      "epoch": 9.34505852891503,
+      "grad_norm": 0.3674251437187195,
+      "learning_rate": 0.00048814797553160496,
+      "loss": 3.3807,
+      "step": 32100
+    },
+    {
+      "epoch": 9.359617960514822,
+      "grad_norm": 0.3387126922607422,
+      "learning_rate": 0.0004879732012816778,
+      "loss": 3.3823,
+      "step": 32150
+    },
+    {
+      "epoch": 9.374177392114612,
+      "grad_norm": 0.3542914390563965,
+      "learning_rate": 0.0004877984270317506,
+      "loss": 3.398,
+      "step": 32200
+    },
+    {
+      "epoch": 9.388736823714403,
+      "grad_norm": 0.354044109582901,
+      "learning_rate": 0.0004876236527818234,
+      "loss": 3.3764,
+      "step": 32250
+    },
+    {
+      "epoch": 9.403296255314192,
+      "grad_norm": 0.3662169575691223,
+      "learning_rate": 0.00048744887853189624,
+      "loss": 3.3919,
+      "step": 32300
+    },
+    {
+      "epoch": 9.417855686913983,
+      "grad_norm": 0.33728882670402527,
+      "learning_rate": 0.00048727410428196907,
+      "loss": 3.383,
+      "step": 32350
+    },
+    {
+      "epoch": 9.432415118513774,
+      "grad_norm": 0.32222864031791687,
+      "learning_rate": 0.0004870993300320419,
+      "loss": 3.3877,
+      "step": 32400
+    },
+    {
+      "epoch": 9.446974550113563,
+      "grad_norm": 0.3222348988056183,
+      "learning_rate": 0.00048692455578211474,
+      "loss": 3.3822,
+      "step": 32450
+    },
+    {
+      "epoch": 9.461533981713353,
+      "grad_norm": 0.3391883671283722,
+      "learning_rate": 0.0004867497815321875,
+      "loss": 3.3887,
+      "step": 32500
+    },
+    {
+      "epoch": 9.476093413313144,
+      "grad_norm": 0.3517501652240753,
+      "learning_rate": 0.00048657500728226035,
+      "loss": 3.3825,
+      "step": 32550
+    },
+    {
+      "epoch": 9.490652844912935,
+      "grad_norm": 0.3315829932689667,
+      "learning_rate": 0.0004864002330323332,
+      "loss": 3.3849,
+      "step": 32600
+    },
+    {
+      "epoch": 9.505212276512726,
+      "grad_norm": 0.33583584427833557,
+      "learning_rate": 0.000486225458782406,
+      "loss": 3.3938,
+      "step": 32650
+    },
+    {
+      "epoch": 9.519771708112515,
+      "grad_norm": 0.3496243357658386,
+      "learning_rate": 0.0004860506845324788,
+      "loss": 3.3901,
+      "step": 32700
+    },
+    {
+      "epoch": 9.534331139712306,
+      "grad_norm": 0.34915950894355774,
+      "learning_rate": 0.0004858759102825516,
+      "loss": 3.402,
+      "step": 32750
+    },
+    {
+      "epoch": 9.548890571312096,
+      "grad_norm": 0.3658216893672943,
+      "learning_rate": 0.00048570113603262446,
+      "loss": 3.391,
+      "step": 32800
+    },
+    {
+      "epoch": 9.563450002911885,
+      "grad_norm": 0.3504136800765991,
+      "learning_rate": 0.0004855263617826973,
+      "loss": 3.3906,
+      "step": 32850
+    },
+    {
+      "epoch": 9.578009434511676,
+      "grad_norm": 0.33254560828208923,
+      "learning_rate": 0.0004853515875327701,
+      "loss": 3.4056,
+      "step": 32900
+    },
+    {
+      "epoch": 9.592568866111467,
+      "grad_norm": 0.34906646609306335,
+      "learning_rate": 0.0004851768132828429,
+      "loss": 3.4075,
+      "step": 32950
+    },
+    {
+      "epoch": 9.607128297711258,
+      "grad_norm": 0.34559518098831177,
+      "learning_rate": 0.00048500203903291574,
+      "loss": 3.4026,
+      "step": 33000
+    },
+    {
+      "epoch": 9.607128297711258,
+      "eval_accuracy": 0.36745666125742,
+      "eval_loss": 3.5726640224456787,
+      "eval_runtime": 181.7757,
+      "eval_samples_per_second": 91.558,
+      "eval_steps_per_second": 5.727,
+      "step": 33000
+    },
+    {
+      "epoch": 9.621687729311049,
+      "grad_norm": 0.3735829293727875,
+      "learning_rate": 0.00048482726478298857,
+      "loss": 3.4065,
+      "step": 33050
+    },
+    {
+      "epoch": 9.636247160910838,
+      "grad_norm": 0.3518868684768677,
+      "learning_rate": 0.0004846524905330614,
+      "loss": 3.4036,
+      "step": 33100
+    },
+    {
+      "epoch": 9.650806592510628,
+      "grad_norm": 0.3787810802459717,
+      "learning_rate": 0.00048447771628313424,
+      "loss": 3.4012,
+      "step": 33150
+    },
+    {
+      "epoch": 9.66536602411042,
+      "grad_norm": 0.36960500478744507,
+      "learning_rate": 0.0004843029420332071,
+      "loss": 3.408,
+      "step": 33200
+    },
+    {
+      "epoch": 9.67992545571021,
+      "grad_norm": 0.34325626492500305,
+      "learning_rate": 0.0004841281677832799,
+      "loss": 3.4017,
+      "step": 33250
+    },
+    {
+      "epoch": 9.694484887309999,
+      "grad_norm": 0.3455840051174164,
+      "learning_rate": 0.00048395339353335273,
+      "loss": 3.4139,
+      "step": 33300
+    },
+    {
+      "epoch": 9.70904431890979,
+      "grad_norm": 0.35434481501579285,
+      "learning_rate": 0.00048377861928342557,
+      "loss": 3.3996,
+      "step": 33350
+    },
+    {
+      "epoch": 9.72360375050958,
+      "grad_norm": 0.33681508898735046,
+      "learning_rate": 0.0004836038450334984,
+      "loss": 3.4125,
+      "step": 33400
+    },
+    {
+      "epoch": 9.738163182109371,
+      "grad_norm": 0.35238656401634216,
+      "learning_rate": 0.0004834290707835712,
+      "loss": 3.4157,
+      "step": 33450
+    },
+    {
+      "epoch": 9.75272261370916,
+      "grad_norm": 0.37718260288238525,
+      "learning_rate": 0.000483254296533644,
+      "loss": 3.4033,
+      "step": 33500
+    },
+    {
+      "epoch": 9.767282045308951,
+      "grad_norm": 0.3434363901615143,
+      "learning_rate": 0.00048307952228371685,
+      "loss": 3.4143,
+      "step": 33550
+    },
+    {
+      "epoch": 9.781841476908742,
+      "grad_norm": 0.34627440571784973,
+      "learning_rate": 0.0004829047480337897,
+      "loss": 3.4043,
+      "step": 33600
+    },
+    {
+      "epoch": 9.796400908508533,
+      "grad_norm": 0.33534497022628784,
+      "learning_rate": 0.0004827299737838625,
+      "loss": 3.4029,
+      "step": 33650
+    },
+    {
+      "epoch": 9.810960340108322,
+      "grad_norm": 0.3508129417896271,
+      "learning_rate": 0.0004825551995339353,
+      "loss": 3.406,
+      "step": 33700
+    },
+    {
+      "epoch": 9.825519771708112,
+      "grad_norm": 0.34650343656539917,
+      "learning_rate": 0.0004823804252840081,
+      "loss": 3.404,
+      "step": 33750
+    },
+    {
+      "epoch": 9.840079203307903,
+      "grad_norm": 0.33442333340644836,
+      "learning_rate": 0.00048220565103408096,
+      "loss": 3.4015,
+      "step": 33800
+    },
+    {
+      "epoch": 9.854638634907694,
+      "grad_norm": 0.3506050407886505,
+      "learning_rate": 0.0004820308767841538,
+      "loss": 3.4156,
+      "step": 33850
+    },
+    {
+      "epoch": 9.869198066507483,
+      "grad_norm": 0.341828316450119,
+      "learning_rate": 0.0004818561025342266,
+      "loss": 3.4171,
+      "step": 33900
+    },
+    {
+      "epoch": 9.883757498107274,
+      "grad_norm": 0.3377910554409027,
+      "learning_rate": 0.0004816813282842994,
+      "loss": 3.4102,
+      "step": 33950
+    },
+    {
+      "epoch": 9.898316929707065,
+      "grad_norm": 0.35400837659835815,
+      "learning_rate": 0.00048150655403437223,
+      "loss": 3.4082,
+      "step": 34000
+    },
+    {
+      "epoch": 9.898316929707065,
+      "eval_accuracy": 0.36838721944064684,
+      "eval_loss": 3.5640623569488525,
+      "eval_runtime": 182.8947,
+      "eval_samples_per_second": 90.998,
+      "eval_steps_per_second": 5.692,
+      "step": 34000
+    },
+    {
+      "epoch": 9.912876361306855,
+      "grad_norm": 0.3472040593624115,
+      "learning_rate": 0.00048133177978444507,
+      "loss": 3.4179,
+      "step": 34050
+    },
+    {
+      "epoch": 9.927435792906644,
+      "grad_norm": 0.3496232032775879,
+      "learning_rate": 0.0004811570055345179,
+      "loss": 3.4113,
+      "step": 34100
+    },
+    {
+      "epoch": 9.941995224506435,
+      "grad_norm": 0.33684638142585754,
+      "learning_rate": 0.0004809822312845907,
+      "loss": 3.4137,
+      "step": 34150
+    },
+    {
+      "epoch": 9.956554656106226,
+      "grad_norm": 0.34335857629776,
+      "learning_rate": 0.0004808074570346635,
+      "loss": 3.4172,
+      "step": 34200
+    },
+    {
+      "epoch": 9.971114087706017,
+      "grad_norm": 0.34269091486930847,
+      "learning_rate": 0.00048063268278473634,
+      "loss": 3.4183,
+      "step": 34250
+    },
+    {
+      "epoch": 9.985673519305806,
+      "grad_norm": 0.3301508128643036,
+      "learning_rate": 0.0004804579085348092,
+      "loss": 3.4135,
+      "step": 34300
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.8190501928329468,
+      "learning_rate": 0.000480283134284882,
+      "loss": 3.4084,
+      "step": 34350
+    },
+    {
+      "epoch": 10.01455943159979,
+      "grad_norm": 0.34881967306137085,
+      "learning_rate": 0.0004801083600349548,
+      "loss": 3.3121,
+      "step": 34400
+    },
+    {
+      "epoch": 10.029118863199582,
+      "grad_norm": 0.3504365086555481,
+      "learning_rate": 0.0004799335857850276,
+      "loss": 3.3012,
+      "step": 34450
+    },
+    {
+      "epoch": 10.04367829479937,
+      "grad_norm": 0.3723757565021515,
+      "learning_rate": 0.00047975881153510046,
+      "loss": 3.3123,
+      "step": 34500
+    },
+    {
+      "epoch": 10.058237726399161,
+      "grad_norm": 0.3652939200401306,
+      "learning_rate": 0.0004795840372851733,
+      "loss": 3.3082,
+      "step": 34550
+    },
+    {
+      "epoch": 10.072797157998952,
+      "grad_norm": 0.36539286375045776,
+      "learning_rate": 0.00047940926303524607,
+      "loss": 3.3053,
+      "step": 34600
+    },
+    {
+      "epoch": 10.087356589598743,
+      "grad_norm": 0.34552112221717834,
+      "learning_rate": 0.0004792344887853189,
+      "loss": 3.3203,
+      "step": 34650
+    },
+    {
+      "epoch": 10.101916021198532,
+      "grad_norm": 0.34289079904556274,
+      "learning_rate": 0.00047905971453539173,
+      "loss": 3.3335,
+      "step": 34700
+    },
+    {
+      "epoch": 10.116475452798323,
+      "grad_norm": 0.34614643454551697,
+      "learning_rate": 0.00047888494028546457,
+      "loss": 3.3293,
+      "step": 34750
+    },
+    {
+      "epoch": 10.131034884398114,
+      "grad_norm": 0.365692675113678,
+      "learning_rate": 0.0004787101660355374,
+      "loss": 3.3347,
+      "step": 34800
+    },
+    {
+      "epoch": 10.145594315997904,
+      "grad_norm": 0.3478696644306183,
+      "learning_rate": 0.0004785353917856102,
+      "loss": 3.3419,
+      "step": 34850
+    },
+    {
+      "epoch": 10.160153747597693,
+      "grad_norm": 0.345829576253891,
+      "learning_rate": 0.000478360617535683,
+      "loss": 3.3263,
+      "step": 34900
+    },
+    {
+      "epoch": 10.174713179197484,
+      "grad_norm": 0.4017032980918884,
+      "learning_rate": 0.00047818584328575584,
+      "loss": 3.3348,
+      "step": 34950
+    },
+    {
+      "epoch": 10.189272610797275,
+      "grad_norm": 0.34451884031295776,
+      "learning_rate": 0.0004780110690358287,
+      "loss": 3.3356,
+      "step": 35000
+    },
+    {
+      "epoch": 10.189272610797275,
+      "eval_accuracy": 0.36771605111744,
+      "eval_loss": 3.5778610706329346,
+      "eval_runtime": 183.2483,
+      "eval_samples_per_second": 90.822,
+      "eval_steps_per_second": 5.681,
+      "step": 35000
+    },
+    {
+      "epoch": 10.203832042397066,
+      "grad_norm": 0.35025554895401,
+      "learning_rate": 0.0004778362947859015,
+      "loss": 3.3442,
+      "step": 35050
+    },
+    {
+      "epoch": 10.218391473996855,
+      "grad_norm": 0.34518471360206604,
+      "learning_rate": 0.0004776615205359743,
+      "loss": 3.3374,
+      "step": 35100
+    },
+    {
+      "epoch": 10.232950905596645,
+      "grad_norm": 0.35896578431129456,
+      "learning_rate": 0.0004774867462860471,
+      "loss": 3.3453,
+      "step": 35150
+    },
+    {
+      "epoch": 10.247510337196436,
+      "grad_norm": 0.3396795094013214,
+      "learning_rate": 0.00047731197203611995,
+      "loss": 3.3457,
+      "step": 35200
+    },
+    {
+      "epoch": 10.262069768796227,
+      "grad_norm": 0.3721248209476471,
+      "learning_rate": 0.0004771371977861928,
+      "loss": 3.3458,
+      "step": 35250
+    },
+    {
+      "epoch": 10.276629200396016,
+      "grad_norm": 0.3700907230377197,
+      "learning_rate": 0.00047696242353626557,
+      "loss": 3.3381,
+      "step": 35300
+    },
+    {
+      "epoch": 10.291188631995807,
+      "grad_norm": 0.3764047622680664,
+      "learning_rate": 0.0004767876492863384,
+      "loss": 3.3418,
+      "step": 35350
+    },
+    {
+      "epoch": 10.305748063595598,
+      "grad_norm": 0.3617747724056244,
+      "learning_rate": 0.00047661287503641123,
+      "loss": 3.347,
+      "step": 35400
+    },
+    {
+      "epoch": 10.320307495195388,
+      "grad_norm": 0.34759700298309326,
+      "learning_rate": 0.00047643810078648407,
+      "loss": 3.3512,
+      "step": 35450
+    },
+    {
+      "epoch": 10.334866926795177,
+      "grad_norm": 0.35689282417297363,
+      "learning_rate": 0.0004762633265365569,
+      "loss": 3.3663,
+      "step": 35500
+    },
+    {
+      "epoch": 10.349426358394968,
+      "grad_norm": 0.32792720198631287,
+      "learning_rate": 0.0004760885522866297,
+      "loss": 3.3568,
+      "step": 35550
+    },
+    {
+      "epoch": 10.363985789994759,
+      "grad_norm": 0.3390996754169464,
+      "learning_rate": 0.0004759137780367025,
+      "loss": 3.3689,
+      "step": 35600
+    },
+    {
+      "epoch": 10.37854522159455,
+      "grad_norm": 0.35693955421447754,
+      "learning_rate": 0.00047573900378677534,
+      "loss": 3.3575,
+      "step": 35650
+    },
+    {
+      "epoch": 10.393104653194339,
+      "grad_norm": 0.3452168405056,
+      "learning_rate": 0.00047556422953684823,
+      "loss": 3.3642,
+      "step": 35700
+    },
+    {
+      "epoch": 10.40766408479413,
+      "grad_norm": 0.370328426361084,
+      "learning_rate": 0.00047538945528692106,
+      "loss": 3.3595,
+      "step": 35750
+    },
+    {
+      "epoch": 10.42222351639392,
+      "grad_norm": 0.37136757373809814,
+      "learning_rate": 0.0004752146810369939,
+      "loss": 3.346,
+      "step": 35800
+    },
+    {
+      "epoch": 10.436782947993711,
+      "grad_norm": 0.3773367702960968,
+      "learning_rate": 0.0004750399067870667,
+      "loss": 3.3645,
+      "step": 35850
+    },
+    {
+      "epoch": 10.4513423795935,
+      "grad_norm": 0.3447873592376709,
+      "learning_rate": 0.0004748651325371395,
+      "loss": 3.3598,
+      "step": 35900
+    },
+    {
+      "epoch": 10.46590181119329,
+      "grad_norm": 0.355688214302063,
+      "learning_rate": 0.00047469035828721234,
+      "loss": 3.3672,
+      "step": 35950
+    },
+    {
+      "epoch": 10.480461242793082,
+      "grad_norm": 0.3678136169910431,
+      "learning_rate": 0.0004745155840372852,
+      "loss": 3.3828,
+      "step": 36000
+    },
+    {
+      "epoch": 10.480461242793082,
+      "eval_accuracy": 0.36819038416155636,
+      "eval_loss": 3.568837881088257,
+      "eval_runtime": 183.926,
+      "eval_samples_per_second": 90.487,
+      "eval_steps_per_second": 5.66,
+      "step": 36000
+    },
+    {
+      "epoch": 10.495020674392872,
+      "grad_norm": 0.3659283220767975,
+      "learning_rate": 0.00047434080978735795,
+      "loss": 3.3647,
+      "step": 36050
+    },
+    {
+      "epoch": 10.509580105992661,
+      "grad_norm": 0.3798047602176666,
+      "learning_rate": 0.0004741660355374308,
+      "loss": 3.3631,
+      "step": 36100
+    },
+    {
+      "epoch": 10.524139537592452,
+      "grad_norm": 0.3466806411743164,
+      "learning_rate": 0.0004739912612875036,
+      "loss": 3.359,
+      "step": 36150
+    },
+    {
+      "epoch": 10.538698969192243,
+      "grad_norm": 0.35511037707328796,
+      "learning_rate": 0.00047381648703757645,
+      "loss": 3.3755,
+      "step": 36200
+    },
+    {
+      "epoch": 10.553258400792034,
+      "grad_norm": 0.3418614864349365,
+      "learning_rate": 0.0004736417127876493,
+      "loss": 3.3799,
+      "step": 36250
+    },
+    {
+      "epoch": 10.567817832391823,
+      "grad_norm": 0.38244953751564026,
+      "learning_rate": 0.00047346693853772206,
+      "loss": 3.382,
+      "step": 36300
+    },
+    {
+      "epoch": 10.582377263991614,
+      "grad_norm": 0.3323763310909271,
+      "learning_rate": 0.0004732921642877949,
+      "loss": 3.3828,
+      "step": 36350
+    },
+    {
+      "epoch": 10.596936695591404,
+      "grad_norm": 0.3437618315219879,
+      "learning_rate": 0.00047311739003786773,
+      "loss": 3.391,
+      "step": 36400
+    },
+    {
+      "epoch": 10.611496127191195,
+      "grad_norm": 0.36182549595832825,
+      "learning_rate": 0.00047294261578794056,
+      "loss": 3.3829,
+      "step": 36450
+    },
+    {
+      "epoch": 10.626055558790984,
+      "grad_norm": 0.38253724575042725,
+      "learning_rate": 0.0004727678415380134,
+      "loss": 3.3803,
+      "step": 36500
+    },
+    {
+      "epoch": 10.640614990390775,
+      "grad_norm": 0.36465519666671753,
+      "learning_rate": 0.0004725930672880862,
+      "loss": 3.3703,
+      "step": 36550
+    },
+    {
+      "epoch": 10.655174421990566,
+      "grad_norm": 0.3479657769203186,
+      "learning_rate": 0.000472418293038159,
+      "loss": 3.3709,
+      "step": 36600
+    },
+    {
+      "epoch": 10.669733853590357,
+      "grad_norm": 0.3454592227935791,
+      "learning_rate": 0.00047224351878823184,
+      "loss": 3.3876,
+      "step": 36650
+    },
+    {
+      "epoch": 10.684293285190146,
+      "grad_norm": 0.34455588459968567,
+      "learning_rate": 0.0004720687445383047,
+      "loss": 3.3788,
+      "step": 36700
+    },
+    {
+      "epoch": 10.698852716789936,
+      "grad_norm": 0.357598215341568,
+      "learning_rate": 0.00047189397028837745,
+      "loss": 3.3759,
+      "step": 36750
+    },
+    {
+      "epoch": 10.713412148389727,
+      "grad_norm": 0.36810582876205444,
+      "learning_rate": 0.0004717191960384503,
+      "loss": 3.3817,
+      "step": 36800
+    },
+    {
+      "epoch": 10.727971579989518,
+      "grad_norm": 0.37969326972961426,
+      "learning_rate": 0.0004715444217885231,
+      "loss": 3.3845,
+      "step": 36850
+    },
+    {
+      "epoch": 10.742531011589307,
+      "grad_norm": 0.362560898065567,
+      "learning_rate": 0.00047136964753859595,
+      "loss": 3.397,
+      "step": 36900
+    },
+    {
+      "epoch": 10.757090443189098,
+      "grad_norm": 0.36402398347854614,
+      "learning_rate": 0.0004711948732886688,
+      "loss": 3.3797,
+      "step": 36950
+    },
+    {
+      "epoch": 10.771649874788888,
+      "grad_norm": 0.3478822410106659,
+      "learning_rate": 0.00047102009903874156,
+      "loss": 3.3911,
+      "step": 37000
+    },
+    {
+      "epoch": 10.771649874788888,
+      "eval_accuracy": 0.368904117819907,
+      "eval_loss": 3.5626118183135986,
+      "eval_runtime": 183.8574,
+      "eval_samples_per_second": 90.521,
+      "eval_steps_per_second": 5.662,
+      "step": 37000
+    },
+    {
+      "epoch": 10.78620930638868,
+      "grad_norm": 0.34672781825065613,
+      "learning_rate": 0.0004708453247888144,
+      "loss": 3.3796,
+      "step": 37050
+    },
+    {
+      "epoch": 10.800768737988468,
+      "grad_norm": 0.35510483384132385,
+      "learning_rate": 0.00047067055053888723,
+      "loss": 3.3921,
+      "step": 37100
+    },
+    {
+      "epoch": 10.815328169588259,
+      "grad_norm": 0.3330132067203522,
+      "learning_rate": 0.00047049577628896006,
+      "loss": 3.3707,
+      "step": 37150
+    },
+    {
+      "epoch": 10.82988760118805,
+      "grad_norm": 0.35041606426239014,
+      "learning_rate": 0.0004703210020390329,
+      "loss": 3.3993,
+      "step": 37200
+    },
+    {
+      "epoch": 10.84444703278784,
+      "grad_norm": 0.34748944640159607,
+      "learning_rate": 0.0004701462277891057,
+      "loss": 3.3854,
+      "step": 37250
+    },
+    {
+      "epoch": 10.85900646438763,
+      "grad_norm": 0.3505236506462097,
+      "learning_rate": 0.0004699714535391785,
+      "loss": 3.3933,
+      "step": 37300
+    },
+    {
+      "epoch": 10.87356589598742,
+      "grad_norm": 0.3472146689891815,
+      "learning_rate": 0.00046979667928925134,
+      "loss": 3.3877,
+      "step": 37350
+    },
+    {
+      "epoch": 10.888125327587211,
+      "grad_norm": 0.33038902282714844,
+      "learning_rate": 0.0004696219050393242,
+      "loss": 3.3872,
+      "step": 37400
+    },
+    {
+      "epoch": 10.902684759187002,
+      "grad_norm": 0.33716917037963867,
+      "learning_rate": 0.00046944713078939695,
+      "loss": 3.3962,
+      "step": 37450
+    },
+    {
+      "epoch": 10.917244190786791,
+      "grad_norm": 0.3526748716831207,
+      "learning_rate": 0.0004692723565394698,
+      "loss": 3.3928,
+      "step": 37500
+    },
+    {
+      "epoch": 10.931803622386582,
+      "grad_norm": 0.36475178599357605,
+      "learning_rate": 0.0004690975822895426,
+      "loss": 3.3842,
+      "step": 37550
+    },
+    {
+      "epoch": 10.946363053986373,
+      "grad_norm": 0.36359477043151855,
+      "learning_rate": 0.00046892280803961545,
+      "loss": 3.401,
+      "step": 37600
+    },
+    {
+      "epoch": 10.960922485586163,
+      "grad_norm": 0.35189494490623474,
+      "learning_rate": 0.0004687480337896883,
+      "loss": 3.3937,
+      "step": 37650
+    },
+    {
+      "epoch": 10.975481917185952,
+      "grad_norm": 0.3400118350982666,
+      "learning_rate": 0.00046857325953976106,
+      "loss": 3.3934,
+      "step": 37700
+    },
+    {
+      "epoch": 10.990041348785743,
+      "grad_norm": 0.3473895490169525,
+      "learning_rate": 0.0004683984852898339,
+      "loss": 3.3902,
+      "step": 37750
+    },
+    {
+      "epoch": 11.004367829479937,
+      "grad_norm": 0.3693157732486725,
+      "learning_rate": 0.00046822371103990673,
+      "loss": 3.3584,
+      "step": 37800
+    },
+    {
+      "epoch": 11.018927261079728,
+      "grad_norm": 0.34884193539619446,
+      "learning_rate": 0.00046804893678997956,
+      "loss": 3.2712,
+      "step": 37850
+    },
+    {
+      "epoch": 11.033486692679517,
+      "grad_norm": 0.331039696931839,
+      "learning_rate": 0.00046787416254005234,
+      "loss": 3.2856,
+      "step": 37900
+    },
+    {
+      "epoch": 11.048046124279308,
+      "grad_norm": 0.34825077652931213,
+      "learning_rate": 0.0004676993882901252,
+      "loss": 3.2941,
+      "step": 37950
+    },
+    {
+      "epoch": 11.062605555879099,
+      "grad_norm": 0.3396894633769989,
+      "learning_rate": 0.000467524614040198,
+      "loss": 3.2861,
+      "step": 38000
+    },
+    {
+      "epoch": 11.062605555879099,
+      "eval_accuracy": 0.36855336534826616,
+      "eval_loss": 3.5719475746154785,
+      "eval_runtime": 180.4716,
+      "eval_samples_per_second": 92.219,
+      "eval_steps_per_second": 5.768,
+      "step": 38000
+    },
+    {
+      "epoch": 11.07716498747889,
+      "grad_norm": 0.41541653871536255,
+      "learning_rate": 0.00046734983979027084,
+      "loss": 3.2967,
+      "step": 38050
+    },
+    {
+      "epoch": 11.091724419078679,
+      "grad_norm": 0.34760013222694397,
+      "learning_rate": 0.00046717506554034367,
+      "loss": 3.2988,
+      "step": 38100
+    },
+    {
+      "epoch": 11.10628385067847,
+      "grad_norm": 0.3493053913116455,
+      "learning_rate": 0.00046700029129041645,
+      "loss": 3.2994,
+      "step": 38150
+    },
+    {
+      "epoch": 11.12084328227826,
+      "grad_norm": 0.35706987977027893,
+      "learning_rate": 0.0004668255170404893,
+      "loss": 3.315,
+      "step": 38200
+    },
+    {
+      "epoch": 11.135402713878051,
+      "grad_norm": 0.3363507390022278,
+      "learning_rate": 0.00046665074279056217,
+      "loss": 3.3071,
+      "step": 38250
+    },
+    {
+      "epoch": 11.14996214547784,
+      "grad_norm": 0.3618837296962738,
+      "learning_rate": 0.000466475968540635,
+      "loss": 3.3106,
+      "step": 38300
+    },
+    {
+      "epoch": 11.16452157707763,
+      "grad_norm": 0.33892515301704407,
+      "learning_rate": 0.00046630119429070784,
+      "loss": 3.3133,
+      "step": 38350
+    },
+    {
+      "epoch": 11.179081008677421,
+      "grad_norm": 0.33202266693115234,
+      "learning_rate": 0.00046612642004078067,
+      "loss": 3.3235,
+      "step": 38400
+    },
+    {
+      "epoch": 11.193640440277212,
+      "grad_norm": 0.3930901288986206,
+      "learning_rate": 0.00046595164579085345,
+      "loss": 3.3299,
+      "step": 38450
+    },
+    {
+      "epoch": 11.208199871877001,
+      "grad_norm": 0.4052780568599701,
+      "learning_rate": 0.0004657768715409263,
+      "loss": 3.3197,
+      "step": 38500
+    },
+    {
+      "epoch": 11.222759303476792,
+      "grad_norm": 0.3582177460193634,
+      "learning_rate": 0.0004656020972909991,
+      "loss": 3.3247,
+      "step": 38550
+    },
+    {
+      "epoch": 11.237318735076583,
+      "grad_norm": 0.3405052423477173,
+      "learning_rate": 0.00046542732304107195,
+      "loss": 3.3235,
+      "step": 38600
+    },
+    {
+      "epoch": 11.251878166676374,
+      "grad_norm": 0.32738906145095825,
+      "learning_rate": 0.0004652525487911447,
+      "loss": 3.3191,
+      "step": 38650
+    },
+    {
+      "epoch": 11.266437598276163,
+      "grad_norm": 0.36800041794776917,
+      "learning_rate": 0.00046507777454121756,
+      "loss": 3.3328,
+      "step": 38700
+    },
+    {
+      "epoch": 11.280997029875953,
+      "grad_norm": 0.37207457423210144,
+      "learning_rate": 0.0004649030002912904,
+      "loss": 3.3304,
+      "step": 38750
+    },
+    {
+      "epoch": 11.295556461475744,
+      "grad_norm": 0.36415359377861023,
+      "learning_rate": 0.0004647282260413632,
+      "loss": 3.3409,
+      "step": 38800
+    },
+    {
+      "epoch": 11.310115893075535,
+      "grad_norm": 0.3438774049282074,
+      "learning_rate": 0.00046455345179143606,
+      "loss": 3.3288,
+      "step": 38850
+    },
+    {
+      "epoch": 11.324675324675324,
+      "grad_norm": 0.3514200448989868,
+      "learning_rate": 0.00046437867754150884,
+      "loss": 3.331,
+      "step": 38900
+    },
+    {
+      "epoch": 11.339234756275115,
+      "grad_norm": 0.34444525837898254,
+      "learning_rate": 0.00046420390329158167,
+      "loss": 3.3253,
+      "step": 38950
+    },
+    {
+      "epoch": 11.353794187874906,
+      "grad_norm": 0.34927886724472046,
+      "learning_rate": 0.0004640291290416545,
+      "loss": 3.3361,
+      "step": 39000
+    },
+    {
+      "epoch": 11.353794187874906,
+      "eval_accuracy": 0.36882192674458786,
+      "eval_loss": 3.568171739578247,
+      "eval_runtime": 180.5055,
+      "eval_samples_per_second": 92.202,
+      "eval_steps_per_second": 5.767,
+      "step": 39000
+    },
+    {
+      "epoch": 11.368353619474696,
+      "grad_norm": 0.34765294194221497,
+      "learning_rate": 0.00046385435479172734,
+      "loss": 3.3385,
+      "step": 39050
+    },
+    {
+      "epoch": 11.382913051074485,
+      "grad_norm": 0.37567201256752014,
+      "learning_rate": 0.00046367958054180017,
+      "loss": 3.3372,
+      "step": 39100
+    },
+    {
+      "epoch": 11.397472482674276,
+      "grad_norm": 0.3298972547054291,
+      "learning_rate": 0.00046350480629187295,
+      "loss": 3.3449,
+      "step": 39150
+    },
+    {
+      "epoch": 11.412031914274067,
+      "grad_norm": 0.3385719656944275,
+      "learning_rate": 0.0004633300320419458,
+      "loss": 3.3381,
+      "step": 39200
+    },
+    {
+      "epoch": 11.426591345873858,
+      "grad_norm": 0.3834417164325714,
+      "learning_rate": 0.0004631552577920186,
+      "loss": 3.3453,
+      "step": 39250
+    },
+    {
+      "epoch": 11.441150777473647,
+      "grad_norm": 0.36645200848579407,
+      "learning_rate": 0.00046298048354209145,
+      "loss": 3.3585,
+      "step": 39300
+    },
+    {
+      "epoch": 11.455710209073438,
+      "grad_norm": 0.3596128523349762,
+      "learning_rate": 0.0004628057092921642,
+      "loss": 3.3505,
+      "step": 39350
+    },
+    {
+      "epoch": 11.470269640673228,
+      "grad_norm": 0.37306201457977295,
+      "learning_rate": 0.00046263093504223706,
+      "loss": 3.3489,
+      "step": 39400
+    },
+    {
+      "epoch": 11.484829072273019,
+      "grad_norm": 0.40729859471321106,
+      "learning_rate": 0.0004624561607923099,
+      "loss": 3.3555,
+      "step": 39450
+    },
+    {
+      "epoch": 11.499388503872808,
+      "grad_norm": 0.35100769996643066,
+      "learning_rate": 0.0004622813865423827,
+      "loss": 3.3443,
+      "step": 39500
+    },
+    {
+      "epoch": 11.513947935472599,
+      "grad_norm": 0.347989559173584,
+      "learning_rate": 0.00046210661229245556,
+      "loss": 3.3647,
+      "step": 39550
+    },
+    {
+      "epoch": 11.52850736707239,
+      "grad_norm": 0.35340970754623413,
+      "learning_rate": 0.00046193183804252834,
+      "loss": 3.346,
+      "step": 39600
+    },
+    {
+      "epoch": 11.54306679867218,
+      "grad_norm": 0.3439280092716217,
+      "learning_rate": 0.00046175706379260117,
+      "loss": 3.3613,
+      "step": 39650
+    },
+    {
+      "epoch": 11.55762623027197,
+      "grad_norm": 0.34520137310028076,
+      "learning_rate": 0.000461582289542674,
+      "loss": 3.347,
+      "step": 39700
+    },
+    {
+      "epoch": 11.57218566187176,
+      "grad_norm": 0.3320297598838806,
+      "learning_rate": 0.00046140751529274684,
+      "loss": 3.3489,
+      "step": 39750
+    },
+    {
+      "epoch": 11.586745093471551,
+      "grad_norm": 0.35040003061294556,
+      "learning_rate": 0.00046123274104281967,
+      "loss": 3.3462,
+      "step": 39800
+    },
+    {
+      "epoch": 11.601304525071342,
+      "grad_norm": 0.3691483438014984,
+      "learning_rate": 0.00046105796679289245,
+      "loss": 3.3593,
+      "step": 39850
+    },
+    {
+      "epoch": 11.61586395667113,
+      "grad_norm": 0.3896438777446747,
+      "learning_rate": 0.0004608831925429653,
+      "loss": 3.3616,
+      "step": 39900
+    },
+    {
+      "epoch": 11.630423388270922,
+      "grad_norm": 0.36567434668540955,
+      "learning_rate": 0.0004607084182930381,
+      "loss": 3.3553,
+      "step": 39950
+    },
+    {
+      "epoch": 11.644982819870712,
+      "grad_norm": 0.343128502368927,
+      "learning_rate": 0.00046053364404311095,
+      "loss": 3.361,
+      "step": 40000
+    },
+    {
+      "epoch": 11.644982819870712,
+      "eval_accuracy": 0.36932248097582326,
+      "eval_loss": 3.5595271587371826,
+      "eval_runtime": 180.4161,
+      "eval_samples_per_second": 92.248,
+      "eval_steps_per_second": 5.77,
+      "step": 40000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 171750,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.35936181747712e+17,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}