diff --git "a/resemble_to_drop_frequency_3591/checkpoint-50000/trainer_state.json" "b/resemble_to_drop_frequency_3591/checkpoint-50000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/resemble_to_drop_frequency_3591/checkpoint-50000/trainer_state.json"
@@ -0,0 +1,7493 @@
+{
+  "best_global_step": 48000,
+  "best_metric": 3.543606996536255,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/resemble_to_drop_frequency_3591/checkpoint-40000",
+  "epoch": 14.560312190576038,
+  "eval_steps": 1000,
+  "global_step": 50000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.014561127613722406,
+      "grad_norm": 1.0968835353851318,
+      "learning_rate": 0.000294,
+      "loss": 8.4166,
+      "step": 50
+    },
+    {
+      "epoch": 0.029122255227444813,
+      "grad_norm": 0.6513845920562744,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7305,
+      "step": 100
+    },
+    {
+      "epoch": 0.04368338284116722,
+      "grad_norm": 0.5097090601921082,
+      "learning_rate": 0.0005998286713286713,
+      "loss": 6.3288,
+      "step": 150
+    },
+    {
+      "epoch": 0.058244510454889625,
+      "grad_norm": 0.42854636907577515,
+      "learning_rate": 0.0005996538461538461,
+      "loss": 6.1467,
+      "step": 200
+    },
+    {
+      "epoch": 0.07280563806861204,
+      "grad_norm": 0.5062187314033508,
+      "learning_rate": 0.0005994790209790209,
+      "loss": 5.9942,
+      "step": 250
+    },
+    {
+      "epoch": 0.08736676568233444,
+      "grad_norm": 0.39580509066581726,
+      "learning_rate": 0.0005993041958041958,
+      "loss": 5.8474,
+      "step": 300
+    },
+    {
+      "epoch": 0.10192789329605685,
+      "grad_norm": 0.4567234516143799,
+      "learning_rate": 0.0005991293706293705,
+      "loss": 5.7198,
+      "step": 350
+    },
+    {
+      "epoch": 0.11648902090977925,
+      "grad_norm": 0.5529834628105164,
+      "learning_rate": 0.0005989545454545454,
+      "loss": 5.6138,
+      "step": 400
+    },
+    {
+      "epoch": 0.13105014852350166,
+      "grad_norm": 0.4305090010166168,
+      "learning_rate": 0.0005987797202797202,
+      "loss": 5.4936,
+      "step": 450
+    },
+    {
+      "epoch": 0.14561127613722408,
+      "grad_norm": 0.45783543586730957,
+      "learning_rate": 0.000598604895104895,
+      "loss": 5.4016,
+      "step": 500
+    },
+    {
+      "epoch": 0.16017240375094646,
+      "grad_norm": 0.4605506658554077,
+      "learning_rate": 0.0005984300699300698,
+      "loss": 5.3398,
+      "step": 550
+    },
+    {
+      "epoch": 0.17473353136466888,
+      "grad_norm": 0.582597553730011,
+      "learning_rate": 0.0005982552447552447,
+      "loss": 5.2616,
+      "step": 600
+    },
+    {
+      "epoch": 0.1892946589783913,
+      "grad_norm": 0.47175198793411255,
+      "learning_rate": 0.0005980804195804195,
+      "loss": 5.1831,
+      "step": 650
+    },
+    {
+      "epoch": 0.2038557865921137,
+      "grad_norm": 0.5068243145942688,
+      "learning_rate": 0.0005979055944055943,
+      "loss": 5.1231,
+      "step": 700
+    },
+    {
+      "epoch": 0.2184169142058361,
+      "grad_norm": 0.4487193822860718,
+      "learning_rate": 0.0005977307692307691,
+      "loss": 5.0754,
+      "step": 750
+    },
+    {
+      "epoch": 0.2329780418195585,
+      "grad_norm": 0.51069575548172,
+      "learning_rate": 0.000597555944055944,
+      "loss": 5.0179,
+      "step": 800
+    },
+    {
+      "epoch": 0.24753916943328091,
+      "grad_norm": 0.5181578397750854,
+      "learning_rate": 0.0005973811188811188,
+      "loss": 4.9711,
+      "step": 850
+    },
+    {
+      "epoch": 0.2621002970470033,
+      "grad_norm": 0.5237641334533691,
+      "learning_rate": 0.0005972062937062936,
+      "loss": 4.9212,
+      "step": 900
+    },
+    {
+      "epoch": 0.27666142466072574,
+      "grad_norm": 0.5211111903190613,
+      "learning_rate": 0.0005970314685314685,
+      "loss": 4.869,
+      "step": 950
+    },
+    {
+      "epoch": 0.29122255227444815,
+      "grad_norm": 0.44638922810554504,
+      "learning_rate": 0.0005968566433566433,
+      "loss": 4.8185,
+      "step": 1000
+    },
+    {
+      "epoch": 0.29122255227444815,
+      "eval_accuracy": 0.2571676232670328,
+      "eval_loss": 4.735123634338379,
+      "eval_runtime": 181.933,
+      "eval_samples_per_second": 91.468,
+      "eval_steps_per_second": 5.722,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30578367988817057,
+      "grad_norm": 0.4172416627407074,
+      "learning_rate": 0.0005966818181818181,
+      "loss": 4.7639,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3203448075018929,
+      "grad_norm": 0.4456934928894043,
+      "learning_rate": 0.0005965069930069929,
+      "loss": 4.7177,
+      "step": 1100
+    },
+    {
+      "epoch": 0.33490593511561534,
+      "grad_norm": 0.47358137369155884,
+      "learning_rate": 0.0005963321678321677,
+      "loss": 4.693,
+      "step": 1150
+    },
+    {
+      "epoch": 0.34946706272933775,
+      "grad_norm": 0.4430226683616638,
+      "learning_rate": 0.0005961573426573425,
+      "loss": 4.659,
+      "step": 1200
+    },
+    {
+      "epoch": 0.36402819034306017,
+      "grad_norm": 0.4078950583934784,
+      "learning_rate": 0.0005959825174825174,
+      "loss": 4.6245,
+      "step": 1250
+    },
+    {
+      "epoch": 0.3785893179567826,
+      "grad_norm": 0.44066765904426575,
+      "learning_rate": 0.0005958076923076922,
+      "loss": 4.5934,
+      "step": 1300
+    },
+    {
+      "epoch": 0.393150445570505,
+      "grad_norm": 0.43180859088897705,
+      "learning_rate": 0.000595632867132867,
+      "loss": 4.5672,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4077115731842274,
+      "grad_norm": 0.41051167249679565,
+      "learning_rate": 0.0005954580419580418,
+      "loss": 4.552,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4222727007979498,
+      "grad_norm": 0.4435499906539917,
+      "learning_rate": 0.0005952832167832168,
+      "loss": 4.5272,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4368338284116722,
+      "grad_norm": 0.4016467332839966,
+      "learning_rate": 0.0005951083916083916,
+      "loss": 4.4965,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4513949560253946,
+      "grad_norm": 0.42740699648857117,
+      "learning_rate": 0.0005949335664335664,
+      "loss": 4.4734,
+      "step": 1550
+    },
+    {
+      "epoch": 0.465956083639117,
+      "grad_norm": 0.41071462631225586,
+      "learning_rate": 0.0005947587412587413,
+      "loss": 4.4678,
+      "step": 1600
+    },
+    {
+      "epoch": 0.4805172112528394,
+      "grad_norm": 0.41863691806793213,
+      "learning_rate": 0.0005945839160839161,
+      "loss": 4.4373,
+      "step": 1650
+    },
+    {
+      "epoch": 0.49507833886656183,
+      "grad_norm": 0.4083499610424042,
+      "learning_rate": 0.0005944090909090909,
+      "loss": 4.4181,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5096394664802842,
+      "grad_norm": 0.45506301522254944,
+      "learning_rate": 0.0005942342657342657,
+      "loss": 4.4017,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5242005940940067,
+      "grad_norm": 0.4013260006904602,
+      "learning_rate": 0.0005940594405594406,
+      "loss": 4.3976,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5387617217077291,
+      "grad_norm": 0.37785711884498596,
+      "learning_rate": 0.0005938846153846153,
+      "loss": 4.3778,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5533228493214515,
+      "grad_norm": 0.436885267496109,
+      "learning_rate": 0.0005937097902097902,
+      "loss": 4.3769,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5678839769351739,
+      "grad_norm": 0.41713714599609375,
+      "learning_rate": 0.000593534965034965,
+      "loss": 4.3597,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5824451045488963,
+      "grad_norm": 0.40815940499305725,
+      "learning_rate": 0.0005933601398601398,
+      "loss": 4.3311,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5824451045488963,
+      "eval_accuracy": 0.2994343186746337,
+      "eval_loss": 4.2821149826049805,
+      "eval_runtime": 182.0522,
+      "eval_samples_per_second": 91.408,
+      "eval_steps_per_second": 5.718,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5970062321626187,
+      "grad_norm": 0.3702036440372467,
+      "learning_rate": 0.0005931853146853146,
+      "loss": 4.3181,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6115673597763411,
+      "grad_norm": 0.3854442238807678,
+      "learning_rate": 0.0005930104895104895,
+      "loss": 4.308,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6261284873900634,
+      "grad_norm": 0.3813020884990692,
+      "learning_rate": 0.0005928356643356643,
+      "loss": 4.31,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6406896150037859,
+      "grad_norm": 0.3868289291858673,
+      "learning_rate": 0.0005926608391608391,
+      "loss": 4.3031,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6552507426175083,
+      "grad_norm": 0.3773530423641205,
+      "learning_rate": 0.000592486013986014,
+      "loss": 4.2951,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6698118702312307,
+      "grad_norm": 0.43276000022888184,
+      "learning_rate": 0.0005923111888111888,
+      "loss": 4.2716,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6843729978449531,
+      "grad_norm": 0.4105272591114044,
+      "learning_rate": 0.0005921363636363636,
+      "loss": 4.269,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6989341254586755,
+      "grad_norm": 0.381144255399704,
+      "learning_rate": 0.0005919615384615384,
+      "loss": 4.2447,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7134952530723979,
+      "grad_norm": 0.5546244978904724,
+      "learning_rate": 0.0005917867132867133,
+      "loss": 4.2573,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7280563806861203,
+      "grad_norm": 0.37709176540374756,
+      "learning_rate": 0.0005916118881118881,
+      "loss": 4.2385,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7426175082998427,
+      "grad_norm": 0.39420562982559204,
+      "learning_rate": 0.0005914370629370629,
+      "loss": 4.235,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7571786359135652,
+      "grad_norm": 0.3711323142051697,
+      "learning_rate": 0.0005912622377622377,
+      "loss": 4.2138,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7717397635272876,
+      "grad_norm": 0.37468209862709045,
+      "learning_rate": 0.0005910874125874125,
+      "loss": 4.2063,
+      "step": 2650
+    },
+    {
+      "epoch": 0.78630089114101,
+      "grad_norm": 0.36328959465026855,
+      "learning_rate": 0.0005909125874125873,
+      "loss": 4.1915,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8008620187547324,
+      "grad_norm": 0.3596959412097931,
+      "learning_rate": 0.0005907377622377622,
+      "loss": 4.1859,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8154231463684548,
+      "grad_norm": 0.3694506287574768,
+      "learning_rate": 0.000590562937062937,
+      "loss": 4.17,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8299842739821772,
+      "grad_norm": 0.37155982851982117,
+      "learning_rate": 0.0005903881118881118,
+      "loss": 4.1644,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8445454015958996,
+      "grad_norm": 0.3630830943584442,
+      "learning_rate": 0.0005902132867132867,
+      "loss": 4.1669,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8591065292096219,
+      "grad_norm": 0.37551137804985046,
+      "learning_rate": 0.0005900384615384615,
+      "loss": 4.1477,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8736676568233444,
+      "grad_norm": 0.38218334317207336,
+      "learning_rate": 0.0005898636363636363,
+      "loss": 4.1448,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8736676568233444,
+      "eval_accuracy": 0.315657070793131,
+      "eval_loss": 4.097928524017334,
+      "eval_runtime": 180.4968,
+      "eval_samples_per_second": 92.196,
+      "eval_steps_per_second": 5.767,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8882287844370668,
+      "grad_norm": 0.3394276201725006,
+      "learning_rate": 0.0005896888111888111,
+      "loss": 4.1382,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9027899120507892,
+      "grad_norm": 0.35621699690818787,
+      "learning_rate": 0.000589513986013986,
+      "loss": 4.1264,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9173510396645116,
+      "grad_norm": 0.40414878726005554,
+      "learning_rate": 0.0005893391608391608,
+      "loss": 4.1358,
+      "step": 3150
+    },
+    {
+      "epoch": 0.931912167278234,
+      "grad_norm": 0.34163355827331543,
+      "learning_rate": 0.0005891643356643356,
+      "loss": 4.1084,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9464732948919564,
+      "grad_norm": 0.34562134742736816,
+      "learning_rate": 0.0005889895104895104,
+      "loss": 4.1068,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9610344225056788,
+      "grad_norm": 0.34311237931251526,
+      "learning_rate": 0.0005888146853146853,
+      "loss": 4.1128,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9755955501194012,
+      "grad_norm": 0.3517252206802368,
+      "learning_rate": 0.00058863986013986,
+      "loss": 4.0946,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9901566777331237,
+      "grad_norm": 0.36632663011550903,
+      "learning_rate": 0.0005884650349650349,
+      "loss": 4.1048,
+      "step": 3400
+    },
+    {
+      "epoch": 1.004659560836391,
+      "grad_norm": 0.3214505910873413,
+      "learning_rate": 0.0005882902097902097,
+      "loss": 4.068,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0192206884501136,
+      "grad_norm": 0.3312755525112152,
+      "learning_rate": 0.0005881153846153845,
+      "loss": 4.0171,
+      "step": 3500
+    },
+    {
+      "epoch": 1.033781816063836,
+      "grad_norm": 0.35106778144836426,
+      "learning_rate": 0.0005879405594405594,
+      "loss": 4.0175,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0483429436775584,
+      "grad_norm": 0.3634989857673645,
+      "learning_rate": 0.0005877657342657342,
+      "loss": 4.007,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0629040712912807,
+      "grad_norm": 0.3441219627857208,
+      "learning_rate": 0.000587590909090909,
+      "loss": 4.0123,
+      "step": 3650
+    },
+    {
+      "epoch": 1.0774651989050033,
+      "grad_norm": 0.33224573731422424,
+      "learning_rate": 0.0005874160839160838,
+      "loss": 4.0016,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0920263265187256,
+      "grad_norm": 0.34425127506256104,
+      "learning_rate": 0.0005872412587412587,
+      "loss": 4.0066,
+      "step": 3750
+    },
+    {
+      "epoch": 1.106587454132448,
+      "grad_norm": 0.34838539361953735,
+      "learning_rate": 0.0005870664335664335,
+      "loss": 4.0148,
+      "step": 3800
+    },
+    {
+      "epoch": 1.1211485817461704,
+      "grad_norm": 0.3374408781528473,
+      "learning_rate": 0.0005868916083916083,
+      "loss": 3.9959,
+      "step": 3850
+    },
+    {
+      "epoch": 1.135709709359893,
+      "grad_norm": 0.35629168152809143,
+      "learning_rate": 0.0005867167832167831,
+      "loss": 3.9961,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1502708369736152,
+      "grad_norm": 0.34352248907089233,
+      "learning_rate": 0.000586541958041958,
+      "loss": 3.9805,
+      "step": 3950
+    },
+    {
+      "epoch": 1.1648319645873377,
+      "grad_norm": 0.3234029710292816,
+      "learning_rate": 0.0005863671328671328,
+      "loss": 3.9959,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1648319645873377,
+      "eval_accuracy": 0.3252184881351332,
+      "eval_loss": 3.9894917011260986,
+      "eval_runtime": 180.4599,
+      "eval_samples_per_second": 92.214,
+      "eval_steps_per_second": 5.769,
+      "step": 4000
+    },
+    {
+      "epoch": 1.17939309220106,
+      "grad_norm": 0.3508160710334778,
+      "learning_rate": 0.0005861923076923076,
+      "loss": 3.9981,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1939542198147826,
+      "grad_norm": 0.35192403197288513,
+      "learning_rate": 0.0005860174825174824,
+      "loss": 3.9802,
+      "step": 4100
+    },
+    {
+      "epoch": 1.2085153474285049,
+      "grad_norm": 0.33519187569618225,
+      "learning_rate": 0.0005858426573426573,
+      "loss": 3.9831,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2230764750422272,
+      "grad_norm": 0.3435879349708557,
+      "learning_rate": 0.000585667832167832,
+      "loss": 3.9778,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2376376026559497,
+      "grad_norm": 0.3758980333805084,
+      "learning_rate": 0.000585493006993007,
+      "loss": 3.9786,
+      "step": 4250
+    },
+    {
+      "epoch": 1.2521987302696722,
+      "grad_norm": 0.3644406199455261,
+      "learning_rate": 0.0005853181818181817,
+      "loss": 3.9794,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2667598578833945,
+      "grad_norm": 0.34182295203208923,
+      "learning_rate": 0.0005851433566433565,
+      "loss": 3.9657,
+      "step": 4350
+    },
+    {
+      "epoch": 1.2813209854971168,
+      "grad_norm": 0.35043004155158997,
+      "learning_rate": 0.0005849685314685315,
+      "loss": 3.9696,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2958821131108393,
+      "grad_norm": 0.3384864926338196,
+      "learning_rate": 0.0005847937062937063,
+      "loss": 3.9709,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3104432407245616,
+      "grad_norm": 0.3442433178424835,
+      "learning_rate": 0.0005846188811188811,
+      "loss": 3.9592,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3250043683382842,
+      "grad_norm": 0.34257060289382935,
+      "learning_rate": 0.0005844440559440559,
+      "loss": 3.9582,
+      "step": 4550
+    },
+    {
+      "epoch": 1.3395654959520065,
+      "grad_norm": 0.35144466161727905,
+      "learning_rate": 0.0005842692307692308,
+      "loss": 3.9495,
+      "step": 4600
+    },
+    {
+      "epoch": 1.354126623565729,
+      "grad_norm": 0.3375908136367798,
+      "learning_rate": 0.0005840944055944056,
+      "loss": 3.9506,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3686877511794513,
+      "grad_norm": 0.3664558231830597,
+      "learning_rate": 0.0005839195804195804,
+      "loss": 3.9559,
+      "step": 4700
+    },
+    {
+      "epoch": 1.3832488787931738,
+      "grad_norm": 0.3706004321575165,
+      "learning_rate": 0.0005837447552447552,
+      "loss": 3.9544,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3978100064068961,
+      "grad_norm": 0.36268237233161926,
+      "learning_rate": 0.0005835699300699301,
+      "loss": 3.9448,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4123711340206184,
+      "grad_norm": 0.3499133884906769,
+      "learning_rate": 0.0005833951048951048,
+      "loss": 3.941,
+      "step": 4850
+    },
+    {
+      "epoch": 1.426932261634341,
+      "grad_norm": 0.33515897393226624,
+      "learning_rate": 0.0005832202797202797,
+      "loss": 3.9399,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4414933892480635,
+      "grad_norm": 0.3366197645664215,
+      "learning_rate": 0.0005830454545454546,
+      "loss": 3.9426,
+      "step": 4950
+    },
+    {
+      "epoch": 1.4560545168617858,
+      "grad_norm": 0.32890358567237854,
+      "learning_rate": 0.0005828706293706293,
+      "loss": 3.9195,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4560545168617858,
+      "eval_accuracy": 0.33210278858796755,
+      "eval_loss": 3.913426637649536,
+      "eval_runtime": 180.5893,
+      "eval_samples_per_second": 92.148,
+      "eval_steps_per_second": 5.764,
+      "step": 5000
+    },
+    {
+      "epoch": 1.470615644475508,
+      "grad_norm": 0.32962432503700256,
+      "learning_rate": 0.0005826958041958042,
+      "loss": 3.9236,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4851767720892306,
+      "grad_norm": 0.32921549677848816,
+      "learning_rate": 0.000582520979020979,
+      "loss": 3.9185,
+      "step": 5100
+    },
+    {
+      "epoch": 1.4997378997029531,
+      "grad_norm": 0.3333084285259247,
+      "learning_rate": 0.0005823461538461538,
+      "loss": 3.9211,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5142990273166754,
+      "grad_norm": 0.33431023359298706,
+      "learning_rate": 0.0005821713286713286,
+      "loss": 3.9171,
+      "step": 5200
+    },
+    {
+      "epoch": 1.5288601549303977,
+      "grad_norm": 0.3071958124637604,
+      "learning_rate": 0.0005819965034965035,
+      "loss": 3.9336,
+      "step": 5250
+    },
+    {
+      "epoch": 1.5434212825441203,
+      "grad_norm": 0.33107051253318787,
+      "learning_rate": 0.0005818216783216783,
+      "loss": 3.9241,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5579824101578428,
+      "grad_norm": 0.3333437442779541,
+      "learning_rate": 0.0005816468531468531,
+      "loss": 3.9097,
+      "step": 5350
+    },
+    {
+      "epoch": 1.572543537771565,
+      "grad_norm": 0.30821335315704346,
+      "learning_rate": 0.0005814720279720279,
+      "loss": 3.9082,
+      "step": 5400
+    },
+    {
+      "epoch": 1.5871046653852874,
+      "grad_norm": 0.33230161666870117,
+      "learning_rate": 0.0005812972027972028,
+      "loss": 3.9124,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6016657929990097,
+      "grad_norm": 0.32524222135543823,
+      "learning_rate": 0.0005811223776223776,
+      "loss": 3.8887,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6162269206127322,
+      "grad_norm": 0.3314615786075592,
+      "learning_rate": 0.0005809475524475524,
+      "loss": 3.8882,
+      "step": 5550
+    },
+    {
+      "epoch": 1.6307880482264547,
+      "grad_norm": 0.32449811697006226,
+      "learning_rate": 0.0005807727272727272,
+      "loss": 3.9018,
+      "step": 5600
+    },
+    {
+      "epoch": 1.645349175840177,
+      "grad_norm": 0.3151806890964508,
+      "learning_rate": 0.0005805979020979021,
+      "loss": 3.9122,
+      "step": 5650
+    },
+    {
+      "epoch": 1.6599103034538993,
+      "grad_norm": 0.31618261337280273,
+      "learning_rate": 0.0005804230769230769,
+      "loss": 3.8943,
+      "step": 5700
+    },
+    {
+      "epoch": 1.6744714310676219,
+      "grad_norm": 0.31507301330566406,
+      "learning_rate": 0.0005802482517482517,
+      "loss": 3.8947,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6890325586813444,
+      "grad_norm": 0.3101233243942261,
+      "learning_rate": 0.0005800734265734265,
+      "loss": 3.8899,
+      "step": 5800
+    },
+    {
+      "epoch": 1.7035936862950667,
+      "grad_norm": 0.3337361514568329,
+      "learning_rate": 0.0005798986013986013,
+      "loss": 3.8903,
+      "step": 5850
+    },
+    {
+      "epoch": 1.718154813908789,
+      "grad_norm": 0.3244958519935608,
+      "learning_rate": 0.0005797237762237762,
+      "loss": 3.8904,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7327159415225115,
+      "grad_norm": 0.3190813958644867,
+      "learning_rate": 0.000579548951048951,
+      "loss": 3.8707,
+      "step": 5950
+    },
+    {
+      "epoch": 1.747277069136234,
+      "grad_norm": 0.30000555515289307,
+      "learning_rate": 0.0005793741258741258,
+      "loss": 3.8847,
+      "step": 6000
+    },
+    {
+      "epoch": 1.747277069136234,
+      "eval_accuracy": 0.3372700416567149,
+      "eval_loss": 3.8552145957946777,
+      "eval_runtime": 180.7902,
+      "eval_samples_per_second": 92.046,
+      "eval_steps_per_second": 5.758,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7618381967499563,
+      "grad_norm": 0.3228197395801544,
+      "learning_rate": 0.0005791993006993006,
+      "loss": 3.8787,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7763993243636786,
+      "grad_norm": 0.3813265264034271,
+      "learning_rate": 0.0005790244755244755,
+      "loss": 3.8758,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7909604519774012,
+      "grad_norm": 0.31867480278015137,
+      "learning_rate": 0.0005788496503496503,
+      "loss": 3.8588,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8055215795911237,
+      "grad_norm": 0.3052184581756592,
+      "learning_rate": 0.0005786748251748251,
+      "loss": 3.8693,
+      "step": 6200
+    },
+    {
+      "epoch": 1.820082707204846,
+      "grad_norm": 0.33260828256607056,
+      "learning_rate": 0.0005784999999999999,
+      "loss": 3.8806,
+      "step": 6250
+    },
+    {
+      "epoch": 1.8346438348185683,
+      "grad_norm": 0.3406659960746765,
+      "learning_rate": 0.0005783251748251748,
+      "loss": 3.8749,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8492049624322906,
+      "grad_norm": 0.3197321891784668,
+      "learning_rate": 0.0005781503496503496,
+      "loss": 3.8612,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8637660900460131,
+      "grad_norm": 0.3296874761581421,
+      "learning_rate": 0.0005779755244755244,
+      "loss": 3.8534,
+      "step": 6400
+    },
+    {
+      "epoch": 1.8783272176597356,
+      "grad_norm": 0.33814582228660583,
+      "learning_rate": 0.0005778006993006993,
+      "loss": 3.867,
+      "step": 6450
+    },
+    {
+      "epoch": 1.892888345273458,
+      "grad_norm": 0.3142564594745636,
+      "learning_rate": 0.000577625874125874,
+      "loss": 3.8651,
+      "step": 6500
+    },
+    {
+      "epoch": 1.9074494728871803,
+      "grad_norm": 0.32239586114883423,
+      "learning_rate": 0.0005774510489510489,
+      "loss": 3.8486,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9220106005009028,
+      "grad_norm": 0.3052440583705902,
+      "learning_rate": 0.0005772762237762237,
+      "loss": 3.8551,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9365717281146253,
+      "grad_norm": 0.34700754284858704,
+      "learning_rate": 0.0005771013986013985,
+      "loss": 3.8433,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9511328557283476,
+      "grad_norm": 0.32743698358535767,
+      "learning_rate": 0.0005769265734265733,
+      "loss": 3.8523,
+      "step": 6700
+    },
+    {
+      "epoch": 1.96569398334207,
+      "grad_norm": 0.3276400864124298,
+      "learning_rate": 0.0005767517482517482,
+      "loss": 3.8501,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9802551109557924,
+      "grad_norm": 0.31817659735679626,
+      "learning_rate": 0.000576576923076923,
+      "loss": 3.8395,
+      "step": 6800
+    },
+    {
+      "epoch": 1.994816238569515,
+      "grad_norm": 0.30320706963539124,
+      "learning_rate": 0.0005764020979020978,
+      "loss": 3.8473,
+      "step": 6850
+    },
+    {
+      "epoch": 2.009319121672782,
+      "grad_norm": 0.33266910910606384,
+      "learning_rate": 0.0005762272727272726,
+      "loss": 3.7834,
+      "step": 6900
+    },
+    {
+      "epoch": 2.023880249286505,
+      "grad_norm": 0.3259655237197876,
+      "learning_rate": 0.0005760524475524475,
+      "loss": 3.7468,
+      "step": 6950
+    },
+    {
+      "epoch": 2.038441376900227,
+      "grad_norm": 0.32959362864494324,
+      "learning_rate": 0.0005758776223776223,
+      "loss": 3.7518,
+      "step": 7000
+    },
+    {
+      "epoch": 2.038441376900227,
+      "eval_accuracy": 0.3410902104309129,
+      "eval_loss": 3.8155248165130615,
+      "eval_runtime": 180.6704,
+      "eval_samples_per_second": 92.107,
+      "eval_steps_per_second": 5.762,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0530025045139495,
+      "grad_norm": 0.3127276301383972,
+      "learning_rate": 0.0005757027972027971,
+      "loss": 3.7567,
+      "step": 7050
+    },
+    {
+      "epoch": 2.067563632127672,
+      "grad_norm": 0.3236212432384491,
+      "learning_rate": 0.000575527972027972,
+      "loss": 3.7529,
+      "step": 7100
+    },
+    {
+      "epoch": 2.0821247597413945,
+      "grad_norm": 0.32352373003959656,
+      "learning_rate": 0.0005753531468531468,
+      "loss": 3.7582,
+      "step": 7150
+    },
+    {
+      "epoch": 2.096685887355117,
+      "grad_norm": 0.3106796145439148,
+      "learning_rate": 0.0005751783216783216,
+      "loss": 3.769,
+      "step": 7200
+    },
+    {
+      "epoch": 2.111247014968839,
+      "grad_norm": 0.3407052755355835,
+      "learning_rate": 0.0005750034965034964,
+      "loss": 3.7593,
+      "step": 7250
+    },
+    {
+      "epoch": 2.1258081425825615,
+      "grad_norm": 0.32309308648109436,
+      "learning_rate": 0.0005748286713286712,
+      "loss": 3.7436,
+      "step": 7300
+    },
+    {
+      "epoch": 2.140369270196284,
+      "grad_norm": 0.3216754198074341,
+      "learning_rate": 0.000574653846153846,
+      "loss": 3.7569,
+      "step": 7350
+    },
+    {
+      "epoch": 2.1549303978100065,
+      "grad_norm": 0.32464084029197693,
+      "learning_rate": 0.000574479020979021,
+      "loss": 3.7569,
+      "step": 7400
+    },
+    {
+      "epoch": 2.169491525423729,
+      "grad_norm": 0.31773078441619873,
+      "learning_rate": 0.0005743041958041958,
+      "loss": 3.7452,
+      "step": 7450
+    },
+    {
+      "epoch": 2.184052653037451,
+      "grad_norm": 0.33485111594200134,
+      "learning_rate": 0.0005741293706293706,
+      "loss": 3.7562,
+      "step": 7500
+    },
+    {
+      "epoch": 2.198613780651174,
+      "grad_norm": 0.31402260065078735,
+      "learning_rate": 0.0005739545454545454,
+      "loss": 3.7598,
+      "step": 7550
+    },
+    {
+      "epoch": 2.213174908264896,
+      "grad_norm": 0.3134409785270691,
+      "learning_rate": 0.0005737797202797203,
+      "loss": 3.7545,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2277360358786185,
+      "grad_norm": 0.29751038551330566,
+      "learning_rate": 0.0005736048951048951,
+      "loss": 3.7576,
+      "step": 7650
+    },
+    {
+      "epoch": 2.2422971634923408,
+      "grad_norm": 0.3134411573410034,
+      "learning_rate": 0.0005734300699300699,
+      "loss": 3.7487,
+      "step": 7700
+    },
+    {
+      "epoch": 2.256858291106063,
+      "grad_norm": 0.30340442061424255,
+      "learning_rate": 0.0005732552447552448,
+      "loss": 3.7514,
+      "step": 7750
+    },
+    {
+      "epoch": 2.271419418719786,
+      "grad_norm": 0.3226783871650696,
+      "learning_rate": 0.0005730804195804196,
+      "loss": 3.7542,
+      "step": 7800
+    },
+    {
+      "epoch": 2.285980546333508,
+      "grad_norm": 0.331408828496933,
+      "learning_rate": 0.0005729055944055944,
+      "loss": 3.7647,
+      "step": 7850
+    },
+    {
+      "epoch": 2.3005416739472304,
+      "grad_norm": 0.3096162974834442,
+      "learning_rate": 0.0005727307692307692,
+      "loss": 3.7471,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3151028015609527,
+      "grad_norm": 0.3096749186515808,
+      "learning_rate": 0.0005725559440559441,
+      "loss": 3.7573,
+      "step": 7950
+    },
+    {
+      "epoch": 2.3296639291746755,
+      "grad_norm": 0.3250366151332855,
+      "learning_rate": 0.0005723811188811188,
+      "loss": 3.7545,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3296639291746755,
+      "eval_accuracy": 0.3441775089018693,
+      "eval_loss": 3.785076856613159,
+      "eval_runtime": 182.534,
+      "eval_samples_per_second": 91.167,
+      "eval_steps_per_second": 5.703,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3442250567883978,
+      "grad_norm": 0.334833025932312,
+      "learning_rate": 0.0005722062937062937,
+      "loss": 3.7573,
+      "step": 8050
+    },
+    {
+      "epoch": 2.35878618440212,
+      "grad_norm": 0.3055669963359833,
+      "learning_rate": 0.0005720314685314685,
+      "loss": 3.7556,
+      "step": 8100
+    },
+    {
+      "epoch": 2.3733473120158424,
+      "grad_norm": 0.31652265787124634,
+      "learning_rate": 0.0005718566433566433,
+      "loss": 3.7427,
+      "step": 8150
+    },
+    {
+      "epoch": 2.387908439629565,
+      "grad_norm": 0.31866180896759033,
+      "learning_rate": 0.0005716818181818181,
+      "loss": 3.7445,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4024695672432874,
+      "grad_norm": 0.3108992278575897,
+      "learning_rate": 0.000571506993006993,
+      "loss": 3.7576,
+      "step": 8250
+    },
+    {
+      "epoch": 2.4170306948570097,
+      "grad_norm": 0.32130104303359985,
+      "learning_rate": 0.0005713321678321678,
+      "loss": 3.749,
+      "step": 8300
+    },
+    {
+      "epoch": 2.431591822470732,
+      "grad_norm": 0.3208393156528473,
+      "learning_rate": 0.0005711573426573426,
+      "loss": 3.7552,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4461529500844543,
+      "grad_norm": 0.33237773180007935,
+      "learning_rate": 0.0005709825174825175,
+      "loss": 3.7437,
+      "step": 8400
+    },
+    {
+      "epoch": 2.460714077698177,
+      "grad_norm": 0.329561322927475,
+      "learning_rate": 0.0005708076923076923,
+      "loss": 3.7517,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4752752053118994,
+      "grad_norm": 0.3131866753101349,
+      "learning_rate": 0.0005706328671328671,
+      "loss": 3.7548,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4898363329256217,
+      "grad_norm": 0.3359507620334625,
+      "learning_rate": 0.0005704580419580419,
+      "loss": 3.7339,
+      "step": 8550
+    },
+    {
+      "epoch": 2.5043974605393444,
+      "grad_norm": 0.32358288764953613,
+      "learning_rate": 0.0005702832167832168,
+      "loss": 3.7537,
+      "step": 8600
+    },
+    {
+      "epoch": 2.5189585881530667,
+      "grad_norm": 0.3077840805053711,
+      "learning_rate": 0.0005701083916083916,
+      "loss": 3.7491,
+      "step": 8650
+    },
+    {
+      "epoch": 2.533519715766789,
+      "grad_norm": 0.306542307138443,
+      "learning_rate": 0.0005699335664335664,
+      "loss": 3.7501,
+      "step": 8700
+    },
+    {
+      "epoch": 2.5480808433805113,
+      "grad_norm": 0.31777438521385193,
+      "learning_rate": 0.0005697587412587412,
+      "loss": 3.7588,
+      "step": 8750
+    },
+    {
+      "epoch": 2.5626419709942336,
+      "grad_norm": 0.2995713949203491,
+      "learning_rate": 0.000569583916083916,
+      "loss": 3.7534,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5772030986079564,
+      "grad_norm": 0.28951001167297363,
+      "learning_rate": 0.0005694090909090908,
+      "loss": 3.7181,
+      "step": 8850
+    },
+    {
+      "epoch": 2.5917642262216787,
+      "grad_norm": 0.3006434142589569,
+      "learning_rate": 0.0005692342657342657,
+      "loss": 3.7345,
+      "step": 8900
+    },
+    {
+      "epoch": 2.606325353835401,
+      "grad_norm": 0.341970831155777,
+      "learning_rate": 0.0005690594405594405,
+      "loss": 3.7367,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6208864814491233,
+      "grad_norm": 0.33957773447036743,
+      "learning_rate": 0.0005688846153846153,
+      "loss": 3.7406,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6208864814491233,
+      "eval_accuracy": 0.3470423120882088,
+      "eval_loss": 3.755228042602539,
+      "eval_runtime": 182.4889,
+      "eval_samples_per_second": 91.189,
+      "eval_steps_per_second": 5.704,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6354476090628456,
+      "grad_norm": 0.3154286742210388,
+      "learning_rate": 0.0005687097902097901,
+      "loss": 3.7347,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6500087366765683,
+      "grad_norm": 0.3290162682533264,
+      "learning_rate": 0.000568534965034965,
+      "loss": 3.7492,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6645698642902906,
+      "grad_norm": 0.3163735568523407,
+      "learning_rate": 0.0005683601398601398,
+      "loss": 3.7292,
+      "step": 9150
+    },
+    {
+      "epoch": 2.679130991904013,
+      "grad_norm": 0.31252047419548035,
+      "learning_rate": 0.0005681853146853146,
+      "loss": 3.7412,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6936921195177357,
+      "grad_norm": 0.31522566080093384,
+      "learning_rate": 0.0005680104895104895,
+      "loss": 3.7257,
+      "step": 9250
+    },
+    {
+      "epoch": 2.708253247131458,
+      "grad_norm": 0.320932000875473,
+      "learning_rate": 0.0005678356643356643,
+      "loss": 3.7274,
+      "step": 9300
+    },
+    {
+      "epoch": 2.7228143747451803,
+      "grad_norm": 0.30979084968566895,
+      "learning_rate": 0.0005676608391608391,
+      "loss": 3.7346,
+      "step": 9350
+    },
+    {
+      "epoch": 2.7373755023589026,
+      "grad_norm": 0.31152546405792236,
+      "learning_rate": 0.0005674860139860139,
+      "loss": 3.725,
+      "step": 9400
+    },
+    {
+      "epoch": 2.751936629972625,
+      "grad_norm": 0.30036696791648865,
+      "learning_rate": 0.0005673111888111888,
+      "loss": 3.7413,
+      "step": 9450
+    },
+    {
+      "epoch": 2.7664977575863476,
+      "grad_norm": 0.2985321879386902,
+      "learning_rate": 0.0005671363636363635,
+      "loss": 3.7299,
+      "step": 9500
+    },
+    {
+      "epoch": 2.78105888520007,
+      "grad_norm": 0.29972919821739197,
+      "learning_rate": 0.0005669615384615384,
+      "loss": 3.7392,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7956200128137922,
+      "grad_norm": 0.3080502152442932,
+      "learning_rate": 0.0005667867132867132,
+      "loss": 3.7244,
+      "step": 9600
+    },
+    {
+      "epoch": 2.8101811404275145,
+      "grad_norm": 0.3224172294139862,
+      "learning_rate": 0.000566611888111888,
+      "loss": 3.7353,
+      "step": 9650
+    },
+    {
+      "epoch": 2.824742268041237,
+      "grad_norm": 0.29166194796562195,
+      "learning_rate": 0.0005664370629370628,
+      "loss": 3.7261,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8393033956549596,
+      "grad_norm": 0.3214750587940216,
+      "learning_rate": 0.0005662622377622377,
+      "loss": 3.7281,
+      "step": 9750
+    },
+    {
+      "epoch": 2.853864523268682,
+      "grad_norm": 0.3150324821472168,
+      "learning_rate": 0.0005660874125874125,
+      "loss": 3.7247,
+      "step": 9800
+    },
+    {
+      "epoch": 2.868425650882404,
+      "grad_norm": 0.31625422835350037,
+      "learning_rate": 0.0005659125874125873,
+      "loss": 3.7244,
+      "step": 9850
+    },
+    {
+      "epoch": 2.882986778496127,
+      "grad_norm": 0.3253997564315796,
+      "learning_rate": 0.0005657377622377622,
+      "loss": 3.7113,
+      "step": 9900
+    },
+    {
+      "epoch": 2.8975479061098492,
+      "grad_norm": 0.2979266941547394,
+      "learning_rate": 0.000565562937062937,
+      "loss": 3.7287,
+      "step": 9950
+    },
+    {
+      "epoch": 2.9121090337235715,
+      "grad_norm": 0.28829896450042725,
+      "learning_rate": 0.0005653881118881118,
+      "loss": 3.7104,
+      "step": 10000
+    },
+    {
+      "epoch": 2.9121090337235715,
+      "eval_accuracy": 0.3495516167304694,
+      "eval_loss": 3.725857734680176,
+      "eval_runtime": 182.3878,
+      "eval_samples_per_second": 91.24,
+      "eval_steps_per_second": 5.708,
+      "step": 10000
+    },
+    {
+      "epoch": 2.926670161337294,
+      "grad_norm": 0.3077658414840698,
+      "learning_rate": 0.0005652132867132866,
+      "loss": 3.7278,
+      "step": 10050
+    },
+    {
+      "epoch": 2.941231288951016,
+      "grad_norm": 0.2965807020664215,
+      "learning_rate": 0.0005650384615384615,
+      "loss": 3.7136,
+      "step": 10100
+    },
+    {
+      "epoch": 2.955792416564739,
+      "grad_norm": 0.3523092269897461,
+      "learning_rate": 0.0005648636363636363,
+      "loss": 3.7209,
+      "step": 10150
+    },
+    {
+      "epoch": 2.970353544178461,
+      "grad_norm": 0.32154616713523865,
+      "learning_rate": 0.0005646888111888111,
+      "loss": 3.71,
+      "step": 10200
+    },
+    {
+      "epoch": 2.9849146717921835,
+      "grad_norm": 0.32815372943878174,
+      "learning_rate": 0.000564513986013986,
+      "loss": 3.7295,
+      "step": 10250
+    },
+    {
+      "epoch": 2.9994757994059063,
+      "grad_norm": 0.3026379346847534,
+      "learning_rate": 0.0005643391608391607,
+      "loss": 3.7079,
+      "step": 10300
+    },
+    {
+      "epoch": 3.0139786825091734,
+      "grad_norm": 0.32696130871772766,
+      "learning_rate": 0.0005641643356643355,
+      "loss": 3.6198,
+      "step": 10350
+    },
+    {
+      "epoch": 3.0285398101228957,
+      "grad_norm": 0.3219397962093353,
+      "learning_rate": 0.0005639895104895105,
+      "loss": 3.6018,
+      "step": 10400
+    },
+    {
+      "epoch": 3.0431009377366185,
+      "grad_norm": 0.32359954714775085,
+      "learning_rate": 0.0005638146853146853,
+      "loss": 3.6216,
+      "step": 10450
+    },
+    {
+      "epoch": 3.057662065350341,
+      "grad_norm": 0.33311325311660767,
+      "learning_rate": 0.0005636398601398601,
+      "loss": 3.6286,
+      "step": 10500
+    },
+    {
+      "epoch": 3.072223192964063,
+      "grad_norm": 0.3124459385871887,
+      "learning_rate": 0.000563465034965035,
+      "loss": 3.6296,
+      "step": 10550
+    },
+    {
+      "epoch": 3.0867843205777854,
+      "grad_norm": 0.3220648169517517,
+      "learning_rate": 0.0005632902097902098,
+      "loss": 3.6115,
+      "step": 10600
+    },
+    {
+      "epoch": 3.101345448191508,
+      "grad_norm": 0.3088878095149994,
+      "learning_rate": 0.0005631153846153846,
+      "loss": 3.6037,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1159065758052304,
+      "grad_norm": 0.30444589257240295,
+      "learning_rate": 0.0005629405594405594,
+      "loss": 3.6469,
+      "step": 10700
+    },
+    {
+      "epoch": 3.1304677034189528,
+      "grad_norm": 0.32744741439819336,
+      "learning_rate": 0.0005627657342657343,
+      "loss": 3.6301,
+      "step": 10750
+    },
+    {
+      "epoch": 3.145028831032675,
+      "grad_norm": 0.3285087049007416,
+      "learning_rate": 0.0005625909090909091,
+      "loss": 3.6354,
+      "step": 10800
+    },
+    {
+      "epoch": 3.1595899586463974,
+      "grad_norm": 0.3226669430732727,
+      "learning_rate": 0.0005624160839160839,
+      "loss": 3.6348,
+      "step": 10850
+    },
+    {
+      "epoch": 3.17415108626012,
+      "grad_norm": 0.32627004384994507,
+      "learning_rate": 0.0005622412587412587,
+      "loss": 3.6261,
+      "step": 10900
+    },
+    {
+      "epoch": 3.1887122138738424,
+      "grad_norm": 0.3208872377872467,
+      "learning_rate": 0.0005620664335664336,
+      "loss": 3.6344,
+      "step": 10950
+    },
+    {
+      "epoch": 3.2032733414875647,
+      "grad_norm": 0.3367385268211365,
+      "learning_rate": 0.0005618916083916083,
+      "loss": 3.6416,
+      "step": 11000
+    },
+    {
+      "epoch": 3.2032733414875647,
+      "eval_accuracy": 0.35108838648700996,
+      "eval_loss": 3.7168452739715576,
+      "eval_runtime": 182.5337,
+      "eval_samples_per_second": 91.167,
+      "eval_steps_per_second": 5.703,
+      "step": 11000
+    },
+    {
+      "epoch": 3.217834469101287,
+      "grad_norm": 0.32166630029678345,
+      "learning_rate": 0.0005617167832167832,
+      "loss": 3.6309,
+      "step": 11050
+    },
+    {
+      "epoch": 3.2323955967150098,
+      "grad_norm": 0.32037273049354553,
+      "learning_rate": 0.000561541958041958,
+      "loss": 3.6488,
+      "step": 11100
+    },
+    {
+      "epoch": 3.246956724328732,
+      "grad_norm": 0.32968729734420776,
+      "learning_rate": 0.0005613671328671328,
+      "loss": 3.6462,
+      "step": 11150
+    },
+    {
+      "epoch": 3.2615178519424544,
+      "grad_norm": 0.3250342905521393,
+      "learning_rate": 0.0005611923076923077,
+      "loss": 3.6474,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2760789795561767,
+      "grad_norm": 0.3201673924922943,
+      "learning_rate": 0.0005610174825174825,
+      "loss": 3.6506,
+      "step": 11250
+    },
+    {
+      "epoch": 3.2906401071698994,
+      "grad_norm": 0.3127792477607727,
+      "learning_rate": 0.0005608426573426573,
+      "loss": 3.6342,
+      "step": 11300
+    },
+    {
+      "epoch": 3.3052012347836217,
+      "grad_norm": 0.3302961587905884,
+      "learning_rate": 0.0005606678321678321,
+      "loss": 3.6471,
+      "step": 11350
+    },
+    {
+      "epoch": 3.319762362397344,
+      "grad_norm": 0.32816413044929504,
+      "learning_rate": 0.000560493006993007,
+      "loss": 3.6385,
+      "step": 11400
+    },
+    {
+      "epoch": 3.3343234900110663,
+      "grad_norm": 0.31450170278549194,
+      "learning_rate": 0.0005603181818181818,
+      "loss": 3.6462,
+      "step": 11450
+    },
+    {
+      "epoch": 3.3488846176247886,
+      "grad_norm": 0.32757627964019775,
+      "learning_rate": 0.0005601433566433566,
+      "loss": 3.646,
+      "step": 11500
+    },
+    {
+      "epoch": 3.3634457452385114,
+      "grad_norm": 0.32431358098983765,
+      "learning_rate": 0.0005599685314685314,
+      "loss": 3.6413,
+      "step": 11550
+    },
+    {
+      "epoch": 3.3780068728522337,
+      "grad_norm": 0.3174758851528168,
+      "learning_rate": 0.0005597937062937063,
+      "loss": 3.643,
+      "step": 11600
+    },
+    {
+      "epoch": 3.392568000465956,
+      "grad_norm": 0.3019546866416931,
+      "learning_rate": 0.0005596188811188811,
+      "loss": 3.6347,
+      "step": 11650
+    },
+    {
+      "epoch": 3.4071291280796787,
+      "grad_norm": 0.3143378794193268,
+      "learning_rate": 0.0005594440559440559,
+      "loss": 3.6397,
+      "step": 11700
+    },
+    {
+      "epoch": 3.421690255693401,
+      "grad_norm": 0.3293830454349518,
+      "learning_rate": 0.0005592692307692307,
+      "loss": 3.6487,
+      "step": 11750
+    },
+    {
+      "epoch": 3.4362513833071233,
+      "grad_norm": 0.3159070909023285,
+      "learning_rate": 0.0005590944055944055,
+      "loss": 3.6391,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4508125109208456,
+      "grad_norm": 0.31916487216949463,
+      "learning_rate": 0.0005589195804195803,
+      "loss": 3.6553,
+      "step": 11850
+    },
+    {
+      "epoch": 3.465373638534568,
+      "grad_norm": 0.30976319313049316,
+      "learning_rate": 0.0005587447552447552,
+      "loss": 3.6392,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4799347661482907,
+      "grad_norm": 0.3121052086353302,
+      "learning_rate": 0.00055856993006993,
+      "loss": 3.6477,
+      "step": 11950
+    },
+    {
+      "epoch": 3.494495893762013,
+      "grad_norm": 0.31760597229003906,
+      "learning_rate": 0.0005583951048951048,
+      "loss": 3.6462,
+      "step": 12000
+    },
+    {
+      "epoch": 3.494495893762013,
+      "eval_accuracy": 0.3529633678918372,
+      "eval_loss": 3.695136785507202,
+      "eval_runtime": 181.8852,
+      "eval_samples_per_second": 91.492,
+      "eval_steps_per_second": 5.723,
+      "step": 12000
+    },
+    {
+      "epoch": 3.5090570213757353,
+      "grad_norm": 0.324222594499588,
+      "learning_rate": 0.0005582202797202797,
+      "loss": 3.6489,
+      "step": 12050
+    },
+    {
+      "epoch": 3.523618148989458,
+      "grad_norm": 0.3073160946369171,
+      "learning_rate": 0.0005580454545454545,
+      "loss": 3.6491,
+      "step": 12100
+    },
+    {
+      "epoch": 3.53817927660318,
+      "grad_norm": 0.333292692899704,
+      "learning_rate": 0.0005578706293706293,
+      "loss": 3.6485,
+      "step": 12150
+    },
+    {
+      "epoch": 3.5527404042169026,
+      "grad_norm": 0.32193732261657715,
+      "learning_rate": 0.0005576958041958041,
+      "loss": 3.6356,
+      "step": 12200
+    },
+    {
+      "epoch": 3.567301531830625,
+      "grad_norm": 0.305073618888855,
+      "learning_rate": 0.000557520979020979,
+      "loss": 3.6421,
+      "step": 12250
+    },
+    {
+      "epoch": 3.5818626594443472,
+      "grad_norm": 0.30666157603263855,
+      "learning_rate": 0.0005573461538461538,
+      "loss": 3.6479,
+      "step": 12300
+    },
+    {
+      "epoch": 3.59642378705807,
+      "grad_norm": 0.30662357807159424,
+      "learning_rate": 0.0005571713286713286,
+      "loss": 3.6384,
+      "step": 12350
+    },
+    {
+      "epoch": 3.6109849146717923,
+      "grad_norm": 0.3077661991119385,
+      "learning_rate": 0.0005569965034965034,
+      "loss": 3.6267,
+      "step": 12400
+    },
+    {
+      "epoch": 3.6255460422855146,
+      "grad_norm": 0.3115411698818207,
+      "learning_rate": 0.0005568216783216783,
+      "loss": 3.6399,
+      "step": 12450
+    },
+    {
+      "epoch": 3.640107169899237,
+      "grad_norm": 0.347708523273468,
+      "learning_rate": 0.000556646853146853,
+      "loss": 3.6376,
+      "step": 12500
+    },
+    {
+      "epoch": 3.654668297512959,
+      "grad_norm": 0.33384427428245544,
+      "learning_rate": 0.0005564720279720279,
+      "loss": 3.6411,
+      "step": 12550
+    },
+    {
+      "epoch": 3.669229425126682,
+      "grad_norm": 0.31211602687835693,
+      "learning_rate": 0.0005562972027972027,
+      "loss": 3.6455,
+      "step": 12600
+    },
+    {
+      "epoch": 3.6837905527404042,
+      "grad_norm": 0.3280918002128601,
+      "learning_rate": 0.0005561223776223775,
+      "loss": 3.6432,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6983516803541265,
+      "grad_norm": 0.30643153190612793,
+      "learning_rate": 0.0005559475524475524,
+      "loss": 3.6345,
+      "step": 12700
+    },
+    {
+      "epoch": 3.7129128079678493,
+      "grad_norm": 0.32905712723731995,
+      "learning_rate": 0.0005557727272727272,
+      "loss": 3.6532,
+      "step": 12750
+    },
+    {
+      "epoch": 3.7274739355815716,
+      "grad_norm": 0.2936493158340454,
+      "learning_rate": 0.000555597902097902,
+      "loss": 3.6437,
+      "step": 12800
+    },
+    {
+      "epoch": 3.742035063195294,
+      "grad_norm": 0.3418334126472473,
+      "learning_rate": 0.0005554230769230768,
+      "loss": 3.6344,
+      "step": 12850
+    },
+    {
+      "epoch": 3.756596190809016,
+      "grad_norm": 0.32170310616493225,
+      "learning_rate": 0.0005552482517482517,
+      "loss": 3.6389,
+      "step": 12900
+    },
+    {
+      "epoch": 3.7711573184227385,
+      "grad_norm": 0.3068753480911255,
+      "learning_rate": 0.0005550734265734265,
+      "loss": 3.6438,
+      "step": 12950
+    },
+    {
+      "epoch": 3.7857184460364612,
+      "grad_norm": 0.31144753098487854,
+      "learning_rate": 0.0005548986013986013,
+      "loss": 3.6498,
+      "step": 13000
+    },
+    {
+      "epoch": 3.7857184460364612,
+      "eval_accuracy": 0.35463855041264525,
+      "eval_loss": 3.6793668270111084,
+      "eval_runtime": 181.7036,
+      "eval_samples_per_second": 91.583,
+      "eval_steps_per_second": 5.729,
+      "step": 13000
+    },
+    {
+      "epoch": 3.8002795736501835,
+      "grad_norm": 0.3221241235733032,
+      "learning_rate": 0.0005547237762237761,
+      "loss": 3.6364,
+      "step": 13050
+    },
+    {
+      "epoch": 3.814840701263906,
+      "grad_norm": 0.3083803653717041,
+      "learning_rate": 0.000554548951048951,
+      "loss": 3.6327,
+      "step": 13100
+    },
+    {
+      "epoch": 3.829401828877628,
+      "grad_norm": 0.3062104880809784,
+      "learning_rate": 0.0005543741258741258,
+      "loss": 3.6334,
+      "step": 13150
+    },
+    {
+      "epoch": 3.8439629564913504,
+      "grad_norm": 0.3108167350292206,
+      "learning_rate": 0.0005541993006993006,
+      "loss": 3.6318,
+      "step": 13200
+    },
+    {
+      "epoch": 3.858524084105073,
+      "grad_norm": 0.3138327896595001,
+      "learning_rate": 0.0005540244755244756,
+      "loss": 3.6419,
+      "step": 13250
+    },
+    {
+      "epoch": 3.8730852117187955,
+      "grad_norm": 0.3089863061904907,
+      "learning_rate": 0.0005538496503496502,
+      "loss": 3.6278,
+      "step": 13300
+    },
+    {
+      "epoch": 3.887646339332518,
+      "grad_norm": 0.301481693983078,
+      "learning_rate": 0.0005536748251748252,
+      "loss": 3.6352,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9022074669462405,
+      "grad_norm": 0.315318763256073,
+      "learning_rate": 0.0005535,
+      "loss": 3.6306,
+      "step": 13400
+    },
+    {
+      "epoch": 3.916768594559963,
+      "grad_norm": 0.3293091058731079,
+      "learning_rate": 0.0005533251748251748,
+      "loss": 3.6413,
+      "step": 13450
+    },
+    {
+      "epoch": 3.931329722173685,
+      "grad_norm": 0.30823683738708496,
+      "learning_rate": 0.0005531503496503496,
+      "loss": 3.6494,
+      "step": 13500
+    },
+    {
+      "epoch": 3.9458908497874075,
+      "grad_norm": 0.30845707654953003,
+      "learning_rate": 0.0005529755244755245,
+      "loss": 3.6276,
+      "step": 13550
+    },
+    {
+      "epoch": 3.9604519774011298,
+      "grad_norm": 0.30484089255332947,
+      "learning_rate": 0.0005528006993006993,
+      "loss": 3.6249,
+      "step": 13600
+    },
+    {
+      "epoch": 3.9750131050148525,
+      "grad_norm": 0.31132036447525024,
+      "learning_rate": 0.0005526258741258741,
+      "loss": 3.6481,
+      "step": 13650
+    },
+    {
+      "epoch": 3.989574232628575,
+      "grad_norm": 0.2920059859752655,
+      "learning_rate": 0.0005524510489510489,
+      "loss": 3.6361,
+      "step": 13700
+    },
+    {
+      "epoch": 4.004077115731842,
+      "grad_norm": 0.31949788331985474,
+      "learning_rate": 0.0005522762237762238,
+      "loss": 3.6005,
+      "step": 13750
+    },
+    {
+      "epoch": 4.018638243345564,
+      "grad_norm": 0.3100167214870453,
+      "learning_rate": 0.0005521013986013986,
+      "loss": 3.5291,
+      "step": 13800
+    },
+    {
+      "epoch": 4.033199370959287,
+      "grad_norm": 0.3215591609477997,
+      "learning_rate": 0.0005519265734265734,
+      "loss": 3.5248,
+      "step": 13850
+    },
+    {
+      "epoch": 4.04776049857301,
+      "grad_norm": 0.31551459431648254,
+      "learning_rate": 0.0005517517482517482,
+      "loss": 3.5267,
+      "step": 13900
+    },
+    {
+      "epoch": 4.062321626186732,
+      "grad_norm": 0.3320501744747162,
+      "learning_rate": 0.0005515769230769231,
+      "loss": 3.5426,
+      "step": 13950
+    },
+    {
+      "epoch": 4.076882753800454,
+      "grad_norm": 0.32519182562828064,
+      "learning_rate": 0.0005514020979020979,
+      "loss": 3.533,
+      "step": 14000
+    },
+    {
+      "epoch": 4.076882753800454,
+      "eval_accuracy": 0.3559558824307633,
+      "eval_loss": 3.6741702556610107,
+      "eval_runtime": 181.4446,
+      "eval_samples_per_second": 91.714,
+      "eval_steps_per_second": 5.737,
+      "step": 14000
+    },
+    {
+      "epoch": 4.091443881414177,
+      "grad_norm": 0.3130313754081726,
+      "learning_rate": 0.0005512272727272727,
+      "loss": 3.5326,
+      "step": 14050
+    },
+    {
+      "epoch": 4.106005009027899,
+      "grad_norm": 0.315218061208725,
+      "learning_rate": 0.0005510524475524475,
+      "loss": 3.5588,
+      "step": 14100
+    },
+    {
+      "epoch": 4.120566136641622,
+      "grad_norm": 0.3174503743648529,
+      "learning_rate": 0.0005508776223776223,
+      "loss": 3.5427,
+      "step": 14150
+    },
+    {
+      "epoch": 4.135127264255344,
+      "grad_norm": 0.2998291254043579,
+      "learning_rate": 0.0005507027972027972,
+      "loss": 3.56,
+      "step": 14200
+    },
+    {
+      "epoch": 4.149688391869066,
+      "grad_norm": 0.31294891238212585,
+      "learning_rate": 0.000550527972027972,
+      "loss": 3.5353,
+      "step": 14250
+    },
+    {
+      "epoch": 4.164249519482789,
+      "grad_norm": 0.30971232056617737,
+      "learning_rate": 0.0005503531468531468,
+      "loss": 3.5362,
+      "step": 14300
+    },
+    {
+      "epoch": 4.178810647096511,
+      "grad_norm": 0.3312567174434662,
+      "learning_rate": 0.0005501783216783216,
+      "loss": 3.5566,
+      "step": 14350
+    },
+    {
+      "epoch": 4.193371774710234,
+      "grad_norm": 0.3150828778743744,
+      "learning_rate": 0.0005500034965034965,
+      "loss": 3.558,
+      "step": 14400
+    },
+    {
+      "epoch": 4.207932902323956,
+      "grad_norm": 0.3072719871997833,
+      "learning_rate": 0.0005498286713286713,
+      "loss": 3.5581,
+      "step": 14450
+    },
+    {
+      "epoch": 4.222494029937678,
+      "grad_norm": 0.34312841296195984,
+      "learning_rate": 0.0005496538461538461,
+      "loss": 3.5609,
+      "step": 14500
+    },
+    {
+      "epoch": 4.237055157551401,
+      "grad_norm": 0.33278995752334595,
+      "learning_rate": 0.0005494790209790209,
+      "loss": 3.5608,
+      "step": 14550
+    },
+    {
+      "epoch": 4.251616285165123,
+      "grad_norm": 0.31530410051345825,
+      "learning_rate": 0.0005493041958041958,
+      "loss": 3.559,
+      "step": 14600
+    },
+    {
+      "epoch": 4.266177412778846,
+      "grad_norm": 0.33409106731414795,
+      "learning_rate": 0.0005491293706293706,
+      "loss": 3.5706,
+      "step": 14650
+    },
+    {
+      "epoch": 4.280738540392568,
+      "grad_norm": 0.31736454367637634,
+      "learning_rate": 0.0005489545454545454,
+      "loss": 3.5602,
+      "step": 14700
+    },
+    {
+      "epoch": 4.29529966800629,
+      "grad_norm": 0.30737069249153137,
+      "learning_rate": 0.0005487797202797203,
+      "loss": 3.5574,
+      "step": 14750
+    },
+    {
+      "epoch": 4.309860795620013,
+      "grad_norm": 0.32270538806915283,
+      "learning_rate": 0.000548604895104895,
+      "loss": 3.5568,
+      "step": 14800
+    },
+    {
+      "epoch": 4.324421923233735,
+      "grad_norm": 0.3326343297958374,
+      "learning_rate": 0.0005484300699300699,
+      "loss": 3.5657,
+      "step": 14850
+    },
+    {
+      "epoch": 4.338983050847458,
+      "grad_norm": 0.32561352849006653,
+      "learning_rate": 0.0005482552447552447,
+      "loss": 3.5686,
+      "step": 14900
+    },
+    {
+      "epoch": 4.35354417846118,
+      "grad_norm": 0.3027591407299042,
+      "learning_rate": 0.0005480804195804195,
+      "loss": 3.566,
+      "step": 14950
+    },
+    {
+      "epoch": 4.368105306074902,
+      "grad_norm": 0.31824353337287903,
+      "learning_rate": 0.0005479055944055943,
+      "loss": 3.5749,
+      "step": 15000
+    },
+    {
+      "epoch": 4.368105306074902,
+      "eval_accuracy": 0.35690054660694104,
+      "eval_loss": 3.66287899017334,
+      "eval_runtime": 181.6353,
+      "eval_samples_per_second": 91.618,
+      "eval_steps_per_second": 5.731,
+      "step": 15000
+    },
+    {
+      "epoch": 4.382666433688625,
+      "grad_norm": 0.3168264329433441,
+      "learning_rate": 0.0005477307692307692,
+      "loss": 3.5774,
+      "step": 15050
+    },
+    {
+      "epoch": 4.397227561302348,
+      "grad_norm": 0.3169305920600891,
+      "learning_rate": 0.000547555944055944,
+      "loss": 3.5736,
+      "step": 15100
+    },
+    {
+      "epoch": 4.41178868891607,
+      "grad_norm": 0.3356797397136688,
+      "learning_rate": 0.0005473811188811188,
+      "loss": 3.5811,
+      "step": 15150
+    },
+    {
+      "epoch": 4.426349816529792,
+      "grad_norm": 0.31550532579421997,
+      "learning_rate": 0.0005472062937062936,
+      "loss": 3.5761,
+      "step": 15200
+    },
+    {
+      "epoch": 4.440910944143514,
+      "grad_norm": 0.3461132049560547,
+      "learning_rate": 0.0005470314685314685,
+      "loss": 3.5665,
+      "step": 15250
+    },
+    {
+      "epoch": 4.455472071757237,
+      "grad_norm": 0.31411316990852356,
+      "learning_rate": 0.0005468566433566433,
+      "loss": 3.5696,
+      "step": 15300
+    },
+    {
+      "epoch": 4.47003319937096,
+      "grad_norm": 0.3277323544025421,
+      "learning_rate": 0.0005466818181818181,
+      "loss": 3.5694,
+      "step": 15350
+    },
+    {
+      "epoch": 4.4845943269846815,
+      "grad_norm": 0.31850603222846985,
+      "learning_rate": 0.000546506993006993,
+      "loss": 3.5765,
+      "step": 15400
+    },
+    {
+      "epoch": 4.499155454598404,
+      "grad_norm": 0.33567726612091064,
+      "learning_rate": 0.0005463321678321678,
+      "loss": 3.5728,
+      "step": 15450
+    },
+    {
+      "epoch": 4.513716582212126,
+      "grad_norm": 0.33373895287513733,
+      "learning_rate": 0.0005461573426573426,
+      "loss": 3.5827,
+      "step": 15500
+    },
+    {
+      "epoch": 4.528277709825849,
+      "grad_norm": 0.33843833208084106,
+      "learning_rate": 0.0005459825174825174,
+      "loss": 3.5677,
+      "step": 15550
+    },
+    {
+      "epoch": 4.542838837439572,
+      "grad_norm": 0.30723223090171814,
+      "learning_rate": 0.0005458076923076922,
+      "loss": 3.5769,
+      "step": 15600
+    },
+    {
+      "epoch": 4.5573999650532935,
+      "grad_norm": 0.3198756277561188,
+      "learning_rate": 0.000545632867132867,
+      "loss": 3.5735,
+      "step": 15650
+    },
+    {
+      "epoch": 4.571961092667016,
+      "grad_norm": 0.32363468408584595,
+      "learning_rate": 0.0005454580419580419,
+      "loss": 3.5751,
+      "step": 15700
+    },
+    {
+      "epoch": 4.586522220280738,
+      "grad_norm": 0.34676221013069153,
+      "learning_rate": 0.0005452832167832167,
+      "loss": 3.5806,
+      "step": 15750
+    },
+    {
+      "epoch": 4.601083347894461,
+      "grad_norm": 0.32409968972206116,
+      "learning_rate": 0.0005451083916083915,
+      "loss": 3.5709,
+      "step": 15800
+    },
+    {
+      "epoch": 4.615644475508184,
+      "grad_norm": 0.31708860397338867,
+      "learning_rate": 0.0005449335664335663,
+      "loss": 3.5786,
+      "step": 15850
+    },
+    {
+      "epoch": 4.630205603121905,
+      "grad_norm": 0.3069595694541931,
+      "learning_rate": 0.0005447587412587412,
+      "loss": 3.5719,
+      "step": 15900
+    },
+    {
+      "epoch": 4.644766730735628,
+      "grad_norm": 0.30829885601997375,
+      "learning_rate": 0.000544583916083916,
+      "loss": 3.5832,
+      "step": 15950
+    },
+    {
+      "epoch": 4.659327858349351,
+      "grad_norm": 0.3094317317008972,
+      "learning_rate": 0.0005444090909090908,
+      "loss": 3.5851,
+      "step": 16000
+    },
+    {
+      "epoch": 4.659327858349351,
+      "eval_accuracy": 0.3582172906354063,
+      "eval_loss": 3.6480820178985596,
+      "eval_runtime": 181.5728,
+      "eval_samples_per_second": 91.649,
+      "eval_steps_per_second": 5.733,
+      "step": 16000
+    },
+    {
+      "epoch": 4.673888985963073,
+      "grad_norm": 0.3228926360607147,
+      "learning_rate": 0.0005442342657342657,
+      "loss": 3.5847,
+      "step": 16050
+    },
+    {
+      "epoch": 4.6884501135767955,
+      "grad_norm": 0.31402644515037537,
+      "learning_rate": 0.0005440594405594405,
+      "loss": 3.5727,
+      "step": 16100
+    },
+    {
+      "epoch": 4.703011241190518,
+      "grad_norm": 0.32076942920684814,
+      "learning_rate": 0.0005438846153846153,
+      "loss": 3.5834,
+      "step": 16150
+    },
+    {
+      "epoch": 4.71757236880424,
+      "grad_norm": 0.32715633511543274,
+      "learning_rate": 0.0005437097902097901,
+      "loss": 3.5792,
+      "step": 16200
+    },
+    {
+      "epoch": 4.732133496417963,
+      "grad_norm": 0.33358898758888245,
+      "learning_rate": 0.0005435349650349651,
+      "loss": 3.5713,
+      "step": 16250
+    },
+    {
+      "epoch": 4.746694624031685,
+      "grad_norm": 0.31136220693588257,
+      "learning_rate": 0.0005433601398601397,
+      "loss": 3.5786,
+      "step": 16300
+    },
+    {
+      "epoch": 4.7612557516454075,
+      "grad_norm": 0.3148178160190582,
+      "learning_rate": 0.0005431853146853147,
+      "loss": 3.5716,
+      "step": 16350
+    },
+    {
+      "epoch": 4.77581687925913,
+      "grad_norm": 0.321322500705719,
+      "learning_rate": 0.0005430104895104895,
+      "loss": 3.572,
+      "step": 16400
+    },
+    {
+      "epoch": 4.790378006872852,
+      "grad_norm": 0.3095892071723938,
+      "learning_rate": 0.0005428356643356643,
+      "loss": 3.5684,
+      "step": 16450
+    },
+    {
+      "epoch": 4.804939134486575,
+      "grad_norm": 0.30092310905456543,
+      "learning_rate": 0.0005426608391608391,
+      "loss": 3.5772,
+      "step": 16500
+    },
+    {
+      "epoch": 4.819500262100297,
+      "grad_norm": 0.3169448673725128,
+      "learning_rate": 0.000542486013986014,
+      "loss": 3.5718,
+      "step": 16550
+    },
+    {
+      "epoch": 4.834061389714019,
+      "grad_norm": 0.31526780128479004,
+      "learning_rate": 0.0005423111888111888,
+      "loss": 3.5614,
+      "step": 16600
+    },
+    {
+      "epoch": 4.848622517327742,
+      "grad_norm": 0.3070712983608246,
+      "learning_rate": 0.0005421363636363636,
+      "loss": 3.5931,
+      "step": 16650
+    },
+    {
+      "epoch": 4.863183644941464,
+      "grad_norm": 0.3159180283546448,
+      "learning_rate": 0.0005419615384615385,
+      "loss": 3.58,
+      "step": 16700
+    },
+    {
+      "epoch": 4.877744772555187,
+      "grad_norm": 0.3154292702674866,
+      "learning_rate": 0.0005417867132867133,
+      "loss": 3.587,
+      "step": 16750
+    },
+    {
+      "epoch": 4.892305900168909,
+      "grad_norm": 0.3371017277240753,
+      "learning_rate": 0.0005416118881118881,
+      "loss": 3.5729,
+      "step": 16800
+    },
+    {
+      "epoch": 4.906867027782631,
+      "grad_norm": 0.3221917748451233,
+      "learning_rate": 0.0005414370629370629,
+      "loss": 3.5721,
+      "step": 16850
+    },
+    {
+      "epoch": 4.921428155396354,
+      "grad_norm": 0.30847296118736267,
+      "learning_rate": 0.0005412622377622378,
+      "loss": 3.59,
+      "step": 16900
+    },
+    {
+      "epoch": 4.935989283010076,
+      "grad_norm": 0.3217724859714508,
+      "learning_rate": 0.0005410874125874126,
+      "loss": 3.5694,
+      "step": 16950
+    },
+    {
+      "epoch": 4.950550410623799,
+      "grad_norm": 0.319769948720932,
+      "learning_rate": 0.0005409125874125874,
+      "loss": 3.5783,
+      "step": 17000
+    },
+    {
+      "epoch": 4.950550410623799,
+      "eval_accuracy": 0.3593890364154928,
+      "eval_loss": 3.6356794834136963,
+      "eval_runtime": 181.6809,
+      "eval_samples_per_second": 91.595,
+      "eval_steps_per_second": 5.73,
+      "step": 17000
+    },
+    {
+      "epoch": 4.9651115382375215,
+      "grad_norm": 0.33256304264068604,
+      "learning_rate": 0.0005407377622377622,
+      "loss": 3.5772,
+      "step": 17050
+    },
+    {
+      "epoch": 4.979672665851243,
+      "grad_norm": 0.31527116894721985,
+      "learning_rate": 0.000540562937062937,
+      "loss": 3.5829,
+      "step": 17100
+    },
+    {
+      "epoch": 4.994233793464966,
+      "grad_norm": 0.3263448476791382,
+      "learning_rate": 0.0005403881118881118,
+      "loss": 3.5755,
+      "step": 17150
+    },
+    {
+      "epoch": 5.008736676568233,
+      "grad_norm": 0.30820319056510925,
+      "learning_rate": 0.0005402132867132867,
+      "loss": 3.5017,
+      "step": 17200
+    },
+    {
+      "epoch": 5.023297804181956,
+      "grad_norm": 0.324663370847702,
+      "learning_rate": 0.0005400384615384615,
+      "loss": 3.4585,
+      "step": 17250
+    },
+    {
+      "epoch": 5.037858931795678,
+      "grad_norm": 0.33028483390808105,
+      "learning_rate": 0.0005398636363636363,
+      "loss": 3.4698,
+      "step": 17300
+    },
+    {
+      "epoch": 5.052420059409401,
+      "grad_norm": 0.30313462018966675,
+      "learning_rate": 0.0005396888111888111,
+      "loss": 3.471,
+      "step": 17350
+    },
+    {
+      "epoch": 5.066981187023123,
+      "grad_norm": 0.3187935948371887,
+      "learning_rate": 0.000539513986013986,
+      "loss": 3.4774,
+      "step": 17400
+    },
+    {
+      "epoch": 5.081542314636845,
+      "grad_norm": 0.3407951295375824,
+      "learning_rate": 0.0005393391608391608,
+      "loss": 3.4883,
+      "step": 17450
+    },
+    {
+      "epoch": 5.096103442250568,
+      "grad_norm": 0.3357192575931549,
+      "learning_rate": 0.0005391643356643356,
+      "loss": 3.4888,
+      "step": 17500
+    },
+    {
+      "epoch": 5.110664569864291,
+      "grad_norm": 0.3398742079734802,
+      "learning_rate": 0.0005389895104895105,
+      "loss": 3.4936,
+      "step": 17550
+    },
+    {
+      "epoch": 5.125225697478013,
+      "grad_norm": 0.3227720558643341,
+      "learning_rate": 0.0005388146853146853,
+      "loss": 3.4852,
+      "step": 17600
+    },
+    {
+      "epoch": 5.139786825091735,
+      "grad_norm": 0.3157457113265991,
+      "learning_rate": 0.0005386398601398601,
+      "loss": 3.4914,
+      "step": 17650
+    },
+    {
+      "epoch": 5.154347952705457,
+      "grad_norm": 0.3167363405227661,
+      "learning_rate": 0.0005384650349650349,
+      "loss": 3.4853,
+      "step": 17700
+    },
+    {
+      "epoch": 5.16890908031918,
+      "grad_norm": 0.31787964701652527,
+      "learning_rate": 0.0005382902097902098,
+      "loss": 3.4914,
+      "step": 17750
+    },
+    {
+      "epoch": 5.183470207932903,
+      "grad_norm": 0.3270482122898102,
+      "learning_rate": 0.0005381153846153845,
+      "loss": 3.5016,
+      "step": 17800
+    },
+    {
+      "epoch": 5.1980313355466246,
+      "grad_norm": 0.3301296830177307,
+      "learning_rate": 0.0005379405594405594,
+      "loss": 3.5044,
+      "step": 17850
+    },
+    {
+      "epoch": 5.212592463160347,
+      "grad_norm": 0.3604094386100769,
+      "learning_rate": 0.0005377657342657342,
+      "loss": 3.5047,
+      "step": 17900
+    },
+    {
+      "epoch": 5.227153590774069,
+      "grad_norm": 0.3370974361896515,
+      "learning_rate": 0.000537590909090909,
+      "loss": 3.5027,
+      "step": 17950
+    },
+    {
+      "epoch": 5.241714718387792,
+      "grad_norm": 0.35089635848999023,
+      "learning_rate": 0.0005374160839160838,
+      "loss": 3.4935,
+      "step": 18000
+    },
+    {
+      "epoch": 5.241714718387792,
+      "eval_accuracy": 0.36011414525531743,
+      "eval_loss": 3.6362621784210205,
+      "eval_runtime": 181.5644,
+      "eval_samples_per_second": 91.653,
+      "eval_steps_per_second": 5.734,
+      "step": 18000
+    },
+    {
+      "epoch": 5.256275846001515,
+      "grad_norm": 0.3056846857070923,
+      "learning_rate": 0.0005372412587412587,
+      "loss": 3.5168,
+      "step": 18050
+    },
+    {
+      "epoch": 5.2708369736152365,
+      "grad_norm": 0.32178497314453125,
+      "learning_rate": 0.0005370664335664335,
+      "loss": 3.5152,
+      "step": 18100
+    },
+    {
+      "epoch": 5.285398101228959,
+      "grad_norm": 0.33543261885643005,
+      "learning_rate": 0.0005368916083916083,
+      "loss": 3.5213,
+      "step": 18150
+    },
+    {
+      "epoch": 5.299959228842681,
+      "grad_norm": 0.31193044781684875,
+      "learning_rate": 0.0005367167832167832,
+      "loss": 3.5102,
+      "step": 18200
+    },
+    {
+      "epoch": 5.314520356456404,
+      "grad_norm": 0.31134116649627686,
+      "learning_rate": 0.000536541958041958,
+      "loss": 3.5019,
+      "step": 18250
+    },
+    {
+      "epoch": 5.329081484070127,
+      "grad_norm": 0.3064328730106354,
+      "learning_rate": 0.0005363671328671328,
+      "loss": 3.5082,
+      "step": 18300
+    },
+    {
+      "epoch": 5.3436426116838485,
+      "grad_norm": 0.3232489228248596,
+      "learning_rate": 0.0005361923076923076,
+      "loss": 3.5088,
+      "step": 18350
+    },
+    {
+      "epoch": 5.358203739297571,
+      "grad_norm": 0.3162420690059662,
+      "learning_rate": 0.0005360174825174825,
+      "loss": 3.5201,
+      "step": 18400
+    },
+    {
+      "epoch": 5.372764866911294,
+      "grad_norm": 0.29694801568984985,
+      "learning_rate": 0.0005358426573426573,
+      "loss": 3.5179,
+      "step": 18450
+    },
+    {
+      "epoch": 5.387325994525016,
+      "grad_norm": 0.35571807622909546,
+      "learning_rate": 0.0005356678321678321,
+      "loss": 3.5171,
+      "step": 18500
+    },
+    {
+      "epoch": 5.401887122138739,
+      "grad_norm": 0.32287997007369995,
+      "learning_rate": 0.0005354930069930069,
+      "loss": 3.507,
+      "step": 18550
+    },
+    {
+      "epoch": 5.41644824975246,
+      "grad_norm": 0.3234080970287323,
+      "learning_rate": 0.0005353181818181817,
+      "loss": 3.5237,
+      "step": 18600
+    },
+    {
+      "epoch": 5.431009377366183,
+      "grad_norm": 0.3096522092819214,
+      "learning_rate": 0.0005351433566433565,
+      "loss": 3.5226,
+      "step": 18650
+    },
+    {
+      "epoch": 5.445570504979906,
+      "grad_norm": 0.3458673357963562,
+      "learning_rate": 0.0005349685314685314,
+      "loss": 3.5318,
+      "step": 18700
+    },
+    {
+      "epoch": 5.460131632593628,
+      "grad_norm": 0.341791033744812,
+      "learning_rate": 0.0005347937062937062,
+      "loss": 3.5091,
+      "step": 18750
+    },
+    {
+      "epoch": 5.4746927602073505,
+      "grad_norm": 0.3296269476413727,
+      "learning_rate": 0.000534618881118881,
+      "loss": 3.5258,
+      "step": 18800
+    },
+    {
+      "epoch": 5.489253887821073,
+      "grad_norm": 0.3013812005519867,
+      "learning_rate": 0.0005344440559440559,
+      "loss": 3.5136,
+      "step": 18850
+    },
+    {
+      "epoch": 5.503815015434795,
+      "grad_norm": 0.30301058292388916,
+      "learning_rate": 0.0005342692307692307,
+      "loss": 3.522,
+      "step": 18900
+    },
+    {
+      "epoch": 5.518376143048518,
+      "grad_norm": 0.33094391226768494,
+      "learning_rate": 0.0005340944055944055,
+      "loss": 3.5293,
+      "step": 18950
+    },
+    {
+      "epoch": 5.53293727066224,
+      "grad_norm": 0.3317716717720032,
+      "learning_rate": 0.0005339195804195803,
+      "loss": 3.5133,
+      "step": 19000
+    },
+    {
+      "epoch": 5.53293727066224,
+      "eval_accuracy": 0.36096437829325656,
+      "eval_loss": 3.627488613128662,
+      "eval_runtime": 181.4247,
+      "eval_samples_per_second": 91.724,
+      "eval_steps_per_second": 5.738,
+      "step": 19000
+    },
+    {
+      "epoch": 5.5474983982759625,
+      "grad_norm": 0.32238489389419556,
+      "learning_rate": 0.0005337447552447552,
+      "loss": 3.5201,
+      "step": 19050
+    },
+    {
+      "epoch": 5.562059525889685,
+      "grad_norm": 0.30963394045829773,
+      "learning_rate": 0.00053356993006993,
+      "loss": 3.5149,
+      "step": 19100
+    },
+    {
+      "epoch": 5.576620653503407,
+      "grad_norm": 0.3175974190235138,
+      "learning_rate": 0.0005333951048951048,
+      "loss": 3.5136,
+      "step": 19150
+    },
+    {
+      "epoch": 5.59118178111713,
+      "grad_norm": 0.32928696274757385,
+      "learning_rate": 0.0005332202797202796,
+      "loss": 3.5253,
+      "step": 19200
+    },
+    {
+      "epoch": 5.605742908730852,
+      "grad_norm": 0.3137303292751312,
+      "learning_rate": 0.0005330454545454546,
+      "loss": 3.5249,
+      "step": 19250
+    },
+    {
+      "epoch": 5.620304036344574,
+      "grad_norm": 0.332732617855072,
+      "learning_rate": 0.0005328706293706292,
+      "loss": 3.5172,
+      "step": 19300
+    },
+    {
+      "epoch": 5.634865163958297,
+      "grad_norm": 0.34113192558288574,
+      "learning_rate": 0.0005326958041958042,
+      "loss": 3.5251,
+      "step": 19350
+    },
+    {
+      "epoch": 5.649426291572019,
+      "grad_norm": 0.3195572793483734,
+      "learning_rate": 0.000532520979020979,
+      "loss": 3.5343,
+      "step": 19400
+    },
+    {
+      "epoch": 5.663987419185742,
+      "grad_norm": 0.3289816975593567,
+      "learning_rate": 0.0005323461538461538,
+      "loss": 3.5341,
+      "step": 19450
+    },
+    {
+      "epoch": 5.6785485467994645,
+      "grad_norm": 0.31563058495521545,
+      "learning_rate": 0.0005321713286713287,
+      "loss": 3.5192,
+      "step": 19500
+    },
+    {
+      "epoch": 5.693109674413186,
+      "grad_norm": 0.3364487588405609,
+      "learning_rate": 0.0005319965034965035,
+      "loss": 3.5278,
+      "step": 19550
+    },
+    {
+      "epoch": 5.707670802026909,
+      "grad_norm": 0.31011462211608887,
+      "learning_rate": 0.0005318216783216783,
+      "loss": 3.5285,
+      "step": 19600
+    },
+    {
+      "epoch": 5.722231929640631,
+      "grad_norm": 0.31973445415496826,
+      "learning_rate": 0.0005316468531468531,
+      "loss": 3.5257,
+      "step": 19650
+    },
+    {
+      "epoch": 5.736793057254354,
+      "grad_norm": 0.33663949370384216,
+      "learning_rate": 0.000531472027972028,
+      "loss": 3.5427,
+      "step": 19700
+    },
+    {
+      "epoch": 5.7513541848680765,
+      "grad_norm": 0.320126473903656,
+      "learning_rate": 0.0005312972027972028,
+      "loss": 3.5276,
+      "step": 19750
+    },
+    {
+      "epoch": 5.765915312481798,
+      "grad_norm": 0.3229815363883972,
+      "learning_rate": 0.0005311223776223776,
+      "loss": 3.5301,
+      "step": 19800
+    },
+    {
+      "epoch": 5.780476440095521,
+      "grad_norm": 0.3471143841743469,
+      "learning_rate": 0.0005309475524475524,
+      "loss": 3.518,
+      "step": 19850
+    },
+    {
+      "epoch": 5.795037567709244,
+      "grad_norm": 0.33333975076675415,
+      "learning_rate": 0.0005307727272727273,
+      "loss": 3.5302,
+      "step": 19900
+    },
+    {
+      "epoch": 5.809598695322966,
+      "grad_norm": 0.31919199228286743,
+      "learning_rate": 0.0005305979020979021,
+      "loss": 3.5216,
+      "step": 19950
+    },
+    {
+      "epoch": 5.824159822936688,
+      "grad_norm": 0.2982664704322815,
+      "learning_rate": 0.0005304230769230769,
+      "loss": 3.5425,
+      "step": 20000
+    },
+    {
+      "epoch": 5.824159822936688,
+      "eval_accuracy": 0.3616781977317476,
+      "eval_loss": 3.6140241622924805,
+      "eval_runtime": 181.7884,
+      "eval_samples_per_second": 91.54,
+      "eval_steps_per_second": 5.726,
+      "step": 20000
+    },
+    {
+      "epoch": 5.83872095055041,
+      "grad_norm": 0.3217616081237793,
+      "learning_rate": 0.0005302482517482517,
+      "loss": 3.5286,
+      "step": 20050
+    },
+    {
+      "epoch": 5.853282078164133,
+      "grad_norm": 0.3139144480228424,
+      "learning_rate": 0.0005300734265734265,
+      "loss": 3.5229,
+      "step": 20100
+    },
+    {
+      "epoch": 5.867843205777856,
+      "grad_norm": 0.32186374068260193,
+      "learning_rate": 0.0005298986013986013,
+      "loss": 3.526,
+      "step": 20150
+    },
+    {
+      "epoch": 5.882404333391578,
+      "grad_norm": 0.3202000856399536,
+      "learning_rate": 0.0005297237762237762,
+      "loss": 3.5309,
+      "step": 20200
+    },
+    {
+      "epoch": 5.8969654610053,
+      "grad_norm": 0.3569493591785431,
+      "learning_rate": 0.000529548951048951,
+      "loss": 3.5113,
+      "step": 20250
+    },
+    {
+      "epoch": 5.911526588619022,
+      "grad_norm": 0.302033394575119,
+      "learning_rate": 0.0005293741258741258,
+      "loss": 3.5269,
+      "step": 20300
+    },
+    {
+      "epoch": 5.926087716232745,
+      "grad_norm": 0.31015071272850037,
+      "learning_rate": 0.0005291993006993007,
+      "loss": 3.5308,
+      "step": 20350
+    },
+    {
+      "epoch": 5.940648843846468,
+      "grad_norm": 0.3325614929199219,
+      "learning_rate": 0.0005290244755244755,
+      "loss": 3.5259,
+      "step": 20400
+    },
+    {
+      "epoch": 5.95520997146019,
+      "grad_norm": 0.3196668326854706,
+      "learning_rate": 0.0005288496503496503,
+      "loss": 3.5339,
+      "step": 20450
+    },
+    {
+      "epoch": 5.969771099073912,
+      "grad_norm": 0.313598096370697,
+      "learning_rate": 0.0005286748251748251,
+      "loss": 3.527,
+      "step": 20500
+    },
+    {
+      "epoch": 5.984332226687634,
+      "grad_norm": 0.32459649443626404,
+      "learning_rate": 0.0005285,
+      "loss": 3.5347,
+      "step": 20550
+    },
+    {
+      "epoch": 5.998893354301357,
+      "grad_norm": 0.31021371483802795,
+      "learning_rate": 0.0005283251748251748,
+      "loss": 3.5267,
+      "step": 20600
+    },
+    {
+      "epoch": 6.013396237404625,
+      "grad_norm": 0.3184387683868408,
+      "learning_rate": 0.0005281503496503496,
+      "loss": 3.4286,
+      "step": 20650
+    },
+    {
+      "epoch": 6.027957365018347,
+      "grad_norm": 0.3275751769542694,
+      "learning_rate": 0.0005279755244755244,
+      "loss": 3.4175,
+      "step": 20700
+    },
+    {
+      "epoch": 6.04251849263207,
+      "grad_norm": 0.320161372423172,
+      "learning_rate": 0.0005278006993006993,
+      "loss": 3.4261,
+      "step": 20750
+    },
+    {
+      "epoch": 6.0570796202457915,
+      "grad_norm": 0.33726227283477783,
+      "learning_rate": 0.000527625874125874,
+      "loss": 3.427,
+      "step": 20800
+    },
+    {
+      "epoch": 6.071640747859514,
+      "grad_norm": 0.3396624028682709,
+      "learning_rate": 0.0005274510489510489,
+      "loss": 3.4261,
+      "step": 20850
+    },
+    {
+      "epoch": 6.086201875473237,
+      "grad_norm": 0.3232564628124237,
+      "learning_rate": 0.0005272762237762238,
+      "loss": 3.4347,
+      "step": 20900
+    },
+    {
+      "epoch": 6.100763003086959,
+      "grad_norm": 0.31716442108154297,
+      "learning_rate": 0.0005271013986013985,
+      "loss": 3.4299,
+      "step": 20950
+    },
+    {
+      "epoch": 6.115324130700682,
+      "grad_norm": 0.32577764987945557,
+      "learning_rate": 0.0005269265734265734,
+      "loss": 3.4302,
+      "step": 21000
+    },
+    {
+      "epoch": 6.115324130700682,
+      "eval_accuracy": 0.3622031548937614,
+      "eval_loss": 3.619544267654419,
+      "eval_runtime": 181.3777,
+      "eval_samples_per_second": 91.748,
+      "eval_steps_per_second": 5.739,
+      "step": 21000
+    },
+    {
+      "epoch": 6.1298852583144035,
+      "grad_norm": 0.3372443914413452,
+      "learning_rate": 0.0005267517482517482,
+      "loss": 3.451,
+      "step": 21050
+    },
+    {
+      "epoch": 6.144446385928126,
+      "grad_norm": 0.32983914017677307,
+      "learning_rate": 0.000526576923076923,
+      "loss": 3.4585,
+      "step": 21100
+    },
+    {
+      "epoch": 6.159007513541849,
+      "grad_norm": 0.3527901768684387,
+      "learning_rate": 0.0005264020979020978,
+      "loss": 3.4515,
+      "step": 21150
+    },
+    {
+      "epoch": 6.173568641155571,
+      "grad_norm": 0.32249078154563904,
+      "learning_rate": 0.0005262272727272727,
+      "loss": 3.4478,
+      "step": 21200
+    },
+    {
+      "epoch": 6.1881297687692935,
+      "grad_norm": 0.3154282867908478,
+      "learning_rate": 0.0005260524475524475,
+      "loss": 3.4408,
+      "step": 21250
+    },
+    {
+      "epoch": 6.202690896383016,
+      "grad_norm": 0.3283529281616211,
+      "learning_rate": 0.0005258776223776223,
+      "loss": 3.4585,
+      "step": 21300
+    },
+    {
+      "epoch": 6.217252023996738,
+      "grad_norm": 0.3528301417827606,
+      "learning_rate": 0.0005257027972027971,
+      "loss": 3.4687,
+      "step": 21350
+    },
+    {
+      "epoch": 6.231813151610461,
+      "grad_norm": 0.3371451497077942,
+      "learning_rate": 0.000525527972027972,
+      "loss": 3.4452,
+      "step": 21400
+    },
+    {
+      "epoch": 6.246374279224183,
+      "grad_norm": 0.3369438350200653,
+      "learning_rate": 0.0005253531468531468,
+      "loss": 3.4643,
+      "step": 21450
+    },
+    {
+      "epoch": 6.2609354068379055,
+      "grad_norm": 0.3266349732875824,
+      "learning_rate": 0.0005251783216783216,
+      "loss": 3.4547,
+      "step": 21500
+    },
+    {
+      "epoch": 6.275496534451628,
+      "grad_norm": 0.32149258255958557,
+      "learning_rate": 0.0005250034965034965,
+      "loss": 3.4577,
+      "step": 21550
+    },
+    {
+      "epoch": 6.29005766206535,
+      "grad_norm": 0.31332188844680786,
+      "learning_rate": 0.0005248286713286712,
+      "loss": 3.4526,
+      "step": 21600
+    },
+    {
+      "epoch": 6.304618789679073,
+      "grad_norm": 0.34418395161628723,
+      "learning_rate": 0.0005246538461538461,
+      "loss": 3.4707,
+      "step": 21650
+    },
+    {
+      "epoch": 6.319179917292795,
+      "grad_norm": 0.3074207305908203,
+      "learning_rate": 0.0005244790209790209,
+      "loss": 3.4713,
+      "step": 21700
+    },
+    {
+      "epoch": 6.3337410449065175,
+      "grad_norm": 0.3123386800289154,
+      "learning_rate": 0.0005243041958041957,
+      "loss": 3.466,
+      "step": 21750
+    },
+    {
+      "epoch": 6.34830217252024,
+      "grad_norm": 0.32567399740219116,
+      "learning_rate": 0.0005241293706293705,
+      "loss": 3.4607,
+      "step": 21800
+    },
+    {
+      "epoch": 6.362863300133962,
+      "grad_norm": 0.3354513645172119,
+      "learning_rate": 0.0005239545454545454,
+      "loss": 3.4662,
+      "step": 21850
+    },
+    {
+      "epoch": 6.377424427747685,
+      "grad_norm": 0.32310134172439575,
+      "learning_rate": 0.0005237797202797202,
+      "loss": 3.473,
+      "step": 21900
+    },
+    {
+      "epoch": 6.391985555361408,
+      "grad_norm": 0.34556859731674194,
+      "learning_rate": 0.000523604895104895,
+      "loss": 3.4771,
+      "step": 21950
+    },
+    {
+      "epoch": 6.406546682975129,
+      "grad_norm": 0.3282891809940338,
+      "learning_rate": 0.0005234300699300698,
+      "loss": 3.4781,
+      "step": 22000
+    },
+    {
+      "epoch": 6.406546682975129,
+      "eval_accuracy": 0.36261039652728605,
+      "eval_loss": 3.612433910369873,
+      "eval_runtime": 179.6978,
+      "eval_samples_per_second": 92.605,
+      "eval_steps_per_second": 5.793,
+      "step": 22000
+    },
+    {
+      "epoch": 6.421107810588852,
+      "grad_norm": 0.3479907810688019,
+      "learning_rate": 0.0005232552447552447,
+      "loss": 3.4734,
+      "step": 22050
+    },
+    {
+      "epoch": 6.435668938202574,
+      "grad_norm": 0.36589229106903076,
+      "learning_rate": 0.0005230804195804195,
+      "loss": 3.4748,
+      "step": 22100
+    },
+    {
+      "epoch": 6.450230065816297,
+      "grad_norm": 0.33497488498687744,
+      "learning_rate": 0.0005229055944055943,
+      "loss": 3.4702,
+      "step": 22150
+    },
+    {
+      "epoch": 6.4647911934300195,
+      "grad_norm": 0.3153390884399414,
+      "learning_rate": 0.0005227307692307691,
+      "loss": 3.4821,
+      "step": 22200
+    },
+    {
+      "epoch": 6.479352321043741,
+      "grad_norm": 0.34096068143844604,
+      "learning_rate": 0.0005225559440559441,
+      "loss": 3.4717,
+      "step": 22250
+    },
+    {
+      "epoch": 6.493913448657464,
+      "grad_norm": 0.3129245340824127,
+      "learning_rate": 0.0005223811188811189,
+      "loss": 3.4865,
+      "step": 22300
+    },
+    {
+      "epoch": 6.508474576271187,
+      "grad_norm": 0.32267722487449646,
+      "learning_rate": 0.0005222062937062937,
+      "loss": 3.4701,
+      "step": 22350
+    },
+    {
+      "epoch": 6.523035703884909,
+      "grad_norm": 0.31626445055007935,
+      "learning_rate": 0.0005220314685314686,
+      "loss": 3.4743,
+      "step": 22400
+    },
+    {
+      "epoch": 6.5375968314986315,
+      "grad_norm": 0.3249252438545227,
+      "learning_rate": 0.0005218566433566433,
+      "loss": 3.4776,
+      "step": 22450
+    },
+    {
+      "epoch": 6.552157959112353,
+      "grad_norm": 0.3492892384529114,
+      "learning_rate": 0.0005216818181818182,
+      "loss": 3.4883,
+      "step": 22500
+    },
+    {
+      "epoch": 6.566719086726076,
+      "grad_norm": 0.3136710524559021,
+      "learning_rate": 0.000521506993006993,
+      "loss": 3.4739,
+      "step": 22550
+    },
+    {
+      "epoch": 6.581280214339799,
+      "grad_norm": 0.3537392020225525,
+      "learning_rate": 0.0005213321678321678,
+      "loss": 3.4942,
+      "step": 22600
+    },
+    {
+      "epoch": 6.595841341953521,
+      "grad_norm": 0.31021302938461304,
+      "learning_rate": 0.0005211573426573426,
+      "loss": 3.4792,
+      "step": 22650
+    },
+    {
+      "epoch": 6.610402469567243,
+      "grad_norm": 0.29873308539390564,
+      "learning_rate": 0.0005209825174825175,
+      "loss": 3.4877,
+      "step": 22700
+    },
+    {
+      "epoch": 6.624963597180965,
+      "grad_norm": 0.32095155119895935,
+      "learning_rate": 0.0005208076923076923,
+      "loss": 3.4752,
+      "step": 22750
+    },
+    {
+      "epoch": 6.639524724794688,
+      "grad_norm": 0.31786495447158813,
+      "learning_rate": 0.0005206328671328671,
+      "loss": 3.4854,
+      "step": 22800
+    },
+    {
+      "epoch": 6.654085852408411,
+      "grad_norm": 0.34261924028396606,
+      "learning_rate": 0.0005204580419580419,
+      "loss": 3.4853,
+      "step": 22850
+    },
+    {
+      "epoch": 6.668646980022133,
+      "grad_norm": 0.3314582407474518,
+      "learning_rate": 0.0005202832167832168,
+      "loss": 3.4919,
+      "step": 22900
+    },
+    {
+      "epoch": 6.683208107635855,
+      "grad_norm": 0.3397805094718933,
+      "learning_rate": 0.0005201083916083916,
+      "loss": 3.4908,
+      "step": 22950
+    },
+    {
+      "epoch": 6.697769235249577,
+      "grad_norm": 0.33193162083625793,
+      "learning_rate": 0.0005199335664335664,
+      "loss": 3.4873,
+      "step": 23000
+    },
+    {
+      "epoch": 6.697769235249577,
+      "eval_accuracy": 0.363417235928849,
+      "eval_loss": 3.6028225421905518,
+      "eval_runtime": 179.443,
+      "eval_samples_per_second": 92.737,
+      "eval_steps_per_second": 5.801,
+      "step": 23000
+    },
+    {
+      "epoch": 6.7123303628633,
+      "grad_norm": 0.30862730741500854,
+      "learning_rate": 0.0005197587412587413,
+      "loss": 3.4914,
+      "step": 23050
+    },
+    {
+      "epoch": 6.726891490477023,
+      "grad_norm": 0.3028578460216522,
+      "learning_rate": 0.0005195839160839161,
+      "loss": 3.4778,
+      "step": 23100
+    },
+    {
+      "epoch": 6.741452618090745,
+      "grad_norm": 0.3111686706542969,
+      "learning_rate": 0.0005194090909090909,
+      "loss": 3.4941,
+      "step": 23150
+    },
+    {
+      "epoch": 6.756013745704467,
+      "grad_norm": 0.3356785476207733,
+      "learning_rate": 0.0005192342657342657,
+      "loss": 3.4939,
+      "step": 23200
+    },
+    {
+      "epoch": 6.77057487331819,
+      "grad_norm": 0.3174416720867157,
+      "learning_rate": 0.0005190594405594405,
+      "loss": 3.4902,
+      "step": 23250
+    },
+    {
+      "epoch": 6.785136000931912,
+      "grad_norm": 0.33599984645843506,
+      "learning_rate": 0.0005188846153846153,
+      "loss": 3.4959,
+      "step": 23300
+    },
+    {
+      "epoch": 6.799697128545635,
+      "grad_norm": 0.3432798385620117,
+      "learning_rate": 0.0005187097902097902,
+      "loss": 3.4871,
+      "step": 23350
+    },
+    {
+      "epoch": 6.814258256159357,
+      "grad_norm": 0.3143886625766754,
+      "learning_rate": 0.000518534965034965,
+      "loss": 3.5018,
+      "step": 23400
+    },
+    {
+      "epoch": 6.828819383773079,
+      "grad_norm": 0.3041757047176361,
+      "learning_rate": 0.0005183601398601398,
+      "loss": 3.4996,
+      "step": 23450
+    },
+    {
+      "epoch": 6.843380511386802,
+      "grad_norm": 0.3227998912334442,
+      "learning_rate": 0.0005181853146853146,
+      "loss": 3.488,
+      "step": 23500
+    },
+    {
+      "epoch": 6.857941639000524,
+      "grad_norm": 0.3264809846878052,
+      "learning_rate": 0.0005180104895104895,
+      "loss": 3.5016,
+      "step": 23550
+    },
+    {
+      "epoch": 6.872502766614247,
+      "grad_norm": 0.30348044633865356,
+      "learning_rate": 0.0005178356643356643,
+      "loss": 3.4928,
+      "step": 23600
+    },
+    {
+      "epoch": 6.887063894227969,
+      "grad_norm": 0.30934104323387146,
+      "learning_rate": 0.0005176608391608391,
+      "loss": 3.481,
+      "step": 23650
+    },
+    {
+      "epoch": 6.901625021841691,
+      "grad_norm": 0.3080596327781677,
+      "learning_rate": 0.000517486013986014,
+      "loss": 3.4931,
+      "step": 23700
+    },
+    {
+      "epoch": 6.916186149455414,
+      "grad_norm": 0.3390050530433655,
+      "learning_rate": 0.0005173111888111888,
+      "loss": 3.4935,
+      "step": 23750
+    },
+    {
+      "epoch": 6.930747277069136,
+      "grad_norm": 0.33725789189338684,
+      "learning_rate": 0.0005171363636363636,
+      "loss": 3.4894,
+      "step": 23800
+    },
+    {
+      "epoch": 6.945308404682859,
+      "grad_norm": 0.3304024934768677,
+      "learning_rate": 0.0005169615384615384,
+      "loss": 3.4811,
+      "step": 23850
+    },
+    {
+      "epoch": 6.959869532296581,
+      "grad_norm": 0.3428155481815338,
+      "learning_rate": 0.0005167867132867133,
+      "loss": 3.4964,
+      "step": 23900
+    },
+    {
+      "epoch": 6.974430659910303,
+      "grad_norm": 0.34181949496269226,
+      "learning_rate": 0.000516611888111888,
+      "loss": 3.4981,
+      "step": 23950
+    },
+    {
+      "epoch": 6.988991787524026,
+      "grad_norm": 0.3134143352508545,
+      "learning_rate": 0.0005164370629370629,
+      "loss": 3.4938,
+      "step": 24000
+    },
+    {
+      "epoch": 6.988991787524026,
+      "eval_accuracy": 0.3644916106224329,
+      "eval_loss": 3.5912303924560547,
+      "eval_runtime": 179.7006,
+      "eval_samples_per_second": 92.604,
+      "eval_steps_per_second": 5.793,
+      "step": 24000
+    },
+    {
+      "epoch": 7.003494670627293,
+      "grad_norm": 0.3197336792945862,
+      "learning_rate": 0.0005162622377622377,
+      "loss": 3.4804,
+      "step": 24050
+    },
+    {
+      "epoch": 7.018055798241016,
+      "grad_norm": 0.34031471610069275,
+      "learning_rate": 0.0005160874125874125,
+      "loss": 3.389,
+      "step": 24100
+    },
+    {
+      "epoch": 7.032616925854738,
+      "grad_norm": 0.31004777550697327,
+      "learning_rate": 0.0005159125874125873,
+      "loss": 3.3799,
+      "step": 24150
+    },
+    {
+      "epoch": 7.0471780534684605,
+      "grad_norm": 0.343368798494339,
+      "learning_rate": 0.0005157377622377622,
+      "loss": 3.3855,
+      "step": 24200
+    },
+    {
+      "epoch": 7.061739181082183,
+      "grad_norm": 0.3274615406990051,
+      "learning_rate": 0.000515562937062937,
+      "loss": 3.3952,
+      "step": 24250
+    },
+    {
+      "epoch": 7.076300308695905,
+      "grad_norm": 0.32230237126350403,
+      "learning_rate": 0.0005153881118881118,
+      "loss": 3.3911,
+      "step": 24300
+    },
+    {
+      "epoch": 7.090861436309628,
+      "grad_norm": 0.32454419136047363,
+      "learning_rate": 0.0005152132867132867,
+      "loss": 3.4076,
+      "step": 24350
+    },
+    {
+      "epoch": 7.105422563923351,
+      "grad_norm": 0.33376428484916687,
+      "learning_rate": 0.0005150384615384615,
+      "loss": 3.4129,
+      "step": 24400
+    },
+    {
+      "epoch": 7.1199836915370724,
+      "grad_norm": 0.3614583909511566,
+      "learning_rate": 0.0005148636363636363,
+      "loss": 3.4057,
+      "step": 24450
+    },
+    {
+      "epoch": 7.134544819150795,
+      "grad_norm": 0.34300723671913147,
+      "learning_rate": 0.0005146888111888111,
+      "loss": 3.405,
+      "step": 24500
+    },
+    {
+      "epoch": 7.149105946764517,
+      "grad_norm": 0.343777596950531,
+      "learning_rate": 0.000514513986013986,
+      "loss": 3.4118,
+      "step": 24550
+    },
+    {
+      "epoch": 7.16366707437824,
+      "grad_norm": 0.3267875611782074,
+      "learning_rate": 0.0005143391608391608,
+      "loss": 3.4238,
+      "step": 24600
+    },
+    {
+      "epoch": 7.1782282019919625,
+      "grad_norm": 0.37025755643844604,
+      "learning_rate": 0.0005141643356643356,
+      "loss": 3.4162,
+      "step": 24650
+    },
+    {
+      "epoch": 7.192789329605684,
+      "grad_norm": 0.3431757390499115,
+      "learning_rate": 0.0005139895104895104,
+      "loss": 3.4131,
+      "step": 24700
+    },
+    {
+      "epoch": 7.207350457219407,
+      "grad_norm": 0.36377280950546265,
+      "learning_rate": 0.0005138146853146852,
+      "loss": 3.4085,
+      "step": 24750
+    },
+    {
+      "epoch": 7.22191158483313,
+      "grad_norm": 0.3307691216468811,
+      "learning_rate": 0.00051363986013986,
+      "loss": 3.4203,
+      "step": 24800
+    },
+    {
+      "epoch": 7.236472712446852,
+      "grad_norm": 0.35206225514411926,
+      "learning_rate": 0.0005134650349650349,
+      "loss": 3.4279,
+      "step": 24850
+    },
+    {
+      "epoch": 7.2510338400605745,
+      "grad_norm": 0.336747944355011,
+      "learning_rate": 0.0005132902097902097,
+      "loss": 3.4326,
+      "step": 24900
+    },
+    {
+      "epoch": 7.265594967674296,
+      "grad_norm": 0.3248525857925415,
+      "learning_rate": 0.0005131153846153845,
+      "loss": 3.4251,
+      "step": 24950
+    },
+    {
+      "epoch": 7.280156095288019,
+      "grad_norm": 0.3397109806537628,
+      "learning_rate": 0.0005129405594405594,
+      "loss": 3.4288,
+      "step": 25000
+    },
+    {
+      "epoch": 7.280156095288019,
+      "eval_accuracy": 0.3642418326179263,
+      "eval_loss": 3.5990543365478516,
+      "eval_runtime": 179.5973,
+      "eval_samples_per_second": 92.657,
+      "eval_steps_per_second": 5.796,
+      "step": 25000
+    },
+    {
+      "epoch": 7.294717222901742,
+      "grad_norm": 0.3284904956817627,
+      "learning_rate": 0.0005127657342657342,
+      "loss": 3.4113,
+      "step": 25050
+    },
+    {
+      "epoch": 7.309278350515464,
+      "grad_norm": 0.33549222350120544,
+      "learning_rate": 0.000512590909090909,
+      "loss": 3.4227,
+      "step": 25100
+    },
+    {
+      "epoch": 7.3238394781291865,
+      "grad_norm": 0.3276277780532837,
+      "learning_rate": 0.0005124160839160838,
+      "loss": 3.4321,
+      "step": 25150
+    },
+    {
+      "epoch": 7.338400605742908,
+      "grad_norm": 0.3157896101474762,
+      "learning_rate": 0.0005122412587412588,
+      "loss": 3.4382,
+      "step": 25200
+    },
+    {
+      "epoch": 7.352961733356631,
+      "grad_norm": 0.3331218957901001,
+      "learning_rate": 0.0005120664335664336,
+      "loss": 3.4405,
+      "step": 25250
+    },
+    {
+      "epoch": 7.367522860970354,
+      "grad_norm": 0.321426659822464,
+      "learning_rate": 0.0005118916083916084,
+      "loss": 3.4389,
+      "step": 25300
+    },
+    {
+      "epoch": 7.382083988584076,
+      "grad_norm": 0.3326101005077362,
+      "learning_rate": 0.0005117167832167832,
+      "loss": 3.4309,
+      "step": 25350
+    },
+    {
+      "epoch": 7.396645116197798,
+      "grad_norm": 0.3259352445602417,
+      "learning_rate": 0.0005115419580419581,
+      "loss": 3.4294,
+      "step": 25400
+    },
+    {
+      "epoch": 7.411206243811521,
+      "grad_norm": 0.32202550768852234,
+      "learning_rate": 0.0005113671328671328,
+      "loss": 3.439,
+      "step": 25450
+    },
+    {
+      "epoch": 7.425767371425243,
+      "grad_norm": 0.32014110684394836,
+      "learning_rate": 0.0005111923076923077,
+      "loss": 3.4449,
+      "step": 25500
+    },
+    {
+      "epoch": 7.440328499038966,
+      "grad_norm": 0.34263157844543457,
+      "learning_rate": 0.0005110174825174825,
+      "loss": 3.4429,
+      "step": 25550
+    },
+    {
+      "epoch": 7.454889626652688,
+      "grad_norm": 0.31470826268196106,
+      "learning_rate": 0.0005108426573426573,
+      "loss": 3.4333,
+      "step": 25600
+    },
+    {
+      "epoch": 7.46945075426641,
+      "grad_norm": 0.3453708589076996,
+      "learning_rate": 0.0005106678321678321,
+      "loss": 3.4381,
+      "step": 25650
+    },
+    {
+      "epoch": 7.484011881880133,
+      "grad_norm": 0.3306244909763336,
+      "learning_rate": 0.000510493006993007,
+      "loss": 3.4464,
+      "step": 25700
+    },
+    {
+      "epoch": 7.498573009493855,
+      "grad_norm": 0.3169175088405609,
+      "learning_rate": 0.0005103181818181818,
+      "loss": 3.4453,
+      "step": 25750
+    },
+    {
+      "epoch": 7.513134137107578,
+      "grad_norm": 0.317241370677948,
+      "learning_rate": 0.0005101433566433566,
+      "loss": 3.4517,
+      "step": 25800
+    },
+    {
+      "epoch": 7.5276952647213005,
+      "grad_norm": 0.3119760751724243,
+      "learning_rate": 0.0005099685314685315,
+      "loss": 3.4369,
+      "step": 25850
+    },
+    {
+      "epoch": 7.542256392335022,
+      "grad_norm": 0.3233243227005005,
+      "learning_rate": 0.0005097937062937063,
+      "loss": 3.4422,
+      "step": 25900
+    },
+    {
+      "epoch": 7.556817519948745,
+      "grad_norm": 0.3211483955383301,
+      "learning_rate": 0.0005096188811188811,
+      "loss": 3.4554,
+      "step": 25950
+    },
+    {
+      "epoch": 7.571378647562467,
+      "grad_norm": 0.3340722918510437,
+      "learning_rate": 0.0005094440559440559,
+      "loss": 3.4497,
+      "step": 26000
+    },
+    {
+      "epoch": 7.571378647562467,
+      "eval_accuracy": 0.3650712508221565,
+      "eval_loss": 3.5916457176208496,
+      "eval_runtime": 179.7062,
+      "eval_samples_per_second": 92.601,
+      "eval_steps_per_second": 5.793,
+      "step": 26000
+    },
+    {
+      "epoch": 7.58593977517619,
+      "grad_norm": 0.322214812040329,
+      "learning_rate": 0.0005092692307692308,
+      "loss": 3.4458,
+      "step": 26050
+    },
+    {
+      "epoch": 7.600500902789912,
+      "grad_norm": 0.35813722014427185,
+      "learning_rate": 0.0005090944055944056,
+      "loss": 3.4463,
+      "step": 26100
+    },
+    {
+      "epoch": 7.615062030403634,
+      "grad_norm": 0.3351564407348633,
+      "learning_rate": 0.0005089195804195804,
+      "loss": 3.453,
+      "step": 26150
+    },
+    {
+      "epoch": 7.629623158017357,
+      "grad_norm": 0.335275262594223,
+      "learning_rate": 0.0005087447552447552,
+      "loss": 3.441,
+      "step": 26200
+    },
+    {
+      "epoch": 7.644184285631079,
+      "grad_norm": 0.3310592770576477,
+      "learning_rate": 0.00050856993006993,
+      "loss": 3.4408,
+      "step": 26250
+    },
+    {
+      "epoch": 7.658745413244802,
+      "grad_norm": 0.33791449666023254,
+      "learning_rate": 0.0005083951048951048,
+      "loss": 3.4483,
+      "step": 26300
+    },
+    {
+      "epoch": 7.673306540858524,
+      "grad_norm": 0.3487813174724579,
+      "learning_rate": 0.0005082202797202797,
+      "loss": 3.4584,
+      "step": 26350
+    },
+    {
+      "epoch": 7.687867668472246,
+      "grad_norm": 0.3040063679218292,
+      "learning_rate": 0.0005080454545454545,
+      "loss": 3.4539,
+      "step": 26400
+    },
+    {
+      "epoch": 7.702428796085969,
+      "grad_norm": 0.31800422072410583,
+      "learning_rate": 0.0005078706293706293,
+      "loss": 3.4514,
+      "step": 26450
+    },
+    {
+      "epoch": 7.716989923699691,
+      "grad_norm": 0.3236735165119171,
+      "learning_rate": 0.0005076958041958042,
+      "loss": 3.4515,
+      "step": 26500
+    },
+    {
+      "epoch": 7.731551051313414,
+      "grad_norm": 0.31341543793678284,
+      "learning_rate": 0.000507520979020979,
+      "loss": 3.4679,
+      "step": 26550
+    },
+    {
+      "epoch": 7.746112178927136,
+      "grad_norm": 0.32171645760536194,
+      "learning_rate": 0.0005073461538461538,
+      "loss": 3.4554,
+      "step": 26600
+    },
+    {
+      "epoch": 7.760673306540858,
+      "grad_norm": 0.31651511788368225,
+      "learning_rate": 0.0005071713286713286,
+      "loss": 3.4561,
+      "step": 26650
+    },
+    {
+      "epoch": 7.775234434154581,
+      "grad_norm": 0.3263193964958191,
+      "learning_rate": 0.0005069965034965035,
+      "loss": 3.4446,
+      "step": 26700
+    },
+    {
+      "epoch": 7.789795561768304,
+      "grad_norm": 0.3318195939064026,
+      "learning_rate": 0.0005068216783216783,
+      "loss": 3.4513,
+      "step": 26750
+    },
+    {
+      "epoch": 7.8043566893820255,
+      "grad_norm": 0.3138175308704376,
+      "learning_rate": 0.0005066468531468531,
+      "loss": 3.4507,
+      "step": 26800
+    },
+    {
+      "epoch": 7.818917816995748,
+      "grad_norm": 0.344470351934433,
+      "learning_rate": 0.0005064720279720279,
+      "loss": 3.4695,
+      "step": 26850
+    },
+    {
+      "epoch": 7.833478944609471,
+      "grad_norm": 0.32541370391845703,
+      "learning_rate": 0.0005062972027972028,
+      "loss": 3.4625,
+      "step": 26900
+    },
+    {
+      "epoch": 7.848040072223193,
+      "grad_norm": 0.33207646012306213,
+      "learning_rate": 0.0005061223776223775,
+      "loss": 3.4471,
+      "step": 26950
+    },
+    {
+      "epoch": 7.862601199836916,
+      "grad_norm": 0.3322581350803375,
+      "learning_rate": 0.0005059475524475524,
+      "loss": 3.4642,
+      "step": 27000
+    },
+    {
+      "epoch": 7.862601199836916,
+      "eval_accuracy": 0.3658501019162465,
+      "eval_loss": 3.5833911895751953,
+      "eval_runtime": 179.4681,
+      "eval_samples_per_second": 92.724,
+      "eval_steps_per_second": 5.8,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8771623274506375,
+      "grad_norm": 0.3180125653743744,
+      "learning_rate": 0.0005057727272727272,
+      "loss": 3.4548,
+      "step": 27050
+    },
+    {
+      "epoch": 7.89172345506436,
+      "grad_norm": 0.32920128107070923,
+      "learning_rate": 0.000505597902097902,
+      "loss": 3.4555,
+      "step": 27100
+    },
+    {
+      "epoch": 7.906284582678083,
+      "grad_norm": 0.345284640789032,
+      "learning_rate": 0.0005054230769230769,
+      "loss": 3.4548,
+      "step": 27150
+    },
+    {
+      "epoch": 7.920845710291805,
+      "grad_norm": 0.29907023906707764,
+      "learning_rate": 0.0005052482517482517,
+      "loss": 3.4601,
+      "step": 27200
+    },
+    {
+      "epoch": 7.935406837905528,
+      "grad_norm": 0.32699817419052124,
+      "learning_rate": 0.0005050734265734265,
+      "loss": 3.4745,
+      "step": 27250
+    },
+    {
+      "epoch": 7.9499679655192494,
+      "grad_norm": 0.3353591859340668,
+      "learning_rate": 0.0005048986013986013,
+      "loss": 3.4594,
+      "step": 27300
+    },
+    {
+      "epoch": 7.964529093132972,
+      "grad_norm": 0.31299489736557007,
+      "learning_rate": 0.0005047237762237762,
+      "loss": 3.4708,
+      "step": 27350
+    },
+    {
+      "epoch": 7.979090220746695,
+      "grad_norm": 0.318194180727005,
+      "learning_rate": 0.000504548951048951,
+      "loss": 3.4673,
+      "step": 27400
+    },
+    {
+      "epoch": 7.993651348360417,
+      "grad_norm": 0.3500754237174988,
+      "learning_rate": 0.0005043741258741258,
+      "loss": 3.4694,
+      "step": 27450
+    },
+    {
+      "epoch": 8.008154231463685,
+      "grad_norm": 0.32221224904060364,
+      "learning_rate": 0.0005041993006993006,
+      "loss": 3.4022,
+      "step": 27500
+    },
+    {
+      "epoch": 8.022715359077408,
+      "grad_norm": 0.3506814241409302,
+      "learning_rate": 0.0005040244755244755,
+      "loss": 3.3639,
+      "step": 27550
+    },
+    {
+      "epoch": 8.037276486691129,
+      "grad_norm": 0.3434743285179138,
+      "learning_rate": 0.0005038496503496503,
+      "loss": 3.3689,
+      "step": 27600
+    },
+    {
+      "epoch": 8.051837614304851,
+      "grad_norm": 0.36473479866981506,
+      "learning_rate": 0.0005036748251748251,
+      "loss": 3.3577,
+      "step": 27650
+    },
+    {
+      "epoch": 8.066398741918574,
+      "grad_norm": 0.3590794801712036,
+      "learning_rate": 0.0005034999999999999,
+      "loss": 3.3581,
+      "step": 27700
+    },
+    {
+      "epoch": 8.080959869532297,
+      "grad_norm": 0.34557044506073,
+      "learning_rate": 0.0005033251748251747,
+      "loss": 3.3431,
+      "step": 27750
+    },
+    {
+      "epoch": 8.09552099714602,
+      "grad_norm": 0.3259578049182892,
+      "learning_rate": 0.0005031503496503496,
+      "loss": 3.3671,
+      "step": 27800
+    },
+    {
+      "epoch": 8.11008212475974,
+      "grad_norm": 0.33602702617645264,
+      "learning_rate": 0.0005029755244755244,
+      "loss": 3.3677,
+      "step": 27850
+    },
+    {
+      "epoch": 8.124643252373463,
+      "grad_norm": 0.3089875280857086,
+      "learning_rate": 0.0005028006993006992,
+      "loss": 3.3709,
+      "step": 27900
+    },
+    {
+      "epoch": 8.139204379987186,
+      "grad_norm": 0.32774367928504944,
+      "learning_rate": 0.000502625874125874,
+      "loss": 3.3677,
+      "step": 27950
+    },
+    {
+      "epoch": 8.153765507600909,
+      "grad_norm": 0.34521356225013733,
+      "learning_rate": 0.000502451048951049,
+      "loss": 3.3812,
+      "step": 28000
+    },
+    {
+      "epoch": 8.153765507600909,
+      "eval_accuracy": 0.365528353978238,
+      "eval_loss": 3.5927321910858154,
+      "eval_runtime": 179.4954,
+      "eval_samples_per_second": 92.71,
+      "eval_steps_per_second": 5.8,
+      "step": 28000
+    },
+    {
+      "epoch": 8.168326635214632,
+      "grad_norm": 0.3378417491912842,
+      "learning_rate": 0.0005022762237762237,
+      "loss": 3.3911,
+      "step": 28050
+    },
+    {
+      "epoch": 8.182887762828354,
+      "grad_norm": 0.3383776843547821,
+      "learning_rate": 0.0005021013986013985,
+      "loss": 3.3869,
+      "step": 28100
+    },
+    {
+      "epoch": 8.197448890442075,
+      "grad_norm": 0.36928725242614746,
+      "learning_rate": 0.0005019265734265733,
+      "loss": 3.3919,
+      "step": 28150
+    },
+    {
+      "epoch": 8.212010018055798,
+      "grad_norm": 0.3346560597419739,
+      "learning_rate": 0.0005017517482517483,
+      "loss": 3.3872,
+      "step": 28200
+    },
+    {
+      "epoch": 8.22657114566952,
+      "grad_norm": 0.33889731764793396,
+      "learning_rate": 0.0005015769230769231,
+      "loss": 3.3853,
+      "step": 28250
+    },
+    {
+      "epoch": 8.241132273283243,
+      "grad_norm": 0.3344823122024536,
+      "learning_rate": 0.0005014020979020979,
+      "loss": 3.3937,
+      "step": 28300
+    },
+    {
+      "epoch": 8.255693400896966,
+      "grad_norm": 0.3183842599391937,
+      "learning_rate": 0.0005012272727272727,
+      "loss": 3.3917,
+      "step": 28350
+    },
+    {
+      "epoch": 8.270254528510687,
+      "grad_norm": 0.3541378974914551,
+      "learning_rate": 0.0005010524475524476,
+      "loss": 3.4027,
+      "step": 28400
+    },
+    {
+      "epoch": 8.28481565612441,
+      "grad_norm": 0.34245550632476807,
+      "learning_rate": 0.0005008776223776223,
+      "loss": 3.4131,
+      "step": 28450
+    },
+    {
+      "epoch": 8.299376783738133,
+      "grad_norm": 0.3223508298397064,
+      "learning_rate": 0.0005007027972027972,
+      "loss": 3.399,
+      "step": 28500
+    },
+    {
+      "epoch": 8.313937911351855,
+      "grad_norm": 0.3582371771335602,
+      "learning_rate": 0.000500527972027972,
+      "loss": 3.4116,
+      "step": 28550
+    },
+    {
+      "epoch": 8.328499038965578,
+      "grad_norm": 0.3282024562358856,
+      "learning_rate": 0.0005003531468531468,
+      "loss": 3.3914,
+      "step": 28600
+    },
+    {
+      "epoch": 8.3430601665793,
+      "grad_norm": 0.32470643520355225,
+      "learning_rate": 0.0005001783216783217,
+      "loss": 3.4074,
+      "step": 28650
+    },
+    {
+      "epoch": 8.357621294193022,
+      "grad_norm": 0.32355979084968567,
+      "learning_rate": 0.0005000034965034965,
+      "loss": 3.4055,
+      "step": 28700
+    },
+    {
+      "epoch": 8.372182421806745,
+      "grad_norm": 0.3411882519721985,
+      "learning_rate": 0.0004998286713286713,
+      "loss": 3.4086,
+      "step": 28750
+    },
+    {
+      "epoch": 8.386743549420467,
+      "grad_norm": 0.3322669267654419,
+      "learning_rate": 0.0004996538461538461,
+      "loss": 3.4094,
+      "step": 28800
+    },
+    {
+      "epoch": 8.40130467703419,
+      "grad_norm": 0.32565972208976746,
+      "learning_rate": 0.000499479020979021,
+      "loss": 3.4171,
+      "step": 28850
+    },
+    {
+      "epoch": 8.415865804647911,
+      "grad_norm": 0.325556218624115,
+      "learning_rate": 0.0004993041958041958,
+      "loss": 3.4111,
+      "step": 28900
+    },
+    {
+      "epoch": 8.430426932261634,
+      "grad_norm": 0.34173521399497986,
+      "learning_rate": 0.0004991293706293706,
+      "loss": 3.4118,
+      "step": 28950
+    },
+    {
+      "epoch": 8.444988059875357,
+      "grad_norm": 0.32827889919281006,
+      "learning_rate": 0.0004989545454545454,
+      "loss": 3.4174,
+      "step": 29000
+    },
+    {
+      "epoch": 8.444988059875357,
+      "eval_accuracy": 0.3661121101055312,
+      "eval_loss": 3.5833680629730225,
+      "eval_runtime": 179.5213,
+      "eval_samples_per_second": 92.697,
+      "eval_steps_per_second": 5.799,
+      "step": 29000
+    },
+    {
+      "epoch": 8.45954918748908,
+      "grad_norm": 0.3157867193222046,
+      "learning_rate": 0.0004987797202797203,
+      "loss": 3.4037,
+      "step": 29050
+    },
+    {
+      "epoch": 8.474110315102802,
+      "grad_norm": 0.3219262659549713,
+      "learning_rate": 0.0004986048951048951,
+      "loss": 3.4059,
+      "step": 29100
+    },
+    {
+      "epoch": 8.488671442716523,
+      "grad_norm": 0.3398604094982147,
+      "learning_rate": 0.0004984300699300699,
+      "loss": 3.4149,
+      "step": 29150
+    },
+    {
+      "epoch": 8.503232570330246,
+      "grad_norm": 0.31037428975105286,
+      "learning_rate": 0.0004982552447552448,
+      "loss": 3.4154,
+      "step": 29200
+    },
+    {
+      "epoch": 8.517793697943969,
+      "grad_norm": 0.3249618113040924,
+      "learning_rate": 0.0004980804195804195,
+      "loss": 3.4058,
+      "step": 29250
+    },
+    {
+      "epoch": 8.532354825557691,
+      "grad_norm": 0.3502727746963501,
+      "learning_rate": 0.0004979055944055944,
+      "loss": 3.423,
+      "step": 29300
+    },
+    {
+      "epoch": 8.546915953171414,
+      "grad_norm": 0.3321726322174072,
+      "learning_rate": 0.0004977307692307692,
+      "loss": 3.4139,
+      "step": 29350
+    },
+    {
+      "epoch": 8.561477080785137,
+      "grad_norm": 0.32756468653678894,
+      "learning_rate": 0.000497555944055944,
+      "loss": 3.4144,
+      "step": 29400
+    },
+    {
+      "epoch": 8.576038208398858,
+      "grad_norm": 0.3343152105808258,
+      "learning_rate": 0.0004973811188811188,
+      "loss": 3.4128,
+      "step": 29450
+    },
+    {
+      "epoch": 8.59059933601258,
+      "grad_norm": 0.3429624140262604,
+      "learning_rate": 0.0004972062937062937,
+      "loss": 3.4153,
+      "step": 29500
+    },
+    {
+      "epoch": 8.605160463626303,
+      "grad_norm": 0.3241840898990631,
+      "learning_rate": 0.0004970314685314685,
+      "loss": 3.4261,
+      "step": 29550
+    },
+    {
+      "epoch": 8.619721591240026,
+      "grad_norm": 0.36211371421813965,
+      "learning_rate": 0.0004968566433566433,
+      "loss": 3.4077,
+      "step": 29600
+    },
+    {
+      "epoch": 8.634282718853749,
+      "grad_norm": 0.35850051045417786,
+      "learning_rate": 0.0004966818181818181,
+      "loss": 3.4132,
+      "step": 29650
+    },
+    {
+      "epoch": 8.64884384646747,
+      "grad_norm": 0.32470637559890747,
+      "learning_rate": 0.000496506993006993,
+      "loss": 3.4301,
+      "step": 29700
+    },
+    {
+      "epoch": 8.663404974081192,
+      "grad_norm": 0.3128025531768799,
+      "learning_rate": 0.0004963321678321678,
+      "loss": 3.4233,
+      "step": 29750
+    },
+    {
+      "epoch": 8.677966101694915,
+      "grad_norm": 0.3735058605670929,
+      "learning_rate": 0.0004961573426573426,
+      "loss": 3.4253,
+      "step": 29800
+    },
+    {
+      "epoch": 8.692527229308638,
+      "grad_norm": 0.3122337758541107,
+      "learning_rate": 0.0004959825174825175,
+      "loss": 3.4234,
+      "step": 29850
+    },
+    {
+      "epoch": 8.70708835692236,
+      "grad_norm": 0.3589786887168884,
+      "learning_rate": 0.0004958076923076923,
+      "loss": 3.4175,
+      "step": 29900
+    },
+    {
+      "epoch": 8.721649484536082,
+      "grad_norm": 0.33352115750312805,
+      "learning_rate": 0.0004956328671328671,
+      "loss": 3.4166,
+      "step": 29950
+    },
+    {
+      "epoch": 8.736210612149804,
+      "grad_norm": 0.3245551288127899,
+      "learning_rate": 0.0004954580419580419,
+      "loss": 3.4296,
+      "step": 30000
+    },
+    {
+      "epoch": 8.736210612149804,
+      "eval_accuracy": 0.367026904407347,
+      "eval_loss": 3.5770156383514404,
+      "eval_runtime": 179.9405,
+      "eval_samples_per_second": 92.481,
+      "eval_steps_per_second": 5.785,
+      "step": 30000
+    },
+    {
+      "epoch": 8.750771739763527,
+      "grad_norm": 0.3421856164932251,
+      "learning_rate": 0.0004952832167832167,
+      "loss": 3.4276,
+      "step": 30050
+    },
+    {
+      "epoch": 8.76533286737725,
+      "grad_norm": 0.3369308114051819,
+      "learning_rate": 0.0004951083916083915,
+      "loss": 3.4324,
+      "step": 30100
+    },
+    {
+      "epoch": 8.779893994990973,
+      "grad_norm": 0.3228960633277893,
+      "learning_rate": 0.0004949335664335664,
+      "loss": 3.4244,
+      "step": 30150
+    },
+    {
+      "epoch": 8.794455122604695,
+      "grad_norm": 0.3328655958175659,
+      "learning_rate": 0.0004947587412587412,
+      "loss": 3.4292,
+      "step": 30200
+    },
+    {
+      "epoch": 8.809016250218416,
+      "grad_norm": 0.33146989345550537,
+      "learning_rate": 0.000494583916083916,
+      "loss": 3.4347,
+      "step": 30250
+    },
+    {
+      "epoch": 8.82357737783214,
+      "grad_norm": 0.3366376459598541,
+      "learning_rate": 0.0004944090909090908,
+      "loss": 3.4403,
+      "step": 30300
+    },
+    {
+      "epoch": 8.838138505445862,
+      "grad_norm": 0.3395143151283264,
+      "learning_rate": 0.0004942342657342657,
+      "loss": 3.4291,
+      "step": 30350
+    },
+    {
+      "epoch": 8.852699633059585,
+      "grad_norm": 0.3265385627746582,
+      "learning_rate": 0.0004940594405594405,
+      "loss": 3.4229,
+      "step": 30400
+    },
+    {
+      "epoch": 8.867260760673307,
+      "grad_norm": 0.3156973123550415,
+      "learning_rate": 0.0004938846153846153,
+      "loss": 3.4274,
+      "step": 30450
+    },
+    {
+      "epoch": 8.881821888287028,
+      "grad_norm": 0.3365817964076996,
+      "learning_rate": 0.0004937097902097901,
+      "loss": 3.4404,
+      "step": 30500
+    },
+    {
+      "epoch": 8.896383015900751,
+      "grad_norm": 0.33978214859962463,
+      "learning_rate": 0.000493534965034965,
+      "loss": 3.4348,
+      "step": 30550
+    },
+    {
+      "epoch": 8.910944143514474,
+      "grad_norm": 0.3471260666847229,
+      "learning_rate": 0.0004933601398601398,
+      "loss": 3.4458,
+      "step": 30600
+    },
+    {
+      "epoch": 8.925505271128197,
+      "grad_norm": 0.33476272225379944,
+      "learning_rate": 0.0004931853146853146,
+      "loss": 3.4331,
+      "step": 30650
+    },
+    {
+      "epoch": 8.94006639874192,
+      "grad_norm": 0.31054040789604187,
+      "learning_rate": 0.0004930104895104895,
+      "loss": 3.4318,
+      "step": 30700
+    },
+    {
+      "epoch": 8.95462752635564,
+      "grad_norm": 0.3388405740261078,
+      "learning_rate": 0.0004928356643356642,
+      "loss": 3.4365,
+      "step": 30750
+    },
+    {
+      "epoch": 8.969188653969363,
+      "grad_norm": 0.3688811659812927,
+      "learning_rate": 0.0004926608391608391,
+      "loss": 3.4345,
+      "step": 30800
+    },
+    {
+      "epoch": 8.983749781583086,
+      "grad_norm": 0.32696250081062317,
+      "learning_rate": 0.0004924860139860139,
+      "loss": 3.4382,
+      "step": 30850
+    },
+    {
+      "epoch": 8.998310909196809,
+      "grad_norm": 0.36053240299224854,
+      "learning_rate": 0.0004923111888111887,
+      "loss": 3.435,
+      "step": 30900
+    },
+    {
+      "epoch": 9.012813792300076,
+      "grad_norm": 0.3175714612007141,
+      "learning_rate": 0.0004921363636363635,
+      "loss": 3.3325,
+      "step": 30950
+    },
+    {
+      "epoch": 9.027374919913798,
+      "grad_norm": 0.3416489362716675,
+      "learning_rate": 0.0004919615384615384,
+      "loss": 3.3249,
+      "step": 31000
+    },
+    {
+      "epoch": 9.027374919913798,
+      "eval_accuracy": 0.36684850834668953,
+      "eval_loss": 3.580686330795288,
+      "eval_runtime": 179.5407,
+      "eval_samples_per_second": 92.686,
+      "eval_steps_per_second": 5.798,
+      "step": 31000
+    },
+    {
+      "epoch": 9.041936047527521,
+      "grad_norm": 0.3503015339374542,
+      "learning_rate": 0.0004917867132867132,
+      "loss": 3.3338,
+      "step": 31050
+    },
+    {
+      "epoch": 9.056497175141242,
+      "grad_norm": 0.3463656008243561,
+      "learning_rate": 0.000491611888111888,
+      "loss": 3.3427,
+      "step": 31100
+    },
+    {
+      "epoch": 9.071058302754965,
+      "grad_norm": 0.3392924964427948,
+      "learning_rate": 0.0004914370629370628,
+      "loss": 3.3455,
+      "step": 31150
+    },
+    {
+      "epoch": 9.085619430368688,
+      "grad_norm": 0.3339923620223999,
+      "learning_rate": 0.0004912622377622378,
+      "loss": 3.3401,
+      "step": 31200
+    },
+    {
+      "epoch": 9.10018055798241,
+      "grad_norm": 0.33536580204963684,
+      "learning_rate": 0.0004910874125874126,
+      "loss": 3.3443,
+      "step": 31250
+    },
+    {
+      "epoch": 9.114741685596133,
+      "grad_norm": 0.325057715177536,
+      "learning_rate": 0.0004909125874125874,
+      "loss": 3.3497,
+      "step": 31300
+    },
+    {
+      "epoch": 9.129302813209854,
+      "grad_norm": 0.3440364897251129,
+      "learning_rate": 0.0004907377622377623,
+      "loss": 3.3465,
+      "step": 31350
+    },
+    {
+      "epoch": 9.143863940823577,
+      "grad_norm": 0.34977987408638,
+      "learning_rate": 0.0004905629370629371,
+      "loss": 3.3461,
+      "step": 31400
+    },
+    {
+      "epoch": 9.1584250684373,
+      "grad_norm": 0.34154027700424194,
+      "learning_rate": 0.0004903881118881119,
+      "loss": 3.3571,
+      "step": 31450
+    },
+    {
+      "epoch": 9.172986196051022,
+      "grad_norm": 0.35453176498413086,
+      "learning_rate": 0.0004902132867132867,
+      "loss": 3.3583,
+      "step": 31500
+    },
+    {
+      "epoch": 9.187547323664745,
+      "grad_norm": 0.34433984756469727,
+      "learning_rate": 0.0004900384615384615,
+      "loss": 3.3614,
+      "step": 31550
+    },
+    {
+      "epoch": 9.202108451278466,
+      "grad_norm": 0.34944620728492737,
+      "learning_rate": 0.0004898636363636363,
+      "loss": 3.3564,
+      "step": 31600
+    },
+    {
+      "epoch": 9.216669578892189,
+      "grad_norm": 0.35863709449768066,
+      "learning_rate": 0.0004896888111888112,
+      "loss": 3.3646,
+      "step": 31650
+    },
+    {
+      "epoch": 9.231230706505912,
+      "grad_norm": 0.34784045815467834,
+      "learning_rate": 0.000489513986013986,
+      "loss": 3.3667,
+      "step": 31700
+    },
+    {
+      "epoch": 9.245791834119634,
+      "grad_norm": 0.3355439603328705,
+      "learning_rate": 0.0004893391608391608,
+      "loss": 3.3814,
+      "step": 31750
+    },
+    {
+      "epoch": 9.260352961733357,
+      "grad_norm": 0.3343488872051239,
+      "learning_rate": 0.0004891643356643356,
+      "loss": 3.366,
+      "step": 31800
+    },
+    {
+      "epoch": 9.27491408934708,
+      "grad_norm": 0.3547097444534302,
+      "learning_rate": 0.0004889895104895105,
+      "loss": 3.378,
+      "step": 31850
+    },
+    {
+      "epoch": 9.2894752169608,
+      "grad_norm": 0.33162373304367065,
+      "learning_rate": 0.0004888146853146853,
+      "loss": 3.3649,
+      "step": 31900
+    },
+    {
+      "epoch": 9.304036344574524,
+      "grad_norm": 0.3784696161746979,
+      "learning_rate": 0.0004886398601398601,
+      "loss": 3.3739,
+      "step": 31950
+    },
+    {
+      "epoch": 9.318597472188246,
+      "grad_norm": 0.3406825363636017,
+      "learning_rate": 0.000488465034965035,
+      "loss": 3.377,
+      "step": 32000
+    },
+    {
+      "epoch": 9.318597472188246,
+      "eval_accuracy": 0.3671976566025182,
+      "eval_loss": 3.579307794570923,
+      "eval_runtime": 179.5993,
+      "eval_samples_per_second": 92.656,
+      "eval_steps_per_second": 5.796,
+      "step": 32000
+    },
+    {
+      "epoch": 9.333158599801969,
+      "grad_norm": 0.34786367416381836,
+      "learning_rate": 0.0004882902097902098,
+      "loss": 3.3731,
+      "step": 32050
+    },
+    {
+      "epoch": 9.347719727415692,
+      "grad_norm": 0.33358603715896606,
+      "learning_rate": 0.0004881153846153846,
+      "loss": 3.3715,
+      "step": 32100
+    },
+    {
+      "epoch": 9.362280855029413,
+      "grad_norm": 0.34001341462135315,
+      "learning_rate": 0.0004879405594405594,
+      "loss": 3.3815,
+      "step": 32150
+    },
+    {
+      "epoch": 9.376841982643136,
+      "grad_norm": 0.343924880027771,
+      "learning_rate": 0.00048776573426573424,
+      "loss": 3.379,
+      "step": 32200
+    },
+    {
+      "epoch": 9.391403110256858,
+      "grad_norm": 0.3322538137435913,
+      "learning_rate": 0.00048759090909090904,
+      "loss": 3.3914,
+      "step": 32250
+    },
+    {
+      "epoch": 9.405964237870581,
+      "grad_norm": 0.3515528440475464,
+      "learning_rate": 0.0004874160839160839,
+      "loss": 3.3813,
+      "step": 32300
+    },
+    {
+      "epoch": 9.420525365484304,
+      "grad_norm": 0.34504127502441406,
+      "learning_rate": 0.0004872412587412587,
+      "loss": 3.3871,
+      "step": 32350
+    },
+    {
+      "epoch": 9.435086493098025,
+      "grad_norm": 0.3422424793243408,
+      "learning_rate": 0.00048706643356643354,
+      "loss": 3.3952,
+      "step": 32400
+    },
+    {
+      "epoch": 9.449647620711747,
+      "grad_norm": 0.3778160512447357,
+      "learning_rate": 0.00048689160839160834,
+      "loss": 3.3871,
+      "step": 32450
+    },
+    {
+      "epoch": 9.46420874832547,
+      "grad_norm": 0.34064722061157227,
+      "learning_rate": 0.0004867167832167832,
+      "loss": 3.3851,
+      "step": 32500
+    },
+    {
+      "epoch": 9.478769875939193,
+      "grad_norm": 0.31927308440208435,
+      "learning_rate": 0.00048654195804195794,
+      "loss": 3.3986,
+      "step": 32550
+    },
+    {
+      "epoch": 9.493331003552916,
+      "grad_norm": 0.3233279883861542,
+      "learning_rate": 0.00048636713286713285,
+      "loss": 3.3917,
+      "step": 32600
+    },
+    {
+      "epoch": 9.507892131166638,
+      "grad_norm": 0.34362176060676575,
+      "learning_rate": 0.0004861923076923077,
+      "loss": 3.3818,
+      "step": 32650
+    },
+    {
+      "epoch": 9.52245325878036,
+      "grad_norm": 0.32272911071777344,
+      "learning_rate": 0.00048601748251748245,
+      "loss": 3.3855,
+      "step": 32700
+    },
+    {
+      "epoch": 9.537014386394082,
+      "grad_norm": 0.3509780466556549,
+      "learning_rate": 0.0004858426573426573,
+      "loss": 3.3869,
+      "step": 32750
+    },
+    {
+      "epoch": 9.551575514007805,
+      "grad_norm": 0.3370649814605713,
+      "learning_rate": 0.0004856678321678321,
+      "loss": 3.3947,
+      "step": 32800
+    },
+    {
+      "epoch": 9.566136641621528,
+      "grad_norm": 0.33514314889907837,
+      "learning_rate": 0.00048549300699300696,
+      "loss": 3.3867,
+      "step": 32850
+    },
+    {
+      "epoch": 9.58069776923525,
+      "grad_norm": 0.3138607144355774,
+      "learning_rate": 0.00048531818181818176,
+      "loss": 3.3909,
+      "step": 32900
+    },
+    {
+      "epoch": 9.595258896848971,
+      "grad_norm": 0.34165963530540466,
+      "learning_rate": 0.0004851433566433566,
+      "loss": 3.3951,
+      "step": 32950
+    },
+    {
+      "epoch": 9.609820024462694,
+      "grad_norm": 0.351672500371933,
+      "learning_rate": 0.0004849685314685314,
+      "loss": 3.3885,
+      "step": 33000
+    },
+    {
+      "epoch": 9.609820024462694,
+      "eval_accuracy": 0.3680022616434005,
+      "eval_loss": 3.5697364807128906,
+      "eval_runtime": 179.5621,
+      "eval_samples_per_second": 92.675,
+      "eval_steps_per_second": 5.797,
+      "step": 33000
+    },
+    {
+      "epoch": 9.624381152076417,
+      "grad_norm": 0.3302818238735199,
+      "learning_rate": 0.00048479370629370627,
+      "loss": 3.3928,
+      "step": 33050
+    },
+    {
+      "epoch": 9.63894227969014,
+      "grad_norm": 0.3303544521331787,
+      "learning_rate": 0.00048461888111888106,
+      "loss": 3.3966,
+      "step": 33100
+    },
+    {
+      "epoch": 9.653503407303862,
+      "grad_norm": 0.37402021884918213,
+      "learning_rate": 0.0004844440559440559,
+      "loss": 3.4061,
+      "step": 33150
+    },
+    {
+      "epoch": 9.668064534917583,
+      "grad_norm": 0.3550156354904175,
+      "learning_rate": 0.0004842692307692307,
+      "loss": 3.3945,
+      "step": 33200
+    },
+    {
+      "epoch": 9.682625662531306,
+      "grad_norm": 0.3375867009162903,
+      "learning_rate": 0.00048409440559440557,
+      "loss": 3.4039,
+      "step": 33250
+    },
+    {
+      "epoch": 9.697186790145029,
+      "grad_norm": 0.3243738114833832,
+      "learning_rate": 0.0004839195804195803,
+      "loss": 3.3875,
+      "step": 33300
+    },
+    {
+      "epoch": 9.711747917758752,
+      "grad_norm": 0.35806065797805786,
+      "learning_rate": 0.0004837447552447552,
+      "loss": 3.3992,
+      "step": 33350
+    },
+    {
+      "epoch": 9.726309045372474,
+      "grad_norm": 0.32793280482292175,
+      "learning_rate": 0.0004835699300699301,
+      "loss": 3.3877,
+      "step": 33400
+    },
+    {
+      "epoch": 9.740870172986195,
+      "grad_norm": 0.33390912413597107,
+      "learning_rate": 0.0004833951048951048,
+      "loss": 3.4176,
+      "step": 33450
+    },
+    {
+      "epoch": 9.755431300599918,
+      "grad_norm": 0.3261817991733551,
+      "learning_rate": 0.0004832202797202797,
+      "loss": 3.397,
+      "step": 33500
+    },
+    {
+      "epoch": 9.76999242821364,
+      "grad_norm": 0.3434211015701294,
+      "learning_rate": 0.0004830454545454545,
+      "loss": 3.4082,
+      "step": 33550
+    },
+    {
+      "epoch": 9.784553555827364,
+      "grad_norm": 0.34745633602142334,
+      "learning_rate": 0.00048287062937062933,
+      "loss": 3.4018,
+      "step": 33600
+    },
+    {
+      "epoch": 9.799114683441086,
+      "grad_norm": 0.3618619441986084,
+      "learning_rate": 0.00048269580419580413,
+      "loss": 3.3959,
+      "step": 33650
+    },
+    {
+      "epoch": 9.813675811054807,
+      "grad_norm": 0.35037025809288025,
+      "learning_rate": 0.000482520979020979,
+      "loss": 3.3963,
+      "step": 33700
+    },
+    {
+      "epoch": 9.82823693866853,
+      "grad_norm": 0.35323193669319153,
+      "learning_rate": 0.0004823461538461538,
+      "loss": 3.4175,
+      "step": 33750
+    },
+    {
+      "epoch": 9.842798066282253,
+      "grad_norm": 0.32245779037475586,
+      "learning_rate": 0.00048217132867132864,
+      "loss": 3.3962,
+      "step": 33800
+    },
+    {
+      "epoch": 9.857359193895975,
+      "grad_norm": 0.3340567648410797,
+      "learning_rate": 0.00048199650349650344,
+      "loss": 3.4052,
+      "step": 33850
+    },
+    {
+      "epoch": 9.871920321509698,
+      "grad_norm": 0.35121092200279236,
+      "learning_rate": 0.0004818216783216783,
+      "loss": 3.4022,
+      "step": 33900
+    },
+    {
+      "epoch": 9.88648144912342,
+      "grad_norm": 0.3410423994064331,
+      "learning_rate": 0.0004816468531468531,
+      "loss": 3.4115,
+      "step": 33950
+    },
+    {
+      "epoch": 9.901042576737142,
+      "grad_norm": 0.32788190245628357,
+      "learning_rate": 0.00048147202797202795,
+      "loss": 3.4055,
+      "step": 34000
+    },
+    {
+      "epoch": 9.901042576737142,
+      "eval_accuracy": 0.3681349121090707,
+      "eval_loss": 3.5651626586914062,
+      "eval_runtime": 179.6407,
+      "eval_samples_per_second": 92.635,
+      "eval_steps_per_second": 5.795,
+      "step": 34000
+    },
+    {
+      "epoch": 9.915603704350865,
+      "grad_norm": 0.3360147774219513,
+      "learning_rate": 0.0004812972027972028,
+      "loss": 3.421,
+      "step": 34050
+    },
+    {
+      "epoch": 9.930164831964587,
+      "grad_norm": 0.3194892108440399,
+      "learning_rate": 0.0004811223776223776,
+      "loss": 3.4199,
+      "step": 34100
+    },
+    {
+      "epoch": 9.94472595957831,
+      "grad_norm": 0.3178540766239166,
+      "learning_rate": 0.00048094755244755245,
+      "loss": 3.414,
+      "step": 34150
+    },
+    {
+      "epoch": 9.959287087192033,
+      "grad_norm": 0.3059634268283844,
+      "learning_rate": 0.0004807727272727272,
+      "loss": 3.4017,
+      "step": 34200
+    },
+    {
+      "epoch": 9.973848214805754,
+      "grad_norm": 0.33229926228523254,
+      "learning_rate": 0.00048059790209790205,
+      "loss": 3.4211,
+      "step": 34250
+    },
+    {
+      "epoch": 9.988409342419477,
+      "grad_norm": 0.3151392638683319,
+      "learning_rate": 0.00048042307692307685,
+      "loss": 3.4128,
+      "step": 34300
+    },
+    {
+      "epoch": 10.002912225522744,
+      "grad_norm": 0.35194942355155945,
+      "learning_rate": 0.0004802482517482517,
+      "loss": 3.3823,
+      "step": 34350
+    },
+    {
+      "epoch": 10.017473353136467,
+      "grad_norm": 0.34302377700805664,
+      "learning_rate": 0.0004800734265734265,
+      "loss": 3.3022,
+      "step": 34400
+    },
+    {
+      "epoch": 10.03203448075019,
+      "grad_norm": 0.35437721014022827,
+      "learning_rate": 0.00047989860139860136,
+      "loss": 3.3014,
+      "step": 34450
+    },
+    {
+      "epoch": 10.046595608363912,
+      "grad_norm": 0.3585539758205414,
+      "learning_rate": 0.00047972377622377616,
+      "loss": 3.2966,
+      "step": 34500
+    },
+    {
+      "epoch": 10.061156735977635,
+      "grad_norm": 0.3557938039302826,
+      "learning_rate": 0.000479548951048951,
+      "loss": 3.31,
+      "step": 34550
+    },
+    {
+      "epoch": 10.075717863591356,
+      "grad_norm": 0.38117077946662903,
+      "learning_rate": 0.0004793741258741258,
+      "loss": 3.3192,
+      "step": 34600
+    },
+    {
+      "epoch": 10.090278991205079,
+      "grad_norm": 0.3344501256942749,
+      "learning_rate": 0.00047919930069930067,
+      "loss": 3.3062,
+      "step": 34650
+    },
+    {
+      "epoch": 10.104840118818801,
+      "grad_norm": 0.3537605106830597,
+      "learning_rate": 0.0004790244755244755,
+      "loss": 3.3134,
+      "step": 34700
+    },
+    {
+      "epoch": 10.119401246432524,
+      "grad_norm": 0.35433295369148254,
+      "learning_rate": 0.0004788496503496503,
+      "loss": 3.3182,
+      "step": 34750
+    },
+    {
+      "epoch": 10.133962374046247,
+      "grad_norm": 0.3392355144023895,
+      "learning_rate": 0.0004786748251748252,
+      "loss": 3.3281,
+      "step": 34800
+    },
+    {
+      "epoch": 10.148523501659968,
+      "grad_norm": 0.34290164709091187,
+      "learning_rate": 0.0004785,
+      "loss": 3.3316,
+      "step": 34850
+    },
+    {
+      "epoch": 10.16308462927369,
+      "grad_norm": 0.35168832540512085,
+      "learning_rate": 0.00047832517482517483,
+      "loss": 3.3223,
+      "step": 34900
+    },
+    {
+      "epoch": 10.177645756887413,
+      "grad_norm": 0.32866406440734863,
+      "learning_rate": 0.0004781503496503496,
+      "loss": 3.3278,
+      "step": 34950
+    },
+    {
+      "epoch": 10.192206884501136,
+      "grad_norm": 0.3799251914024353,
+      "learning_rate": 0.00047797552447552443,
+      "loss": 3.3423,
+      "step": 35000
+    },
+    {
+      "epoch": 10.192206884501136,
+      "eval_accuracy": 0.36767463380886406,
+      "eval_loss": 3.5737509727478027,
+      "eval_runtime": 179.5577,
+      "eval_samples_per_second": 92.678,
+      "eval_steps_per_second": 5.798,
+      "step": 35000
+    },
+    {
+      "epoch": 10.206768012114859,
+      "grad_norm": 0.35849910974502563,
+      "learning_rate": 0.00047780069930069923,
+      "loss": 3.3372,
+      "step": 35050
+    },
+    {
+      "epoch": 10.221329139728581,
+      "grad_norm": 0.33695927262306213,
+      "learning_rate": 0.0004776258741258741,
+      "loss": 3.3448,
+      "step": 35100
+    },
+    {
+      "epoch": 10.235890267342302,
+      "grad_norm": 0.32416245341300964,
+      "learning_rate": 0.0004774510489510489,
+      "loss": 3.3477,
+      "step": 35150
+    },
+    {
+      "epoch": 10.250451394956025,
+      "grad_norm": 0.3458312749862671,
+      "learning_rate": 0.00047727622377622374,
+      "loss": 3.3455,
+      "step": 35200
+    },
+    {
+      "epoch": 10.265012522569748,
+      "grad_norm": 0.36680135130882263,
+      "learning_rate": 0.00047710139860139854,
+      "loss": 3.3607,
+      "step": 35250
+    },
+    {
+      "epoch": 10.27957365018347,
+      "grad_norm": 0.34393811225891113,
+      "learning_rate": 0.0004769265734265734,
+      "loss": 3.3559,
+      "step": 35300
+    },
+    {
+      "epoch": 10.294134777797193,
+      "grad_norm": 0.3232147991657257,
+      "learning_rate": 0.0004767517482517482,
+      "loss": 3.3609,
+      "step": 35350
+    },
+    {
+      "epoch": 10.308695905410914,
+      "grad_norm": 0.36371856927871704,
+      "learning_rate": 0.00047657692307692304,
+      "loss": 3.3565,
+      "step": 35400
+    },
+    {
+      "epoch": 10.323257033024637,
+      "grad_norm": 0.32138916850090027,
+      "learning_rate": 0.0004764020979020979,
+      "loss": 3.3509,
+      "step": 35450
+    },
+    {
+      "epoch": 10.33781816063836,
+      "grad_norm": 0.36574694514274597,
+      "learning_rate": 0.0004762272727272727,
+      "loss": 3.3486,
+      "step": 35500
+    },
+    {
+      "epoch": 10.352379288252083,
+      "grad_norm": 0.36567094922065735,
+      "learning_rate": 0.00047605244755244755,
+      "loss": 3.3576,
+      "step": 35550
+    },
+    {
+      "epoch": 10.366940415865805,
+      "grad_norm": 0.3548871576786041,
+      "learning_rate": 0.00047587762237762235,
+      "loss": 3.3614,
+      "step": 35600
+    },
+    {
+      "epoch": 10.381501543479526,
+      "grad_norm": 0.3599976599216461,
+      "learning_rate": 0.0004757027972027972,
+      "loss": 3.3603,
+      "step": 35650
+    },
+    {
+      "epoch": 10.396062671093249,
+      "grad_norm": 0.34999898076057434,
+      "learning_rate": 0.00047552797202797195,
+      "loss": 3.3498,
+      "step": 35700
+    },
+    {
+      "epoch": 10.410623798706972,
+      "grad_norm": 0.3362889289855957,
+      "learning_rate": 0.0004753531468531468,
+      "loss": 3.3549,
+      "step": 35750
+    },
+    {
+      "epoch": 10.425184926320695,
+      "grad_norm": 0.33971652388572693,
+      "learning_rate": 0.0004751783216783216,
+      "loss": 3.3558,
+      "step": 35800
+    },
+    {
+      "epoch": 10.439746053934417,
+      "grad_norm": 0.32522860169410706,
+      "learning_rate": 0.00047500349650349646,
+      "loss": 3.3734,
+      "step": 35850
+    },
+    {
+      "epoch": 10.454307181548138,
+      "grad_norm": 0.33393293619155884,
+      "learning_rate": 0.00047482867132867126,
+      "loss": 3.3728,
+      "step": 35900
+    },
+    {
+      "epoch": 10.468868309161861,
+      "grad_norm": 0.36121344566345215,
+      "learning_rate": 0.0004746538461538461,
+      "loss": 3.3704,
+      "step": 35950
+    },
+    {
+      "epoch": 10.483429436775584,
+      "grad_norm": 0.33934155106544495,
+      "learning_rate": 0.0004744790209790209,
+      "loss": 3.36,
+      "step": 36000
+    },
+    {
+      "epoch": 10.483429436775584,
+      "eval_accuracy": 0.36847312375735736,
+      "eval_loss": 3.567448616027832,
+      "eval_runtime": 189.5881,
+      "eval_samples_per_second": 87.774,
+      "eval_steps_per_second": 5.491,
+      "step": 36000
+    },
+    {
+      "epoch": 10.497990564389307,
+      "grad_norm": 0.33829599618911743,
+      "learning_rate": 0.00047430419580419576,
+      "loss": 3.3594,
+      "step": 36050
+    },
+    {
+      "epoch": 10.51255169200303,
+      "grad_norm": 0.36573532223701477,
+      "learning_rate": 0.0004741293706293706,
+      "loss": 3.3697,
+      "step": 36100
+    },
+    {
+      "epoch": 10.52711281961675,
+      "grad_norm": 0.3282630145549774,
+      "learning_rate": 0.0004739545454545454,
+      "loss": 3.3601,
+      "step": 36150
+    },
+    {
+      "epoch": 10.541673947230473,
+      "grad_norm": 0.36783677339553833,
+      "learning_rate": 0.00047377972027972027,
+      "loss": 3.3587,
+      "step": 36200
+    },
+    {
+      "epoch": 10.556235074844196,
+      "grad_norm": 0.3381112217903137,
+      "learning_rate": 0.00047360489510489507,
+      "loss": 3.3777,
+      "step": 36250
+    },
+    {
+      "epoch": 10.570796202457919,
+      "grad_norm": 0.3379128575325012,
+      "learning_rate": 0.0004734300699300699,
+      "loss": 3.3733,
+      "step": 36300
+    },
+    {
+      "epoch": 10.585357330071641,
+      "grad_norm": 0.32401373982429504,
+      "learning_rate": 0.0004732552447552447,
+      "loss": 3.3722,
+      "step": 36350
+    },
+    {
+      "epoch": 10.599918457685362,
+      "grad_norm": 0.3448126018047333,
+      "learning_rate": 0.0004730804195804196,
+      "loss": 3.3762,
+      "step": 36400
+    },
+    {
+      "epoch": 10.614479585299085,
+      "grad_norm": 0.36033710837364197,
+      "learning_rate": 0.0004729055944055943,
+      "loss": 3.3721,
+      "step": 36450
+    },
+    {
+      "epoch": 10.629040712912808,
+      "grad_norm": 0.32210254669189453,
+      "learning_rate": 0.0004727307692307692,
+      "loss": 3.3704,
+      "step": 36500
+    },
+    {
+      "epoch": 10.64360184052653,
+      "grad_norm": 0.3496635854244232,
+      "learning_rate": 0.000472555944055944,
+      "loss": 3.3641,
+      "step": 36550
+    },
+    {
+      "epoch": 10.658162968140253,
+      "grad_norm": 0.3081586956977844,
+      "learning_rate": 0.00047238111888111883,
+      "loss": 3.3772,
+      "step": 36600
+    },
+    {
+      "epoch": 10.672724095753976,
+      "grad_norm": 0.3534667491912842,
+      "learning_rate": 0.00047220629370629363,
+      "loss": 3.3856,
+      "step": 36650
+    },
+    {
+      "epoch": 10.687285223367697,
+      "grad_norm": 0.34923726320266724,
+      "learning_rate": 0.0004720314685314685,
+      "loss": 3.378,
+      "step": 36700
+    },
+    {
+      "epoch": 10.70184635098142,
+      "grad_norm": 0.35523879528045654,
+      "learning_rate": 0.0004718566433566433,
+      "loss": 3.3725,
+      "step": 36750
+    },
+    {
+      "epoch": 10.716407478595142,
+      "grad_norm": 0.3368263840675354,
+      "learning_rate": 0.00047168181818181814,
+      "loss": 3.3739,
+      "step": 36800
+    },
+    {
+      "epoch": 10.730968606208865,
+      "grad_norm": 0.32694390416145325,
+      "learning_rate": 0.000471506993006993,
+      "loss": 3.3751,
+      "step": 36850
+    },
+    {
+      "epoch": 10.745529733822588,
+      "grad_norm": 0.32708966732025146,
+      "learning_rate": 0.0004713321678321678,
+      "loss": 3.3844,
+      "step": 36900
+    },
+    {
+      "epoch": 10.760090861436309,
+      "grad_norm": 0.30745530128479004,
+      "learning_rate": 0.00047115734265734265,
+      "loss": 3.3898,
+      "step": 36950
+    },
+    {
+      "epoch": 10.774651989050032,
+      "grad_norm": 0.3373164236545563,
+      "learning_rate": 0.00047098251748251745,
+      "loss": 3.3876,
+      "step": 37000
+    },
+    {
+      "epoch": 10.774651989050032,
+      "eval_accuracy": 0.36911803080854105,
+      "eval_loss": 3.5577805042266846,
+      "eval_runtime": 179.2277,
+      "eval_samples_per_second": 92.848,
+      "eval_steps_per_second": 5.808,
+      "step": 37000
+    },
+    {
+      "epoch": 10.789213116663754,
+      "grad_norm": 0.3574223816394806,
+      "learning_rate": 0.0004708076923076923,
+      "loss": 3.3808,
+      "step": 37050
+    },
+    {
+      "epoch": 10.803774244277477,
+      "grad_norm": 0.3373048007488251,
+      "learning_rate": 0.0004706328671328671,
+      "loss": 3.3903,
+      "step": 37100
+    },
+    {
+      "epoch": 10.8183353718912,
+      "grad_norm": 0.34644612669944763,
+      "learning_rate": 0.00047045804195804195,
+      "loss": 3.3885,
+      "step": 37150
+    },
+    {
+      "epoch": 10.83289649950492,
+      "grad_norm": 0.34422236680984497,
+      "learning_rate": 0.0004702832167832167,
+      "loss": 3.3818,
+      "step": 37200
+    },
+    {
+      "epoch": 10.847457627118644,
+      "grad_norm": 0.3302682638168335,
+      "learning_rate": 0.00047010839160839155,
+      "loss": 3.3797,
+      "step": 37250
+    },
+    {
+      "epoch": 10.862018754732366,
+      "grad_norm": 0.3459398150444031,
+      "learning_rate": 0.00046993356643356635,
+      "loss": 3.3998,
+      "step": 37300
+    },
+    {
+      "epoch": 10.876579882346089,
+      "grad_norm": 0.3879469037055969,
+      "learning_rate": 0.0004697587412587412,
+      "loss": 3.3815,
+      "step": 37350
+    },
+    {
+      "epoch": 10.891141009959812,
+      "grad_norm": 0.33385929465293884,
+      "learning_rate": 0.000469583916083916,
+      "loss": 3.379,
+      "step": 37400
+    },
+    {
+      "epoch": 10.905702137573535,
+      "grad_norm": 0.3342723250389099,
+      "learning_rate": 0.00046940909090909086,
+      "loss": 3.3929,
+      "step": 37450
+    },
+    {
+      "epoch": 10.920263265187256,
+      "grad_norm": 0.35342901945114136,
+      "learning_rate": 0.0004692342657342657,
+      "loss": 3.3927,
+      "step": 37500
+    },
+    {
+      "epoch": 10.934824392800978,
+      "grad_norm": 0.33476606011390686,
+      "learning_rate": 0.0004690594405594405,
+      "loss": 3.3997,
+      "step": 37550
+    },
+    {
+      "epoch": 10.949385520414701,
+      "grad_norm": 0.34175390005111694,
+      "learning_rate": 0.00046888461538461537,
+      "loss": 3.3931,
+      "step": 37600
+    },
+    {
+      "epoch": 10.963946648028424,
+      "grad_norm": 0.3353242576122284,
+      "learning_rate": 0.00046870979020979017,
+      "loss": 3.393,
+      "step": 37650
+    },
+    {
+      "epoch": 10.978507775642147,
+      "grad_norm": 0.34950190782546997,
+      "learning_rate": 0.000468534965034965,
+      "loss": 3.3864,
+      "step": 37700
+    },
+    {
+      "epoch": 10.993068903255867,
+      "grad_norm": 0.3388817310333252,
+      "learning_rate": 0.0004683601398601398,
+      "loss": 3.3917,
+      "step": 37750
+    },
+    {
+      "epoch": 11.007571786359136,
+      "grad_norm": 0.3355076313018799,
+      "learning_rate": 0.0004681853146853147,
+      "loss": 3.3182,
+      "step": 37800
+    },
+    {
+      "epoch": 11.022132913972857,
+      "grad_norm": 0.34891900420188904,
+      "learning_rate": 0.0004680104895104895,
+      "loss": 3.2813,
+      "step": 37850
+    },
+    {
+      "epoch": 11.03669404158658,
+      "grad_norm": 0.3277644217014313,
+      "learning_rate": 0.00046783566433566433,
+      "loss": 3.2816,
+      "step": 37900
+    },
+    {
+      "epoch": 11.051255169200303,
+      "grad_norm": 0.3547028601169586,
+      "learning_rate": 0.0004676608391608391,
+      "loss": 3.2908,
+      "step": 37950
+    },
+    {
+      "epoch": 11.065816296814026,
+      "grad_norm": 0.3213656544685364,
+      "learning_rate": 0.00046748601398601393,
+      "loss": 3.2956,
+      "step": 38000
+    },
+    {
+      "epoch": 11.065816296814026,
+      "eval_accuracy": 0.3687849934691989,
+      "eval_loss": 3.5677566528320312,
+      "eval_runtime": 179.2703,
+      "eval_samples_per_second": 92.826,
+      "eval_steps_per_second": 5.807,
+      "step": 38000
+    },
+    {
+      "epoch": 11.080377424427748,
+      "grad_norm": 0.35302484035491943,
+      "learning_rate": 0.00046731118881118873,
+      "loss": 3.3003,
+      "step": 38050
+    },
+    {
+      "epoch": 11.09493855204147,
+      "grad_norm": 0.33304354548454285,
+      "learning_rate": 0.0004671363636363636,
+      "loss": 3.2988,
+      "step": 38100
+    },
+    {
+      "epoch": 11.109499679655192,
+      "grad_norm": 0.35924726724624634,
+      "learning_rate": 0.00046696153846153844,
+      "loss": 3.3095,
+      "step": 38150
+    },
+    {
+      "epoch": 11.124060807268915,
+      "grad_norm": 0.340108722448349,
+      "learning_rate": 0.00046678671328671324,
+      "loss": 3.304,
+      "step": 38200
+    },
+    {
+      "epoch": 11.138621934882638,
+      "grad_norm": 0.3212912678718567,
+      "learning_rate": 0.0004666118881118881,
+      "loss": 3.3098,
+      "step": 38250
+    },
+    {
+      "epoch": 11.15318306249636,
+      "grad_norm": 0.34773996472358704,
+      "learning_rate": 0.0004664370629370629,
+      "loss": 3.3144,
+      "step": 38300
+    },
+    {
+      "epoch": 11.167744190110081,
+      "grad_norm": 0.3532446026802063,
+      "learning_rate": 0.00046626223776223774,
+      "loss": 3.3145,
+      "step": 38350
+    },
+    {
+      "epoch": 11.182305317723804,
+      "grad_norm": 0.3510436415672302,
+      "learning_rate": 0.00046608741258741254,
+      "loss": 3.3075,
+      "step": 38400
+    },
+    {
+      "epoch": 11.196866445337527,
+      "grad_norm": 0.3559911549091339,
+      "learning_rate": 0.0004659125874125874,
+      "loss": 3.3147,
+      "step": 38450
+    },
+    {
+      "epoch": 11.21142757295125,
+      "grad_norm": 0.35517627000808716,
+      "learning_rate": 0.0004657377622377622,
+      "loss": 3.3198,
+      "step": 38500
+    },
+    {
+      "epoch": 11.225988700564972,
+      "grad_norm": 0.34963780641555786,
+      "learning_rate": 0.00046556293706293705,
+      "loss": 3.3041,
+      "step": 38550
+    },
+    {
+      "epoch": 11.240549828178693,
+      "grad_norm": 0.37004563212394714,
+      "learning_rate": 0.00046538811188811185,
+      "loss": 3.3175,
+      "step": 38600
+    },
+    {
+      "epoch": 11.255110955792416,
+      "grad_norm": 0.3559558689594269,
+      "learning_rate": 0.0004652132867132867,
+      "loss": 3.331,
+      "step": 38650
+    },
+    {
+      "epoch": 11.269672083406139,
+      "grad_norm": 0.3516577184200287,
+      "learning_rate": 0.00046503846153846145,
+      "loss": 3.3216,
+      "step": 38700
+    },
+    {
+      "epoch": 11.284233211019862,
+      "grad_norm": 0.3305668532848358,
+      "learning_rate": 0.0004648636363636363,
+      "loss": 3.3288,
+      "step": 38750
+    },
+    {
+      "epoch": 11.298794338633584,
+      "grad_norm": 0.3463844358921051,
+      "learning_rate": 0.0004646888111888111,
+      "loss": 3.3294,
+      "step": 38800
+    },
+    {
+      "epoch": 11.313355466247307,
+      "grad_norm": 0.3470117151737213,
+      "learning_rate": 0.00046451398601398596,
+      "loss": 3.334,
+      "step": 38850
+    },
+    {
+      "epoch": 11.327916593861028,
+      "grad_norm": 0.3878748118877411,
+      "learning_rate": 0.0004643391608391608,
+      "loss": 3.3207,
+      "step": 38900
+    },
+    {
+      "epoch": 11.34247772147475,
+      "grad_norm": 0.3300091028213501,
+      "learning_rate": 0.0004641643356643356,
+      "loss": 3.3571,
+      "step": 38950
+    },
+    {
+      "epoch": 11.357038849088473,
+      "grad_norm": 0.3411715626716614,
+      "learning_rate": 0.00046398951048951046,
+      "loss": 3.3316,
+      "step": 39000
+    },
+    {
+      "epoch": 11.357038849088473,
+      "eval_accuracy": 0.36932288640357425,
+      "eval_loss": 3.56606125831604,
+      "eval_runtime": 179.4464,
+      "eval_samples_per_second": 92.735,
+      "eval_steps_per_second": 5.801,
+      "step": 39000
+    },
+    {
+      "epoch": 11.371599976702196,
+      "grad_norm": 0.34825968742370605,
+      "learning_rate": 0.00046381468531468526,
+      "loss": 3.3289,
+      "step": 39050
+    },
+    {
+      "epoch": 11.386161104315919,
+      "grad_norm": 0.3493784964084625,
+      "learning_rate": 0.0004636398601398601,
+      "loss": 3.3427,
+      "step": 39100
+    },
+    {
+      "epoch": 11.40072223192964,
+      "grad_norm": 0.38270437717437744,
+      "learning_rate": 0.0004634650349650349,
+      "loss": 3.3439,
+      "step": 39150
+    },
+    {
+      "epoch": 11.415283359543363,
+      "grad_norm": 0.373262882232666,
+      "learning_rate": 0.00046329020979020977,
+      "loss": 3.3412,
+      "step": 39200
+    },
+    {
+      "epoch": 11.429844487157085,
+      "grad_norm": 0.3348482847213745,
+      "learning_rate": 0.00046311538461538457,
+      "loss": 3.3427,
+      "step": 39250
+    },
+    {
+      "epoch": 11.444405614770808,
+      "grad_norm": 0.3629266917705536,
+      "learning_rate": 0.0004629405594405594,
+      "loss": 3.3431,
+      "step": 39300
+    },
+    {
+      "epoch": 11.458966742384531,
+      "grad_norm": 0.3323516845703125,
+      "learning_rate": 0.0004627657342657342,
+      "loss": 3.3372,
+      "step": 39350
+    },
+    {
+      "epoch": 11.473527869998252,
+      "grad_norm": 0.3465496301651001,
+      "learning_rate": 0.0004625909090909091,
+      "loss": 3.3334,
+      "step": 39400
+    },
+    {
+      "epoch": 11.488088997611975,
+      "grad_norm": 0.38713112473487854,
+      "learning_rate": 0.0004624160839160838,
+      "loss": 3.3428,
+      "step": 39450
+    },
+    {
+      "epoch": 11.502650125225697,
+      "grad_norm": 0.33216097950935364,
+      "learning_rate": 0.0004622412587412587,
+      "loss": 3.3379,
+      "step": 39500
+    },
+    {
+      "epoch": 11.51721125283942,
+      "grad_norm": 0.35208389163017273,
+      "learning_rate": 0.00046206643356643353,
+      "loss": 3.3628,
+      "step": 39550
+    },
+    {
+      "epoch": 11.531772380453143,
+      "grad_norm": 0.36084526777267456,
+      "learning_rate": 0.00046189160839160833,
+      "loss": 3.3487,
+      "step": 39600
+    },
+    {
+      "epoch": 11.546333508066864,
+      "grad_norm": 0.3580910265445709,
+      "learning_rate": 0.0004617167832167832,
+      "loss": 3.3446,
+      "step": 39650
+    },
+    {
+      "epoch": 11.560894635680587,
+      "grad_norm": 0.3954737186431885,
+      "learning_rate": 0.000461541958041958,
+      "loss": 3.3511,
+      "step": 39700
+    },
+    {
+      "epoch": 11.57545576329431,
+      "grad_norm": 0.33455291390419006,
+      "learning_rate": 0.00046136713286713284,
+      "loss": 3.3654,
+      "step": 39750
+    },
+    {
+      "epoch": 11.590016890908032,
+      "grad_norm": 0.364233136177063,
+      "learning_rate": 0.00046119230769230764,
+      "loss": 3.3632,
+      "step": 39800
+    },
+    {
+      "epoch": 11.604578018521755,
+      "grad_norm": 0.36201661825180054,
+      "learning_rate": 0.0004610174825174825,
+      "loss": 3.3553,
+      "step": 39850
+    },
+    {
+      "epoch": 11.619139146135478,
+      "grad_norm": 0.3511942923069,
+      "learning_rate": 0.0004608426573426573,
+      "loss": 3.3432,
+      "step": 39900
+    },
+    {
+      "epoch": 11.633700273749199,
+      "grad_norm": 0.35589101910591125,
+      "learning_rate": 0.00046066783216783215,
+      "loss": 3.3694,
+      "step": 39950
+    },
+    {
+      "epoch": 11.648261401362921,
+      "grad_norm": 0.33582714200019836,
+      "learning_rate": 0.00046049300699300695,
+      "loss": 3.3497,
+      "step": 40000
+    },
+    {
+      "epoch": 11.648261401362921,
+      "eval_accuracy": 0.3697849286727392,
+      "eval_loss": 3.5575385093688965,
+      "eval_runtime": 179.3781,
+      "eval_samples_per_second": 92.771,
+      "eval_steps_per_second": 5.803,
+      "step": 40000
+    },
+    {
+      "epoch": 11.662822528976644,
+      "grad_norm": 0.3475906550884247,
+      "learning_rate": 0.0004603181818181818,
+      "loss": 3.3493,
+      "step": 40050
+    },
+    {
+      "epoch": 11.677383656590367,
+      "grad_norm": 0.3488917052745819,
+      "learning_rate": 0.0004601433566433566,
+      "loss": 3.3608,
+      "step": 40100
+    },
+    {
+      "epoch": 11.69194478420409,
+      "grad_norm": 0.379167765378952,
+      "learning_rate": 0.00045996853146853145,
+      "loss": 3.3625,
+      "step": 40150
+    },
+    {
+      "epoch": 11.70650591181781,
+      "grad_norm": 0.34200170636177063,
+      "learning_rate": 0.0004597937062937062,
+      "loss": 3.3571,
+      "step": 40200
+    },
+    {
+      "epoch": 11.721067039431533,
+      "grad_norm": 0.36554938554763794,
+      "learning_rate": 0.00045961888111888105,
+      "loss": 3.3614,
+      "step": 40250
+    },
+    {
+      "epoch": 11.735628167045256,
+      "grad_norm": 0.3781202435493469,
+      "learning_rate": 0.0004594440559440559,
+      "loss": 3.3741,
+      "step": 40300
+    },
+    {
+      "epoch": 11.750189294658979,
+      "grad_norm": 0.3247515559196472,
+      "learning_rate": 0.0004592692307692307,
+      "loss": 3.3653,
+      "step": 40350
+    },
+    {
+      "epoch": 11.764750422272702,
+      "grad_norm": 0.3390739858150482,
+      "learning_rate": 0.00045909440559440556,
+      "loss": 3.3551,
+      "step": 40400
+    },
+    {
+      "epoch": 11.779311549886422,
+      "grad_norm": 0.34482771158218384,
+      "learning_rate": 0.00045891958041958036,
+      "loss": 3.3688,
+      "step": 40450
+    },
+    {
+      "epoch": 11.793872677500145,
+      "grad_norm": 0.35239699482917786,
+      "learning_rate": 0.0004587447552447552,
+      "loss": 3.3604,
+      "step": 40500
+    },
+    {
+      "epoch": 11.808433805113868,
+      "grad_norm": 0.3418464958667755,
+      "learning_rate": 0.00045856993006993,
+      "loss": 3.3696,
+      "step": 40550
+    },
+    {
+      "epoch": 11.82299493272759,
+      "grad_norm": 0.3679628372192383,
+      "learning_rate": 0.00045839510489510487,
+      "loss": 3.3641,
+      "step": 40600
+    },
+    {
+      "epoch": 11.837556060341313,
+      "grad_norm": 0.33282944560050964,
+      "learning_rate": 0.00045822027972027967,
+      "loss": 3.3546,
+      "step": 40650
+    },
+    {
+      "epoch": 11.852117187955034,
+      "grad_norm": 0.36056581139564514,
+      "learning_rate": 0.0004580454545454545,
+      "loss": 3.3632,
+      "step": 40700
+    },
+    {
+      "epoch": 11.866678315568757,
+      "grad_norm": 0.3631812036037445,
+      "learning_rate": 0.0004578706293706293,
+      "loss": 3.3794,
+      "step": 40750
+    },
+    {
+      "epoch": 11.88123944318248,
+      "grad_norm": 0.33666253089904785,
+      "learning_rate": 0.0004576958041958042,
+      "loss": 3.3578,
+      "step": 40800
+    },
+    {
+      "epoch": 11.895800570796203,
+      "grad_norm": 0.3388082981109619,
+      "learning_rate": 0.000457520979020979,
+      "loss": 3.3795,
+      "step": 40850
+    },
+    {
+      "epoch": 11.910361698409925,
+      "grad_norm": 0.3440251648426056,
+      "learning_rate": 0.00045734615384615383,
+      "loss": 3.3674,
+      "step": 40900
+    },
+    {
+      "epoch": 11.924922826023646,
+      "grad_norm": 0.3502148687839508,
+      "learning_rate": 0.0004571713286713287,
+      "loss": 3.3791,
+      "step": 40950
+    },
+    {
+      "epoch": 11.93948395363737,
+      "grad_norm": 0.33843347430229187,
+      "learning_rate": 0.00045699650349650343,
+      "loss": 3.3652,
+      "step": 41000
+    },
+    {
+      "epoch": 11.93948395363737,
+      "eval_accuracy": 0.37016665155533257,
+      "eval_loss": 3.5524773597717285,
+      "eval_runtime": 179.4931,
+      "eval_samples_per_second": 92.711,
+      "eval_steps_per_second": 5.8,
+      "step": 41000
+    },
+    {
+      "epoch": 11.954045081251092,
+      "grad_norm": 0.3571944534778595,
+      "learning_rate": 0.0004568216783216783,
+      "loss": 3.378,
+      "step": 41050
+    },
+    {
+      "epoch": 11.968606208864815,
+      "grad_norm": 0.35896360874176025,
+      "learning_rate": 0.0004566468531468531,
+      "loss": 3.3745,
+      "step": 41100
+    },
+    {
+      "epoch": 11.983167336478537,
+      "grad_norm": 0.36639687418937683,
+      "learning_rate": 0.00045647202797202794,
+      "loss": 3.3723,
+      "step": 41150
+    },
+    {
+      "epoch": 11.99772846409226,
+      "grad_norm": 0.34861519932746887,
+      "learning_rate": 0.00045629720279720274,
+      "loss": 3.3665,
+      "step": 41200
+    },
+    {
+      "epoch": 12.012231347195527,
+      "grad_norm": 0.3419104218482971,
+      "learning_rate": 0.0004561223776223776,
+      "loss": 3.2789,
+      "step": 41250
+    },
+    {
+      "epoch": 12.02679247480925,
+      "grad_norm": 0.3540184795856476,
+      "learning_rate": 0.0004559475524475524,
+      "loss": 3.2586,
+      "step": 41300
+    },
+    {
+      "epoch": 12.041353602422971,
+      "grad_norm": 0.3587668836116791,
+      "learning_rate": 0.00045577272727272724,
+      "loss": 3.261,
+      "step": 41350
+    },
+    {
+      "epoch": 12.055914730036694,
+      "grad_norm": 0.33976301550865173,
+      "learning_rate": 0.00045559790209790204,
+      "loss": 3.2696,
+      "step": 41400
+    },
+    {
+      "epoch": 12.070475857650417,
+      "grad_norm": 0.3481811583042145,
+      "learning_rate": 0.0004554230769230769,
+      "loss": 3.2758,
+      "step": 41450
+    },
+    {
+      "epoch": 12.08503698526414,
+      "grad_norm": 0.36990079283714294,
+      "learning_rate": 0.0004552482517482517,
+      "loss": 3.2813,
+      "step": 41500
+    },
+    {
+      "epoch": 12.099598112877862,
+      "grad_norm": 0.33966222405433655,
+      "learning_rate": 0.00045507342657342655,
+      "loss": 3.2727,
+      "step": 41550
+    },
+    {
+      "epoch": 12.114159240491583,
+      "grad_norm": 0.3467234969139099,
+      "learning_rate": 0.00045489860139860135,
+      "loss": 3.2746,
+      "step": 41600
+    },
+    {
+      "epoch": 12.128720368105306,
+      "grad_norm": 0.35601502656936646,
+      "learning_rate": 0.0004547237762237762,
+      "loss": 3.2741,
+      "step": 41650
+    },
+    {
+      "epoch": 12.143281495719028,
+      "grad_norm": 0.3513096868991852,
+      "learning_rate": 0.00045454895104895106,
+      "loss": 3.2947,
+      "step": 41700
+    },
+    {
+      "epoch": 12.157842623332751,
+      "grad_norm": 0.3616795241832733,
+      "learning_rate": 0.0004543741258741258,
+      "loss": 3.2756,
+      "step": 41750
+    },
+    {
+      "epoch": 12.172403750946474,
+      "grad_norm": 0.36047980189323425,
+      "learning_rate": 0.00045419930069930066,
+      "loss": 3.2892,
+      "step": 41800
+    },
+    {
+      "epoch": 12.186964878560195,
+      "grad_norm": 0.3357391357421875,
+      "learning_rate": 0.00045402447552447546,
+      "loss": 3.2952,
+      "step": 41850
+    },
+    {
+      "epoch": 12.201526006173918,
+      "grad_norm": 0.3331086337566376,
+      "learning_rate": 0.0004538496503496503,
+      "loss": 3.298,
+      "step": 41900
+    },
+    {
+      "epoch": 12.21608713378764,
+      "grad_norm": 0.3594485819339752,
+      "learning_rate": 0.0004536748251748251,
+      "loss": 3.2939,
+      "step": 41950
+    },
+    {
+      "epoch": 12.230648261401363,
+      "grad_norm": 0.333735853433609,
+      "learning_rate": 0.00045349999999999996,
+      "loss": 3.2998,
+      "step": 42000
+    },
+    {
+      "epoch": 12.230648261401363,
+      "eval_accuracy": 0.36975270683976613,
+      "eval_loss": 3.5625216960906982,
+      "eval_runtime": 179.4692,
+      "eval_samples_per_second": 92.723,
+      "eval_steps_per_second": 5.8,
+      "step": 42000
+    },
+    {
+      "epoch": 12.245209389015086,
+      "grad_norm": 0.38054659962654114,
+      "learning_rate": 0.00045332517482517476,
+      "loss": 3.3154,
+      "step": 42050
+    },
+    {
+      "epoch": 12.259770516628807,
+      "grad_norm": 0.35925668478012085,
+      "learning_rate": 0.0004531503496503496,
+      "loss": 3.301,
+      "step": 42100
+    },
+    {
+      "epoch": 12.27433164424253,
+      "grad_norm": 0.3805968165397644,
+      "learning_rate": 0.0004529755244755244,
+      "loss": 3.3095,
+      "step": 42150
+    },
+    {
+      "epoch": 12.288892771856252,
+      "grad_norm": 0.3440055251121521,
+      "learning_rate": 0.00045280069930069927,
+      "loss": 3.3106,
+      "step": 42200
+    },
+    {
+      "epoch": 12.303453899469975,
+      "grad_norm": 0.3645689785480499,
+      "learning_rate": 0.00045262587412587407,
+      "loss": 3.3168,
+      "step": 42250
+    },
+    {
+      "epoch": 12.318015027083698,
+      "grad_norm": 0.3602798581123352,
+      "learning_rate": 0.0004524510489510489,
+      "loss": 3.3174,
+      "step": 42300
+    },
+    {
+      "epoch": 12.33257615469742,
+      "grad_norm": 0.36583825945854187,
+      "learning_rate": 0.0004522762237762238,
+      "loss": 3.3139,
+      "step": 42350
+    },
+    {
+      "epoch": 12.347137282311142,
+      "grad_norm": 0.34254148602485657,
+      "learning_rate": 0.0004521013986013986,
+      "loss": 3.3131,
+      "step": 42400
+    },
+    {
+      "epoch": 12.361698409924864,
+      "grad_norm": 0.3659822344779968,
+      "learning_rate": 0.00045192657342657343,
+      "loss": 3.3231,
+      "step": 42450
+    },
+    {
+      "epoch": 12.376259537538587,
+      "grad_norm": 0.3749573528766632,
+      "learning_rate": 0.0004517517482517482,
+      "loss": 3.322,
+      "step": 42500
+    },
+    {
+      "epoch": 12.39082066515231,
+      "grad_norm": 0.34579816460609436,
+      "learning_rate": 0.00045157692307692303,
+      "loss": 3.3321,
+      "step": 42550
+    },
+    {
+      "epoch": 12.405381792766033,
+      "grad_norm": 0.34676283597946167,
+      "learning_rate": 0.00045140209790209783,
+      "loss": 3.3237,
+      "step": 42600
+    },
+    {
+      "epoch": 12.419942920379754,
+      "grad_norm": 0.37196454405784607,
+      "learning_rate": 0.0004512272727272727,
+      "loss": 3.3271,
+      "step": 42650
+    },
+    {
+      "epoch": 12.434504047993476,
+      "grad_norm": 0.33962497115135193,
+      "learning_rate": 0.0004510524475524475,
+      "loss": 3.3228,
+      "step": 42700
+    },
+    {
+      "epoch": 12.449065175607199,
+      "grad_norm": 0.3634340167045593,
+      "learning_rate": 0.00045087762237762234,
+      "loss": 3.3355,
+      "step": 42750
+    },
+    {
+      "epoch": 12.463626303220922,
+      "grad_norm": 0.3375799357891083,
+      "learning_rate": 0.00045070279720279714,
+      "loss": 3.3264,
+      "step": 42800
+    },
+    {
+      "epoch": 12.478187430834645,
+      "grad_norm": 0.34822535514831543,
+      "learning_rate": 0.000450527972027972,
+      "loss": 3.3143,
+      "step": 42850
+    },
+    {
+      "epoch": 12.492748558448366,
+      "grad_norm": 0.3571722209453583,
+      "learning_rate": 0.0004503531468531468,
+      "loss": 3.3301,
+      "step": 42900
+    },
+    {
+      "epoch": 12.507309686062088,
+      "grad_norm": 0.34919336438179016,
+      "learning_rate": 0.00045017832167832165,
+      "loss": 3.329,
+      "step": 42950
+    },
+    {
+      "epoch": 12.521870813675811,
+      "grad_norm": 0.35883447527885437,
+      "learning_rate": 0.0004500034965034965,
+      "loss": 3.3222,
+      "step": 43000
+    },
+    {
+      "epoch": 12.521870813675811,
+      "eval_accuracy": 0.37045594246450686,
+      "eval_loss": 3.555795192718506,
+      "eval_runtime": 179.3484,
+      "eval_samples_per_second": 92.786,
+      "eval_steps_per_second": 5.804,
+      "step": 43000
+    },
+    {
+      "epoch": 12.536431941289534,
+      "grad_norm": 0.3635919988155365,
+      "learning_rate": 0.0004498286713286713,
+      "loss": 3.3463,
+      "step": 43050
+    },
+    {
+      "epoch": 12.550993068903256,
+      "grad_norm": 0.3739362359046936,
+      "learning_rate": 0.00044965384615384615,
+      "loss": 3.3449,
+      "step": 43100
+    },
+    {
+      "epoch": 12.565554196516977,
+      "grad_norm": 0.34652945399284363,
+      "learning_rate": 0.00044947902097902095,
+      "loss": 3.3283,
+      "step": 43150
+    },
+    {
+      "epoch": 12.5801153241307,
+      "grad_norm": 0.37062638998031616,
+      "learning_rate": 0.0004493041958041958,
+      "loss": 3.3339,
+      "step": 43200
+    },
+    {
+      "epoch": 12.594676451744423,
+      "grad_norm": 0.3567717373371124,
+      "learning_rate": 0.00044912937062937055,
+      "loss": 3.3448,
+      "step": 43250
+    },
+    {
+      "epoch": 12.609237579358146,
+      "grad_norm": 0.3564370274543762,
+      "learning_rate": 0.0004489545454545454,
+      "loss": 3.3355,
+      "step": 43300
+    },
+    {
+      "epoch": 12.623798706971868,
+      "grad_norm": 0.3377971351146698,
+      "learning_rate": 0.0004487797202797202,
+      "loss": 3.3314,
+      "step": 43350
+    },
+    {
+      "epoch": 12.63835983458559,
+      "grad_norm": 0.35002943873405457,
+      "learning_rate": 0.00044860489510489506,
+      "loss": 3.3442,
+      "step": 43400
+    },
+    {
+      "epoch": 12.652920962199312,
+      "grad_norm": 0.35957759618759155,
+      "learning_rate": 0.00044843006993006986,
+      "loss": 3.3305,
+      "step": 43450
+    },
+    {
+      "epoch": 12.667482089813035,
+      "grad_norm": 0.3890550136566162,
+      "learning_rate": 0.0004482552447552447,
+      "loss": 3.3383,
+      "step": 43500
+    },
+    {
+      "epoch": 12.682043217426758,
+      "grad_norm": 0.3480173349380493,
+      "learning_rate": 0.0004480804195804195,
+      "loss": 3.3468,
+      "step": 43550
+    },
+    {
+      "epoch": 12.69660434504048,
+      "grad_norm": 0.35312148928642273,
+      "learning_rate": 0.00044790559440559437,
+      "loss": 3.3424,
+      "step": 43600
+    },
+    {
+      "epoch": 12.711165472654203,
+      "grad_norm": 0.3422091007232666,
+      "learning_rate": 0.00044773076923076917,
+      "loss": 3.3358,
+      "step": 43650
+    },
+    {
+      "epoch": 12.725726600267924,
+      "grad_norm": 0.3429749011993408,
+      "learning_rate": 0.000447555944055944,
+      "loss": 3.3494,
+      "step": 43700
+    },
+    {
+      "epoch": 12.740287727881647,
+      "grad_norm": 0.3457909822463989,
+      "learning_rate": 0.0004473811188811189,
+      "loss": 3.3504,
+      "step": 43750
+    },
+    {
+      "epoch": 12.75484885549537,
+      "grad_norm": 0.3556594252586365,
+      "learning_rate": 0.0004472062937062937,
+      "loss": 3.327,
+      "step": 43800
+    },
+    {
+      "epoch": 12.769409983109092,
+      "grad_norm": 0.3568826913833618,
+      "learning_rate": 0.00044703146853146853,
+      "loss": 3.3423,
+      "step": 43850
+    },
+    {
+      "epoch": 12.783971110722815,
+      "grad_norm": 0.3365405797958374,
+      "learning_rate": 0.00044685664335664333,
+      "loss": 3.3448,
+      "step": 43900
+    },
+    {
+      "epoch": 12.798532238336536,
+      "grad_norm": 0.32082322239875793,
+      "learning_rate": 0.0004466818181818182,
+      "loss": 3.3455,
+      "step": 43950
+    },
+    {
+      "epoch": 12.813093365950259,
+      "grad_norm": 0.3770608603954315,
+      "learning_rate": 0.00044650699300699293,
+      "loss": 3.3512,
+      "step": 44000
+    },
+    {
+      "epoch": 12.813093365950259,
+      "eval_accuracy": 0.370579537889524,
+      "eval_loss": 3.5477168560028076,
+      "eval_runtime": 179.3881,
+      "eval_samples_per_second": 92.765,
+      "eval_steps_per_second": 5.803,
+      "step": 44000
+    },
+    {
+      "epoch": 12.827654493563982,
+      "grad_norm": 0.35191744565963745,
+      "learning_rate": 0.0004463321678321678,
+      "loss": 3.3434,
+      "step": 44050
+    },
+    {
+      "epoch": 12.842215621177704,
+      "grad_norm": 0.35282081365585327,
+      "learning_rate": 0.0004461573426573426,
+      "loss": 3.363,
+      "step": 44100
+    },
+    {
+      "epoch": 12.856776748791427,
+      "grad_norm": 0.3475187420845032,
+      "learning_rate": 0.00044598251748251744,
+      "loss": 3.3419,
+      "step": 44150
+    },
+    {
+      "epoch": 12.871337876405148,
+      "grad_norm": 0.37921270728111267,
+      "learning_rate": 0.00044580769230769224,
+      "loss": 3.3643,
+      "step": 44200
+    },
+    {
+      "epoch": 12.88589900401887,
+      "grad_norm": 0.35705578327178955,
+      "learning_rate": 0.0004456328671328671,
+      "loss": 3.3566,
+      "step": 44250
+    },
+    {
+      "epoch": 12.900460131632594,
+      "grad_norm": 0.3738381266593933,
+      "learning_rate": 0.0004454580419580419,
+      "loss": 3.3578,
+      "step": 44300
+    },
+    {
+      "epoch": 12.915021259246316,
+      "grad_norm": 0.37262797355651855,
+      "learning_rate": 0.00044528321678321674,
+      "loss": 3.3576,
+      "step": 44350
+    },
+    {
+      "epoch": 12.929582386860039,
+      "grad_norm": 0.36386188864707947,
+      "learning_rate": 0.0004451083916083916,
+      "loss": 3.3621,
+      "step": 44400
+    },
+    {
+      "epoch": 12.944143514473762,
+      "grad_norm": 0.35035935044288635,
+      "learning_rate": 0.0004449335664335664,
+      "loss": 3.3512,
+      "step": 44450
+    },
+    {
+      "epoch": 12.958704642087483,
+      "grad_norm": 0.34800392389297485,
+      "learning_rate": 0.00044475874125874125,
+      "loss": 3.3548,
+      "step": 44500
+    },
+    {
+      "epoch": 12.973265769701205,
+      "grad_norm": 0.3844282031059265,
+      "learning_rate": 0.00044458391608391605,
+      "loss": 3.3601,
+      "step": 44550
+    },
+    {
+      "epoch": 12.987826897314928,
+      "grad_norm": 0.3487918972969055,
+      "learning_rate": 0.0004444090909090909,
+      "loss": 3.3575,
+      "step": 44600
+    },
+    {
+      "epoch": 13.002329780418195,
+      "grad_norm": 0.3930342197418213,
+      "learning_rate": 0.0004442342657342657,
+      "loss": 3.3273,
+      "step": 44650
+    },
+    {
+      "epoch": 13.016890908031918,
+      "grad_norm": 0.3483104109764099,
+      "learning_rate": 0.00044405944055944056,
+      "loss": 3.2386,
+      "step": 44700
+    },
+    {
+      "epoch": 13.031452035645641,
+      "grad_norm": 0.40511345863342285,
+      "learning_rate": 0.0004438846153846153,
+      "loss": 3.2456,
+      "step": 44750
+    },
+    {
+      "epoch": 13.046013163259364,
+      "grad_norm": 0.34650883078575134,
+      "learning_rate": 0.00044370979020979016,
+      "loss": 3.2567,
+      "step": 44800
+    },
+    {
+      "epoch": 13.060574290873085,
+      "grad_norm": 0.34063106775283813,
+      "learning_rate": 0.00044353496503496496,
+      "loss": 3.264,
+      "step": 44850
+    },
+    {
+      "epoch": 13.075135418486807,
+      "grad_norm": 0.3684180974960327,
+      "learning_rate": 0.0004433601398601398,
+      "loss": 3.2597,
+      "step": 44900
+    },
+    {
+      "epoch": 13.08969654610053,
+      "grad_norm": 0.3664800822734833,
+      "learning_rate": 0.0004431853146853146,
+      "loss": 3.2519,
+      "step": 44950
+    },
+    {
+      "epoch": 13.104257673714253,
+      "grad_norm": 0.326003760099411,
+      "learning_rate": 0.00044301048951048946,
+      "loss": 3.2655,
+      "step": 45000
+    },
+    {
+      "epoch": 13.104257673714253,
+      "eval_accuracy": 0.37012114115620637,
+      "eval_loss": 3.5636181831359863,
+      "eval_runtime": 179.2598,
+      "eval_samples_per_second": 92.832,
+      "eval_steps_per_second": 5.807,
+      "step": 45000
+    },
+    {
+      "epoch": 13.118818801327976,
+      "grad_norm": 0.34620916843414307,
+      "learning_rate": 0.00044283566433566426,
+      "loss": 3.2733,
+      "step": 45050
+    },
+    {
+      "epoch": 13.133379928941697,
+      "grad_norm": 0.3420342803001404,
+      "learning_rate": 0.0004426608391608391,
+      "loss": 3.2653,
+      "step": 45100
+    },
+    {
+      "epoch": 13.14794105655542,
+      "grad_norm": 0.37536609172821045,
+      "learning_rate": 0.00044248601398601397,
+      "loss": 3.2691,
+      "step": 45150
+    },
+    {
+      "epoch": 13.162502184169142,
+      "grad_norm": 0.3489169478416443,
+      "learning_rate": 0.00044231118881118877,
+      "loss": 3.2838,
+      "step": 45200
+    },
+    {
+      "epoch": 13.177063311782865,
+      "grad_norm": 0.3672625720500946,
+      "learning_rate": 0.0004421363636363636,
+      "loss": 3.271,
+      "step": 45250
+    },
+    {
+      "epoch": 13.191624439396588,
+      "grad_norm": 0.33298152685165405,
+      "learning_rate": 0.0004419615384615384,
+      "loss": 3.2791,
+      "step": 45300
+    },
+    {
+      "epoch": 13.206185567010309,
+      "grad_norm": 0.35840123891830444,
+      "learning_rate": 0.0004417867132867133,
+      "loss": 3.2835,
+      "step": 45350
+    },
+    {
+      "epoch": 13.220746694624031,
+      "grad_norm": 0.34123364090919495,
+      "learning_rate": 0.0004416118881118881,
+      "loss": 3.2848,
+      "step": 45400
+    },
+    {
+      "epoch": 13.235307822237754,
+      "grad_norm": 0.35084405541419983,
+      "learning_rate": 0.00044143706293706293,
+      "loss": 3.2859,
+      "step": 45450
+    },
+    {
+      "epoch": 13.249868949851477,
+      "grad_norm": 0.38360145688056946,
+      "learning_rate": 0.0004412622377622377,
+      "loss": 3.2838,
+      "step": 45500
+    },
+    {
+      "epoch": 13.2644300774652,
+      "grad_norm": 0.3318229019641876,
+      "learning_rate": 0.00044108741258741253,
+      "loss": 3.2892,
+      "step": 45550
+    },
+    {
+      "epoch": 13.27899120507892,
+      "grad_norm": 0.3659403622150421,
+      "learning_rate": 0.00044091258741258733,
+      "loss": 3.3001,
+      "step": 45600
+    },
+    {
+      "epoch": 13.293552332692643,
+      "grad_norm": 0.35524776577949524,
+      "learning_rate": 0.0004407377622377622,
+      "loss": 3.2927,
+      "step": 45650
+    },
+    {
+      "epoch": 13.308113460306366,
+      "grad_norm": 0.3889663517475128,
+      "learning_rate": 0.000440562937062937,
+      "loss": 3.2983,
+      "step": 45700
+    },
+    {
+      "epoch": 13.322674587920089,
+      "grad_norm": 0.34109199047088623,
+      "learning_rate": 0.00044038811188811184,
+      "loss": 3.2808,
+      "step": 45750
+    },
+    {
+      "epoch": 13.337235715533811,
+      "grad_norm": 0.3871344029903412,
+      "learning_rate": 0.0004402132867132867,
+      "loss": 3.3107,
+      "step": 45800
+    },
+    {
+      "epoch": 13.351796843147532,
+      "grad_norm": 0.3498539328575134,
+      "learning_rate": 0.0004400384615384615,
+      "loss": 3.2938,
+      "step": 45850
+    },
+    {
+      "epoch": 13.366357970761255,
+      "grad_norm": 0.3516894280910492,
+      "learning_rate": 0.00043986363636363635,
+      "loss": 3.305,
+      "step": 45900
+    },
+    {
+      "epoch": 13.380919098374978,
+      "grad_norm": 0.35592004656791687,
+      "learning_rate": 0.00043968881118881115,
+      "loss": 3.2998,
+      "step": 45950
+    },
+    {
+      "epoch": 13.3954802259887,
+      "grad_norm": 0.32546910643577576,
+      "learning_rate": 0.000439513986013986,
+      "loss": 3.3094,
+      "step": 46000
+    },
+    {
+      "epoch": 13.3954802259887,
+      "eval_accuracy": 0.37056460295234306,
+      "eval_loss": 3.5570590496063232,
+      "eval_runtime": 179.3634,
+      "eval_samples_per_second": 92.778,
+      "eval_steps_per_second": 5.804,
+      "step": 46000
+    },
+    {
+      "epoch": 13.410041353602423,
+      "grad_norm": 0.37634313106536865,
+      "learning_rate": 0.0004393391608391608,
+      "loss": 3.3115,
+      "step": 46050
+    },
+    {
+      "epoch": 13.424602481216146,
+      "grad_norm": 0.3550618886947632,
+      "learning_rate": 0.00043916433566433565,
+      "loss": 3.3024,
+      "step": 46100
+    },
+    {
+      "epoch": 13.439163608829867,
+      "grad_norm": 0.3748590052127838,
+      "learning_rate": 0.00043898951048951045,
+      "loss": 3.289,
+      "step": 46150
+    },
+    {
+      "epoch": 13.45372473644359,
+      "grad_norm": 0.35100436210632324,
+      "learning_rate": 0.0004388146853146853,
+      "loss": 3.3117,
+      "step": 46200
+    },
+    {
+      "epoch": 13.468285864057313,
+      "grad_norm": 0.36626309156417847,
+      "learning_rate": 0.00043863986013986005,
+      "loss": 3.3111,
+      "step": 46250
+    },
+    {
+      "epoch": 13.482846991671035,
+      "grad_norm": 0.3601759672164917,
+      "learning_rate": 0.0004384650349650349,
+      "loss": 3.3107,
+      "step": 46300
+    },
+    {
+      "epoch": 13.497408119284758,
+      "grad_norm": 0.33450236916542053,
+      "learning_rate": 0.0004382902097902097,
+      "loss": 3.3077,
+      "step": 46350
+    },
+    {
+      "epoch": 13.51196924689848,
+      "grad_norm": 0.34406334161758423,
+      "learning_rate": 0.00043811538461538456,
+      "loss": 3.3194,
+      "step": 46400
+    },
+    {
+      "epoch": 13.526530374512202,
+      "grad_norm": 0.34490951895713806,
+      "learning_rate": 0.0004379405594405594,
+      "loss": 3.3069,
+      "step": 46450
+    },
+    {
+      "epoch": 13.541091502125925,
+      "grad_norm": 0.3848843574523926,
+      "learning_rate": 0.0004377657342657342,
+      "loss": 3.3134,
+      "step": 46500
+    },
+    {
+      "epoch": 13.555652629739647,
+      "grad_norm": 0.3632429838180542,
+      "learning_rate": 0.00043759090909090907,
+      "loss": 3.3208,
+      "step": 46550
+    },
+    {
+      "epoch": 13.57021375735337,
+      "grad_norm": 0.3939577341079712,
+      "learning_rate": 0.00043741608391608387,
+      "loss": 3.324,
+      "step": 46600
+    },
+    {
+      "epoch": 13.584774884967091,
+      "grad_norm": 0.3596148192882538,
+      "learning_rate": 0.0004372412587412587,
+      "loss": 3.3322,
+      "step": 46650
+    },
+    {
+      "epoch": 13.599336012580814,
+      "grad_norm": 0.35220861434936523,
+      "learning_rate": 0.0004370664335664335,
+      "loss": 3.3069,
+      "step": 46700
+    },
+    {
+      "epoch": 13.613897140194537,
+      "grad_norm": 0.3723880350589752,
+      "learning_rate": 0.0004368916083916084,
+      "loss": 3.327,
+      "step": 46750
+    },
+    {
+      "epoch": 13.62845826780826,
+      "grad_norm": 0.3383399546146393,
+      "learning_rate": 0.0004367167832167832,
+      "loss": 3.3256,
+      "step": 46800
+    },
+    {
+      "epoch": 13.643019395421982,
+      "grad_norm": 0.3382317125797272,
+      "learning_rate": 0.00043654195804195803,
+      "loss": 3.3374,
+      "step": 46850
+    },
+    {
+      "epoch": 13.657580523035705,
+      "grad_norm": 0.34736913442611694,
+      "learning_rate": 0.00043636713286713283,
+      "loss": 3.329,
+      "step": 46900
+    },
+    {
+      "epoch": 13.672141650649426,
+      "grad_norm": 0.33893242478370667,
+      "learning_rate": 0.0004361923076923077,
+      "loss": 3.3123,
+      "step": 46950
+    },
+    {
+      "epoch": 13.686702778263149,
+      "grad_norm": 0.35469168424606323,
+      "learning_rate": 0.00043601748251748243,
+      "loss": 3.3239,
+      "step": 47000
+    },
+    {
+      "epoch": 13.686702778263149,
+      "eval_accuracy": 0.37072630010686125,
+      "eval_loss": 3.550806999206543,
+      "eval_runtime": 179.4918,
+      "eval_samples_per_second": 92.712,
+      "eval_steps_per_second": 5.8,
+      "step": 47000
+    },
+    {
+      "epoch": 13.701263905876871,
+      "grad_norm": 0.3704715073108673,
+      "learning_rate": 0.00043584265734265734,
+      "loss": 3.3243,
+      "step": 47050
+    },
+    {
+      "epoch": 13.715825033490594,
+      "grad_norm": 0.3501978814601898,
+      "learning_rate": 0.0004356678321678321,
+      "loss": 3.3377,
+      "step": 47100
+    },
+    {
+      "epoch": 13.730386161104317,
+      "grad_norm": 0.3527981638908386,
+      "learning_rate": 0.00043549300699300694,
+      "loss": 3.3309,
+      "step": 47150
+    },
+    {
+      "epoch": 13.744947288718038,
+      "grad_norm": 0.36017870903015137,
+      "learning_rate": 0.0004353181818181818,
+      "loss": 3.3214,
+      "step": 47200
+    },
+    {
+      "epoch": 13.75950841633176,
+      "grad_norm": 0.3922679126262665,
+      "learning_rate": 0.0004351433566433566,
+      "loss": 3.3286,
+      "step": 47250
+    },
+    {
+      "epoch": 13.774069543945483,
+      "grad_norm": 0.3513588011264801,
+      "learning_rate": 0.00043496853146853144,
+      "loss": 3.3451,
+      "step": 47300
+    },
+    {
+      "epoch": 13.788630671559206,
+      "grad_norm": 0.3663727045059204,
+      "learning_rate": 0.00043479370629370624,
+      "loss": 3.3265,
+      "step": 47350
+    },
+    {
+      "epoch": 13.803191799172929,
+      "grad_norm": 0.35833120346069336,
+      "learning_rate": 0.0004346188811188811,
+      "loss": 3.3246,
+      "step": 47400
+    },
+    {
+      "epoch": 13.81775292678665,
+      "grad_norm": 0.39127853512763977,
+      "learning_rate": 0.0004344440559440559,
+      "loss": 3.3396,
+      "step": 47450
+    },
+    {
+      "epoch": 13.832314054400372,
+      "grad_norm": 0.34158089756965637,
+      "learning_rate": 0.00043426923076923075,
+      "loss": 3.3327,
+      "step": 47500
+    },
+    {
+      "epoch": 13.846875182014095,
+      "grad_norm": 0.34215644001960754,
+      "learning_rate": 0.00043409440559440555,
+      "loss": 3.3317,
+      "step": 47550
+    },
+    {
+      "epoch": 13.861436309627818,
+      "grad_norm": 0.3922632038593292,
+      "learning_rate": 0.0004339195804195804,
+      "loss": 3.3378,
+      "step": 47600
+    },
+    {
+      "epoch": 13.87599743724154,
+      "grad_norm": 0.35244500637054443,
+      "learning_rate": 0.0004337447552447552,
+      "loss": 3.3386,
+      "step": 47650
+    },
+    {
+      "epoch": 13.890558564855262,
+      "grad_norm": 0.3905071020126343,
+      "learning_rate": 0.00043356993006993006,
+      "loss": 3.3239,
+      "step": 47700
+    },
+    {
+      "epoch": 13.905119692468984,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004333951048951048,
+      "loss": 3.3461,
+      "step": 47750
+    },
+    {
+      "epoch": 13.919680820082707,
+      "grad_norm": 0.3413317799568176,
+      "learning_rate": 0.0004332202797202797,
+      "loss": 3.3401,
+      "step": 47800
+    },
+    {
+      "epoch": 13.93424194769643,
+      "grad_norm": 0.3655047118663788,
+      "learning_rate": 0.00043304545454545456,
+      "loss": 3.3375,
+      "step": 47850
+    },
+    {
+      "epoch": 13.948803075310153,
+      "grad_norm": 0.35636481642723083,
+      "learning_rate": 0.0004328706293706293,
+      "loss": 3.3318,
+      "step": 47900
+    },
+    {
+      "epoch": 13.963364202923874,
+      "grad_norm": 0.35816138982772827,
+      "learning_rate": 0.00043269580419580416,
+      "loss": 3.3291,
+      "step": 47950
+    },
+    {
+      "epoch": 13.977925330537596,
+      "grad_norm": 0.3533070683479309,
+      "learning_rate": 0.00043252097902097896,
+      "loss": 3.3384,
+      "step": 48000
+    },
+    {
+      "epoch": 13.977925330537596,
+      "eval_accuracy": 0.3715532487545497,
+      "eval_loss": 3.543606996536255,
+      "eval_runtime": 179.3377,
+      "eval_samples_per_second": 92.791,
+      "eval_steps_per_second": 5.805,
+      "step": 48000
+    },
+    {
+      "epoch": 13.992486458151319,
+      "grad_norm": 0.3505331873893738,
+      "learning_rate": 0.0004323461538461538,
+      "loss": 3.3509,
+      "step": 48050
+    },
+    {
+      "epoch": 14.006989341254586,
+      "grad_norm": 0.33162781596183777,
+      "learning_rate": 0.0004321713286713286,
+      "loss": 3.2666,
+      "step": 48100
+    },
+    {
+      "epoch": 14.021550468868309,
+      "grad_norm": 0.3567025661468506,
+      "learning_rate": 0.00043199650349650347,
+      "loss": 3.2307,
+      "step": 48150
+    },
+    {
+      "epoch": 14.036111596482032,
+      "grad_norm": 0.39268216490745544,
+      "learning_rate": 0.00043182167832167827,
+      "loss": 3.236,
+      "step": 48200
+    },
+    {
+      "epoch": 14.050672724095755,
+      "grad_norm": 0.3755398392677307,
+      "learning_rate": 0.0004316468531468531,
+      "loss": 3.2447,
+      "step": 48250
+    },
+    {
+      "epoch": 14.065233851709475,
+      "grad_norm": 0.37311825156211853,
+      "learning_rate": 0.0004314720279720279,
+      "loss": 3.2517,
+      "step": 48300
+    },
+    {
+      "epoch": 14.079794979323198,
+      "grad_norm": 0.3706851601600647,
+      "learning_rate": 0.0004312972027972028,
+      "loss": 3.2327,
+      "step": 48350
+    },
+    {
+      "epoch": 14.094356106936921,
+      "grad_norm": 0.3393601179122925,
+      "learning_rate": 0.0004311223776223776,
+      "loss": 3.2466,
+      "step": 48400
+    },
+    {
+      "epoch": 14.108917234550644,
+      "grad_norm": 0.33590853214263916,
+      "learning_rate": 0.00043094755244755243,
+      "loss": 3.2537,
+      "step": 48450
+    },
+    {
+      "epoch": 14.123478362164366,
+      "grad_norm": 0.370746910572052,
+      "learning_rate": 0.0004307727272727272,
+      "loss": 3.2524,
+      "step": 48500
+    },
+    {
+      "epoch": 14.13803948977809,
+      "grad_norm": 0.37777411937713623,
+      "learning_rate": 0.0004305979020979021,
+      "loss": 3.2438,
+      "step": 48550
+    },
+    {
+      "epoch": 14.15260061739181,
+      "grad_norm": 0.3572547435760498,
+      "learning_rate": 0.00043042307692307694,
+      "loss": 3.2555,
+      "step": 48600
+    },
+    {
+      "epoch": 14.167161745005533,
+      "grad_norm": 0.39759549498558044,
+      "learning_rate": 0.0004302482517482517,
+      "loss": 3.2567,
+      "step": 48650
+    },
+    {
+      "epoch": 14.181722872619256,
+      "grad_norm": 0.37063178420066833,
+      "learning_rate": 0.00043007342657342654,
+      "loss": 3.2589,
+      "step": 48700
+    },
+    {
+      "epoch": 14.196284000232978,
+      "grad_norm": 0.365633487701416,
+      "learning_rate": 0.00042989860139860134,
+      "loss": 3.2801,
+      "step": 48750
+    },
+    {
+      "epoch": 14.210845127846701,
+      "grad_norm": 0.35019052028656006,
+      "learning_rate": 0.0004297237762237762,
+      "loss": 3.2774,
+      "step": 48800
+    },
+    {
+      "epoch": 14.225406255460422,
+      "grad_norm": 0.35043588280677795,
+      "learning_rate": 0.000429548951048951,
+      "loss": 3.2659,
+      "step": 48850
+    },
+    {
+      "epoch": 14.239967383074145,
+      "grad_norm": 0.3760192394256592,
+      "learning_rate": 0.00042937412587412585,
+      "loss": 3.2845,
+      "step": 48900
+    },
+    {
+      "epoch": 14.254528510687868,
+      "grad_norm": 0.35163772106170654,
+      "learning_rate": 0.00042919930069930065,
+      "loss": 3.2683,
+      "step": 48950
+    },
+    {
+      "epoch": 14.26908963830159,
+      "grad_norm": 0.3676861524581909,
+      "learning_rate": 0.0004290244755244755,
+      "loss": 3.2693,
+      "step": 49000
+    },
+    {
+      "epoch": 14.26908963830159,
+      "eval_accuracy": 0.37074393979644504,
+      "eval_loss": 3.561136245727539,
+      "eval_runtime": 179.3898,
+      "eval_samples_per_second": 92.764,
+      "eval_steps_per_second": 5.803,
+      "step": 49000
+    },
+    {
+      "epoch": 14.283650765915313,
+      "grad_norm": 0.3473881185054779,
+      "learning_rate": 0.0004288496503496503,
+      "loss": 3.2762,
+      "step": 49050
+    },
+    {
+      "epoch": 14.298211893529034,
+      "grad_norm": 0.3722080886363983,
+      "learning_rate": 0.00042867482517482515,
+      "loss": 3.2836,
+      "step": 49100
+    },
+    {
+      "epoch": 14.312773021142757,
+      "grad_norm": 0.35850104689598083,
+      "learning_rate": 0.00042849999999999995,
+      "loss": 3.285,
+      "step": 49150
+    },
+    {
+      "epoch": 14.32733414875648,
+      "grad_norm": 0.3875146210193634,
+      "learning_rate": 0.0004283251748251748,
+      "loss": 3.2743,
+      "step": 49200
+    },
+    {
+      "epoch": 14.341895276370202,
+      "grad_norm": 0.3412521779537201,
+      "learning_rate": 0.00042815034965034966,
+      "loss": 3.2891,
+      "step": 49250
+    },
+    {
+      "epoch": 14.356456403983925,
+      "grad_norm": 0.36994293332099915,
+      "learning_rate": 0.00042797552447552446,
+      "loss": 3.2841,
+      "step": 49300
+    },
+    {
+      "epoch": 14.371017531597648,
+      "grad_norm": 0.36702197790145874,
+      "learning_rate": 0.0004278006993006993,
+      "loss": 3.2903,
+      "step": 49350
+    },
+    {
+      "epoch": 14.385578659211369,
+      "grad_norm": 0.4124605357646942,
+      "learning_rate": 0.00042762587412587406,
+      "loss": 3.2874,
+      "step": 49400
+    },
+    {
+      "epoch": 14.400139786825092,
+      "grad_norm": 0.39137279987335205,
+      "learning_rate": 0.0004274510489510489,
+      "loss": 3.2784,
+      "step": 49450
+    },
+    {
+      "epoch": 14.414700914438814,
+      "grad_norm": 0.3900337219238281,
+      "learning_rate": 0.0004272762237762237,
+      "loss": 3.2899,
+      "step": 49500
+    },
+    {
+      "epoch": 14.429262042052537,
+      "grad_norm": 0.3700048625469208,
+      "learning_rate": 0.00042710139860139857,
+      "loss": 3.2876,
+      "step": 49550
+    },
+    {
+      "epoch": 14.44382316966626,
+      "grad_norm": 0.3632020056247711,
+      "learning_rate": 0.00042692657342657337,
+      "loss": 3.2918,
+      "step": 49600
+    },
+    {
+      "epoch": 14.45838429727998,
+      "grad_norm": 0.37253108620643616,
+      "learning_rate": 0.0004267517482517482,
+      "loss": 3.3046,
+      "step": 49650
+    },
+    {
+      "epoch": 14.472945424893704,
+      "grad_norm": 0.37529903650283813,
+      "learning_rate": 0.000426576923076923,
+      "loss": 3.2981,
+      "step": 49700
+    },
+    {
+      "epoch": 14.487506552507426,
+      "grad_norm": 0.3445450961589813,
+      "learning_rate": 0.0004264020979020979,
+      "loss": 3.2962,
+      "step": 49750
+    },
+    {
+      "epoch": 14.502067680121149,
+      "grad_norm": 0.37300825119018555,
+      "learning_rate": 0.0004262272727272727,
+      "loss": 3.2913,
+      "step": 49800
+    },
+    {
+      "epoch": 14.516628807734872,
+      "grad_norm": 0.3445644676685333,
+      "learning_rate": 0.00042605244755244753,
+      "loss": 3.3022,
+      "step": 49850
+    },
+    {
+      "epoch": 14.531189935348593,
+      "grad_norm": 0.4042005240917206,
+      "learning_rate": 0.00042587762237762233,
+      "loss": 3.2982,
+      "step": 49900
+    },
+    {
+      "epoch": 14.545751062962315,
+      "grad_norm": 0.3575955331325531,
+      "learning_rate": 0.0004257027972027972,
+      "loss": 3.305,
+      "step": 49950
+    },
+    {
+      "epoch": 14.560312190576038,
+      "grad_norm": 0.3821354806423187,
+      "learning_rate": 0.00042552797202797204,
+      "loss": 3.3071,
+      "step": 50000
+    },
+    {
+      "epoch": 14.560312190576038,
+      "eval_accuracy": 0.3708429572539754,
+      "eval_loss": 3.5507326126098633,
+      "eval_runtime": 179.5347,
+      "eval_samples_per_second": 92.69,
+      "eval_steps_per_second": 5.798,
+      "step": 50000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 171700,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 2
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.045069359611904e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}