diff --git "a/cost_to_push_frequency_2128/checkpoint-110000/trainer_state.json" "b/cost_to_push_frequency_2128/checkpoint-110000/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/cost_to_push_frequency_2128/checkpoint-110000/trainer_state.json"
@@ -0,0 +1,16433 @@
+{
+  "best_global_step": 96000,
+  "best_metric": 3.529139995574951,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/cost_to_push_frequency_2128/checkpoint-40000",
+  "epoch": 32.03290814840701,
+  "eval_steps": 1000,
+  "global_step": 110000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.014561127613722406,
+      "grad_norm": 1.0827219486236572,
+      "learning_rate": 0.000294,
+      "loss": 8.4429,
+      "step": 50
+    },
+    {
+      "epoch": 0.029122255227444813,
+      "grad_norm": 0.9950307011604309,
+      "learning_rate": 0.0005939999999999999,
+      "loss": 6.7538,
+      "step": 100
+    },
+    {
+      "epoch": 0.04368338284116722,
+      "grad_norm": 0.40593206882476807,
+      "learning_rate": 0.0005998286713286713,
+      "loss": 6.3529,
+      "step": 150
+    },
+    {
+      "epoch": 0.058244510454889625,
+      "grad_norm": 0.5188880562782288,
+      "learning_rate": 0.0005996538461538461,
+      "loss": 6.1387,
+      "step": 200
+    },
+    {
+      "epoch": 0.07280563806861204,
+      "grad_norm": 0.5219882726669312,
+      "learning_rate": 0.0005994790209790209,
+      "loss": 5.9936,
+      "step": 250
+    },
+    {
+      "epoch": 0.08736676568233444,
+      "grad_norm": 0.4981943368911743,
+      "learning_rate": 0.0005993041958041958,
+      "loss": 5.8475,
+      "step": 300
+    },
+    {
+      "epoch": 0.10192789329605685,
+      "grad_norm": 0.419317364692688,
+      "learning_rate": 0.0005991293706293705,
+      "loss": 5.731,
+      "step": 350
+    },
+    {
+      "epoch": 0.11648902090977925,
+      "grad_norm": 0.4203638732433319,
+      "learning_rate": 0.0005989545454545454,
+      "loss": 5.6254,
+      "step": 400
+    },
+    {
+      "epoch": 0.13105014852350166,
+      "grad_norm": 0.5592066645622253,
+      "learning_rate": 0.0005987797202797202,
+      "loss": 5.5068,
+      "step": 450
+    },
+    {
+      "epoch": 0.14561127613722408,
+      "grad_norm": 0.465763658285141,
+      "learning_rate": 0.000598604895104895,
+      "loss": 5.4002,
+      "step": 500
+    },
+    {
+      "epoch": 0.16017240375094646,
+      "grad_norm": 0.4648076295852661,
+      "learning_rate": 0.0005984300699300698,
+      "loss": 5.3308,
+      "step": 550
+    },
+    {
+      "epoch": 0.17473353136466888,
+      "grad_norm": 0.49379315972328186,
+      "learning_rate": 0.0005982552447552447,
+      "loss": 5.2609,
+      "step": 600
+    },
+    {
+      "epoch": 0.1892946589783913,
+      "grad_norm": 0.4596584439277649,
+      "learning_rate": 0.0005980804195804195,
+      "loss": 5.1905,
+      "step": 650
+    },
+    {
+      "epoch": 0.2038557865921137,
+      "grad_norm": 0.40508726239204407,
+      "learning_rate": 0.0005979055944055943,
+      "loss": 5.1331,
+      "step": 700
+    },
+    {
+      "epoch": 0.2184169142058361,
+      "grad_norm": 0.3763967752456665,
+      "learning_rate": 0.0005977307692307691,
+      "loss": 5.0753,
+      "step": 750
+    },
+    {
+      "epoch": 0.2329780418195585,
+      "grad_norm": 0.4820829927921295,
+      "learning_rate": 0.000597555944055944,
+      "loss": 5.0201,
+      "step": 800
+    },
+    {
+      "epoch": 0.24753916943328091,
+      "grad_norm": 0.4278320074081421,
+      "learning_rate": 0.0005973811188811188,
+      "loss": 4.9577,
+      "step": 850
+    },
+    {
+      "epoch": 0.2621002970470033,
+      "grad_norm": 0.4293597340583801,
+      "learning_rate": 0.0005972062937062936,
+      "loss": 4.9213,
+      "step": 900
+    },
+    {
+      "epoch": 0.27666142466072574,
+      "grad_norm": 0.43348240852355957,
+      "learning_rate": 0.0005970314685314685,
+      "loss": 4.8786,
+      "step": 950
+    },
+    {
+      "epoch": 0.29122255227444815,
+      "grad_norm": 0.4520808160305023,
+      "learning_rate": 0.0005968566433566433,
+      "loss": 4.8181,
+      "step": 1000
+    },
+    {
+      "epoch": 0.29122255227444815,
+      "eval_accuracy": 0.2556966724844482,
+      "eval_loss": 4.745121955871582,
+      "eval_runtime": 179.4217,
+      "eval_samples_per_second": 92.776,
+      "eval_steps_per_second": 5.802,
+      "step": 1000
+    },
+    {
+      "epoch": 0.30578367988817057,
+      "grad_norm": 0.45122841000556946,
+      "learning_rate": 0.0005966818181818181,
+      "loss": 4.7877,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3203448075018929,
+      "grad_norm": 0.47879835963249207,
+      "learning_rate": 0.0005965069930069929,
+      "loss": 4.7428,
+      "step": 1100
+    },
+    {
+      "epoch": 0.33490593511561534,
+      "grad_norm": 0.4831642210483551,
+      "learning_rate": 0.0005963321678321677,
+      "loss": 4.6996,
+      "step": 1150
+    },
+    {
+      "epoch": 0.34946706272933775,
+      "grad_norm": 0.4561481177806854,
+      "learning_rate": 0.0005961573426573425,
+      "loss": 4.6659,
+      "step": 1200
+    },
+    {
+      "epoch": 0.36402819034306017,
+      "grad_norm": 0.4561339020729065,
+      "learning_rate": 0.0005959825174825174,
+      "loss": 4.6367,
+      "step": 1250
+    },
+    {
+      "epoch": 0.3785893179567826,
+      "grad_norm": 0.4436923861503601,
+      "learning_rate": 0.0005958076923076922,
+      "loss": 4.6064,
+      "step": 1300
+    },
+    {
+      "epoch": 0.393150445570505,
+      "grad_norm": 0.46087032556533813,
+      "learning_rate": 0.000595632867132867,
+      "loss": 4.5797,
+      "step": 1350
+    },
+    {
+      "epoch": 0.4077115731842274,
+      "grad_norm": 0.47251585125923157,
+      "learning_rate": 0.0005954580419580418,
+      "loss": 4.547,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4222727007979498,
+      "grad_norm": 0.43149644136428833,
+      "learning_rate": 0.0005952832167832168,
+      "loss": 4.5216,
+      "step": 1450
+    },
+    {
+      "epoch": 0.4368338284116722,
+      "grad_norm": 0.3600349724292755,
+      "learning_rate": 0.0005951083916083916,
+      "loss": 4.5129,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4513949560253946,
+      "grad_norm": 0.42545634508132935,
+      "learning_rate": 0.0005949335664335664,
+      "loss": 4.478,
+      "step": 1550
+    },
+    {
+      "epoch": 0.465956083639117,
+      "grad_norm": 0.4261489808559418,
+      "learning_rate": 0.0005947587412587413,
+      "loss": 4.466,
+      "step": 1600
+    },
+    {
+      "epoch": 0.4805172112528394,
+      "grad_norm": 0.382684588432312,
+      "learning_rate": 0.0005945839160839161,
+      "loss": 4.4463,
+      "step": 1650
+    },
+    {
+      "epoch": 0.49507833886656183,
+      "grad_norm": 0.4798526465892792,
+      "learning_rate": 0.0005944090909090909,
+      "loss": 4.419,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5096394664802842,
+      "grad_norm": 0.4271828830242157,
+      "learning_rate": 0.0005942342657342657,
+      "loss": 4.4065,
+      "step": 1750
+    },
+    {
+      "epoch": 0.5242005940940067,
+      "grad_norm": 0.4648028016090393,
+      "learning_rate": 0.0005940594405594406,
+      "loss": 4.389,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5387617217077291,
+      "grad_norm": 0.46727654337882996,
+      "learning_rate": 0.0005938846153846153,
+      "loss": 4.3739,
+      "step": 1850
+    },
+    {
+      "epoch": 0.5533228493214515,
+      "grad_norm": 0.4359632432460785,
+      "learning_rate": 0.0005937097902097902,
+      "loss": 4.3727,
+      "step": 1900
+    },
+    {
+      "epoch": 0.5678839769351739,
+      "grad_norm": 0.39883190393447876,
+      "learning_rate": 0.000593534965034965,
+      "loss": 4.3559,
+      "step": 1950
+    },
+    {
+      "epoch": 0.5824451045488963,
+      "grad_norm": 0.4254516661167145,
+      "learning_rate": 0.0005933601398601398,
+      "loss": 4.3438,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5824451045488963,
+      "eval_accuracy": 0.29953294727340574,
+      "eval_loss": 4.282804489135742,
+      "eval_runtime": 179.6292,
+      "eval_samples_per_second": 92.669,
+      "eval_steps_per_second": 5.795,
+      "step": 2000
+    },
+    {
+      "epoch": 0.5970062321626187,
+      "grad_norm": 0.39681392908096313,
+      "learning_rate": 0.0005931853146853146,
+      "loss": 4.3252,
+      "step": 2050
+    },
+    {
+      "epoch": 0.6115673597763411,
+      "grad_norm": 0.36488792300224304,
+      "learning_rate": 0.0005930104895104895,
+      "loss": 4.3158,
+      "step": 2100
+    },
+    {
+      "epoch": 0.6261284873900634,
+      "grad_norm": 0.4375183582305908,
+      "learning_rate": 0.0005928356643356643,
+      "loss": 4.299,
+      "step": 2150
+    },
+    {
+      "epoch": 0.6406896150037859,
+      "grad_norm": 0.38287097215652466,
+      "learning_rate": 0.0005926608391608391,
+      "loss": 4.2941,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6552507426175083,
+      "grad_norm": 0.3945271968841553,
+      "learning_rate": 0.000592486013986014,
+      "loss": 4.2685,
+      "step": 2250
+    },
+    {
+      "epoch": 0.6698118702312307,
+      "grad_norm": 0.3807995617389679,
+      "learning_rate": 0.0005923111888111888,
+      "loss": 4.2773,
+      "step": 2300
+    },
+    {
+      "epoch": 0.6843729978449531,
+      "grad_norm": 0.3736141324043274,
+      "learning_rate": 0.0005921363636363636,
+      "loss": 4.2439,
+      "step": 2350
+    },
+    {
+      "epoch": 0.6989341254586755,
+      "grad_norm": 0.37925609946250916,
+      "learning_rate": 0.0005919615384615384,
+      "loss": 4.2377,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7134952530723979,
+      "grad_norm": 0.40228238701820374,
+      "learning_rate": 0.0005917867132867133,
+      "loss": 4.2397,
+      "step": 2450
+    },
+    {
+      "epoch": 0.7280563806861203,
+      "grad_norm": 0.3505542278289795,
+      "learning_rate": 0.0005916118881118881,
+      "loss": 4.2359,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7426175082998427,
+      "grad_norm": 0.40058302879333496,
+      "learning_rate": 0.0005914370629370629,
+      "loss": 4.2241,
+      "step": 2550
+    },
+    {
+      "epoch": 0.7571786359135652,
+      "grad_norm": 0.3788367509841919,
+      "learning_rate": 0.0005912622377622377,
+      "loss": 4.2107,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7717397635272876,
+      "grad_norm": 0.3747999668121338,
+      "learning_rate": 0.0005910874125874125,
+      "loss": 4.2,
+      "step": 2650
+    },
+    {
+      "epoch": 0.78630089114101,
+      "grad_norm": 0.40086600184440613,
+      "learning_rate": 0.0005909125874125873,
+      "loss": 4.1915,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8008620187547324,
+      "grad_norm": 0.36495792865753174,
+      "learning_rate": 0.0005907377622377622,
+      "loss": 4.1941,
+      "step": 2750
+    },
+    {
+      "epoch": 0.8154231463684548,
+      "grad_norm": 0.3766659200191498,
+      "learning_rate": 0.000590562937062937,
+      "loss": 4.1739,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8299842739821772,
+      "grad_norm": 0.3640320301055908,
+      "learning_rate": 0.0005903881118881118,
+      "loss": 4.1626,
+      "step": 2850
+    },
+    {
+      "epoch": 0.8445454015958996,
+      "grad_norm": 0.3703969717025757,
+      "learning_rate": 0.0005902132867132867,
+      "loss": 4.1557,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8591065292096219,
+      "grad_norm": 0.3352505564689636,
+      "learning_rate": 0.0005900384615384615,
+      "loss": 4.1426,
+      "step": 2950
+    },
+    {
+      "epoch": 0.8736676568233444,
+      "grad_norm": 0.3644249141216278,
+      "learning_rate": 0.0005898636363636363,
+      "loss": 4.1483,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8736676568233444,
+      "eval_accuracy": 0.31578797630784283,
+      "eval_loss": 4.0948615074157715,
+      "eval_runtime": 179.7252,
+      "eval_samples_per_second": 92.619,
+      "eval_steps_per_second": 5.792,
+      "step": 3000
+    },
+    {
+      "epoch": 0.8882287844370668,
+      "grad_norm": 0.3400101065635681,
+      "learning_rate": 0.0005896888111888111,
+      "loss": 4.1436,
+      "step": 3050
+    },
+    {
+      "epoch": 0.9027899120507892,
+      "grad_norm": 0.3571796417236328,
+      "learning_rate": 0.000589513986013986,
+      "loss": 4.1302,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9173510396645116,
+      "grad_norm": 0.34732452034950256,
+      "learning_rate": 0.0005893391608391608,
+      "loss": 4.1203,
+      "step": 3150
+    },
+    {
+      "epoch": 0.931912167278234,
+      "grad_norm": 0.36288565397262573,
+      "learning_rate": 0.0005891643356643356,
+      "loss": 4.1241,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9464732948919564,
+      "grad_norm": 0.34131136536598206,
+      "learning_rate": 0.0005889895104895104,
+      "loss": 4.1136,
+      "step": 3250
+    },
+    {
+      "epoch": 0.9610344225056788,
+      "grad_norm": 0.35798367857933044,
+      "learning_rate": 0.0005888146853146853,
+      "loss": 4.1029,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9755955501194012,
+      "grad_norm": 0.3709186613559723,
+      "learning_rate": 0.00058863986013986,
+      "loss": 4.0891,
+      "step": 3350
+    },
+    {
+      "epoch": 0.9901566777331237,
+      "grad_norm": 0.3378744423389435,
+      "learning_rate": 0.0005884650349650349,
+      "loss": 4.0959,
+      "step": 3400
+    },
+    {
+      "epoch": 1.004659560836391,
+      "grad_norm": 0.3469085097312927,
+      "learning_rate": 0.0005882902097902097,
+      "loss": 4.0733,
+      "step": 3450
+    },
+    {
+      "epoch": 1.0192206884501136,
+      "grad_norm": 0.3355250954627991,
+      "learning_rate": 0.0005881153846153845,
+      "loss": 4.0135,
+      "step": 3500
+    },
+    {
+      "epoch": 1.033781816063836,
+      "grad_norm": 0.34765860438346863,
+      "learning_rate": 0.0005879405594405594,
+      "loss": 4.0131,
+      "step": 3550
+    },
+    {
+      "epoch": 1.0483429436775584,
+      "grad_norm": 0.3484998941421509,
+      "learning_rate": 0.0005877657342657342,
+      "loss": 4.0352,
+      "step": 3600
+    },
+    {
+      "epoch": 1.0629040712912807,
+      "grad_norm": 0.34341979026794434,
+      "learning_rate": 0.000587590909090909,
+      "loss": 4.0047,
+      "step": 3650
+    },
+    {
+      "epoch": 1.0774651989050033,
+      "grad_norm": 0.36538752913475037,
+      "learning_rate": 0.0005874160839160838,
+      "loss": 4.0016,
+      "step": 3700
+    },
+    {
+      "epoch": 1.0920263265187256,
+      "grad_norm": 0.3458220064640045,
+      "learning_rate": 0.0005872412587412587,
+      "loss": 4.0163,
+      "step": 3750
+    },
+    {
+      "epoch": 1.106587454132448,
+      "grad_norm": 0.3493204414844513,
+      "learning_rate": 0.0005870664335664335,
+      "loss": 4.0035,
+      "step": 3800
+    },
+    {
+      "epoch": 1.1211485817461704,
+      "grad_norm": 0.3274590075016022,
+      "learning_rate": 0.0005868916083916083,
+      "loss": 4.0167,
+      "step": 3850
+    },
+    {
+      "epoch": 1.135709709359893,
+      "grad_norm": 0.3461831510066986,
+      "learning_rate": 0.0005867167832167831,
+      "loss": 3.99,
+      "step": 3900
+    },
+    {
+      "epoch": 1.1502708369736152,
+      "grad_norm": 0.3442121148109436,
+      "learning_rate": 0.000586541958041958,
+      "loss": 3.9825,
+      "step": 3950
+    },
+    {
+      "epoch": 1.1648319645873377,
+      "grad_norm": 0.3337996006011963,
+      "learning_rate": 0.0005863671328671328,
+      "loss": 3.9794,
+      "step": 4000
+    },
+    {
+      "epoch": 1.1648319645873377,
+      "eval_accuracy": 0.32491283320475906,
+      "eval_loss": 3.989677906036377,
+      "eval_runtime": 179.7843,
+      "eval_samples_per_second": 92.589,
+      "eval_steps_per_second": 5.79,
+      "step": 4000
+    },
+    {
+      "epoch": 1.17939309220106,
+      "grad_norm": 0.33036714792251587,
+      "learning_rate": 0.0005861923076923076,
+      "loss": 3.9821,
+      "step": 4050
+    },
+    {
+      "epoch": 1.1939542198147826,
+      "grad_norm": 0.33033114671707153,
+      "learning_rate": 0.0005860174825174824,
+      "loss": 3.9925,
+      "step": 4100
+    },
+    {
+      "epoch": 1.2085153474285049,
+      "grad_norm": 0.3445809781551361,
+      "learning_rate": 0.0005858426573426573,
+      "loss": 3.9873,
+      "step": 4150
+    },
+    {
+      "epoch": 1.2230764750422272,
+      "grad_norm": 0.32692384719848633,
+      "learning_rate": 0.000585667832167832,
+      "loss": 3.9814,
+      "step": 4200
+    },
+    {
+      "epoch": 1.2376376026559497,
+      "grad_norm": 0.3487424850463867,
+      "learning_rate": 0.000585493006993007,
+      "loss": 3.9712,
+      "step": 4250
+    },
+    {
+      "epoch": 1.2521987302696722,
+      "grad_norm": 0.345749169588089,
+      "learning_rate": 0.0005853181818181817,
+      "loss": 3.9784,
+      "step": 4300
+    },
+    {
+      "epoch": 1.2667598578833945,
+      "grad_norm": 0.36335498094558716,
+      "learning_rate": 0.0005851433566433565,
+      "loss": 3.9808,
+      "step": 4350
+    },
+    {
+      "epoch": 1.2813209854971168,
+      "grad_norm": 0.31872642040252686,
+      "learning_rate": 0.0005849685314685315,
+      "loss": 3.9746,
+      "step": 4400
+    },
+    {
+      "epoch": 1.2958821131108393,
+      "grad_norm": 0.357146680355072,
+      "learning_rate": 0.0005847937062937063,
+      "loss": 3.9645,
+      "step": 4450
+    },
+    {
+      "epoch": 1.3104432407245616,
+      "grad_norm": 0.325870543718338,
+      "learning_rate": 0.0005846188811188811,
+      "loss": 3.9639,
+      "step": 4500
+    },
+    {
+      "epoch": 1.3250043683382842,
+      "grad_norm": 0.3136429488658905,
+      "learning_rate": 0.0005844440559440559,
+      "loss": 3.9582,
+      "step": 4550
+    },
+    {
+      "epoch": 1.3395654959520065,
+      "grad_norm": 0.35432639718055725,
+      "learning_rate": 0.0005842692307692308,
+      "loss": 3.9456,
+      "step": 4600
+    },
+    {
+      "epoch": 1.354126623565729,
+      "grad_norm": 0.3514183759689331,
+      "learning_rate": 0.0005840944055944056,
+      "loss": 3.9475,
+      "step": 4650
+    },
+    {
+      "epoch": 1.3686877511794513,
+      "grad_norm": 0.33868497610092163,
+      "learning_rate": 0.0005839195804195804,
+      "loss": 3.9486,
+      "step": 4700
+    },
+    {
+      "epoch": 1.3832488787931738,
+      "grad_norm": 0.3391216993331909,
+      "learning_rate": 0.0005837447552447552,
+      "loss": 3.9525,
+      "step": 4750
+    },
+    {
+      "epoch": 1.3978100064068961,
+      "grad_norm": 0.34010815620422363,
+      "learning_rate": 0.0005835699300699301,
+      "loss": 3.947,
+      "step": 4800
+    },
+    {
+      "epoch": 1.4123711340206184,
+      "grad_norm": 0.3243875205516815,
+      "learning_rate": 0.0005833951048951048,
+      "loss": 3.9515,
+      "step": 4850
+    },
+    {
+      "epoch": 1.426932261634341,
+      "grad_norm": 0.35085731744766235,
+      "learning_rate": 0.0005832202797202797,
+      "loss": 3.9402,
+      "step": 4900
+    },
+    {
+      "epoch": 1.4414933892480635,
+      "grad_norm": 0.34375637769699097,
+      "learning_rate": 0.0005830454545454546,
+      "loss": 3.9424,
+      "step": 4950
+    },
+    {
+      "epoch": 1.4560545168617858,
+      "grad_norm": 0.3360918164253235,
+      "learning_rate": 0.0005828706293706293,
+      "loss": 3.946,
+      "step": 5000
+    },
+    {
+      "epoch": 1.4560545168617858,
+      "eval_accuracy": 0.3315629972163526,
+      "eval_loss": 3.9168527126312256,
+      "eval_runtime": 179.6234,
+      "eval_samples_per_second": 92.672,
+      "eval_steps_per_second": 5.795,
+      "step": 5000
+    },
+    {
+      "epoch": 1.470615644475508,
+      "grad_norm": 0.3384229838848114,
+      "learning_rate": 0.0005826958041958042,
+      "loss": 3.9282,
+      "step": 5050
+    },
+    {
+      "epoch": 1.4851767720892306,
+      "grad_norm": 0.3160015642642975,
+      "learning_rate": 0.000582520979020979,
+      "loss": 3.9223,
+      "step": 5100
+    },
+    {
+      "epoch": 1.4997378997029531,
+      "grad_norm": 0.31337279081344604,
+      "learning_rate": 0.0005823461538461538,
+      "loss": 3.9139,
+      "step": 5150
+    },
+    {
+      "epoch": 1.5142990273166754,
+      "grad_norm": 0.3430108428001404,
+      "learning_rate": 0.0005821713286713286,
+      "loss": 3.9192,
+      "step": 5200
+    },
+    {
+      "epoch": 1.5288601549303977,
+      "grad_norm": 0.32244783639907837,
+      "learning_rate": 0.0005819965034965035,
+      "loss": 3.9181,
+      "step": 5250
+    },
+    {
+      "epoch": 1.5434212825441203,
+      "grad_norm": 0.32754674553871155,
+      "learning_rate": 0.0005818216783216783,
+      "loss": 3.9076,
+      "step": 5300
+    },
+    {
+      "epoch": 1.5579824101578428,
+      "grad_norm": 0.3257962167263031,
+      "learning_rate": 0.0005816468531468531,
+      "loss": 3.9091,
+      "step": 5350
+    },
+    {
+      "epoch": 1.572543537771565,
+      "grad_norm": 0.319021999835968,
+      "learning_rate": 0.0005814720279720279,
+      "loss": 3.8997,
+      "step": 5400
+    },
+    {
+      "epoch": 1.5871046653852874,
+      "grad_norm": 0.34583571553230286,
+      "learning_rate": 0.0005812972027972028,
+      "loss": 3.9082,
+      "step": 5450
+    },
+    {
+      "epoch": 1.6016657929990097,
+      "grad_norm": 0.31768912076950073,
+      "learning_rate": 0.0005811223776223776,
+      "loss": 3.9119,
+      "step": 5500
+    },
+    {
+      "epoch": 1.6162269206127322,
+      "grad_norm": 0.30981358885765076,
+      "learning_rate": 0.0005809475524475524,
+      "loss": 3.8991,
+      "step": 5550
+    },
+    {
+      "epoch": 1.6307880482264547,
+      "grad_norm": 0.3583605885505676,
+      "learning_rate": 0.0005807727272727272,
+      "loss": 3.8898,
+      "step": 5600
+    },
+    {
+      "epoch": 1.645349175840177,
+      "grad_norm": 0.35432425141334534,
+      "learning_rate": 0.0005805979020979021,
+      "loss": 3.9067,
+      "step": 5650
+    },
+    {
+      "epoch": 1.6599103034538993,
+      "grad_norm": 0.32656440138816833,
+      "learning_rate": 0.0005804230769230769,
+      "loss": 3.8878,
+      "step": 5700
+    },
+    {
+      "epoch": 1.6744714310676219,
+      "grad_norm": 0.32895249128341675,
+      "learning_rate": 0.0005802482517482517,
+      "loss": 3.8858,
+      "step": 5750
+    },
+    {
+      "epoch": 1.6890325586813444,
+      "grad_norm": 0.3573879897594452,
+      "learning_rate": 0.0005800734265734265,
+      "loss": 3.8995,
+      "step": 5800
+    },
+    {
+      "epoch": 1.7035936862950667,
+      "grad_norm": 0.3116515278816223,
+      "learning_rate": 0.0005798986013986013,
+      "loss": 3.8855,
+      "step": 5850
+    },
+    {
+      "epoch": 1.718154813908789,
+      "grad_norm": 0.32921165227890015,
+      "learning_rate": 0.0005797237762237762,
+      "loss": 3.8858,
+      "step": 5900
+    },
+    {
+      "epoch": 1.7327159415225115,
+      "grad_norm": 0.32322996854782104,
+      "learning_rate": 0.000579548951048951,
+      "loss": 3.8747,
+      "step": 5950
+    },
+    {
+      "epoch": 1.747277069136234,
+      "grad_norm": 0.3198484778404236,
+      "learning_rate": 0.0005793741258741258,
+      "loss": 3.8796,
+      "step": 6000
+    },
+    {
+      "epoch": 1.747277069136234,
+      "eval_accuracy": 0.33665428105410394,
+      "eval_loss": 3.859868049621582,
+      "eval_runtime": 179.7598,
+      "eval_samples_per_second": 92.601,
+      "eval_steps_per_second": 5.791,
+      "step": 6000
+    },
+    {
+      "epoch": 1.7618381967499563,
+      "grad_norm": 0.32858818769454956,
+      "learning_rate": 0.0005791993006993006,
+      "loss": 3.8737,
+      "step": 6050
+    },
+    {
+      "epoch": 1.7763993243636786,
+      "grad_norm": 0.31307506561279297,
+      "learning_rate": 0.0005790244755244755,
+      "loss": 3.8731,
+      "step": 6100
+    },
+    {
+      "epoch": 1.7909604519774012,
+      "grad_norm": 0.32378000020980835,
+      "learning_rate": 0.0005788496503496503,
+      "loss": 3.8751,
+      "step": 6150
+    },
+    {
+      "epoch": 1.8055215795911237,
+      "grad_norm": 0.3218482434749603,
+      "learning_rate": 0.0005786748251748251,
+      "loss": 3.8731,
+      "step": 6200
+    },
+    {
+      "epoch": 1.820082707204846,
+      "grad_norm": 0.3510587215423584,
+      "learning_rate": 0.0005784999999999999,
+      "loss": 3.8621,
+      "step": 6250
+    },
+    {
+      "epoch": 1.8346438348185683,
+      "grad_norm": 0.32646113634109497,
+      "learning_rate": 0.0005783251748251748,
+      "loss": 3.8652,
+      "step": 6300
+    },
+    {
+      "epoch": 1.8492049624322906,
+      "grad_norm": 0.34067031741142273,
+      "learning_rate": 0.0005781503496503496,
+      "loss": 3.8638,
+      "step": 6350
+    },
+    {
+      "epoch": 1.8637660900460131,
+      "grad_norm": 0.327680766582489,
+      "learning_rate": 0.0005779755244755244,
+      "loss": 3.8617,
+      "step": 6400
+    },
+    {
+      "epoch": 1.8783272176597356,
+      "grad_norm": 0.31625163555145264,
+      "learning_rate": 0.0005778006993006993,
+      "loss": 3.8561,
+      "step": 6450
+    },
+    {
+      "epoch": 1.892888345273458,
+      "grad_norm": 0.312741219997406,
+      "learning_rate": 0.000577625874125874,
+      "loss": 3.842,
+      "step": 6500
+    },
+    {
+      "epoch": 1.9074494728871803,
+      "grad_norm": 0.32632362842559814,
+      "learning_rate": 0.0005774510489510489,
+      "loss": 3.8528,
+      "step": 6550
+    },
+    {
+      "epoch": 1.9220106005009028,
+      "grad_norm": 0.32156306505203247,
+      "learning_rate": 0.0005772762237762237,
+      "loss": 3.8587,
+      "step": 6600
+    },
+    {
+      "epoch": 1.9365717281146253,
+      "grad_norm": 0.3177630305290222,
+      "learning_rate": 0.0005771013986013985,
+      "loss": 3.8592,
+      "step": 6650
+    },
+    {
+      "epoch": 1.9511328557283476,
+      "grad_norm": 0.3381432890892029,
+      "learning_rate": 0.0005769265734265733,
+      "loss": 3.8487,
+      "step": 6700
+    },
+    {
+      "epoch": 1.96569398334207,
+      "grad_norm": 0.31193795800209045,
+      "learning_rate": 0.0005767517482517482,
+      "loss": 3.8599,
+      "step": 6750
+    },
+    {
+      "epoch": 1.9802551109557924,
+      "grad_norm": 0.33586713671684265,
+      "learning_rate": 0.000576576923076923,
+      "loss": 3.8439,
+      "step": 6800
+    },
+    {
+      "epoch": 1.994816238569515,
+      "grad_norm": 0.3259575068950653,
+      "learning_rate": 0.0005764020979020978,
+      "loss": 3.8482,
+      "step": 6850
+    },
+    {
+      "epoch": 2.009319121672782,
+      "grad_norm": 0.3125501275062561,
+      "learning_rate": 0.0005762272727272726,
+      "loss": 3.7815,
+      "step": 6900
+    },
+    {
+      "epoch": 2.023880249286505,
+      "grad_norm": 0.3336809575557709,
+      "learning_rate": 0.0005760524475524475,
+      "loss": 3.7473,
+      "step": 6950
+    },
+    {
+      "epoch": 2.038441376900227,
+      "grad_norm": 0.3166639804840088,
+      "learning_rate": 0.0005758776223776223,
+      "loss": 3.7474,
+      "step": 7000
+    },
+    {
+      "epoch": 2.038441376900227,
+      "eval_accuracy": 0.34135384628406934,
+      "eval_loss": 3.8145618438720703,
+      "eval_runtime": 179.8334,
+      "eval_samples_per_second": 92.563,
+      "eval_steps_per_second": 5.789,
+      "step": 7000
+    },
+    {
+      "epoch": 2.0530025045139495,
+      "grad_norm": 0.3304164409637451,
+      "learning_rate": 0.0005757027972027971,
+      "loss": 3.7559,
+      "step": 7050
+    },
+    {
+      "epoch": 2.067563632127672,
+      "grad_norm": 0.35328182578086853,
+      "learning_rate": 0.000575527972027972,
+      "loss": 3.741,
+      "step": 7100
+    },
+    {
+      "epoch": 2.0821247597413945,
+      "grad_norm": 0.3486672043800354,
+      "learning_rate": 0.0005753531468531468,
+      "loss": 3.751,
+      "step": 7150
+    },
+    {
+      "epoch": 2.096685887355117,
+      "grad_norm": 0.32075631618499756,
+      "learning_rate": 0.0005751783216783216,
+      "loss": 3.7516,
+      "step": 7200
+    },
+    {
+      "epoch": 2.111247014968839,
+      "grad_norm": 0.3235573172569275,
+      "learning_rate": 0.0005750034965034964,
+      "loss": 3.7561,
+      "step": 7250
+    },
+    {
+      "epoch": 2.1258081425825615,
+      "grad_norm": 0.32960283756256104,
+      "learning_rate": 0.0005748286713286712,
+      "loss": 3.7471,
+      "step": 7300
+    },
+    {
+      "epoch": 2.140369270196284,
+      "grad_norm": 0.3249431848526001,
+      "learning_rate": 0.000574653846153846,
+      "loss": 3.7479,
+      "step": 7350
+    },
+    {
+      "epoch": 2.1549303978100065,
+      "grad_norm": 0.32068416476249695,
+      "learning_rate": 0.000574479020979021,
+      "loss": 3.7515,
+      "step": 7400
+    },
+    {
+      "epoch": 2.169491525423729,
+      "grad_norm": 0.35874906182289124,
+      "learning_rate": 0.0005743041958041958,
+      "loss": 3.7665,
+      "step": 7450
+    },
+    {
+      "epoch": 2.184052653037451,
+      "grad_norm": 0.34327706694602966,
+      "learning_rate": 0.0005741293706293706,
+      "loss": 3.7511,
+      "step": 7500
+    },
+    {
+      "epoch": 2.198613780651174,
+      "grad_norm": 0.3151525855064392,
+      "learning_rate": 0.0005739545454545454,
+      "loss": 3.7454,
+      "step": 7550
+    },
+    {
+      "epoch": 2.213174908264896,
+      "grad_norm": 0.3023368716239929,
+      "learning_rate": 0.0005737797202797203,
+      "loss": 3.7624,
+      "step": 7600
+    },
+    {
+      "epoch": 2.2277360358786185,
+      "grad_norm": 0.3228301703929901,
+      "learning_rate": 0.0005736048951048951,
+      "loss": 3.7529,
+      "step": 7650
+    },
+    {
+      "epoch": 2.2422971634923408,
+      "grad_norm": 0.33145347237586975,
+      "learning_rate": 0.0005734300699300699,
+      "loss": 3.76,
+      "step": 7700
+    },
+    {
+      "epoch": 2.256858291106063,
+      "grad_norm": 0.31790366768836975,
+      "learning_rate": 0.0005732552447552448,
+      "loss": 3.7657,
+      "step": 7750
+    },
+    {
+      "epoch": 2.271419418719786,
+      "grad_norm": 0.32009178400039673,
+      "learning_rate": 0.0005730804195804196,
+      "loss": 3.7592,
+      "step": 7800
+    },
+    {
+      "epoch": 2.285980546333508,
+      "grad_norm": 0.31966885924339294,
+      "learning_rate": 0.0005729055944055944,
+      "loss": 3.7606,
+      "step": 7850
+    },
+    {
+      "epoch": 2.3005416739472304,
+      "grad_norm": 0.3291054368019104,
+      "learning_rate": 0.0005727307692307692,
+      "loss": 3.7479,
+      "step": 7900
+    },
+    {
+      "epoch": 2.3151028015609527,
+      "grad_norm": 0.33194002509117126,
+      "learning_rate": 0.0005725559440559441,
+      "loss": 3.757,
+      "step": 7950
+    },
+    {
+      "epoch": 2.3296639291746755,
+      "grad_norm": 0.30678218603134155,
+      "learning_rate": 0.0005723811188811188,
+      "loss": 3.7545,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3296639291746755,
+      "eval_accuracy": 0.34440224469340025,
+      "eval_loss": 3.782811164855957,
+      "eval_runtime": 179.7459,
+      "eval_samples_per_second": 92.609,
+      "eval_steps_per_second": 5.792,
+      "step": 8000
+    },
+    {
+      "epoch": 2.3442250567883978,
+      "grad_norm": 0.31450313329696655,
+      "learning_rate": 0.0005722062937062937,
+      "loss": 3.7648,
+      "step": 8050
+    },
+    {
+      "epoch": 2.35878618440212,
+      "grad_norm": 0.3125315308570862,
+      "learning_rate": 0.0005720314685314685,
+      "loss": 3.7461,
+      "step": 8100
+    },
+    {
+      "epoch": 2.3733473120158424,
+      "grad_norm": 0.3463304936885834,
+      "learning_rate": 0.0005718566433566433,
+      "loss": 3.7542,
+      "step": 8150
+    },
+    {
+      "epoch": 2.387908439629565,
+      "grad_norm": 0.3375414311885834,
+      "learning_rate": 0.0005716818181818181,
+      "loss": 3.7424,
+      "step": 8200
+    },
+    {
+      "epoch": 2.4024695672432874,
+      "grad_norm": 0.3216915428638458,
+      "learning_rate": 0.000571506993006993,
+      "loss": 3.7559,
+      "step": 8250
+    },
+    {
+      "epoch": 2.4170306948570097,
+      "grad_norm": 0.37400275468826294,
+      "learning_rate": 0.0005713321678321678,
+      "loss": 3.7556,
+      "step": 8300
+    },
+    {
+      "epoch": 2.431591822470732,
+      "grad_norm": 0.3273051977157593,
+      "learning_rate": 0.0005711573426573426,
+      "loss": 3.7541,
+      "step": 8350
+    },
+    {
+      "epoch": 2.4461529500844543,
+      "grad_norm": 0.31118476390838623,
+      "learning_rate": 0.0005709825174825175,
+      "loss": 3.7479,
+      "step": 8400
+    },
+    {
+      "epoch": 2.460714077698177,
+      "grad_norm": 0.33436667919158936,
+      "learning_rate": 0.0005708076923076923,
+      "loss": 3.7398,
+      "step": 8450
+    },
+    {
+      "epoch": 2.4752752053118994,
+      "grad_norm": 0.32443201541900635,
+      "learning_rate": 0.0005706328671328671,
+      "loss": 3.7483,
+      "step": 8500
+    },
+    {
+      "epoch": 2.4898363329256217,
+      "grad_norm": 0.3430940806865692,
+      "learning_rate": 0.0005704580419580419,
+      "loss": 3.75,
+      "step": 8550
+    },
+    {
+      "epoch": 2.5043974605393444,
+      "grad_norm": 0.31686174869537354,
+      "learning_rate": 0.0005702832167832168,
+      "loss": 3.7418,
+      "step": 8600
+    },
+    {
+      "epoch": 2.5189585881530667,
+      "grad_norm": 0.3173408508300781,
+      "learning_rate": 0.0005701083916083916,
+      "loss": 3.7437,
+      "step": 8650
+    },
+    {
+      "epoch": 2.533519715766789,
+      "grad_norm": 0.3175743818283081,
+      "learning_rate": 0.0005699335664335664,
+      "loss": 3.7417,
+      "step": 8700
+    },
+    {
+      "epoch": 2.5480808433805113,
+      "grad_norm": 0.3153781592845917,
+      "learning_rate": 0.0005697587412587412,
+      "loss": 3.7459,
+      "step": 8750
+    },
+    {
+      "epoch": 2.5626419709942336,
+      "grad_norm": 0.3198295831680298,
+      "learning_rate": 0.000569583916083916,
+      "loss": 3.7524,
+      "step": 8800
+    },
+    {
+      "epoch": 2.5772030986079564,
+      "grad_norm": 0.31497374176979065,
+      "learning_rate": 0.0005694090909090908,
+      "loss": 3.7366,
+      "step": 8850
+    },
+    {
+      "epoch": 2.5917642262216787,
+      "grad_norm": 0.3190245032310486,
+      "learning_rate": 0.0005692342657342657,
+      "loss": 3.7408,
+      "step": 8900
+    },
+    {
+      "epoch": 2.606325353835401,
+      "grad_norm": 0.3084900975227356,
+      "learning_rate": 0.0005690594405594405,
+      "loss": 3.7355,
+      "step": 8950
+    },
+    {
+      "epoch": 2.6208864814491233,
+      "grad_norm": 0.3053756356239319,
+      "learning_rate": 0.0005688846153846153,
+      "loss": 3.7487,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6208864814491233,
+      "eval_accuracy": 0.34699638118781967,
+      "eval_loss": 3.7568321228027344,
+      "eval_runtime": 179.8658,
+      "eval_samples_per_second": 92.547,
+      "eval_steps_per_second": 5.788,
+      "step": 9000
+    },
+    {
+      "epoch": 2.6354476090628456,
+      "grad_norm": 0.3176893889904022,
+      "learning_rate": 0.0005687097902097901,
+      "loss": 3.7455,
+      "step": 9050
+    },
+    {
+      "epoch": 2.6500087366765683,
+      "grad_norm": 0.3208650052547455,
+      "learning_rate": 0.000568534965034965,
+      "loss": 3.7457,
+      "step": 9100
+    },
+    {
+      "epoch": 2.6645698642902906,
+      "grad_norm": 0.3182576596736908,
+      "learning_rate": 0.0005683601398601398,
+      "loss": 3.7312,
+      "step": 9150
+    },
+    {
+      "epoch": 2.679130991904013,
+      "grad_norm": 0.31629255414009094,
+      "learning_rate": 0.0005681853146853146,
+      "loss": 3.7155,
+      "step": 9200
+    },
+    {
+      "epoch": 2.6936921195177357,
+      "grad_norm": 0.33148428797721863,
+      "learning_rate": 0.0005680104895104895,
+      "loss": 3.7379,
+      "step": 9250
+    },
+    {
+      "epoch": 2.708253247131458,
+      "grad_norm": 0.3020288646221161,
+      "learning_rate": 0.0005678356643356643,
+      "loss": 3.7264,
+      "step": 9300
+    },
+    {
+      "epoch": 2.7228143747451803,
+      "grad_norm": 0.34346917271614075,
+      "learning_rate": 0.0005676608391608391,
+      "loss": 3.7374,
+      "step": 9350
+    },
+    {
+      "epoch": 2.7373755023589026,
+      "grad_norm": 0.31063133478164673,
+      "learning_rate": 0.0005674860139860139,
+      "loss": 3.7298,
+      "step": 9400
+    },
+    {
+      "epoch": 2.751936629972625,
+      "grad_norm": 0.31841859221458435,
+      "learning_rate": 0.0005673111888111888,
+      "loss": 3.7237,
+      "step": 9450
+    },
+    {
+      "epoch": 2.7664977575863476,
+      "grad_norm": 0.3212113082408905,
+      "learning_rate": 0.0005671363636363635,
+      "loss": 3.7389,
+      "step": 9500
+    },
+    {
+      "epoch": 2.78105888520007,
+      "grad_norm": 0.319784551858902,
+      "learning_rate": 0.0005669615384615384,
+      "loss": 3.7401,
+      "step": 9550
+    },
+    {
+      "epoch": 2.7956200128137922,
+      "grad_norm": 0.31253302097320557,
+      "learning_rate": 0.0005667867132867132,
+      "loss": 3.7299,
+      "step": 9600
+    },
+    {
+      "epoch": 2.8101811404275145,
+      "grad_norm": 0.3241884708404541,
+      "learning_rate": 0.000566611888111888,
+      "loss": 3.7281,
+      "step": 9650
+    },
+    {
+      "epoch": 2.824742268041237,
+      "grad_norm": 0.3327905833721161,
+      "learning_rate": 0.0005664370629370628,
+      "loss": 3.7403,
+      "step": 9700
+    },
+    {
+      "epoch": 2.8393033956549596,
+      "grad_norm": 0.33363252878189087,
+      "learning_rate": 0.0005662622377622377,
+      "loss": 3.7429,
+      "step": 9750
+    },
+    {
+      "epoch": 2.853864523268682,
+      "grad_norm": 0.3250058591365814,
+      "learning_rate": 0.0005660874125874125,
+      "loss": 3.7313,
+      "step": 9800
+    },
+    {
+      "epoch": 2.868425650882404,
+      "grad_norm": 0.3366358280181885,
+      "learning_rate": 0.0005659125874125873,
+      "loss": 3.732,
+      "step": 9850
+    },
+    {
+      "epoch": 2.882986778496127,
+      "grad_norm": 0.3395000100135803,
+      "learning_rate": 0.0005657377622377622,
+      "loss": 3.7283,
+      "step": 9900
+    },
+    {
+      "epoch": 2.8975479061098492,
+      "grad_norm": 0.30396348237991333,
+      "learning_rate": 0.000565562937062937,
+      "loss": 3.7282,
+      "step": 9950
+    },
+    {
+      "epoch": 2.9121090337235715,
+      "grad_norm": 0.310280442237854,
+      "learning_rate": 0.0005653881118881118,
+      "loss": 3.7222,
+      "step": 10000
+    },
+    {
+      "epoch": 2.9121090337235715,
+      "eval_accuracy": 0.3495220962447447,
+      "eval_loss": 3.729001045227051,
+      "eval_runtime": 179.9042,
+      "eval_samples_per_second": 92.527,
+      "eval_steps_per_second": 5.786,
+      "step": 10000
+    },
+    {
+      "epoch": 2.926670161337294,
+      "grad_norm": 0.3250355124473572,
+      "learning_rate": 0.0005652132867132866,
+      "loss": 3.7166,
+      "step": 10050
+    },
+    {
+      "epoch": 2.941231288951016,
+      "grad_norm": 0.30567246675491333,
+      "learning_rate": 0.0005650384615384615,
+      "loss": 3.7325,
+      "step": 10100
+    },
+    {
+      "epoch": 2.955792416564739,
+      "grad_norm": 0.34791237115859985,
+      "learning_rate": 0.0005648636363636363,
+      "loss": 3.7056,
+      "step": 10150
+    },
+    {
+      "epoch": 2.970353544178461,
+      "grad_norm": 0.31332409381866455,
+      "learning_rate": 0.0005646888111888111,
+      "loss": 3.7251,
+      "step": 10200
+    },
+    {
+      "epoch": 2.9849146717921835,
+      "grad_norm": 0.2971247136592865,
+      "learning_rate": 0.000564513986013986,
+      "loss": 3.7126,
+      "step": 10250
+    },
+    {
+      "epoch": 2.9994757994059063,
+      "grad_norm": 0.32203900814056396,
+      "learning_rate": 0.0005643391608391607,
+      "loss": 3.7198,
+      "step": 10300
+    },
+    {
+      "epoch": 3.0139786825091734,
+      "grad_norm": 0.3143203854560852,
+      "learning_rate": 0.0005641643356643355,
+      "loss": 3.6176,
+      "step": 10350
+    },
+    {
+      "epoch": 3.0285398101228957,
+      "grad_norm": 0.33899393677711487,
+      "learning_rate": 0.0005639895104895105,
+      "loss": 3.6184,
+      "step": 10400
+    },
+    {
+      "epoch": 3.0431009377366185,
+      "grad_norm": 0.33629149198532104,
+      "learning_rate": 0.0005638146853146853,
+      "loss": 3.6218,
+      "step": 10450
+    },
+    {
+      "epoch": 3.057662065350341,
+      "grad_norm": 0.33977800607681274,
+      "learning_rate": 0.0005636398601398601,
+      "loss": 3.6169,
+      "step": 10500
+    },
+    {
+      "epoch": 3.072223192964063,
+      "grad_norm": 0.3242505192756653,
+      "learning_rate": 0.000563465034965035,
+      "loss": 3.6248,
+      "step": 10550
+    },
+    {
+      "epoch": 3.0867843205777854,
+      "grad_norm": 0.33569052815437317,
+      "learning_rate": 0.0005632902097902098,
+      "loss": 3.6438,
+      "step": 10600
+    },
+    {
+      "epoch": 3.101345448191508,
+      "grad_norm": 0.3249237835407257,
+      "learning_rate": 0.0005631153846153846,
+      "loss": 3.6286,
+      "step": 10650
+    },
+    {
+      "epoch": 3.1159065758052304,
+      "grad_norm": 0.3126699924468994,
+      "learning_rate": 0.0005629405594405594,
+      "loss": 3.6282,
+      "step": 10700
+    },
+    {
+      "epoch": 3.1304677034189528,
+      "grad_norm": 0.3072546720504761,
+      "learning_rate": 0.0005627657342657343,
+      "loss": 3.6303,
+      "step": 10750
+    },
+    {
+      "epoch": 3.145028831032675,
+      "grad_norm": 0.30215486884117126,
+      "learning_rate": 0.0005625909090909091,
+      "loss": 3.6246,
+      "step": 10800
+    },
+    {
+      "epoch": 3.1595899586463974,
+      "grad_norm": 0.30103379487991333,
+      "learning_rate": 0.0005624160839160839,
+      "loss": 3.632,
+      "step": 10850
+    },
+    {
+      "epoch": 3.17415108626012,
+      "grad_norm": 0.40593844652175903,
+      "learning_rate": 0.0005622412587412587,
+      "loss": 3.6411,
+      "step": 10900
+    },
+    {
+      "epoch": 3.1887122138738424,
+      "grad_norm": 0.30845344066619873,
+      "learning_rate": 0.0005620664335664336,
+      "loss": 3.6405,
+      "step": 10950
+    },
+    {
+      "epoch": 3.2032733414875647,
+      "grad_norm": 0.31571993231773376,
+      "learning_rate": 0.0005618916083916083,
+      "loss": 3.6434,
+      "step": 11000
+    },
+    {
+      "epoch": 3.2032733414875647,
+      "eval_accuracy": 0.3514291968616427,
+      "eval_loss": 3.715721607208252,
+      "eval_runtime": 179.7371,
+      "eval_samples_per_second": 92.613,
+      "eval_steps_per_second": 5.792,
+      "step": 11000
+    },
+    {
+      "epoch": 3.217834469101287,
+      "grad_norm": 0.3318782448768616,
+      "learning_rate": 0.0005617167832167832,
+      "loss": 3.6317,
+      "step": 11050
+    },
+    {
+      "epoch": 3.2323955967150098,
+      "grad_norm": 0.33287033438682556,
+      "learning_rate": 0.000561541958041958,
+      "loss": 3.6503,
+      "step": 11100
+    },
+    {
+      "epoch": 3.246956724328732,
+      "grad_norm": 0.3447157144546509,
+      "learning_rate": 0.0005613671328671328,
+      "loss": 3.6464,
+      "step": 11150
+    },
+    {
+      "epoch": 3.2615178519424544,
+      "grad_norm": 0.31866371631622314,
+      "learning_rate": 0.0005611923076923077,
+      "loss": 3.6459,
+      "step": 11200
+    },
+    {
+      "epoch": 3.2760789795561767,
+      "grad_norm": 0.3190111517906189,
+      "learning_rate": 0.0005610174825174825,
+      "loss": 3.6375,
+      "step": 11250
+    },
+    {
+      "epoch": 3.2906401071698994,
+      "grad_norm": 0.3384534418582916,
+      "learning_rate": 0.0005608426573426573,
+      "loss": 3.6297,
+      "step": 11300
+    },
+    {
+      "epoch": 3.3052012347836217,
+      "grad_norm": 0.3122884929180145,
+      "learning_rate": 0.0005606678321678321,
+      "loss": 3.6488,
+      "step": 11350
+    },
+    {
+      "epoch": 3.319762362397344,
+      "grad_norm": 0.3280264139175415,
+      "learning_rate": 0.000560493006993007,
+      "loss": 3.6409,
+      "step": 11400
+    },
+    {
+      "epoch": 3.3343234900110663,
+      "grad_norm": 0.3291660249233246,
+      "learning_rate": 0.0005603181818181818,
+      "loss": 3.6371,
+      "step": 11450
+    },
+    {
+      "epoch": 3.3488846176247886,
+      "grad_norm": 0.3122524619102478,
+      "learning_rate": 0.0005601433566433566,
+      "loss": 3.6437,
+      "step": 11500
+    },
+    {
+      "epoch": 3.3634457452385114,
+      "grad_norm": 0.3195066452026367,
+      "learning_rate": 0.0005599685314685314,
+      "loss": 3.654,
+      "step": 11550
+    },
+    {
+      "epoch": 3.3780068728522337,
+      "grad_norm": 0.32396697998046875,
+      "learning_rate": 0.0005597937062937063,
+      "loss": 3.6451,
+      "step": 11600
+    },
+    {
+      "epoch": 3.392568000465956,
+      "grad_norm": 0.31407713890075684,
+      "learning_rate": 0.0005596188811188811,
+      "loss": 3.6336,
+      "step": 11650
+    },
+    {
+      "epoch": 3.4071291280796787,
+      "grad_norm": 0.31519898772239685,
+      "learning_rate": 0.0005594440559440559,
+      "loss": 3.6432,
+      "step": 11700
+    },
+    {
+      "epoch": 3.421690255693401,
+      "grad_norm": 0.33295854926109314,
+      "learning_rate": 0.0005592692307692307,
+      "loss": 3.6271,
+      "step": 11750
+    },
+    {
+      "epoch": 3.4362513833071233,
+      "grad_norm": 0.3175846338272095,
+      "learning_rate": 0.0005590944055944055,
+      "loss": 3.6419,
+      "step": 11800
+    },
+    {
+      "epoch": 3.4508125109208456,
+      "grad_norm": 0.3179056942462921,
+      "learning_rate": 0.0005589195804195803,
+      "loss": 3.649,
+      "step": 11850
+    },
+    {
+      "epoch": 3.465373638534568,
+      "grad_norm": 0.31343457102775574,
+      "learning_rate": 0.0005587447552447552,
+      "loss": 3.6439,
+      "step": 11900
+    },
+    {
+      "epoch": 3.4799347661482907,
+      "grad_norm": 0.3348383903503418,
+      "learning_rate": 0.00055856993006993,
+      "loss": 3.6389,
+      "step": 11950
+    },
+    {
+      "epoch": 3.494495893762013,
+      "grad_norm": 0.33012107014656067,
+      "learning_rate": 0.0005583951048951048,
+      "loss": 3.6493,
+      "step": 12000
+    },
+    {
+      "epoch": 3.494495893762013,
+      "eval_accuracy": 0.3528794491862669,
+      "eval_loss": 3.7000977993011475,
+      "eval_runtime": 179.7672,
+      "eval_samples_per_second": 92.598,
+      "eval_steps_per_second": 5.791,
+      "step": 12000
+    },
+    {
+      "epoch": 3.5090570213757353,
+      "grad_norm": 0.30933046340942383,
+      "learning_rate": 0.0005582202797202797,
+      "loss": 3.6413,
+      "step": 12050
+    },
+    {
+      "epoch": 3.523618148989458,
+      "grad_norm": 0.3057238757610321,
+      "learning_rate": 0.0005580454545454545,
+      "loss": 3.6378,
+      "step": 12100
+    },
+    {
+      "epoch": 3.53817927660318,
+      "grad_norm": 0.3380361497402191,
+      "learning_rate": 0.0005578706293706293,
+      "loss": 3.6462,
+      "step": 12150
+    },
+    {
+      "epoch": 3.5527404042169026,
+      "grad_norm": 0.32907187938690186,
+      "learning_rate": 0.0005576958041958041,
+      "loss": 3.6464,
+      "step": 12200
+    },
+    {
+      "epoch": 3.567301531830625,
+      "grad_norm": 0.3162597417831421,
+      "learning_rate": 0.000557520979020979,
+      "loss": 3.656,
+      "step": 12250
+    },
+    {
+      "epoch": 3.5818626594443472,
+      "grad_norm": 0.3106593191623688,
+      "learning_rate": 0.0005573461538461538,
+      "loss": 3.6515,
+      "step": 12300
+    },
+    {
+      "epoch": 3.59642378705807,
+      "grad_norm": 0.29408252239227295,
+      "learning_rate": 0.0005571713286713286,
+      "loss": 3.6489,
+      "step": 12350
+    },
+    {
+      "epoch": 3.6109849146717923,
+      "grad_norm": 0.3639216721057892,
+      "learning_rate": 0.0005569965034965034,
+      "loss": 3.6424,
+      "step": 12400
+    },
+    {
+      "epoch": 3.6255460422855146,
+      "grad_norm": 0.31863993406295776,
+      "learning_rate": 0.0005568216783216783,
+      "loss": 3.6446,
+      "step": 12450
+    },
+    {
+      "epoch": 3.640107169899237,
+      "grad_norm": 0.3066108822822571,
+      "learning_rate": 0.000556646853146853,
+      "loss": 3.6432,
+      "step": 12500
+    },
+    {
+      "epoch": 3.654668297512959,
+      "grad_norm": 0.30826711654663086,
+      "learning_rate": 0.0005564720279720279,
+      "loss": 3.6457,
+      "step": 12550
+    },
+    {
+      "epoch": 3.669229425126682,
+      "grad_norm": 0.3210170567035675,
+      "learning_rate": 0.0005562972027972027,
+      "loss": 3.6411,
+      "step": 12600
+    },
+    {
+      "epoch": 3.6837905527404042,
+      "grad_norm": 0.31402987241744995,
+      "learning_rate": 0.0005561223776223775,
+      "loss": 3.6542,
+      "step": 12650
+    },
+    {
+      "epoch": 3.6983516803541265,
+      "grad_norm": 0.33224406838417053,
+      "learning_rate": 0.0005559475524475524,
+      "loss": 3.6385,
+      "step": 12700
+    },
+    {
+      "epoch": 3.7129128079678493,
+      "grad_norm": 0.3081912398338318,
+      "learning_rate": 0.0005557727272727272,
+      "loss": 3.6361,
+      "step": 12750
+    },
+    {
+      "epoch": 3.7274739355815716,
+      "grad_norm": 0.31198635697364807,
+      "learning_rate": 0.000555597902097902,
+      "loss": 3.6456,
+      "step": 12800
+    },
+    {
+      "epoch": 3.742035063195294,
+      "grad_norm": 0.31249940395355225,
+      "learning_rate": 0.0005554230769230768,
+      "loss": 3.6301,
+      "step": 12850
+    },
+    {
+      "epoch": 3.756596190809016,
+      "grad_norm": 0.29419270157814026,
+      "learning_rate": 0.0005552482517482517,
+      "loss": 3.6319,
+      "step": 12900
+    },
+    {
+      "epoch": 3.7711573184227385,
+      "grad_norm": 0.3123679459095001,
+      "learning_rate": 0.0005550734265734265,
+      "loss": 3.6439,
+      "step": 12950
+    },
+    {
+      "epoch": 3.7857184460364612,
+      "grad_norm": 0.3085649013519287,
+      "learning_rate": 0.0005548986013986013,
+      "loss": 3.6429,
+      "step": 13000
+    },
+    {
+      "epoch": 3.7857184460364612,
+      "eval_accuracy": 0.3545970388800704,
+      "eval_loss": 3.6809747219085693,
+      "eval_runtime": 179.6902,
+      "eval_samples_per_second": 92.637,
+      "eval_steps_per_second": 5.793,
+      "step": 13000
+    },
+    {
+      "epoch": 3.8002795736501835,
+      "grad_norm": 0.3226883113384247,
+      "learning_rate": 0.0005547237762237761,
+      "loss": 3.644,
+      "step": 13050
+    },
+    {
+      "epoch": 3.814840701263906,
+      "grad_norm": 0.32543593645095825,
+      "learning_rate": 0.000554548951048951,
+      "loss": 3.6455,
+      "step": 13100
+    },
+    {
+      "epoch": 3.829401828877628,
+      "grad_norm": 0.313363254070282,
+      "learning_rate": 0.0005543741258741258,
+      "loss": 3.647,
+      "step": 13150
+    },
+    {
+      "epoch": 3.8439629564913504,
+      "grad_norm": 0.3085945248603821,
+      "learning_rate": 0.0005541993006993006,
+      "loss": 3.6409,
+      "step": 13200
+    },
+    {
+      "epoch": 3.858524084105073,
+      "grad_norm": 0.32422712445259094,
+      "learning_rate": 0.0005540244755244756,
+      "loss": 3.6415,
+      "step": 13250
+    },
+    {
+      "epoch": 3.8730852117187955,
+      "grad_norm": 0.31334224343299866,
+      "learning_rate": 0.0005538496503496502,
+      "loss": 3.6376,
+      "step": 13300
+    },
+    {
+      "epoch": 3.887646339332518,
+      "grad_norm": 0.3215864598751068,
+      "learning_rate": 0.0005536748251748252,
+      "loss": 3.6382,
+      "step": 13350
+    },
+    {
+      "epoch": 3.9022074669462405,
+      "grad_norm": 0.32258346676826477,
+      "learning_rate": 0.0005535,
+      "loss": 3.6334,
+      "step": 13400
+    },
+    {
+      "epoch": 3.916768594559963,
+      "grad_norm": 0.32085853815078735,
+      "learning_rate": 0.0005533251748251748,
+      "loss": 3.6264,
+      "step": 13450
+    },
+    {
+      "epoch": 3.931329722173685,
+      "grad_norm": 0.30639684200286865,
+      "learning_rate": 0.0005531503496503496,
+      "loss": 3.6552,
+      "step": 13500
+    },
+    {
+      "epoch": 3.9458908497874075,
+      "grad_norm": 0.31769323348999023,
+      "learning_rate": 0.0005529755244755245,
+      "loss": 3.6226,
+      "step": 13550
+    },
+    {
+      "epoch": 3.9604519774011298,
+      "grad_norm": 0.31194061040878296,
+      "learning_rate": 0.0005528006993006993,
+      "loss": 3.6418,
+      "step": 13600
+    },
+    {
+      "epoch": 3.9750131050148525,
+      "grad_norm": 0.326402485370636,
+      "learning_rate": 0.0005526258741258741,
+      "loss": 3.6404,
+      "step": 13650
+    },
+    {
+      "epoch": 3.989574232628575,
+      "grad_norm": 0.3246409595012665,
+      "learning_rate": 0.0005524510489510489,
+      "loss": 3.6294,
+      "step": 13700
+    },
+    {
+      "epoch": 4.004077115731842,
+      "grad_norm": 0.32423749566078186,
+      "learning_rate": 0.0005522762237762238,
+      "loss": 3.6146,
+      "step": 13750
+    },
+    {
+      "epoch": 4.018638243345564,
+      "grad_norm": 0.311954140663147,
+      "learning_rate": 0.0005521013986013986,
+      "loss": 3.5222,
+      "step": 13800
+    },
+    {
+      "epoch": 4.033199370959287,
+      "grad_norm": 0.31119635701179504,
+      "learning_rate": 0.0005519265734265734,
+      "loss": 3.5373,
+      "step": 13850
+    },
+    {
+      "epoch": 4.04776049857301,
+      "grad_norm": 0.31895068287849426,
+      "learning_rate": 0.0005517517482517482,
+      "loss": 3.5214,
+      "step": 13900
+    },
+    {
+      "epoch": 4.062321626186732,
+      "grad_norm": 0.34818094968795776,
+      "learning_rate": 0.0005515769230769231,
+      "loss": 3.5465,
+      "step": 13950
+    },
+    {
+      "epoch": 4.076882753800454,
+      "grad_norm": 0.33164742588996887,
+      "learning_rate": 0.0005514020979020979,
+      "loss": 3.5427,
+      "step": 14000
+    },
+    {
+      "epoch": 4.076882753800454,
+      "eval_accuracy": 0.3561751993215227,
+      "eval_loss": 3.6737630367279053,
+      "eval_runtime": 179.6974,
+      "eval_samples_per_second": 92.633,
+      "eval_steps_per_second": 5.793,
+      "step": 14000
+    },
+    {
+      "epoch": 4.091443881414177,
+      "grad_norm": 0.3311789333820343,
+      "learning_rate": 0.0005512272727272727,
+      "loss": 3.5457,
+      "step": 14050
+    },
+    {
+      "epoch": 4.106005009027899,
+      "grad_norm": 0.3225516378879547,
+      "learning_rate": 0.0005510524475524475,
+      "loss": 3.5393,
+      "step": 14100
+    },
+    {
+      "epoch": 4.120566136641622,
+      "grad_norm": 0.3110713064670563,
+      "learning_rate": 0.0005508776223776223,
+      "loss": 3.5559,
+      "step": 14150
+    },
+    {
+      "epoch": 4.135127264255344,
+      "grad_norm": 0.32352516055107117,
+      "learning_rate": 0.0005507027972027972,
+      "loss": 3.557,
+      "step": 14200
+    },
+    {
+      "epoch": 4.149688391869066,
+      "grad_norm": 0.32771018147468567,
+      "learning_rate": 0.000550527972027972,
+      "loss": 3.5614,
+      "step": 14250
+    },
+    {
+      "epoch": 4.164249519482789,
+      "grad_norm": 0.3170819580554962,
+      "learning_rate": 0.0005503531468531468,
+      "loss": 3.5519,
+      "step": 14300
+    },
+    {
+      "epoch": 4.178810647096511,
+      "grad_norm": 0.3334265947341919,
+      "learning_rate": 0.0005501783216783216,
+      "loss": 3.5502,
+      "step": 14350
+    },
+    {
+      "epoch": 4.193371774710234,
+      "grad_norm": 0.30677902698516846,
+      "learning_rate": 0.0005500034965034965,
+      "loss": 3.574,
+      "step": 14400
+    },
+    {
+      "epoch": 4.207932902323956,
+      "grad_norm": 0.33088985085487366,
+      "learning_rate": 0.0005498286713286713,
+      "loss": 3.5655,
+      "step": 14450
+    },
+    {
+      "epoch": 4.222494029937678,
+      "grad_norm": 0.31959256529808044,
+      "learning_rate": 0.0005496538461538461,
+      "loss": 3.5559,
+      "step": 14500
+    },
+    {
+      "epoch": 4.237055157551401,
+      "grad_norm": 0.31475120782852173,
+      "learning_rate": 0.0005494790209790209,
+      "loss": 3.559,
+      "step": 14550
+    },
+    {
+      "epoch": 4.251616285165123,
+      "grad_norm": 0.3372187912464142,
+      "learning_rate": 0.0005493041958041958,
+      "loss": 3.568,
+      "step": 14600
+    },
+    {
+      "epoch": 4.266177412778846,
+      "grad_norm": 0.3159469962120056,
+      "learning_rate": 0.0005491293706293706,
+      "loss": 3.5742,
+      "step": 14650
+    },
+    {
+      "epoch": 4.280738540392568,
+      "grad_norm": 0.34496167302131653,
+      "learning_rate": 0.0005489545454545454,
+      "loss": 3.569,
+      "step": 14700
+    },
+    {
+      "epoch": 4.29529966800629,
+      "grad_norm": 0.3201475441455841,
+      "learning_rate": 0.0005487797202797203,
+      "loss": 3.573,
+      "step": 14750
+    },
+    {
+      "epoch": 4.309860795620013,
+      "grad_norm": 0.3239315450191498,
+      "learning_rate": 0.000548604895104895,
+      "loss": 3.577,
+      "step": 14800
+    },
+    {
+      "epoch": 4.324421923233735,
+      "grad_norm": 0.30931442975997925,
+      "learning_rate": 0.0005484300699300699,
+      "loss": 3.5692,
+      "step": 14850
+    },
+    {
+      "epoch": 4.338983050847458,
+      "grad_norm": 0.3285701870918274,
+      "learning_rate": 0.0005482552447552447,
+      "loss": 3.566,
+      "step": 14900
+    },
+    {
+      "epoch": 4.35354417846118,
+      "grad_norm": 0.325842022895813,
+      "learning_rate": 0.0005480804195804195,
+      "loss": 3.5647,
+      "step": 14950
+    },
+    {
+      "epoch": 4.368105306074902,
+      "grad_norm": 0.3167710304260254,
+      "learning_rate": 0.0005479055944055943,
+      "loss": 3.5735,
+      "step": 15000
+    },
+    {
+      "epoch": 4.368105306074902,
+      "eval_accuracy": 0.3571037087946,
+      "eval_loss": 3.659233570098877,
+      "eval_runtime": 179.796,
+      "eval_samples_per_second": 92.583,
+      "eval_steps_per_second": 5.79,
+      "step": 15000
+    },
+    {
+      "epoch": 4.382666433688625,
+      "grad_norm": 0.3091343939304352,
+      "learning_rate": 0.0005477307692307692,
+      "loss": 3.5822,
+      "step": 15050
+    },
+    {
+      "epoch": 4.397227561302348,
+      "grad_norm": 0.33039334416389465,
+      "learning_rate": 0.000547555944055944,
+      "loss": 3.586,
+      "step": 15100
+    },
+    {
+      "epoch": 4.41178868891607,
+      "grad_norm": 0.30892929434776306,
+      "learning_rate": 0.0005473811188811188,
+      "loss": 3.5716,
+      "step": 15150
+    },
+    {
+      "epoch": 4.426349816529792,
+      "grad_norm": 0.3354114592075348,
+      "learning_rate": 0.0005472062937062936,
+      "loss": 3.5646,
+      "step": 15200
+    },
+    {
+      "epoch": 4.440910944143514,
+      "grad_norm": 0.3432832360267639,
+      "learning_rate": 0.0005470314685314685,
+      "loss": 3.5779,
+      "step": 15250
+    },
+    {
+      "epoch": 4.455472071757237,
+      "grad_norm": 0.3167623281478882,
+      "learning_rate": 0.0005468566433566433,
+      "loss": 3.5657,
+      "step": 15300
+    },
+    {
+      "epoch": 4.47003319937096,
+      "grad_norm": 0.3280886113643646,
+      "learning_rate": 0.0005466818181818181,
+      "loss": 3.5732,
+      "step": 15350
+    },
+    {
+      "epoch": 4.4845943269846815,
+      "grad_norm": 0.3291832208633423,
+      "learning_rate": 0.000546506993006993,
+      "loss": 3.5683,
+      "step": 15400
+    },
+    {
+      "epoch": 4.499155454598404,
+      "grad_norm": 0.31101885437965393,
+      "learning_rate": 0.0005463321678321678,
+      "loss": 3.5722,
+      "step": 15450
+    },
+    {
+      "epoch": 4.513716582212126,
+      "grad_norm": 0.3118363320827484,
+      "learning_rate": 0.0005461573426573426,
+      "loss": 3.5855,
+      "step": 15500
+    },
+    {
+      "epoch": 4.528277709825849,
+      "grad_norm": 0.31627270579338074,
+      "learning_rate": 0.0005459825174825174,
+      "loss": 3.5871,
+      "step": 15550
+    },
+    {
+      "epoch": 4.542838837439572,
+      "grad_norm": 0.32284530997276306,
+      "learning_rate": 0.0005458076923076922,
+      "loss": 3.5754,
+      "step": 15600
+    },
+    {
+      "epoch": 4.5573999650532935,
+      "grad_norm": 0.32503610849380493,
+      "learning_rate": 0.000545632867132867,
+      "loss": 3.569,
+      "step": 15650
+    },
+    {
+      "epoch": 4.571961092667016,
+      "grad_norm": 0.3345843553543091,
+      "learning_rate": 0.0005454580419580419,
+      "loss": 3.566,
+      "step": 15700
+    },
+    {
+      "epoch": 4.586522220280738,
+      "grad_norm": 0.31699925661087036,
+      "learning_rate": 0.0005452832167832167,
+      "loss": 3.5757,
+      "step": 15750
+    },
+    {
+      "epoch": 4.601083347894461,
+      "grad_norm": 0.3411146402359009,
+      "learning_rate": 0.0005451083916083915,
+      "loss": 3.5894,
+      "step": 15800
+    },
+    {
+      "epoch": 4.615644475508184,
+      "grad_norm": 0.31675615906715393,
+      "learning_rate": 0.0005449335664335663,
+      "loss": 3.5752,
+      "step": 15850
+    },
+    {
+      "epoch": 4.630205603121905,
+      "grad_norm": 0.3413219153881073,
+      "learning_rate": 0.0005447587412587412,
+      "loss": 3.5711,
+      "step": 15900
+    },
+    {
+      "epoch": 4.644766730735628,
+      "grad_norm": 0.3177620470523834,
+      "learning_rate": 0.000544583916083916,
+      "loss": 3.5798,
+      "step": 15950
+    },
+    {
+      "epoch": 4.659327858349351,
+      "grad_norm": 0.31724312901496887,
+      "learning_rate": 0.0005444090909090908,
+      "loss": 3.5796,
+      "step": 16000
+    },
+    {
+      "epoch": 4.659327858349351,
+      "eval_accuracy": 0.35869268499593115,
+      "eval_loss": 3.648486375808716,
+      "eval_runtime": 179.9066,
+      "eval_samples_per_second": 92.526,
+      "eval_steps_per_second": 5.786,
+      "step": 16000
+    },
+    {
+      "epoch": 4.673888985963073,
+      "grad_norm": 0.32944586873054504,
+      "learning_rate": 0.0005442342657342657,
+      "loss": 3.5742,
+      "step": 16050
+    },
+    {
+      "epoch": 4.6884501135767955,
+      "grad_norm": 0.320095956325531,
+      "learning_rate": 0.0005440594405594405,
+      "loss": 3.5843,
+      "step": 16100
+    },
+    {
+      "epoch": 4.703011241190518,
+      "grad_norm": 0.3284047245979309,
+      "learning_rate": 0.0005438846153846153,
+      "loss": 3.566,
+      "step": 16150
+    },
+    {
+      "epoch": 4.71757236880424,
+      "grad_norm": 0.338379830121994,
+      "learning_rate": 0.0005437097902097901,
+      "loss": 3.5667,
+      "step": 16200
+    },
+    {
+      "epoch": 4.732133496417963,
+      "grad_norm": 0.3109598159790039,
+      "learning_rate": 0.0005435349650349651,
+      "loss": 3.5742,
+      "step": 16250
+    },
+    {
+      "epoch": 4.746694624031685,
+      "grad_norm": 0.30519962310791016,
+      "learning_rate": 0.0005433601398601397,
+      "loss": 3.5789,
+      "step": 16300
+    },
+    {
+      "epoch": 4.7612557516454075,
+      "grad_norm": 0.3150230944156647,
+      "learning_rate": 0.0005431853146853147,
+      "loss": 3.5769,
+      "step": 16350
+    },
+    {
+      "epoch": 4.77581687925913,
+      "grad_norm": 0.29910922050476074,
+      "learning_rate": 0.0005430104895104895,
+      "loss": 3.5761,
+      "step": 16400
+    },
+    {
+      "epoch": 4.790378006872852,
+      "grad_norm": 0.3157634437084198,
+      "learning_rate": 0.0005428356643356643,
+      "loss": 3.5709,
+      "step": 16450
+    },
+    {
+      "epoch": 4.804939134486575,
+      "grad_norm": 0.3214448094367981,
+      "learning_rate": 0.0005426608391608391,
+      "loss": 3.5804,
+      "step": 16500
+    },
+    {
+      "epoch": 4.819500262100297,
+      "grad_norm": 0.31892773509025574,
+      "learning_rate": 0.000542486013986014,
+      "loss": 3.5899,
+      "step": 16550
+    },
+    {
+      "epoch": 4.834061389714019,
+      "grad_norm": 0.3179968595504761,
+      "learning_rate": 0.0005423111888111888,
+      "loss": 3.5709,
+      "step": 16600
+    },
+    {
+      "epoch": 4.848622517327742,
+      "grad_norm": 0.33231818675994873,
+      "learning_rate": 0.0005421363636363636,
+      "loss": 3.5737,
+      "step": 16650
+    },
+    {
+      "epoch": 4.863183644941464,
+      "grad_norm": 0.30390241742134094,
+      "learning_rate": 0.0005419615384615385,
+      "loss": 3.572,
+      "step": 16700
+    },
+    {
+      "epoch": 4.877744772555187,
+      "grad_norm": 0.3263714909553528,
+      "learning_rate": 0.0005417867132867133,
+      "loss": 3.5714,
+      "step": 16750
+    },
+    {
+      "epoch": 4.892305900168909,
+      "grad_norm": 0.31608420610427856,
+      "learning_rate": 0.0005416118881118881,
+      "loss": 3.573,
+      "step": 16800
+    },
+    {
+      "epoch": 4.906867027782631,
+      "grad_norm": 0.3054676353931427,
+      "learning_rate": 0.0005414370629370629,
+      "loss": 3.5793,
+      "step": 16850
+    },
+    {
+      "epoch": 4.921428155396354,
+      "grad_norm": 0.3099980354309082,
+      "learning_rate": 0.0005412622377622378,
+      "loss": 3.5697,
+      "step": 16900
+    },
+    {
+      "epoch": 4.935989283010076,
+      "grad_norm": 0.29981857538223267,
+      "learning_rate": 0.0005410874125874126,
+      "loss": 3.5735,
+      "step": 16950
+    },
+    {
+      "epoch": 4.950550410623799,
+      "grad_norm": 0.3208276033401489,
+      "learning_rate": 0.0005409125874125874,
+      "loss": 3.5819,
+      "step": 17000
+    },
+    {
+      "epoch": 4.950550410623799,
+      "eval_accuracy": 0.3599148658622406,
+      "eval_loss": 3.634756326675415,
+      "eval_runtime": 179.7751,
+      "eval_samples_per_second": 92.593,
+      "eval_steps_per_second": 5.791,
+      "step": 17000
+    },
+    {
+      "epoch": 4.9651115382375215,
+      "grad_norm": 0.310529500246048,
+      "learning_rate": 0.0005407377622377622,
+      "loss": 3.5832,
+      "step": 17050
+    },
+    {
+      "epoch": 4.979672665851243,
+      "grad_norm": 0.32999780774116516,
+      "learning_rate": 0.000540562937062937,
+      "loss": 3.5711,
+      "step": 17100
+    },
+    {
+      "epoch": 4.994233793464966,
+      "grad_norm": 0.3354627192020416,
+      "learning_rate": 0.0005403881118881118,
+      "loss": 3.5734,
+      "step": 17150
+    },
+    {
+      "epoch": 5.008736676568233,
+      "grad_norm": 0.35508137941360474,
+      "learning_rate": 0.0005402132867132867,
+      "loss": 3.5155,
+      "step": 17200
+    },
+    {
+      "epoch": 5.023297804181956,
+      "grad_norm": 0.31227484345436096,
+      "learning_rate": 0.0005400384615384615,
+      "loss": 3.4713,
+      "step": 17250
+    },
+    {
+      "epoch": 5.037858931795678,
+      "grad_norm": 0.31459367275238037,
+      "learning_rate": 0.0005398636363636363,
+      "loss": 3.48,
+      "step": 17300
+    },
+    {
+      "epoch": 5.052420059409401,
+      "grad_norm": 0.31045621633529663,
+      "learning_rate": 0.0005396888111888111,
+      "loss": 3.4772,
+      "step": 17350
+    },
+    {
+      "epoch": 5.066981187023123,
+      "grad_norm": 0.3227365016937256,
+      "learning_rate": 0.000539513986013986,
+      "loss": 3.4702,
+      "step": 17400
+    },
+    {
+      "epoch": 5.081542314636845,
+      "grad_norm": 0.30600887537002563,
+      "learning_rate": 0.0005393391608391608,
+      "loss": 3.4776,
+      "step": 17450
+    },
+    {
+      "epoch": 5.096103442250568,
+      "grad_norm": 0.3312874138355255,
+      "learning_rate": 0.0005391643356643356,
+      "loss": 3.4876,
+      "step": 17500
+    },
+    {
+      "epoch": 5.110664569864291,
+      "grad_norm": 0.3330562114715576,
+      "learning_rate": 0.0005389895104895105,
+      "loss": 3.4802,
+      "step": 17550
+    },
+    {
+      "epoch": 5.125225697478013,
+      "grad_norm": 0.32655513286590576,
+      "learning_rate": 0.0005388146853146853,
+      "loss": 3.4899,
+      "step": 17600
+    },
+    {
+      "epoch": 5.139786825091735,
+      "grad_norm": 0.34551799297332764,
+      "learning_rate": 0.0005386398601398601,
+      "loss": 3.493,
+      "step": 17650
+    },
+    {
+      "epoch": 5.154347952705457,
+      "grad_norm": 0.3142414093017578,
+      "learning_rate": 0.0005384650349650349,
+      "loss": 3.5019,
+      "step": 17700
+    },
+    {
+      "epoch": 5.16890908031918,
+      "grad_norm": 0.3235276937484741,
+      "learning_rate": 0.0005382902097902098,
+      "loss": 3.4889,
+      "step": 17750
+    },
+    {
+      "epoch": 5.183470207932903,
+      "grad_norm": 0.3249594569206238,
+      "learning_rate": 0.0005381153846153845,
+      "loss": 3.4947,
+      "step": 17800
+    },
+    {
+      "epoch": 5.1980313355466246,
+      "grad_norm": 0.32166171073913574,
+      "learning_rate": 0.0005379405594405594,
+      "loss": 3.5064,
+      "step": 17850
+    },
+    {
+      "epoch": 5.212592463160347,
+      "grad_norm": 0.3284703195095062,
+      "learning_rate": 0.0005377657342657342,
+      "loss": 3.5105,
+      "step": 17900
+    },
+    {
+      "epoch": 5.227153590774069,
+      "grad_norm": 0.32744383811950684,
+      "learning_rate": 0.000537590909090909,
+      "loss": 3.5143,
+      "step": 17950
+    },
+    {
+      "epoch": 5.241714718387792,
+      "grad_norm": 0.312739759683609,
+      "learning_rate": 0.0005374160839160838,
+      "loss": 3.5007,
+      "step": 18000
+    },
+    {
+      "epoch": 5.241714718387792,
+      "eval_accuracy": 0.36018232079402723,
+      "eval_loss": 3.6365652084350586,
+      "eval_runtime": 179.8304,
+      "eval_samples_per_second": 92.565,
+      "eval_steps_per_second": 5.789,
+      "step": 18000
+    },
+    {
+      "epoch": 5.256275846001515,
+      "grad_norm": 0.31837671995162964,
+      "learning_rate": 0.0005372412587412587,
+      "loss": 3.5128,
+      "step": 18050
+    },
+    {
+      "epoch": 5.2708369736152365,
+      "grad_norm": 0.33519458770751953,
+      "learning_rate": 0.0005370664335664335,
+      "loss": 3.5119,
+      "step": 18100
+    },
+    {
+      "epoch": 5.285398101228959,
+      "grad_norm": 0.34740373492240906,
+      "learning_rate": 0.0005368916083916083,
+      "loss": 3.5228,
+      "step": 18150
+    },
+    {
+      "epoch": 5.299959228842681,
+      "grad_norm": 0.34328994154930115,
+      "learning_rate": 0.0005367167832167832,
+      "loss": 3.5142,
+      "step": 18200
+    },
+    {
+      "epoch": 5.314520356456404,
+      "grad_norm": 0.3207642436027527,
+      "learning_rate": 0.000536541958041958,
+      "loss": 3.5114,
+      "step": 18250
+    },
+    {
+      "epoch": 5.329081484070127,
+      "grad_norm": 0.335101455450058,
+      "learning_rate": 0.0005363671328671328,
+      "loss": 3.5175,
+      "step": 18300
+    },
+    {
+      "epoch": 5.3436426116838485,
+      "grad_norm": 0.34362977743148804,
+      "learning_rate": 0.0005361923076923076,
+      "loss": 3.519,
+      "step": 18350
+    },
+    {
+      "epoch": 5.358203739297571,
+      "grad_norm": 0.3147866725921631,
+      "learning_rate": 0.0005360174825174825,
+      "loss": 3.5155,
+      "step": 18400
+    },
+    {
+      "epoch": 5.372764866911294,
+      "grad_norm": 0.33346375823020935,
+      "learning_rate": 0.0005358426573426573,
+      "loss": 3.5163,
+      "step": 18450
+    },
+    {
+      "epoch": 5.387325994525016,
+      "grad_norm": 0.3331373631954193,
+      "learning_rate": 0.0005356678321678321,
+      "loss": 3.5133,
+      "step": 18500
+    },
+    {
+      "epoch": 5.401887122138739,
+      "grad_norm": 0.3066289722919464,
+      "learning_rate": 0.0005354930069930069,
+      "loss": 3.5202,
+      "step": 18550
+    },
+    {
+      "epoch": 5.41644824975246,
+      "grad_norm": 0.32293954491615295,
+      "learning_rate": 0.0005353181818181817,
+      "loss": 3.5251,
+      "step": 18600
+    },
+    {
+      "epoch": 5.431009377366183,
+      "grad_norm": 0.33153200149536133,
+      "learning_rate": 0.0005351433566433565,
+      "loss": 3.5089,
+      "step": 18650
+    },
+    {
+      "epoch": 5.445570504979906,
+      "grad_norm": 0.32844340801239014,
+      "learning_rate": 0.0005349685314685314,
+      "loss": 3.5175,
+      "step": 18700
+    },
+    {
+      "epoch": 5.460131632593628,
+      "grad_norm": 0.33013710379600525,
+      "learning_rate": 0.0005347937062937062,
+      "loss": 3.5155,
+      "step": 18750
+    },
+    {
+      "epoch": 5.4746927602073505,
+      "grad_norm": 0.318752259016037,
+      "learning_rate": 0.000534618881118881,
+      "loss": 3.5226,
+      "step": 18800
+    },
+    {
+      "epoch": 5.489253887821073,
+      "grad_norm": 0.3632429242134094,
+      "learning_rate": 0.0005344440559440559,
+      "loss": 3.5256,
+      "step": 18850
+    },
+    {
+      "epoch": 5.503815015434795,
+      "grad_norm": 0.31200987100601196,
+      "learning_rate": 0.0005342692307692307,
+      "loss": 3.5277,
+      "step": 18900
+    },
+    {
+      "epoch": 5.518376143048518,
+      "grad_norm": 0.35066500306129456,
+      "learning_rate": 0.0005340944055944055,
+      "loss": 3.5224,
+      "step": 18950
+    },
+    {
+      "epoch": 5.53293727066224,
+      "grad_norm": 0.3067936301231384,
+      "learning_rate": 0.0005339195804195803,
+      "loss": 3.5156,
+      "step": 19000
+    },
+    {
+      "epoch": 5.53293727066224,
+      "eval_accuracy": 0.3610321808827682,
+      "eval_loss": 3.6285228729248047,
+      "eval_runtime": 180.0932,
+      "eval_samples_per_second": 92.43,
+      "eval_steps_per_second": 5.78,
+      "step": 19000
+    },
+    {
+      "epoch": 5.5474983982759625,
+      "grad_norm": 0.3141394853591919,
+      "learning_rate": 0.0005337447552447552,
+      "loss": 3.5173,
+      "step": 19050
+    },
+    {
+      "epoch": 5.562059525889685,
+      "grad_norm": 0.334416925907135,
+      "learning_rate": 0.00053356993006993,
+      "loss": 3.5189,
+      "step": 19100
+    },
+    {
+      "epoch": 5.576620653503407,
+      "grad_norm": 0.3050374686717987,
+      "learning_rate": 0.0005333951048951048,
+      "loss": 3.5142,
+      "step": 19150
+    },
+    {
+      "epoch": 5.59118178111713,
+      "grad_norm": 0.33711856603622437,
+      "learning_rate": 0.0005332202797202796,
+      "loss": 3.5282,
+      "step": 19200
+    },
+    {
+      "epoch": 5.605742908730852,
+      "grad_norm": 0.34378382563591003,
+      "learning_rate": 0.0005330454545454546,
+      "loss": 3.5195,
+      "step": 19250
+    },
+    {
+      "epoch": 5.620304036344574,
+      "grad_norm": 0.3297707736492157,
+      "learning_rate": 0.0005328706293706292,
+      "loss": 3.532,
+      "step": 19300
+    },
+    {
+      "epoch": 5.634865163958297,
+      "grad_norm": 0.33016687631607056,
+      "learning_rate": 0.0005326958041958042,
+      "loss": 3.5425,
+      "step": 19350
+    },
+    {
+      "epoch": 5.649426291572019,
+      "grad_norm": 0.34170061349868774,
+      "learning_rate": 0.000532520979020979,
+      "loss": 3.5282,
+      "step": 19400
+    },
+    {
+      "epoch": 5.663987419185742,
+      "grad_norm": 0.3264179825782776,
+      "learning_rate": 0.0005323461538461538,
+      "loss": 3.5302,
+      "step": 19450
+    },
+    {
+      "epoch": 5.6785485467994645,
+      "grad_norm": 0.3002929091453552,
+      "learning_rate": 0.0005321713286713287,
+      "loss": 3.5267,
+      "step": 19500
+    },
+    {
+      "epoch": 5.693109674413186,
+      "grad_norm": 0.35670411586761475,
+      "learning_rate": 0.0005319965034965035,
+      "loss": 3.5173,
+      "step": 19550
+    },
+    {
+      "epoch": 5.707670802026909,
+      "grad_norm": 0.3164016902446747,
+      "learning_rate": 0.0005318216783216783,
+      "loss": 3.5437,
+      "step": 19600
+    },
+    {
+      "epoch": 5.722231929640631,
+      "grad_norm": 0.3452078700065613,
+      "learning_rate": 0.0005316468531468531,
+      "loss": 3.5239,
+      "step": 19650
+    },
+    {
+      "epoch": 5.736793057254354,
+      "grad_norm": 0.3179798424243927,
+      "learning_rate": 0.000531472027972028,
+      "loss": 3.5254,
+      "step": 19700
+    },
+    {
+      "epoch": 5.7513541848680765,
+      "grad_norm": 0.32574138045310974,
+      "learning_rate": 0.0005312972027972028,
+      "loss": 3.5335,
+      "step": 19750
+    },
+    {
+      "epoch": 5.765915312481798,
+      "grad_norm": 0.32392826676368713,
+      "learning_rate": 0.0005311223776223776,
+      "loss": 3.5268,
+      "step": 19800
+    },
+    {
+      "epoch": 5.780476440095521,
+      "grad_norm": 0.34594979882240295,
+      "learning_rate": 0.0005309475524475524,
+      "loss": 3.534,
+      "step": 19850
+    },
+    {
+      "epoch": 5.795037567709244,
+      "grad_norm": 0.31376367807388306,
+      "learning_rate": 0.0005307727272727273,
+      "loss": 3.5306,
+      "step": 19900
+    },
+    {
+      "epoch": 5.809598695322966,
+      "grad_norm": 0.32450011372566223,
+      "learning_rate": 0.0005305979020979021,
+      "loss": 3.5337,
+      "step": 19950
+    },
+    {
+      "epoch": 5.824159822936688,
+      "grad_norm": 0.30886128544807434,
+      "learning_rate": 0.0005304230769230769,
+      "loss": 3.5239,
+      "step": 20000
+    },
+    {
+      "epoch": 5.824159822936688,
+      "eval_accuracy": 0.3622778742705534,
+      "eval_loss": 3.6140716075897217,
+      "eval_runtime": 180.1478,
+      "eval_samples_per_second": 92.402,
+      "eval_steps_per_second": 5.779,
+      "step": 20000
+    },
+    {
+      "epoch": 5.83872095055041,
+      "grad_norm": 0.32404589653015137,
+      "learning_rate": 0.0005302482517482517,
+      "loss": 3.5409,
+      "step": 20050
+    },
+    {
+      "epoch": 5.853282078164133,
+      "grad_norm": 0.30877238512039185,
+      "learning_rate": 0.0005300734265734265,
+      "loss": 3.5373,
+      "step": 20100
+    },
+    {
+      "epoch": 5.867843205777856,
+      "grad_norm": 0.31356489658355713,
+      "learning_rate": 0.0005298986013986013,
+      "loss": 3.5219,
+      "step": 20150
+    },
+    {
+      "epoch": 5.882404333391578,
+      "grad_norm": 0.30876606702804565,
+      "learning_rate": 0.0005297237762237762,
+      "loss": 3.529,
+      "step": 20200
+    },
+    {
+      "epoch": 5.8969654610053,
+      "grad_norm": 0.3364260494709015,
+      "learning_rate": 0.000529548951048951,
+      "loss": 3.5252,
+      "step": 20250
+    },
+    {
+      "epoch": 5.911526588619022,
+      "grad_norm": 0.3011105954647064,
+      "learning_rate": 0.0005293741258741258,
+      "loss": 3.5244,
+      "step": 20300
+    },
+    {
+      "epoch": 5.926087716232745,
+      "grad_norm": 0.31753775477409363,
+      "learning_rate": 0.0005291993006993007,
+      "loss": 3.5309,
+      "step": 20350
+    },
+    {
+      "epoch": 5.940648843846468,
+      "grad_norm": 0.3421807289123535,
+      "learning_rate": 0.0005290244755244755,
+      "loss": 3.537,
+      "step": 20400
+    },
+    {
+      "epoch": 5.95520997146019,
+      "grad_norm": 0.3219417631626129,
+      "learning_rate": 0.0005288496503496503,
+      "loss": 3.5311,
+      "step": 20450
+    },
+    {
+      "epoch": 5.969771099073912,
+      "grad_norm": 0.3096925616264343,
+      "learning_rate": 0.0005286748251748251,
+      "loss": 3.5334,
+      "step": 20500
+    },
+    {
+      "epoch": 5.984332226687634,
+      "grad_norm": 0.3308550715446472,
+      "learning_rate": 0.0005285,
+      "loss": 3.5199,
+      "step": 20550
+    },
+    {
+      "epoch": 5.998893354301357,
+      "grad_norm": 0.31948336958885193,
+      "learning_rate": 0.0005283251748251748,
+      "loss": 3.5393,
+      "step": 20600
+    },
+    {
+      "epoch": 6.013396237404625,
+      "grad_norm": 0.31365492939949036,
+      "learning_rate": 0.0005281503496503496,
+      "loss": 3.432,
+      "step": 20650
+    },
+    {
+      "epoch": 6.027957365018347,
+      "grad_norm": 0.32687506079673767,
+      "learning_rate": 0.0005279755244755244,
+      "loss": 3.4276,
+      "step": 20700
+    },
+    {
+      "epoch": 6.04251849263207,
+      "grad_norm": 0.32380980253219604,
+      "learning_rate": 0.0005278006993006993,
+      "loss": 3.4312,
+      "step": 20750
+    },
+    {
+      "epoch": 6.0570796202457915,
+      "grad_norm": 0.3151368498802185,
+      "learning_rate": 0.000527625874125874,
+      "loss": 3.4158,
+      "step": 20800
+    },
+    {
+      "epoch": 6.071640747859514,
+      "grad_norm": 0.315514475107193,
+      "learning_rate": 0.0005274510489510489,
+      "loss": 3.4395,
+      "step": 20850
+    },
+    {
+      "epoch": 6.086201875473237,
+      "grad_norm": 0.32791003584861755,
+      "learning_rate": 0.0005272762237762238,
+      "loss": 3.4373,
+      "step": 20900
+    },
+    {
+      "epoch": 6.100763003086959,
+      "grad_norm": 0.3153580129146576,
+      "learning_rate": 0.0005271013986013985,
+      "loss": 3.4479,
+      "step": 20950
+    },
+    {
+      "epoch": 6.115324130700682,
+      "grad_norm": 0.34948551654815674,
+      "learning_rate": 0.0005269265734265734,
+      "loss": 3.4463,
+      "step": 21000
+    },
+    {
+      "epoch": 6.115324130700682,
+      "eval_accuracy": 0.3622545968742924,
+      "eval_loss": 3.6173741817474365,
+      "eval_runtime": 179.8785,
+      "eval_samples_per_second": 92.54,
+      "eval_steps_per_second": 5.787,
+      "step": 21000
+    },
+    {
+      "epoch": 6.1298852583144035,
+      "grad_norm": 0.3471393883228302,
+      "learning_rate": 0.0005267517482517482,
+      "loss": 3.4418,
+      "step": 21050
+    },
+    {
+      "epoch": 6.144446385928126,
+      "grad_norm": 0.32299190759658813,
+      "learning_rate": 0.000526576923076923,
+      "loss": 3.462,
+      "step": 21100
+    },
+    {
+      "epoch": 6.159007513541849,
+      "grad_norm": 0.3276447355747223,
+      "learning_rate": 0.0005264020979020978,
+      "loss": 3.4441,
+      "step": 21150
+    },
+    {
+      "epoch": 6.173568641155571,
+      "grad_norm": 0.3275761604309082,
+      "learning_rate": 0.0005262272727272727,
+      "loss": 3.4414,
+      "step": 21200
+    },
+    {
+      "epoch": 6.1881297687692935,
+      "grad_norm": 0.32831233739852905,
+      "learning_rate": 0.0005260524475524475,
+      "loss": 3.4573,
+      "step": 21250
+    },
+    {
+      "epoch": 6.202690896383016,
+      "grad_norm": 0.32581037282943726,
+      "learning_rate": 0.0005258776223776223,
+      "loss": 3.4413,
+      "step": 21300
+    },
+    {
+      "epoch": 6.217252023996738,
+      "grad_norm": 0.3218664228916168,
+      "learning_rate": 0.0005257027972027971,
+      "loss": 3.4494,
+      "step": 21350
+    },
+    {
+      "epoch": 6.231813151610461,
+      "grad_norm": 0.34039339423179626,
+      "learning_rate": 0.000525527972027972,
+      "loss": 3.458,
+      "step": 21400
+    },
+    {
+      "epoch": 6.246374279224183,
+      "grad_norm": 0.3327193260192871,
+      "learning_rate": 0.0005253531468531468,
+      "loss": 3.4557,
+      "step": 21450
+    },
+    {
+      "epoch": 6.2609354068379055,
+      "grad_norm": 0.3233095705509186,
+      "learning_rate": 0.0005251783216783216,
+      "loss": 3.4511,
+      "step": 21500
+    },
+    {
+      "epoch": 6.275496534451628,
+      "grad_norm": 0.3496866822242737,
+      "learning_rate": 0.0005250034965034965,
+      "loss": 3.4622,
+      "step": 21550
+    },
+    {
+      "epoch": 6.29005766206535,
+      "grad_norm": 0.3645714521408081,
+      "learning_rate": 0.0005248286713286712,
+      "loss": 3.4517,
+      "step": 21600
+    },
+    {
+      "epoch": 6.304618789679073,
+      "grad_norm": 0.3256557881832123,
+      "learning_rate": 0.0005246538461538461,
+      "loss": 3.4739,
+      "step": 21650
+    },
+    {
+      "epoch": 6.319179917292795,
+      "grad_norm": 0.3235686719417572,
+      "learning_rate": 0.0005244790209790209,
+      "loss": 3.4725,
+      "step": 21700
+    },
+    {
+      "epoch": 6.3337410449065175,
+      "grad_norm": 0.3351970911026001,
+      "learning_rate": 0.0005243041958041957,
+      "loss": 3.4857,
+      "step": 21750
+    },
+    {
+      "epoch": 6.34830217252024,
+      "grad_norm": 0.3423496186733246,
+      "learning_rate": 0.0005241293706293705,
+      "loss": 3.4746,
+      "step": 21800
+    },
+    {
+      "epoch": 6.362863300133962,
+      "grad_norm": 0.3310966491699219,
+      "learning_rate": 0.0005239545454545454,
+      "loss": 3.4763,
+      "step": 21850
+    },
+    {
+      "epoch": 6.377424427747685,
+      "grad_norm": 0.31002819538116455,
+      "learning_rate": 0.0005237797202797202,
+      "loss": 3.4714,
+      "step": 21900
+    },
+    {
+      "epoch": 6.391985555361408,
+      "grad_norm": 0.3289186358451843,
+      "learning_rate": 0.000523604895104895,
+      "loss": 3.4637,
+      "step": 21950
+    },
+    {
+      "epoch": 6.406546682975129,
+      "grad_norm": 0.3141127824783325,
+      "learning_rate": 0.0005234300699300698,
+      "loss": 3.4779,
+      "step": 22000
+    },
+    {
+      "epoch": 6.406546682975129,
+      "eval_accuracy": 0.36285122710673956,
+      "eval_loss": 3.610785722732544,
+      "eval_runtime": 179.7092,
+      "eval_samples_per_second": 92.627,
+      "eval_steps_per_second": 5.793,
+      "step": 22000
+    },
+    {
+      "epoch": 6.421107810588852,
+      "grad_norm": 0.3150128722190857,
+      "learning_rate": 0.0005232552447552447,
+      "loss": 3.4842,
+      "step": 22050
+    },
+    {
+      "epoch": 6.435668938202574,
+      "grad_norm": 0.3259349465370178,
+      "learning_rate": 0.0005230804195804195,
+      "loss": 3.4848,
+      "step": 22100
+    },
+    {
+      "epoch": 6.450230065816297,
+      "grad_norm": 0.32301968336105347,
+      "learning_rate": 0.0005229055944055943,
+      "loss": 3.4818,
+      "step": 22150
+    },
+    {
+      "epoch": 6.4647911934300195,
+      "grad_norm": 0.3123028874397278,
+      "learning_rate": 0.0005227307692307691,
+      "loss": 3.4914,
+      "step": 22200
+    },
+    {
+      "epoch": 6.479352321043741,
+      "grad_norm": 0.3286699652671814,
+      "learning_rate": 0.0005225559440559441,
+      "loss": 3.4875,
+      "step": 22250
+    },
+    {
+      "epoch": 6.493913448657464,
+      "grad_norm": 0.3313329517841339,
+      "learning_rate": 0.0005223811188811189,
+      "loss": 3.4791,
+      "step": 22300
+    },
+    {
+      "epoch": 6.508474576271187,
+      "grad_norm": 0.31018057465553284,
+      "learning_rate": 0.0005222062937062937,
+      "loss": 3.4807,
+      "step": 22350
+    },
+    {
+      "epoch": 6.523035703884909,
+      "grad_norm": 0.32716143131256104,
+      "learning_rate": 0.0005220314685314686,
+      "loss": 3.4846,
+      "step": 22400
+    },
+    {
+      "epoch": 6.5375968314986315,
+      "grad_norm": 0.3213047981262207,
+      "learning_rate": 0.0005218566433566433,
+      "loss": 3.4959,
+      "step": 22450
+    },
+    {
+      "epoch": 6.552157959112353,
+      "grad_norm": 0.3478303849697113,
+      "learning_rate": 0.0005216818181818182,
+      "loss": 3.4794,
+      "step": 22500
+    },
+    {
+      "epoch": 6.566719086726076,
+      "grad_norm": 0.334625780582428,
+      "learning_rate": 0.000521506993006993,
+      "loss": 3.4993,
+      "step": 22550
+    },
+    {
+      "epoch": 6.581280214339799,
+      "grad_norm": 0.3324287235736847,
+      "learning_rate": 0.0005213321678321678,
+      "loss": 3.4947,
+      "step": 22600
+    },
+    {
+      "epoch": 6.595841341953521,
+      "grad_norm": 0.3208302855491638,
+      "learning_rate": 0.0005211573426573426,
+      "loss": 3.4816,
+      "step": 22650
+    },
+    {
+      "epoch": 6.610402469567243,
+      "grad_norm": 0.3206283152103424,
+      "learning_rate": 0.0005209825174825175,
+      "loss": 3.4811,
+      "step": 22700
+    },
+    {
+      "epoch": 6.624963597180965,
+      "grad_norm": 0.3405255377292633,
+      "learning_rate": 0.0005208076923076923,
+      "loss": 3.4839,
+      "step": 22750
+    },
+    {
+      "epoch": 6.639524724794688,
+      "grad_norm": 0.33559542894363403,
+      "learning_rate": 0.0005206328671328671,
+      "loss": 3.4962,
+      "step": 22800
+    },
+    {
+      "epoch": 6.654085852408411,
+      "grad_norm": 0.3277864456176758,
+      "learning_rate": 0.0005204580419580419,
+      "loss": 3.4831,
+      "step": 22850
+    },
+    {
+      "epoch": 6.668646980022133,
+      "grad_norm": 0.3352718949317932,
+      "learning_rate": 0.0005202832167832168,
+      "loss": 3.4782,
+      "step": 22900
+    },
+    {
+      "epoch": 6.683208107635855,
+      "grad_norm": 0.31568098068237305,
+      "learning_rate": 0.0005201083916083916,
+      "loss": 3.4802,
+      "step": 22950
+    },
+    {
+      "epoch": 6.697769235249577,
+      "grad_norm": 0.3398934602737427,
+      "learning_rate": 0.0005199335664335664,
+      "loss": 3.4888,
+      "step": 23000
+    },
+    {
+      "epoch": 6.697769235249577,
+      "eval_accuracy": 0.3639860589557666,
+      "eval_loss": 3.598484992980957,
+      "eval_runtime": 179.7416,
+      "eval_samples_per_second": 92.611,
+      "eval_steps_per_second": 5.792,
+      "step": 23000
+    },
+    {
+      "epoch": 6.7123303628633,
+      "grad_norm": 0.30721819400787354,
+      "learning_rate": 0.0005197587412587413,
+      "loss": 3.4866,
+      "step": 23050
+    },
+    {
+      "epoch": 6.726891490477023,
+      "grad_norm": 0.3224666714668274,
+      "learning_rate": 0.0005195839160839161,
+      "loss": 3.4968,
+      "step": 23100
+    },
+    {
+      "epoch": 6.741452618090745,
+      "grad_norm": 0.32522931694984436,
+      "learning_rate": 0.0005194090909090909,
+      "loss": 3.4878,
+      "step": 23150
+    },
+    {
+      "epoch": 6.756013745704467,
+      "grad_norm": 0.31341007351875305,
+      "learning_rate": 0.0005192342657342657,
+      "loss": 3.4833,
+      "step": 23200
+    },
+    {
+      "epoch": 6.77057487331819,
+      "grad_norm": 0.3186572790145874,
+      "learning_rate": 0.0005190594405594405,
+      "loss": 3.4975,
+      "step": 23250
+    },
+    {
+      "epoch": 6.785136000931912,
+      "grad_norm": 0.33995872735977173,
+      "learning_rate": 0.0005188846153846153,
+      "loss": 3.4878,
+      "step": 23300
+    },
+    {
+      "epoch": 6.799697128545635,
+      "grad_norm": 0.3231462836265564,
+      "learning_rate": 0.0005187097902097902,
+      "loss": 3.4897,
+      "step": 23350
+    },
+    {
+      "epoch": 6.814258256159357,
+      "grad_norm": 0.31064069271087646,
+      "learning_rate": 0.000518534965034965,
+      "loss": 3.4984,
+      "step": 23400
+    },
+    {
+      "epoch": 6.828819383773079,
+      "grad_norm": 0.31749048829078674,
+      "learning_rate": 0.0005183601398601398,
+      "loss": 3.4867,
+      "step": 23450
+    },
+    {
+      "epoch": 6.843380511386802,
+      "grad_norm": 0.31053680181503296,
+      "learning_rate": 0.0005181853146853146,
+      "loss": 3.4937,
+      "step": 23500
+    },
+    {
+      "epoch": 6.857941639000524,
+      "grad_norm": 0.3226015269756317,
+      "learning_rate": 0.0005180104895104895,
+      "loss": 3.4918,
+      "step": 23550
+    },
+    {
+      "epoch": 6.872502766614247,
+      "grad_norm": 0.3255876302719116,
+      "learning_rate": 0.0005178356643356643,
+      "loss": 3.4998,
+      "step": 23600
+    },
+    {
+      "epoch": 6.887063894227969,
+      "grad_norm": 0.32611915469169617,
+      "learning_rate": 0.0005176608391608391,
+      "loss": 3.4874,
+      "step": 23650
+    },
+    {
+      "epoch": 6.901625021841691,
+      "grad_norm": 0.3349880874156952,
+      "learning_rate": 0.000517486013986014,
+      "loss": 3.4898,
+      "step": 23700
+    },
+    {
+      "epoch": 6.916186149455414,
+      "grad_norm": 0.32357269525527954,
+      "learning_rate": 0.0005173111888111888,
+      "loss": 3.4919,
+      "step": 23750
+    },
+    {
+      "epoch": 6.930747277069136,
+      "grad_norm": 0.30893370509147644,
+      "learning_rate": 0.0005171363636363636,
+      "loss": 3.488,
+      "step": 23800
+    },
+    {
+      "epoch": 6.945308404682859,
+      "grad_norm": 0.34728315472602844,
+      "learning_rate": 0.0005169615384615384,
+      "loss": 3.4851,
+      "step": 23850
+    },
+    {
+      "epoch": 6.959869532296581,
+      "grad_norm": 0.34141796827316284,
+      "learning_rate": 0.0005167867132867133,
+      "loss": 3.4924,
+      "step": 23900
+    },
+    {
+      "epoch": 6.974430659910303,
+      "grad_norm": 0.33731377124786377,
+      "learning_rate": 0.000516611888111888,
+      "loss": 3.4936,
+      "step": 23950
+    },
+    {
+      "epoch": 6.988991787524026,
+      "grad_norm": 0.330599308013916,
+      "learning_rate": 0.0005164370629370629,
+      "loss": 3.4999,
+      "step": 24000
+    },
+    {
+      "epoch": 6.988991787524026,
+      "eval_accuracy": 0.36480441226573007,
+      "eval_loss": 3.590554714202881,
+      "eval_runtime": 179.6543,
+      "eval_samples_per_second": 92.656,
+      "eval_steps_per_second": 5.794,
+      "step": 24000
+    },
+    {
+      "epoch": 7.003494670627293,
+      "grad_norm": 0.3449358642101288,
+      "learning_rate": 0.0005162622377622377,
+      "loss": 3.471,
+      "step": 24050
+    },
+    {
+      "epoch": 7.018055798241016,
+      "grad_norm": 0.35293149948120117,
+      "learning_rate": 0.0005160874125874125,
+      "loss": 3.3888,
+      "step": 24100
+    },
+    {
+      "epoch": 7.032616925854738,
+      "grad_norm": 0.3265637755393982,
+      "learning_rate": 0.0005159125874125873,
+      "loss": 3.3925,
+      "step": 24150
+    },
+    {
+      "epoch": 7.0471780534684605,
+      "grad_norm": 0.32121822237968445,
+      "learning_rate": 0.0005157377622377622,
+      "loss": 3.3897,
+      "step": 24200
+    },
+    {
+      "epoch": 7.061739181082183,
+      "grad_norm": 0.3485367000102997,
+      "learning_rate": 0.000515562937062937,
+      "loss": 3.3965,
+      "step": 24250
+    },
+    {
+      "epoch": 7.076300308695905,
+      "grad_norm": 0.32369834184646606,
+      "learning_rate": 0.0005153881118881118,
+      "loss": 3.4083,
+      "step": 24300
+    },
+    {
+      "epoch": 7.090861436309628,
+      "grad_norm": 0.3367840349674225,
+      "learning_rate": 0.0005152132867132867,
+      "loss": 3.4072,
+      "step": 24350
+    },
+    {
+      "epoch": 7.105422563923351,
+      "grad_norm": 0.3350302278995514,
+      "learning_rate": 0.0005150384615384615,
+      "loss": 3.403,
+      "step": 24400
+    },
+    {
+      "epoch": 7.1199836915370724,
+      "grad_norm": 0.3556578755378723,
+      "learning_rate": 0.0005148636363636363,
+      "loss": 3.3993,
+      "step": 24450
+    },
+    {
+      "epoch": 7.134544819150795,
+      "grad_norm": 0.33493995666503906,
+      "learning_rate": 0.0005146888111888111,
+      "loss": 3.4033,
+      "step": 24500
+    },
+    {
+      "epoch": 7.149105946764517,
+      "grad_norm": 0.3266991674900055,
+      "learning_rate": 0.000514513986013986,
+      "loss": 3.4133,
+      "step": 24550
+    },
+    {
+      "epoch": 7.16366707437824,
+      "grad_norm": 0.33190712332725525,
+      "learning_rate": 0.0005143391608391608,
+      "loss": 3.4191,
+      "step": 24600
+    },
+    {
+      "epoch": 7.1782282019919625,
+      "grad_norm": 0.33754125237464905,
+      "learning_rate": 0.0005141643356643356,
+      "loss": 3.4116,
+      "step": 24650
+    },
+    {
+      "epoch": 7.192789329605684,
+      "grad_norm": 0.3015083074569702,
+      "learning_rate": 0.0005139895104895104,
+      "loss": 3.4225,
+      "step": 24700
+    },
+    {
+      "epoch": 7.207350457219407,
+      "grad_norm": 0.3270661532878876,
+      "learning_rate": 0.0005138146853146852,
+      "loss": 3.4205,
+      "step": 24750
+    },
+    {
+      "epoch": 7.22191158483313,
+      "grad_norm": 0.3491705656051636,
+      "learning_rate": 0.00051363986013986,
+      "loss": 3.4145,
+      "step": 24800
+    },
+    {
+      "epoch": 7.236472712446852,
+      "grad_norm": 0.3363984525203705,
+      "learning_rate": 0.0005134650349650349,
+      "loss": 3.4213,
+      "step": 24850
+    },
+    {
+      "epoch": 7.2510338400605745,
+      "grad_norm": 0.33105769753456116,
+      "learning_rate": 0.0005132902097902097,
+      "loss": 3.4229,
+      "step": 24900
+    },
+    {
+      "epoch": 7.265594967674296,
+      "grad_norm": 0.3505908250808716,
+      "learning_rate": 0.0005131153846153845,
+      "loss": 3.4278,
+      "step": 24950
+    },
+    {
+      "epoch": 7.280156095288019,
+      "grad_norm": 0.3380582332611084,
+      "learning_rate": 0.0005129405594405594,
+      "loss": 3.428,
+      "step": 25000
+    },
+    {
+      "epoch": 7.280156095288019,
+      "eval_accuracy": 0.36480793914395143,
+      "eval_loss": 3.5974769592285156,
+      "eval_runtime": 179.8567,
+      "eval_samples_per_second": 92.551,
+      "eval_steps_per_second": 5.788,
+      "step": 25000
+    },
+    {
+      "epoch": 7.294717222901742,
+      "grad_norm": 0.31806254386901855,
+      "learning_rate": 0.0005127657342657342,
+      "loss": 3.4173,
+      "step": 25050
+    },
+    {
+      "epoch": 7.309278350515464,
+      "grad_norm": 0.3278155028820038,
+      "learning_rate": 0.000512590909090909,
+      "loss": 3.4293,
+      "step": 25100
+    },
+    {
+      "epoch": 7.3238394781291865,
+      "grad_norm": 0.31251752376556396,
+      "learning_rate": 0.0005124160839160838,
+      "loss": 3.4386,
+      "step": 25150
+    },
+    {
+      "epoch": 7.338400605742908,
+      "grad_norm": 0.3372874855995178,
+      "learning_rate": 0.0005122412587412588,
+      "loss": 3.4216,
+      "step": 25200
+    },
+    {
+      "epoch": 7.352961733356631,
+      "grad_norm": 0.32962003350257874,
+      "learning_rate": 0.0005120664335664336,
+      "loss": 3.4316,
+      "step": 25250
+    },
+    {
+      "epoch": 7.367522860970354,
+      "grad_norm": 0.3354533612728119,
+      "learning_rate": 0.0005118916083916084,
+      "loss": 3.4331,
+      "step": 25300
+    },
+    {
+      "epoch": 7.382083988584076,
+      "grad_norm": 0.32760855555534363,
+      "learning_rate": 0.0005117167832167832,
+      "loss": 3.4298,
+      "step": 25350
+    },
+    {
+      "epoch": 7.396645116197798,
+      "grad_norm": 0.323398232460022,
+      "learning_rate": 0.0005115419580419581,
+      "loss": 3.4329,
+      "step": 25400
+    },
+    {
+      "epoch": 7.411206243811521,
+      "grad_norm": 0.3129633665084839,
+      "learning_rate": 0.0005113671328671328,
+      "loss": 3.4451,
+      "step": 25450
+    },
+    {
+      "epoch": 7.425767371425243,
+      "grad_norm": 0.308672159910202,
+      "learning_rate": 0.0005111923076923077,
+      "loss": 3.4402,
+      "step": 25500
+    },
+    {
+      "epoch": 7.440328499038966,
+      "grad_norm": 0.3408229649066925,
+      "learning_rate": 0.0005110174825174825,
+      "loss": 3.4397,
+      "step": 25550
+    },
+    {
+      "epoch": 7.454889626652688,
+      "grad_norm": 0.320758581161499,
+      "learning_rate": 0.0005108426573426573,
+      "loss": 3.4447,
+      "step": 25600
+    },
+    {
+      "epoch": 7.46945075426641,
+      "grad_norm": 0.33821046352386475,
+      "learning_rate": 0.0005106678321678321,
+      "loss": 3.4399,
+      "step": 25650
+    },
+    {
+      "epoch": 7.484011881880133,
+      "grad_norm": 0.32798120379447937,
+      "learning_rate": 0.000510493006993007,
+      "loss": 3.4441,
+      "step": 25700
+    },
+    {
+      "epoch": 7.498573009493855,
+      "grad_norm": 0.36191534996032715,
+      "learning_rate": 0.0005103181818181818,
+      "loss": 3.4465,
+      "step": 25750
+    },
+    {
+      "epoch": 7.513134137107578,
+      "grad_norm": 0.333870530128479,
+      "learning_rate": 0.0005101433566433566,
+      "loss": 3.4483,
+      "step": 25800
+    },
+    {
+      "epoch": 7.5276952647213005,
+      "grad_norm": 0.3584294319152832,
+      "learning_rate": 0.0005099685314685315,
+      "loss": 3.4472,
+      "step": 25850
+    },
+    {
+      "epoch": 7.542256392335022,
+      "grad_norm": 0.3232259750366211,
+      "learning_rate": 0.0005097937062937063,
+      "loss": 3.4551,
+      "step": 25900
+    },
+    {
+      "epoch": 7.556817519948745,
+      "grad_norm": 0.34521010518074036,
+      "learning_rate": 0.0005096188811188811,
+      "loss": 3.4492,
+      "step": 25950
+    },
+    {
+      "epoch": 7.571378647562467,
+      "grad_norm": 0.3537822365760803,
+      "learning_rate": 0.0005094440559440559,
+      "loss": 3.4552,
+      "step": 26000
+    },
+    {
+      "epoch": 7.571378647562467,
+      "eval_accuracy": 0.36535131351525596,
+      "eval_loss": 3.590632200241089,
+      "eval_runtime": 179.7252,
+      "eval_samples_per_second": 92.619,
+      "eval_steps_per_second": 5.792,
+      "step": 26000
+    },
+    {
+      "epoch": 7.58593977517619,
+      "grad_norm": 0.3477293848991394,
+      "learning_rate": 0.0005092692307692308,
+      "loss": 3.4485,
+      "step": 26050
+    },
+    {
+      "epoch": 7.600500902789912,
+      "grad_norm": 0.3164335787296295,
+      "learning_rate": 0.0005090944055944056,
+      "loss": 3.4537,
+      "step": 26100
+    },
+    {
+      "epoch": 7.615062030403634,
+      "grad_norm": 0.31365999579429626,
+      "learning_rate": 0.0005089195804195804,
+      "loss": 3.4563,
+      "step": 26150
+    },
+    {
+      "epoch": 7.629623158017357,
+      "grad_norm": 0.33597031235694885,
+      "learning_rate": 0.0005087447552447552,
+      "loss": 3.4469,
+      "step": 26200
+    },
+    {
+      "epoch": 7.644184285631079,
+      "grad_norm": 0.33030572533607483,
+      "learning_rate": 0.00050856993006993,
+      "loss": 3.4471,
+      "step": 26250
+    },
+    {
+      "epoch": 7.658745413244802,
+      "grad_norm": 0.34268873929977417,
+      "learning_rate": 0.0005083951048951048,
+      "loss": 3.4566,
+      "step": 26300
+    },
+    {
+      "epoch": 7.673306540858524,
+      "grad_norm": 0.34644824266433716,
+      "learning_rate": 0.0005082202797202797,
+      "loss": 3.4572,
+      "step": 26350
+    },
+    {
+      "epoch": 7.687867668472246,
+      "grad_norm": 0.3286401331424713,
+      "learning_rate": 0.0005080454545454545,
+      "loss": 3.4614,
+      "step": 26400
+    },
+    {
+      "epoch": 7.702428796085969,
+      "grad_norm": 0.3406911790370941,
+      "learning_rate": 0.0005078706293706293,
+      "loss": 3.4613,
+      "step": 26450
+    },
+    {
+      "epoch": 7.716989923699691,
+      "grad_norm": 0.32939502596855164,
+      "learning_rate": 0.0005076958041958042,
+      "loss": 3.4677,
+      "step": 26500
+    },
+    {
+      "epoch": 7.731551051313414,
+      "grad_norm": 0.33044230937957764,
+      "learning_rate": 0.000507520979020979,
+      "loss": 3.4601,
+      "step": 26550
+    },
+    {
+      "epoch": 7.746112178927136,
+      "grad_norm": 0.315995454788208,
+      "learning_rate": 0.0005073461538461538,
+      "loss": 3.4459,
+      "step": 26600
+    },
+    {
+      "epoch": 7.760673306540858,
+      "grad_norm": 0.35745933651924133,
+      "learning_rate": 0.0005071713286713286,
+      "loss": 3.4574,
+      "step": 26650
+    },
+    {
+      "epoch": 7.775234434154581,
+      "grad_norm": 0.3426244258880615,
+      "learning_rate": 0.0005069965034965035,
+      "loss": 3.4537,
+      "step": 26700
+    },
+    {
+      "epoch": 7.789795561768304,
+      "grad_norm": 0.3141034245491028,
+      "learning_rate": 0.0005068216783216783,
+      "loss": 3.4541,
+      "step": 26750
+    },
+    {
+      "epoch": 7.8043566893820255,
+      "grad_norm": 0.34187954664230347,
+      "learning_rate": 0.0005066468531468531,
+      "loss": 3.4703,
+      "step": 26800
+    },
+    {
+      "epoch": 7.818917816995748,
+      "grad_norm": 0.32608917355537415,
+      "learning_rate": 0.0005064720279720279,
+      "loss": 3.4433,
+      "step": 26850
+    },
+    {
+      "epoch": 7.833478944609471,
+      "grad_norm": 0.30253276228904724,
+      "learning_rate": 0.0005062972027972028,
+      "loss": 3.4582,
+      "step": 26900
+    },
+    {
+      "epoch": 7.848040072223193,
+      "grad_norm": 0.3292168378829956,
+      "learning_rate": 0.0005061223776223775,
+      "loss": 3.4609,
+      "step": 26950
+    },
+    {
+      "epoch": 7.862601199836916,
+      "grad_norm": 0.3352425992488861,
+      "learning_rate": 0.0005059475524475524,
+      "loss": 3.4646,
+      "step": 27000
+    },
+    {
+      "epoch": 7.862601199836916,
+      "eval_accuracy": 0.3661527378097569,
+      "eval_loss": 3.5805842876434326,
+      "eval_runtime": 179.8829,
+      "eval_samples_per_second": 92.538,
+      "eval_steps_per_second": 5.787,
+      "step": 27000
+    },
+    {
+      "epoch": 7.8771623274506375,
+      "grad_norm": 0.33013811707496643,
+      "learning_rate": 0.0005057727272727272,
+      "loss": 3.4593,
+      "step": 27050
+    },
+    {
+      "epoch": 7.89172345506436,
+      "grad_norm": 0.3326588273048401,
+      "learning_rate": 0.000505597902097902,
+      "loss": 3.4569,
+      "step": 27100
+    },
+    {
+      "epoch": 7.906284582678083,
+      "grad_norm": 0.3302481472492218,
+      "learning_rate": 0.0005054230769230769,
+      "loss": 3.4571,
+      "step": 27150
+    },
+    {
+      "epoch": 7.920845710291805,
+      "grad_norm": 0.3329846262931824,
+      "learning_rate": 0.0005052482517482517,
+      "loss": 3.471,
+      "step": 27200
+    },
+    {
+      "epoch": 7.935406837905528,
+      "grad_norm": 0.3458568751811981,
+      "learning_rate": 0.0005050734265734265,
+      "loss": 3.4765,
+      "step": 27250
+    },
+    {
+      "epoch": 7.9499679655192494,
+      "grad_norm": 0.3226156532764435,
+      "learning_rate": 0.0005048986013986013,
+      "loss": 3.4703,
+      "step": 27300
+    },
+    {
+      "epoch": 7.964529093132972,
+      "grad_norm": 0.34230631589889526,
+      "learning_rate": 0.0005047237762237762,
+      "loss": 3.4688,
+      "step": 27350
+    },
+    {
+      "epoch": 7.979090220746695,
+      "grad_norm": 0.31827977299690247,
+      "learning_rate": 0.000504548951048951,
+      "loss": 3.4583,
+      "step": 27400
+    },
+    {
+      "epoch": 7.993651348360417,
+      "grad_norm": 0.32115548849105835,
+      "learning_rate": 0.0005043741258741258,
+      "loss": 3.4671,
+      "step": 27450
+    },
+    {
+      "epoch": 8.008154231463685,
+      "grad_norm": 0.3317052721977234,
+      "learning_rate": 0.0005041993006993006,
+      "loss": 3.4071,
+      "step": 27500
+    },
+    {
+      "epoch": 8.022715359077408,
+      "grad_norm": 0.3432307541370392,
+      "learning_rate": 0.0005040244755244755,
+      "loss": 3.3524,
+      "step": 27550
+    },
+    {
+      "epoch": 8.037276486691129,
+      "grad_norm": 0.3754727244377136,
+      "learning_rate": 0.0005038496503496503,
+      "loss": 3.3459,
+      "step": 27600
+    },
+    {
+      "epoch": 8.051837614304851,
+      "grad_norm": 0.35126733779907227,
+      "learning_rate": 0.0005036748251748251,
+      "loss": 3.3583,
+      "step": 27650
+    },
+    {
+      "epoch": 8.066398741918574,
+      "grad_norm": 0.3542656898498535,
+      "learning_rate": 0.0005034999999999999,
+      "loss": 3.3705,
+      "step": 27700
+    },
+    {
+      "epoch": 8.080959869532297,
+      "grad_norm": 0.34104204177856445,
+      "learning_rate": 0.0005033251748251747,
+      "loss": 3.3645,
+      "step": 27750
+    },
+    {
+      "epoch": 8.09552099714602,
+      "grad_norm": 0.34891462326049805,
+      "learning_rate": 0.0005031503496503496,
+      "loss": 3.3766,
+      "step": 27800
+    },
+    {
+      "epoch": 8.11008212475974,
+      "grad_norm": 0.3348483145236969,
+      "learning_rate": 0.0005029755244755244,
+      "loss": 3.3758,
+      "step": 27850
+    },
+    {
+      "epoch": 8.124643252373463,
+      "grad_norm": 0.35943523049354553,
+      "learning_rate": 0.0005028006993006992,
+      "loss": 3.3722,
+      "step": 27900
+    },
+    {
+      "epoch": 8.139204379987186,
+      "grad_norm": 0.360538125038147,
+      "learning_rate": 0.000502625874125874,
+      "loss": 3.3764,
+      "step": 27950
+    },
+    {
+      "epoch": 8.153765507600909,
+      "grad_norm": 0.34357813000679016,
+      "learning_rate": 0.000502451048951049,
+      "loss": 3.3921,
+      "step": 28000
+    },
+    {
+      "epoch": 8.153765507600909,
+      "eval_accuracy": 0.3661058303294128,
+      "eval_loss": 3.58715558052063,
+      "eval_runtime": 180.3137,
+      "eval_samples_per_second": 92.317,
+      "eval_steps_per_second": 5.773,
+      "step": 28000
+    },
+    {
+      "epoch": 8.168326635214632,
+      "grad_norm": 0.3614567220211029,
+      "learning_rate": 0.0005022762237762237,
+      "loss": 3.3796,
+      "step": 28050
+    },
+    {
+      "epoch": 8.182887762828354,
+      "grad_norm": 0.33387571573257446,
+      "learning_rate": 0.0005021013986013985,
+      "loss": 3.383,
+      "step": 28100
+    },
+    {
+      "epoch": 8.197448890442075,
+      "grad_norm": 0.3599357008934021,
+      "learning_rate": 0.0005019265734265733,
+      "loss": 3.3902,
+      "step": 28150
+    },
+    {
+      "epoch": 8.212010018055798,
+      "grad_norm": 0.3254016041755676,
+      "learning_rate": 0.0005017517482517483,
+      "loss": 3.3856,
+      "step": 28200
+    },
+    {
+      "epoch": 8.22657114566952,
+      "grad_norm": 0.3269076347351074,
+      "learning_rate": 0.0005015769230769231,
+      "loss": 3.3963,
+      "step": 28250
+    },
+    {
+      "epoch": 8.241132273283243,
+      "grad_norm": 0.3196601867675781,
+      "learning_rate": 0.0005014020979020979,
+      "loss": 3.3973,
+      "step": 28300
+    },
+    {
+      "epoch": 8.255693400896966,
+      "grad_norm": 0.3544836640357971,
+      "learning_rate": 0.0005012272727272727,
+      "loss": 3.3985,
+      "step": 28350
+    },
+    {
+      "epoch": 8.270254528510687,
+      "grad_norm": 0.33133646845817566,
+      "learning_rate": 0.0005010524475524476,
+      "loss": 3.3971,
+      "step": 28400
+    },
+    {
+      "epoch": 8.28481565612441,
+      "grad_norm": 0.365125834941864,
+      "learning_rate": 0.0005008776223776223,
+      "loss": 3.3979,
+      "step": 28450
+    },
+    {
+      "epoch": 8.299376783738133,
+      "grad_norm": 0.3482271730899811,
+      "learning_rate": 0.0005007027972027972,
+      "loss": 3.4087,
+      "step": 28500
+    },
+    {
+      "epoch": 8.313937911351855,
+      "grad_norm": 0.3457016050815582,
+      "learning_rate": 0.000500527972027972,
+      "loss": 3.4072,
+      "step": 28550
+    },
+    {
+      "epoch": 8.328499038965578,
+      "grad_norm": 0.3350307047367096,
+      "learning_rate": 0.0005003531468531468,
+      "loss": 3.3984,
+      "step": 28600
+    },
+    {
+      "epoch": 8.3430601665793,
+      "grad_norm": 0.33122938871383667,
+      "learning_rate": 0.0005001783216783217,
+      "loss": 3.4036,
+      "step": 28650
+    },
+    {
+      "epoch": 8.357621294193022,
+      "grad_norm": 0.3646140992641449,
+      "learning_rate": 0.0005000034965034965,
+      "loss": 3.4019,
+      "step": 28700
+    },
+    {
+      "epoch": 8.372182421806745,
+      "grad_norm": 0.339650422334671,
+      "learning_rate": 0.0004998286713286713,
+      "loss": 3.3992,
+      "step": 28750
+    },
+    {
+      "epoch": 8.386743549420467,
+      "grad_norm": 0.31742623448371887,
+      "learning_rate": 0.0004996538461538461,
+      "loss": 3.404,
+      "step": 28800
+    },
+    {
+      "epoch": 8.40130467703419,
+      "grad_norm": 0.3145395815372467,
+      "learning_rate": 0.000499479020979021,
+      "loss": 3.3995,
+      "step": 28850
+    },
+    {
+      "epoch": 8.415865804647911,
+      "grad_norm": 0.34881776571273804,
+      "learning_rate": 0.0004993041958041958,
+      "loss": 3.4046,
+      "step": 28900
+    },
+    {
+      "epoch": 8.430426932261634,
+      "grad_norm": 0.3403722941875458,
+      "learning_rate": 0.0004991293706293706,
+      "loss": 3.4167,
+      "step": 28950
+    },
+    {
+      "epoch": 8.444988059875357,
+      "grad_norm": 0.3250523507595062,
+      "learning_rate": 0.0004989545454545454,
+      "loss": 3.4086,
+      "step": 29000
+    },
+    {
+      "epoch": 8.444988059875357,
+      "eval_accuracy": 0.3662244510002579,
+      "eval_loss": 3.5857577323913574,
+      "eval_runtime": 179.7516,
+      "eval_samples_per_second": 92.606,
+      "eval_steps_per_second": 5.791,
+      "step": 29000
+    },
+    {
+      "epoch": 8.45954918748908,
+      "grad_norm": 0.3275490403175354,
+      "learning_rate": 0.0004987797202797203,
+      "loss": 3.4166,
+      "step": 29050
+    },
+    {
+      "epoch": 8.474110315102802,
+      "grad_norm": 0.31486833095550537,
+      "learning_rate": 0.0004986048951048951,
+      "loss": 3.4041,
+      "step": 29100
+    },
+    {
+      "epoch": 8.488671442716523,
+      "grad_norm": 0.3729318082332611,
+      "learning_rate": 0.0004984300699300699,
+      "loss": 3.4167,
+      "step": 29150
+    },
+    {
+      "epoch": 8.503232570330246,
+      "grad_norm": 0.3305770456790924,
+      "learning_rate": 0.0004982552447552448,
+      "loss": 3.4228,
+      "step": 29200
+    },
+    {
+      "epoch": 8.517793697943969,
+      "grad_norm": 0.3442740738391876,
+      "learning_rate": 0.0004980804195804195,
+      "loss": 3.406,
+      "step": 29250
+    },
+    {
+      "epoch": 8.532354825557691,
+      "grad_norm": 0.32196056842803955,
+      "learning_rate": 0.0004979055944055944,
+      "loss": 3.4296,
+      "step": 29300
+    },
+    {
+      "epoch": 8.546915953171414,
+      "grad_norm": 0.3387078642845154,
+      "learning_rate": 0.0004977307692307692,
+      "loss": 3.4227,
+      "step": 29350
+    },
+    {
+      "epoch": 8.561477080785137,
+      "grad_norm": 0.32302534580230713,
+      "learning_rate": 0.000497555944055944,
+      "loss": 3.414,
+      "step": 29400
+    },
+    {
+      "epoch": 8.576038208398858,
+      "grad_norm": 0.3491160571575165,
+      "learning_rate": 0.0004973811188811188,
+      "loss": 3.4214,
+      "step": 29450
+    },
+    {
+      "epoch": 8.59059933601258,
+      "grad_norm": 0.32889190316200256,
+      "learning_rate": 0.0004972062937062937,
+      "loss": 3.4281,
+      "step": 29500
+    },
+    {
+      "epoch": 8.605160463626303,
+      "grad_norm": 0.32402417063713074,
+      "learning_rate": 0.0004970314685314685,
+      "loss": 3.4171,
+      "step": 29550
+    },
+    {
+      "epoch": 8.619721591240026,
+      "grad_norm": 0.3430418074131012,
+      "learning_rate": 0.0004968566433566433,
+      "loss": 3.4293,
+      "step": 29600
+    },
+    {
+      "epoch": 8.634282718853749,
+      "grad_norm": 0.34214910864830017,
+      "learning_rate": 0.0004966818181818181,
+      "loss": 3.4223,
+      "step": 29650
+    },
+    {
+      "epoch": 8.64884384646747,
+      "grad_norm": 0.3425740897655487,
+      "learning_rate": 0.000496506993006993,
+      "loss": 3.4194,
+      "step": 29700
+    },
+    {
+      "epoch": 8.663404974081192,
+      "grad_norm": 0.34497156739234924,
+      "learning_rate": 0.0004963321678321678,
+      "loss": 3.4311,
+      "step": 29750
+    },
+    {
+      "epoch": 8.677966101694915,
+      "grad_norm": 0.35663503408432007,
+      "learning_rate": 0.0004961573426573426,
+      "loss": 3.431,
+      "step": 29800
+    },
+    {
+      "epoch": 8.692527229308638,
+      "grad_norm": 0.34114986658096313,
+      "learning_rate": 0.0004959825174825175,
+      "loss": 3.4287,
+      "step": 29850
+    },
+    {
+      "epoch": 8.70708835692236,
+      "grad_norm": 0.3398053050041199,
+      "learning_rate": 0.0004958076923076923,
+      "loss": 3.4288,
+      "step": 29900
+    },
+    {
+      "epoch": 8.721649484536082,
+      "grad_norm": 0.34339818358421326,
+      "learning_rate": 0.0004956328671328671,
+      "loss": 3.4414,
+      "step": 29950
+    },
+    {
+      "epoch": 8.736210612149804,
+      "grad_norm": 0.3127419352531433,
+      "learning_rate": 0.0004954580419580419,
+      "loss": 3.4342,
+      "step": 30000
+    },
+    {
+      "epoch": 8.736210612149804,
+      "eval_accuracy": 0.36726194101037535,
+      "eval_loss": 3.5715725421905518,
+      "eval_runtime": 179.7611,
+      "eval_samples_per_second": 92.601,
+      "eval_steps_per_second": 5.791,
+      "step": 30000
+    },
+    {
+      "epoch": 8.750771739763527,
+      "grad_norm": 0.3249102532863617,
+      "learning_rate": 0.0004952832167832167,
+      "loss": 3.428,
+      "step": 30050
+    },
+    {
+      "epoch": 8.76533286737725,
+      "grad_norm": 0.34389105439186096,
+      "learning_rate": 0.0004951083916083915,
+      "loss": 3.4286,
+      "step": 30100
+    },
+    {
+      "epoch": 8.779893994990973,
+      "grad_norm": 0.34607070684432983,
+      "learning_rate": 0.0004949335664335664,
+      "loss": 3.4325,
+      "step": 30150
+    },
+    {
+      "epoch": 8.794455122604695,
+      "grad_norm": 0.33967599272727966,
+      "learning_rate": 0.0004947587412587412,
+      "loss": 3.4188,
+      "step": 30200
+    },
+    {
+      "epoch": 8.809016250218416,
+      "grad_norm": 0.34365391731262207,
+      "learning_rate": 0.000494583916083916,
+      "loss": 3.4258,
+      "step": 30250
+    },
+    {
+      "epoch": 8.82357737783214,
+      "grad_norm": 0.31158357858657837,
+      "learning_rate": 0.0004944090909090908,
+      "loss": 3.4302,
+      "step": 30300
+    },
+    {
+      "epoch": 8.838138505445862,
+      "grad_norm": 0.3425881564617157,
+      "learning_rate": 0.0004942342657342657,
+      "loss": 3.4471,
+      "step": 30350
+    },
+    {
+      "epoch": 8.852699633059585,
+      "grad_norm": 0.33694136142730713,
+      "learning_rate": 0.0004940594405594405,
+      "loss": 3.4363,
+      "step": 30400
+    },
+    {
+      "epoch": 8.867260760673307,
+      "grad_norm": 0.33916687965393066,
+      "learning_rate": 0.0004938846153846153,
+      "loss": 3.4398,
+      "step": 30450
+    },
+    {
+      "epoch": 8.881821888287028,
+      "grad_norm": 0.3424004912376404,
+      "learning_rate": 0.0004937097902097901,
+      "loss": 3.4373,
+      "step": 30500
+    },
+    {
+      "epoch": 8.896383015900751,
+      "grad_norm": 0.3579810857772827,
+      "learning_rate": 0.000493534965034965,
+      "loss": 3.4423,
+      "step": 30550
+    },
+    {
+      "epoch": 8.910944143514474,
+      "grad_norm": 0.35978007316589355,
+      "learning_rate": 0.0004933601398601398,
+      "loss": 3.4223,
+      "step": 30600
+    },
+    {
+      "epoch": 8.925505271128197,
+      "grad_norm": 0.34889093041419983,
+      "learning_rate": 0.0004931853146853146,
+      "loss": 3.4384,
+      "step": 30650
+    },
+    {
+      "epoch": 8.94006639874192,
+      "grad_norm": 0.3178730010986328,
+      "learning_rate": 0.0004930104895104895,
+      "loss": 3.4316,
+      "step": 30700
+    },
+    {
+      "epoch": 8.95462752635564,
+      "grad_norm": 0.3225439190864563,
+      "learning_rate": 0.0004928356643356642,
+      "loss": 3.4376,
+      "step": 30750
+    },
+    {
+      "epoch": 8.969188653969363,
+      "grad_norm": 0.32753077149391174,
+      "learning_rate": 0.0004926608391608391,
+      "loss": 3.4457,
+      "step": 30800
+    },
+    {
+      "epoch": 8.983749781583086,
+      "grad_norm": 0.3687169551849365,
+      "learning_rate": 0.0004924860139860139,
+      "loss": 3.4323,
+      "step": 30850
+    },
+    {
+      "epoch": 8.998310909196809,
+      "grad_norm": 0.3431978225708008,
+      "learning_rate": 0.0004923111888111887,
+      "loss": 3.4443,
+      "step": 30900
+    },
+    {
+      "epoch": 9.012813792300076,
+      "grad_norm": 0.3386369049549103,
+      "learning_rate": 0.0004921363636363635,
+      "loss": 3.3368,
+      "step": 30950
+    },
+    {
+      "epoch": 9.027374919913798,
+      "grad_norm": 0.35466766357421875,
+      "learning_rate": 0.0004919615384615384,
+      "loss": 3.3258,
+      "step": 31000
+    },
+    {
+      "epoch": 9.027374919913798,
+      "eval_accuracy": 0.36759758225444167,
+      "eval_loss": 3.57794451713562,
+      "eval_runtime": 184.3404,
+      "eval_samples_per_second": 90.3,
+      "eval_steps_per_second": 5.647,
+      "step": 31000
+    },
+    {
+      "epoch": 9.041936047527521,
+      "grad_norm": 0.3356529176235199,
+      "learning_rate": 0.0004917867132867132,
+      "loss": 3.3184,
+      "step": 31050
+    },
+    {
+      "epoch": 9.056497175141242,
+      "grad_norm": 0.3212270140647888,
+      "learning_rate": 0.000491611888111888,
+      "loss": 3.3383,
+      "step": 31100
+    },
+    {
+      "epoch": 9.071058302754965,
+      "grad_norm": 0.3324335813522339,
+      "learning_rate": 0.0004914370629370628,
+      "loss": 3.3392,
+      "step": 31150
+    },
+    {
+      "epoch": 9.085619430368688,
+      "grad_norm": 0.32331228256225586,
+      "learning_rate": 0.0004912622377622378,
+      "loss": 3.3521,
+      "step": 31200
+    },
+    {
+      "epoch": 9.10018055798241,
+      "grad_norm": 0.31954678893089294,
+      "learning_rate": 0.0004910874125874126,
+      "loss": 3.3435,
+      "step": 31250
+    },
+    {
+      "epoch": 9.114741685596133,
+      "grad_norm": 0.32974445819854736,
+      "learning_rate": 0.0004909125874125874,
+      "loss": 3.3501,
+      "step": 31300
+    },
+    {
+      "epoch": 9.129302813209854,
+      "grad_norm": 0.35506731271743774,
+      "learning_rate": 0.0004907377622377623,
+      "loss": 3.3467,
+      "step": 31350
+    },
+    {
+      "epoch": 9.143863940823577,
+      "grad_norm": 0.32969748973846436,
+      "learning_rate": 0.0004905629370629371,
+      "loss": 3.357,
+      "step": 31400
+    },
+    {
+      "epoch": 9.1584250684373,
+      "grad_norm": 0.3305834233760834,
+      "learning_rate": 0.0004903881118881119,
+      "loss": 3.3573,
+      "step": 31450
+    },
+    {
+      "epoch": 9.172986196051022,
+      "grad_norm": 0.33574923872947693,
+      "learning_rate": 0.0004902132867132867,
+      "loss": 3.3572,
+      "step": 31500
+    },
+    {
+      "epoch": 9.187547323664745,
+      "grad_norm": 0.32476624846458435,
+      "learning_rate": 0.0004900384615384615,
+      "loss": 3.3454,
+      "step": 31550
+    },
+    {
+      "epoch": 9.202108451278466,
+      "grad_norm": 0.36604878306388855,
+      "learning_rate": 0.0004898636363636363,
+      "loss": 3.3598,
+      "step": 31600
+    },
+    {
+      "epoch": 9.216669578892189,
+      "grad_norm": 0.3407774567604065,
+      "learning_rate": 0.0004896888111888112,
+      "loss": 3.3655,
+      "step": 31650
+    },
+    {
+      "epoch": 9.231230706505912,
+      "grad_norm": 0.3136043846607208,
+      "learning_rate": 0.000489513986013986,
+      "loss": 3.3658,
+      "step": 31700
+    },
+    {
+      "epoch": 9.245791834119634,
+      "grad_norm": 0.34752407670021057,
+      "learning_rate": 0.0004893391608391608,
+      "loss": 3.374,
+      "step": 31750
+    },
+    {
+      "epoch": 9.260352961733357,
+      "grad_norm": 0.33697524666786194,
+      "learning_rate": 0.0004891643356643356,
+      "loss": 3.3715,
+      "step": 31800
+    },
+    {
+      "epoch": 9.27491408934708,
+      "grad_norm": 0.3399849832057953,
+      "learning_rate": 0.0004889895104895105,
+      "loss": 3.3781,
+      "step": 31850
+    },
+    {
+      "epoch": 9.2894752169608,
+      "grad_norm": 0.32320427894592285,
+      "learning_rate": 0.0004888146853146853,
+      "loss": 3.3606,
+      "step": 31900
+    },
+    {
+      "epoch": 9.304036344574524,
+      "grad_norm": 0.3273387849330902,
+      "learning_rate": 0.0004886398601398601,
+      "loss": 3.3726,
+      "step": 31950
+    },
+    {
+      "epoch": 9.318597472188246,
+      "grad_norm": 0.33997225761413574,
+      "learning_rate": 0.000488465034965035,
+      "loss": 3.3831,
+      "step": 32000
+    },
+    {
+      "epoch": 9.318597472188246,
+      "eval_accuracy": 0.3672809861527707,
+      "eval_loss": 3.577822685241699,
+      "eval_runtime": 186.0219,
+      "eval_samples_per_second": 89.484,
+      "eval_steps_per_second": 5.596,
+      "step": 32000
+    },
+    {
+      "epoch": 9.333158599801969,
+      "grad_norm": 0.3397623896598816,
+      "learning_rate": 0.0004882902097902098,
+      "loss": 3.3802,
+      "step": 32050
+    },
+    {
+      "epoch": 9.347719727415692,
+      "grad_norm": 0.3965780735015869,
+      "learning_rate": 0.0004881153846153846,
+      "loss": 3.3961,
+      "step": 32100
+    },
+    {
+      "epoch": 9.362280855029413,
+      "grad_norm": 0.32509127259254456,
+      "learning_rate": 0.0004879405594405594,
+      "loss": 3.392,
+      "step": 32150
+    },
+    {
+      "epoch": 9.376841982643136,
+      "grad_norm": 0.3580123484134674,
+      "learning_rate": 0.00048776573426573424,
+      "loss": 3.3685,
+      "step": 32200
+    },
+    {
+      "epoch": 9.391403110256858,
+      "grad_norm": 0.33572641015052795,
+      "learning_rate": 0.00048759090909090904,
+      "loss": 3.3738,
+      "step": 32250
+    },
+    {
+      "epoch": 9.405964237870581,
+      "grad_norm": 0.34592849016189575,
+      "learning_rate": 0.0004874160839160839,
+      "loss": 3.3792,
+      "step": 32300
+    },
+    {
+      "epoch": 9.420525365484304,
+      "grad_norm": 0.39023056626319885,
+      "learning_rate": 0.0004872412587412587,
+      "loss": 3.3712,
+      "step": 32350
+    },
+    {
+      "epoch": 9.435086493098025,
+      "grad_norm": 0.3557857871055603,
+      "learning_rate": 0.00048706643356643354,
+      "loss": 3.3959,
+      "step": 32400
+    },
+    {
+      "epoch": 9.449647620711747,
+      "grad_norm": 0.3627590537071228,
+      "learning_rate": 0.00048689160839160834,
+      "loss": 3.3798,
+      "step": 32450
+    },
+    {
+      "epoch": 9.46420874832547,
+      "grad_norm": 0.34032562375068665,
+      "learning_rate": 0.0004867167832167832,
+      "loss": 3.3949,
+      "step": 32500
+    },
+    {
+      "epoch": 9.478769875939193,
+      "grad_norm": 0.32405319809913635,
+      "learning_rate": 0.00048654195804195794,
+      "loss": 3.382,
+      "step": 32550
+    },
+    {
+      "epoch": 9.493331003552916,
+      "grad_norm": 0.34905362129211426,
+      "learning_rate": 0.00048636713286713285,
+      "loss": 3.3953,
+      "step": 32600
+    },
+    {
+      "epoch": 9.507892131166638,
+      "grad_norm": 0.3418472409248352,
+      "learning_rate": 0.0004861923076923077,
+      "loss": 3.3972,
+      "step": 32650
+    },
+    {
+      "epoch": 9.52245325878036,
+      "grad_norm": 0.3480176031589508,
+      "learning_rate": 0.00048601748251748245,
+      "loss": 3.3983,
+      "step": 32700
+    },
+    {
+      "epoch": 9.537014386394082,
+      "grad_norm": 0.3377174139022827,
+      "learning_rate": 0.0004858426573426573,
+      "loss": 3.3821,
+      "step": 32750
+    },
+    {
+      "epoch": 9.551575514007805,
+      "grad_norm": 0.3357522487640381,
+      "learning_rate": 0.0004856678321678321,
+      "loss": 3.3975,
+      "step": 32800
+    },
+    {
+      "epoch": 9.566136641621528,
+      "grad_norm": 0.32841238379478455,
+      "learning_rate": 0.00048549300699300696,
+      "loss": 3.3976,
+      "step": 32850
+    },
+    {
+      "epoch": 9.58069776923525,
+      "grad_norm": 0.33749887347221375,
+      "learning_rate": 0.00048531818181818176,
+      "loss": 3.4136,
+      "step": 32900
+    },
+    {
+      "epoch": 9.595258896848971,
+      "grad_norm": 0.3626416325569153,
+      "learning_rate": 0.0004851433566433566,
+      "loss": 3.3985,
+      "step": 32950
+    },
+    {
+      "epoch": 9.609820024462694,
+      "grad_norm": 0.36860400438308716,
+      "learning_rate": 0.0004849685314685314,
+      "loss": 3.3965,
+      "step": 33000
+    },
+    {
+      "epoch": 9.609820024462694,
+      "eval_accuracy": 0.36782729958925975,
+      "eval_loss": 3.5715696811676025,
+      "eval_runtime": 183.923,
+      "eval_samples_per_second": 90.505,
+      "eval_steps_per_second": 5.66,
+      "step": 33000
+    },
+    {
+      "epoch": 9.624381152076417,
+      "grad_norm": 0.3426574468612671,
+      "learning_rate": 0.00048479370629370627,
+      "loss": 3.4014,
+      "step": 33050
+    },
+    {
+      "epoch": 9.63894227969014,
+      "grad_norm": 0.3384750783443451,
+      "learning_rate": 0.00048461888111888106,
+      "loss": 3.4112,
+      "step": 33100
+    },
+    {
+      "epoch": 9.653503407303862,
+      "grad_norm": 0.35202690958976746,
+      "learning_rate": 0.0004844440559440559,
+      "loss": 3.4129,
+      "step": 33150
+    },
+    {
+      "epoch": 9.668064534917583,
+      "grad_norm": 0.355497270822525,
+      "learning_rate": 0.0004842692307692307,
+      "loss": 3.3892,
+      "step": 33200
+    },
+    {
+      "epoch": 9.682625662531306,
+      "grad_norm": 0.32850146293640137,
+      "learning_rate": 0.00048409440559440557,
+      "loss": 3.4,
+      "step": 33250
+    },
+    {
+      "epoch": 9.697186790145029,
+      "grad_norm": 0.3368713855743408,
+      "learning_rate": 0.0004839195804195803,
+      "loss": 3.4127,
+      "step": 33300
+    },
+    {
+      "epoch": 9.711747917758752,
+      "grad_norm": 0.3568696677684784,
+      "learning_rate": 0.0004837447552447552,
+      "loss": 3.3944,
+      "step": 33350
+    },
+    {
+      "epoch": 9.726309045372474,
+      "grad_norm": 0.32732048630714417,
+      "learning_rate": 0.0004835699300699301,
+      "loss": 3.4031,
+      "step": 33400
+    },
+    {
+      "epoch": 9.740870172986195,
+      "grad_norm": 0.3446010649204254,
+      "learning_rate": 0.0004833951048951048,
+      "loss": 3.4169,
+      "step": 33450
+    },
+    {
+      "epoch": 9.755431300599918,
+      "grad_norm": 0.32168522477149963,
+      "learning_rate": 0.0004832202797202797,
+      "loss": 3.4106,
+      "step": 33500
+    },
+    {
+      "epoch": 9.76999242821364,
+      "grad_norm": 0.35548439621925354,
+      "learning_rate": 0.0004830454545454545,
+      "loss": 3.3974,
+      "step": 33550
+    },
+    {
+      "epoch": 9.784553555827364,
+      "grad_norm": 0.3315522074699402,
+      "learning_rate": 0.00048287062937062933,
+      "loss": 3.4138,
+      "step": 33600
+    },
+    {
+      "epoch": 9.799114683441086,
+      "grad_norm": 0.33013713359832764,
+      "learning_rate": 0.00048269580419580413,
+      "loss": 3.4092,
+      "step": 33650
+    },
+    {
+      "epoch": 9.813675811054807,
+      "grad_norm": 0.34848856925964355,
+      "learning_rate": 0.000482520979020979,
+      "loss": 3.4104,
+      "step": 33700
+    },
+    {
+      "epoch": 9.82823693866853,
+      "grad_norm": 0.32687628269195557,
+      "learning_rate": 0.0004823461538461538,
+      "loss": 3.4018,
+      "step": 33750
+    },
+    {
+      "epoch": 9.842798066282253,
+      "grad_norm": 0.36140188574790955,
+      "learning_rate": 0.00048217132867132864,
+      "loss": 3.4006,
+      "step": 33800
+    },
+    {
+      "epoch": 9.857359193895975,
+      "grad_norm": 0.32018741965293884,
+      "learning_rate": 0.00048199650349650344,
+      "loss": 3.4013,
+      "step": 33850
+    },
+    {
+      "epoch": 9.871920321509698,
+      "grad_norm": 0.34114909172058105,
+      "learning_rate": 0.0004818216783216783,
+      "loss": 3.3959,
+      "step": 33900
+    },
+    {
+      "epoch": 9.88648144912342,
+      "grad_norm": 0.32743898034095764,
+      "learning_rate": 0.0004816468531468531,
+      "loss": 3.4191,
+      "step": 33950
+    },
+    {
+      "epoch": 9.901042576737142,
+      "grad_norm": 0.37873852252960205,
+      "learning_rate": 0.00048147202797202795,
+      "loss": 3.4187,
+      "step": 34000
+    },
+    {
+      "epoch": 9.901042576737142,
+      "eval_accuracy": 0.36834492774954836,
+      "eval_loss": 3.561105728149414,
+      "eval_runtime": 179.7864,
+      "eval_samples_per_second": 92.588,
+      "eval_steps_per_second": 5.79,
+      "step": 34000
+    },
+    {
+      "epoch": 9.915603704350865,
+      "grad_norm": 0.33194535970687866,
+      "learning_rate": 0.0004812972027972028,
+      "loss": 3.4104,
+      "step": 34050
+    },
+    {
+      "epoch": 9.930164831964587,
+      "grad_norm": 0.3524761497974396,
+      "learning_rate": 0.0004811223776223776,
+      "loss": 3.4132,
+      "step": 34100
+    },
+    {
+      "epoch": 9.94472595957831,
+      "grad_norm": 0.3482424020767212,
+      "learning_rate": 0.00048094755244755245,
+      "loss": 3.4059,
+      "step": 34150
+    },
+    {
+      "epoch": 9.959287087192033,
+      "grad_norm": 0.350705087184906,
+      "learning_rate": 0.0004807727272727272,
+      "loss": 3.4047,
+      "step": 34200
+    },
+    {
+      "epoch": 9.973848214805754,
+      "grad_norm": 0.33391574025154114,
+      "learning_rate": 0.00048059790209790205,
+      "loss": 3.4213,
+      "step": 34250
+    },
+    {
+      "epoch": 9.988409342419477,
+      "grad_norm": 0.3514692187309265,
+      "learning_rate": 0.00048042307692307685,
+      "loss": 3.4138,
+      "step": 34300
+    },
+    {
+      "epoch": 10.002912225522744,
+      "grad_norm": 0.3422977030277252,
+      "learning_rate": 0.0004802482517482517,
+      "loss": 3.3925,
+      "step": 34350
+    },
+    {
+      "epoch": 10.017473353136467,
+      "grad_norm": 0.33694586157798767,
+      "learning_rate": 0.0004800734265734265,
+      "loss": 3.296,
+      "step": 34400
+    },
+    {
+      "epoch": 10.03203448075019,
+      "grad_norm": 0.3611753284931183,
+      "learning_rate": 0.00047989860139860136,
+      "loss": 3.2954,
+      "step": 34450
+    },
+    {
+      "epoch": 10.046595608363912,
+      "grad_norm": 0.3551093637943268,
+      "learning_rate": 0.00047972377622377616,
+      "loss": 3.2935,
+      "step": 34500
+    },
+    {
+      "epoch": 10.061156735977635,
+      "grad_norm": 0.37375402450561523,
+      "learning_rate": 0.000479548951048951,
+      "loss": 3.3175,
+      "step": 34550
+    },
+    {
+      "epoch": 10.075717863591356,
+      "grad_norm": 0.365528404712677,
+      "learning_rate": 0.0004793741258741258,
+      "loss": 3.3185,
+      "step": 34600
+    },
+    {
+      "epoch": 10.090278991205079,
+      "grad_norm": 0.35895606875419617,
+      "learning_rate": 0.00047919930069930067,
+      "loss": 3.3174,
+      "step": 34650
+    },
+    {
+      "epoch": 10.104840118818801,
+      "grad_norm": 0.32946503162384033,
+      "learning_rate": 0.0004790244755244755,
+      "loss": 3.32,
+      "step": 34700
+    },
+    {
+      "epoch": 10.119401246432524,
+      "grad_norm": 0.33243829011917114,
+      "learning_rate": 0.0004788496503496503,
+      "loss": 3.3142,
+      "step": 34750
+    },
+    {
+      "epoch": 10.133962374046247,
+      "grad_norm": 0.3511507511138916,
+      "learning_rate": 0.0004786748251748252,
+      "loss": 3.3314,
+      "step": 34800
+    },
+    {
+      "epoch": 10.148523501659968,
+      "grad_norm": 0.3584575653076172,
+      "learning_rate": 0.0004785,
+      "loss": 3.3345,
+      "step": 34850
+    },
+    {
+      "epoch": 10.16308462927369,
+      "grad_norm": 0.3298545479774475,
+      "learning_rate": 0.00047832517482517483,
+      "loss": 3.339,
+      "step": 34900
+    },
+    {
+      "epoch": 10.177645756887413,
+      "grad_norm": 0.3483952581882477,
+      "learning_rate": 0.0004781503496503496,
+      "loss": 3.324,
+      "step": 34950
+    },
+    {
+      "epoch": 10.192206884501136,
+      "grad_norm": 0.3524647057056427,
+      "learning_rate": 0.00047797552447552443,
+      "loss": 3.344,
+      "step": 35000
+    },
+    {
+      "epoch": 10.192206884501136,
+      "eval_accuracy": 0.3683687929588463,
+      "eval_loss": 3.5726046562194824,
+      "eval_runtime": 180.0575,
+      "eval_samples_per_second": 92.448,
+      "eval_steps_per_second": 5.781,
+      "step": 35000
+    },
+    {
+      "epoch": 10.206768012114859,
+      "grad_norm": 0.3376518189907074,
+      "learning_rate": 0.00047780069930069923,
+      "loss": 3.3347,
+      "step": 35050
+    },
+    {
+      "epoch": 10.221329139728581,
+      "grad_norm": 0.3457695543766022,
+      "learning_rate": 0.0004776258741258741,
+      "loss": 3.3483,
+      "step": 35100
+    },
+    {
+      "epoch": 10.235890267342302,
+      "grad_norm": 0.37430188059806824,
+      "learning_rate": 0.0004774510489510489,
+      "loss": 3.3552,
+      "step": 35150
+    },
+    {
+      "epoch": 10.250451394956025,
+      "grad_norm": 0.3510351777076721,
+      "learning_rate": 0.00047727622377622374,
+      "loss": 3.3549,
+      "step": 35200
+    },
+    {
+      "epoch": 10.265012522569748,
+      "grad_norm": 0.37889590859413147,
+      "learning_rate": 0.00047710139860139854,
+      "loss": 3.3416,
+      "step": 35250
+    },
+    {
+      "epoch": 10.27957365018347,
+      "grad_norm": 0.3422775864601135,
+      "learning_rate": 0.0004769265734265734,
+      "loss": 3.3383,
+      "step": 35300
+    },
+    {
+      "epoch": 10.294134777797193,
+      "grad_norm": 0.38626229763031006,
+      "learning_rate": 0.0004767517482517482,
+      "loss": 3.3461,
+      "step": 35350
+    },
+    {
+      "epoch": 10.308695905410914,
+      "grad_norm": 0.3493908643722534,
+      "learning_rate": 0.00047657692307692304,
+      "loss": 3.3535,
+      "step": 35400
+    },
+    {
+      "epoch": 10.323257033024637,
+      "grad_norm": 0.35432669520378113,
+      "learning_rate": 0.0004764020979020979,
+      "loss": 3.3555,
+      "step": 35450
+    },
+    {
+      "epoch": 10.33781816063836,
+      "grad_norm": 0.3410918116569519,
+      "learning_rate": 0.0004762272727272727,
+      "loss": 3.3469,
+      "step": 35500
+    },
+    {
+      "epoch": 10.352379288252083,
+      "grad_norm": 0.36023515462875366,
+      "learning_rate": 0.00047605244755244755,
+      "loss": 3.3584,
+      "step": 35550
+    },
+    {
+      "epoch": 10.366940415865805,
+      "grad_norm": 0.3287743330001831,
+      "learning_rate": 0.00047587762237762235,
+      "loss": 3.3674,
+      "step": 35600
+    },
+    {
+      "epoch": 10.381501543479526,
+      "grad_norm": 0.3435341715812683,
+      "learning_rate": 0.0004757027972027972,
+      "loss": 3.3646,
+      "step": 35650
+    },
+    {
+      "epoch": 10.396062671093249,
+      "grad_norm": 0.3478064239025116,
+      "learning_rate": 0.00047552797202797195,
+      "loss": 3.3675,
+      "step": 35700
+    },
+    {
+      "epoch": 10.410623798706972,
+      "grad_norm": 0.3641142249107361,
+      "learning_rate": 0.0004753531468531468,
+      "loss": 3.3591,
+      "step": 35750
+    },
+    {
+      "epoch": 10.425184926320695,
+      "grad_norm": 0.3393605649471283,
+      "learning_rate": 0.0004751783216783216,
+      "loss": 3.3615,
+      "step": 35800
+    },
+    {
+      "epoch": 10.439746053934417,
+      "grad_norm": 0.38742467761039734,
+      "learning_rate": 0.00047500349650349646,
+      "loss": 3.3701,
+      "step": 35850
+    },
+    {
+      "epoch": 10.454307181548138,
+      "grad_norm": 0.37009376287460327,
+      "learning_rate": 0.00047482867132867126,
+      "loss": 3.3576,
+      "step": 35900
+    },
+    {
+      "epoch": 10.468868309161861,
+      "grad_norm": 0.36964377760887146,
+      "learning_rate": 0.0004746538461538461,
+      "loss": 3.3546,
+      "step": 35950
+    },
+    {
+      "epoch": 10.483429436775584,
+      "grad_norm": 0.3347964882850647,
+      "learning_rate": 0.0004744790209790209,
+      "loss": 3.3789,
+      "step": 36000
+    },
+    {
+      "epoch": 10.483429436775584,
+      "eval_accuracy": 0.3682949636414124,
+      "eval_loss": 3.5694682598114014,
+      "eval_runtime": 179.8767,
+      "eval_samples_per_second": 92.541,
+      "eval_steps_per_second": 5.787,
+      "step": 36000
+    },
+    {
+      "epoch": 10.497990564389307,
+      "grad_norm": 0.3568975329399109,
+      "learning_rate": 0.00047430419580419576,
+      "loss": 3.3692,
+      "step": 36050
+    },
+    {
+      "epoch": 10.51255169200303,
+      "grad_norm": 0.3243386447429657,
+      "learning_rate": 0.0004741293706293706,
+      "loss": 3.3693,
+      "step": 36100
+    },
+    {
+      "epoch": 10.52711281961675,
+      "grad_norm": 0.3336549997329712,
+      "learning_rate": 0.0004739545454545454,
+      "loss": 3.3785,
+      "step": 36150
+    },
+    {
+      "epoch": 10.541673947230473,
+      "grad_norm": 0.3561848998069763,
+      "learning_rate": 0.00047377972027972027,
+      "loss": 3.3691,
+      "step": 36200
+    },
+    {
+      "epoch": 10.556235074844196,
+      "grad_norm": 0.356851726770401,
+      "learning_rate": 0.00047360489510489507,
+      "loss": 3.3643,
+      "step": 36250
+    },
+    {
+      "epoch": 10.570796202457919,
+      "grad_norm": 0.33825376629829407,
+      "learning_rate": 0.0004734300699300699,
+      "loss": 3.3913,
+      "step": 36300
+    },
+    {
+      "epoch": 10.585357330071641,
+      "grad_norm": 0.3185909390449524,
+      "learning_rate": 0.0004732552447552447,
+      "loss": 3.3784,
+      "step": 36350
+    },
+    {
+      "epoch": 10.599918457685362,
+      "grad_norm": 0.3499145805835724,
+      "learning_rate": 0.0004730804195804196,
+      "loss": 3.379,
+      "step": 36400
+    },
+    {
+      "epoch": 10.614479585299085,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004729055944055943,
+      "loss": 3.3796,
+      "step": 36450
+    },
+    {
+      "epoch": 10.629040712912808,
+      "grad_norm": 0.34612196683883667,
+      "learning_rate": 0.0004727307692307692,
+      "loss": 3.3845,
+      "step": 36500
+    },
+    {
+      "epoch": 10.64360184052653,
+      "grad_norm": 0.3680227994918823,
+      "learning_rate": 0.000472555944055944,
+      "loss": 3.3761,
+      "step": 36550
+    },
+    {
+      "epoch": 10.658162968140253,
+      "grad_norm": 0.36743855476379395,
+      "learning_rate": 0.00047238111888111883,
+      "loss": 3.3684,
+      "step": 36600
+    },
+    {
+      "epoch": 10.672724095753976,
+      "grad_norm": 0.33822494745254517,
+      "learning_rate": 0.00047220629370629363,
+      "loss": 3.3854,
+      "step": 36650
+    },
+    {
+      "epoch": 10.687285223367697,
+      "grad_norm": 0.3840745687484741,
+      "learning_rate": 0.0004720314685314685,
+      "loss": 3.3676,
+      "step": 36700
+    },
+    {
+      "epoch": 10.70184635098142,
+      "grad_norm": 0.3411411941051483,
+      "learning_rate": 0.0004718566433566433,
+      "loss": 3.3732,
+      "step": 36750
+    },
+    {
+      "epoch": 10.716407478595142,
+      "grad_norm": 0.3389338552951813,
+      "learning_rate": 0.00047168181818181814,
+      "loss": 3.3848,
+      "step": 36800
+    },
+    {
+      "epoch": 10.730968606208865,
+      "grad_norm": 0.33619245886802673,
+      "learning_rate": 0.000471506993006993,
+      "loss": 3.3761,
+      "step": 36850
+    },
+    {
+      "epoch": 10.745529733822588,
+      "grad_norm": 0.35158175230026245,
+      "learning_rate": 0.0004713321678321678,
+      "loss": 3.3823,
+      "step": 36900
+    },
+    {
+      "epoch": 10.760090861436309,
+      "grad_norm": 0.3377283811569214,
+      "learning_rate": 0.00047115734265734265,
+      "loss": 3.4041,
+      "step": 36950
+    },
+    {
+      "epoch": 10.774651989050032,
+      "grad_norm": 0.3622257113456726,
+      "learning_rate": 0.00047098251748251745,
+      "loss": 3.4005,
+      "step": 37000
+    },
+    {
+      "epoch": 10.774651989050032,
+      "eval_accuracy": 0.36925674333237796,
+      "eval_loss": 3.5602827072143555,
+      "eval_runtime": 179.7742,
+      "eval_samples_per_second": 92.594,
+      "eval_steps_per_second": 5.791,
+      "step": 37000
+    },
+    {
+      "epoch": 10.789213116663754,
+      "grad_norm": 0.3160146474838257,
+      "learning_rate": 0.0004708076923076923,
+      "loss": 3.3912,
+      "step": 37050
+    },
+    {
+      "epoch": 10.803774244277477,
+      "grad_norm": 0.3281710147857666,
+      "learning_rate": 0.0004706328671328671,
+      "loss": 3.3967,
+      "step": 37100
+    },
+    {
+      "epoch": 10.8183353718912,
+      "grad_norm": 0.3841243386268616,
+      "learning_rate": 0.00047045804195804195,
+      "loss": 3.3979,
+      "step": 37150
+    },
+    {
+      "epoch": 10.83289649950492,
+      "grad_norm": 0.33870750665664673,
+      "learning_rate": 0.0004702832167832167,
+      "loss": 3.3879,
+      "step": 37200
+    },
+    {
+      "epoch": 10.847457627118644,
+      "grad_norm": 0.34300288558006287,
+      "learning_rate": 0.00047010839160839155,
+      "loss": 3.3959,
+      "step": 37250
+    },
+    {
+      "epoch": 10.862018754732366,
+      "grad_norm": 0.35645779967308044,
+      "learning_rate": 0.00046993356643356635,
+      "loss": 3.3974,
+      "step": 37300
+    },
+    {
+      "epoch": 10.876579882346089,
+      "grad_norm": 0.35175466537475586,
+      "learning_rate": 0.0004697587412587412,
+      "loss": 3.3888,
+      "step": 37350
+    },
+    {
+      "epoch": 10.891141009959812,
+      "grad_norm": 0.3222729563713074,
+      "learning_rate": 0.000469583916083916,
+      "loss": 3.3981,
+      "step": 37400
+    },
+    {
+      "epoch": 10.905702137573535,
+      "grad_norm": 0.32414838671684265,
+      "learning_rate": 0.00046940909090909086,
+      "loss": 3.391,
+      "step": 37450
+    },
+    {
+      "epoch": 10.920263265187256,
+      "grad_norm": 0.34362274408340454,
+      "learning_rate": 0.0004692342657342657,
+      "loss": 3.3832,
+      "step": 37500
+    },
+    {
+      "epoch": 10.934824392800978,
+      "grad_norm": 0.3682989776134491,
+      "learning_rate": 0.0004690594405594405,
+      "loss": 3.3894,
+      "step": 37550
+    },
+    {
+      "epoch": 10.949385520414701,
+      "grad_norm": 0.3218347728252411,
+      "learning_rate": 0.00046888461538461537,
+      "loss": 3.3901,
+      "step": 37600
+    },
+    {
+      "epoch": 10.963946648028424,
+      "grad_norm": 0.37173983454704285,
+      "learning_rate": 0.00046870979020979017,
+      "loss": 3.3884,
+      "step": 37650
+    },
+    {
+      "epoch": 10.978507775642147,
+      "grad_norm": 0.37107351422309875,
+      "learning_rate": 0.000468534965034965,
+      "loss": 3.3922,
+      "step": 37700
+    },
+    {
+      "epoch": 10.993068903255867,
+      "grad_norm": 0.35204780101776123,
+      "learning_rate": 0.0004683601398601398,
+      "loss": 3.3753,
+      "step": 37750
+    },
+    {
+      "epoch": 11.007571786359136,
+      "grad_norm": 0.33582913875579834,
+      "learning_rate": 0.0004681853146853147,
+      "loss": 3.3251,
+      "step": 37800
+    },
+    {
+      "epoch": 11.022132913972857,
+      "grad_norm": 0.33512166142463684,
+      "learning_rate": 0.0004680104895104895,
+      "loss": 3.2653,
+      "step": 37850
+    },
+    {
+      "epoch": 11.03669404158658,
+      "grad_norm": 0.3530137240886688,
+      "learning_rate": 0.00046783566433566433,
+      "loss": 3.2888,
+      "step": 37900
+    },
+    {
+      "epoch": 11.051255169200303,
+      "grad_norm": 0.3322924077510834,
+      "learning_rate": 0.0004676608391608391,
+      "loss": 3.304,
+      "step": 37950
+    },
+    {
+      "epoch": 11.065816296814026,
+      "grad_norm": 0.34434235095977783,
+      "learning_rate": 0.00046748601398601393,
+      "loss": 3.2886,
+      "step": 38000
+    },
+    {
+      "epoch": 11.065816296814026,
+      "eval_accuracy": 0.36925674333237796,
+      "eval_loss": 3.5678048133850098,
+      "eval_runtime": 179.7265,
+      "eval_samples_per_second": 92.618,
+      "eval_steps_per_second": 5.792,
+      "step": 38000
+    },
+    {
+      "epoch": 11.080377424427748,
+      "grad_norm": 0.3417208790779114,
+      "learning_rate": 0.00046731118881118873,
+      "loss": 3.2852,
+      "step": 38050
+    },
+    {
+      "epoch": 11.09493855204147,
+      "grad_norm": 0.36699753999710083,
+      "learning_rate": 0.0004671363636363636,
+      "loss": 3.2885,
+      "step": 38100
+    },
+    {
+      "epoch": 11.109499679655192,
+      "grad_norm": 0.336487740278244,
+      "learning_rate": 0.00046696153846153844,
+      "loss": 3.309,
+      "step": 38150
+    },
+    {
+      "epoch": 11.124060807268915,
+      "grad_norm": 0.35883960127830505,
+      "learning_rate": 0.00046678671328671324,
+      "loss": 3.3019,
+      "step": 38200
+    },
+    {
+      "epoch": 11.138621934882638,
+      "grad_norm": 0.3807854950428009,
+      "learning_rate": 0.0004666118881118881,
+      "loss": 3.3034,
+      "step": 38250
+    },
+    {
+      "epoch": 11.15318306249636,
+      "grad_norm": 0.3594328463077545,
+      "learning_rate": 0.0004664370629370629,
+      "loss": 3.2984,
+      "step": 38300
+    },
+    {
+      "epoch": 11.167744190110081,
+      "grad_norm": 0.344275563955307,
+      "learning_rate": 0.00046626223776223774,
+      "loss": 3.3229,
+      "step": 38350
+    },
+    {
+      "epoch": 11.182305317723804,
+      "grad_norm": 0.3437211811542511,
+      "learning_rate": 0.00046608741258741254,
+      "loss": 3.3241,
+      "step": 38400
+    },
+    {
+      "epoch": 11.196866445337527,
+      "grad_norm": 0.3443619906902313,
+      "learning_rate": 0.0004659125874125874,
+      "loss": 3.319,
+      "step": 38450
+    },
+    {
+      "epoch": 11.21142757295125,
+      "grad_norm": 0.3490242063999176,
+      "learning_rate": 0.0004657377622377622,
+      "loss": 3.3212,
+      "step": 38500
+    },
+    {
+      "epoch": 11.225988700564972,
+      "grad_norm": 0.33340850472450256,
+      "learning_rate": 0.00046556293706293705,
+      "loss": 3.3279,
+      "step": 38550
+    },
+    {
+      "epoch": 11.240549828178693,
+      "grad_norm": 0.33329835534095764,
+      "learning_rate": 0.00046538811188811185,
+      "loss": 3.3262,
+      "step": 38600
+    },
+    {
+      "epoch": 11.255110955792416,
+      "grad_norm": 0.3586530387401581,
+      "learning_rate": 0.0004652132867132867,
+      "loss": 3.3209,
+      "step": 38650
+    },
+    {
+      "epoch": 11.269672083406139,
+      "grad_norm": 0.3424071669578552,
+      "learning_rate": 0.00046503846153846145,
+      "loss": 3.3249,
+      "step": 38700
+    },
+    {
+      "epoch": 11.284233211019862,
+      "grad_norm": 0.36820656061172485,
+      "learning_rate": 0.0004648636363636363,
+      "loss": 3.3411,
+      "step": 38750
+    },
+    {
+      "epoch": 11.298794338633584,
+      "grad_norm": 0.33335718512535095,
+      "learning_rate": 0.0004646888111888111,
+      "loss": 3.3367,
+      "step": 38800
+    },
+    {
+      "epoch": 11.313355466247307,
+      "grad_norm": 0.3624469041824341,
+      "learning_rate": 0.00046451398601398596,
+      "loss": 3.3382,
+      "step": 38850
+    },
+    {
+      "epoch": 11.327916593861028,
+      "grad_norm": 0.3541378974914551,
+      "learning_rate": 0.0004643391608391608,
+      "loss": 3.335,
+      "step": 38900
+    },
+    {
+      "epoch": 11.34247772147475,
+      "grad_norm": 0.3461047112941742,
+      "learning_rate": 0.0004641643356643356,
+      "loss": 3.3381,
+      "step": 38950
+    },
+    {
+      "epoch": 11.357038849088473,
+      "grad_norm": 0.33990010619163513,
+      "learning_rate": 0.00046398951048951046,
+      "loss": 3.339,
+      "step": 39000
+    },
+    {
+      "epoch": 11.357038849088473,
+      "eval_accuracy": 0.36928096122949794,
+      "eval_loss": 3.5659894943237305,
+      "eval_runtime": 179.756,
+      "eval_samples_per_second": 92.603,
+      "eval_steps_per_second": 5.791,
+      "step": 39000
+    },
+    {
+      "epoch": 11.371599976702196,
+      "grad_norm": 0.32082730531692505,
+      "learning_rate": 0.00046381468531468526,
+      "loss": 3.3427,
+      "step": 39050
+    },
+    {
+      "epoch": 11.386161104315919,
+      "grad_norm": 0.34358900785446167,
+      "learning_rate": 0.0004636398601398601,
+      "loss": 3.3326,
+      "step": 39100
+    },
+    {
+      "epoch": 11.40072223192964,
+      "grad_norm": 0.3438430428504944,
+      "learning_rate": 0.0004634650349650349,
+      "loss": 3.3397,
+      "step": 39150
+    },
+    {
+      "epoch": 11.415283359543363,
+      "grad_norm": 0.33695173263549805,
+      "learning_rate": 0.00046329020979020977,
+      "loss": 3.3419,
+      "step": 39200
+    },
+    {
+      "epoch": 11.429844487157085,
+      "grad_norm": 0.35443180799484253,
+      "learning_rate": 0.00046311538461538457,
+      "loss": 3.3384,
+      "step": 39250
+    },
+    {
+      "epoch": 11.444405614770808,
+      "grad_norm": 0.3469848930835724,
+      "learning_rate": 0.0004629405594405594,
+      "loss": 3.3428,
+      "step": 39300
+    },
+    {
+      "epoch": 11.458966742384531,
+      "grad_norm": 0.3489019274711609,
+      "learning_rate": 0.0004627657342657342,
+      "loss": 3.3367,
+      "step": 39350
+    },
+    {
+      "epoch": 11.473527869998252,
+      "grad_norm": 0.34575167298316956,
+      "learning_rate": 0.0004625909090909091,
+      "loss": 3.3421,
+      "step": 39400
+    },
+    {
+      "epoch": 11.488088997611975,
+      "grad_norm": 0.36594027280807495,
+      "learning_rate": 0.0004624160839160838,
+      "loss": 3.3443,
+      "step": 39450
+    },
+    {
+      "epoch": 11.502650125225697,
+      "grad_norm": 0.37035486102104187,
+      "learning_rate": 0.0004622412587412587,
+      "loss": 3.3556,
+      "step": 39500
+    },
+    {
+      "epoch": 11.51721125283942,
+      "grad_norm": 0.378604918718338,
+      "learning_rate": 0.00046206643356643353,
+      "loss": 3.3449,
+      "step": 39550
+    },
+    {
+      "epoch": 11.531772380453143,
+      "grad_norm": 0.37700214982032776,
+      "learning_rate": 0.00046189160839160833,
+      "loss": 3.3588,
+      "step": 39600
+    },
+    {
+      "epoch": 11.546333508066864,
+      "grad_norm": 0.3387846350669861,
+      "learning_rate": 0.0004617167832167832,
+      "loss": 3.3551,
+      "step": 39650
+    },
+    {
+      "epoch": 11.560894635680587,
+      "grad_norm": 0.35091421008110046,
+      "learning_rate": 0.000461541958041958,
+      "loss": 3.3484,
+      "step": 39700
+    },
+    {
+      "epoch": 11.57545576329431,
+      "grad_norm": 0.36825278401374817,
+      "learning_rate": 0.00046136713286713284,
+      "loss": 3.3578,
+      "step": 39750
+    },
+    {
+      "epoch": 11.590016890908032,
+      "grad_norm": 0.3793783485889435,
+      "learning_rate": 0.00046119230769230764,
+      "loss": 3.3567,
+      "step": 39800
+    },
+    {
+      "epoch": 11.604578018521755,
+      "grad_norm": 0.35660460591316223,
+      "learning_rate": 0.0004610174825174825,
+      "loss": 3.3665,
+      "step": 39850
+    },
+    {
+      "epoch": 11.619139146135478,
+      "grad_norm": 0.3568241000175476,
+      "learning_rate": 0.0004608426573426573,
+      "loss": 3.3687,
+      "step": 39900
+    },
+    {
+      "epoch": 11.633700273749199,
+      "grad_norm": 0.3692575991153717,
+      "learning_rate": 0.00046066783216783215,
+      "loss": 3.3489,
+      "step": 39950
+    },
+    {
+      "epoch": 11.648261401362921,
+      "grad_norm": 0.33610156178474426,
+      "learning_rate": 0.00046049300699300695,
+      "loss": 3.3622,
+      "step": 40000
+    },
+    {
+      "epoch": 11.648261401362921,
+      "eval_accuracy": 0.36969924898655154,
+      "eval_loss": 3.5569655895233154,
+      "eval_runtime": 179.7612,
+      "eval_samples_per_second": 92.601,
+      "eval_steps_per_second": 5.791,
+      "step": 40000
+    },
+    {
+      "epoch": 11.662822528976644,
+      "grad_norm": 0.35730060935020447,
+      "learning_rate": 0.0004603181818181818,
+      "loss": 3.3747,
+      "step": 40050
+    },
+    {
+      "epoch": 11.677383656590367,
+      "grad_norm": 0.36511674523353577,
+      "learning_rate": 0.0004601433566433566,
+      "loss": 3.3653,
+      "step": 40100
+    },
+    {
+      "epoch": 11.69194478420409,
+      "grad_norm": 0.3742559254169464,
+      "learning_rate": 0.00045996853146853145,
+      "loss": 3.3673,
+      "step": 40150
+    },
+    {
+      "epoch": 11.70650591181781,
+      "grad_norm": 0.3337925970554352,
+      "learning_rate": 0.0004597937062937062,
+      "loss": 3.353,
+      "step": 40200
+    },
+    {
+      "epoch": 11.721067039431533,
+      "grad_norm": 0.3480316996574402,
+      "learning_rate": 0.00045961888111888105,
+      "loss": 3.3657,
+      "step": 40250
+    },
+    {
+      "epoch": 11.735628167045256,
+      "grad_norm": 0.34601667523384094,
+      "learning_rate": 0.0004594440559440559,
+      "loss": 3.3716,
+      "step": 40300
+    },
+    {
+      "epoch": 11.750189294658979,
+      "grad_norm": 0.34064266085624695,
+      "learning_rate": 0.0004592692307692307,
+      "loss": 3.3759,
+      "step": 40350
+    },
+    {
+      "epoch": 11.764750422272702,
+      "grad_norm": 0.34361889958381653,
+      "learning_rate": 0.00045909440559440556,
+      "loss": 3.3829,
+      "step": 40400
+    },
+    {
+      "epoch": 11.779311549886422,
+      "grad_norm": 0.36990559101104736,
+      "learning_rate": 0.00045891958041958036,
+      "loss": 3.3704,
+      "step": 40450
+    },
+    {
+      "epoch": 11.793872677500145,
+      "grad_norm": 0.3608870804309845,
+      "learning_rate": 0.0004587447552447552,
+      "loss": 3.3708,
+      "step": 40500
+    },
+    {
+      "epoch": 11.808433805113868,
+      "grad_norm": 0.3519209027290344,
+      "learning_rate": 0.00045856993006993,
+      "loss": 3.3659,
+      "step": 40550
+    },
+    {
+      "epoch": 11.82299493272759,
+      "grad_norm": 0.353716641664505,
+      "learning_rate": 0.00045839510489510487,
+      "loss": 3.3706,
+      "step": 40600
+    },
+    {
+      "epoch": 11.837556060341313,
+      "grad_norm": 0.3455187976360321,
+      "learning_rate": 0.00045822027972027967,
+      "loss": 3.3705,
+      "step": 40650
+    },
+    {
+      "epoch": 11.852117187955034,
+      "grad_norm": 0.373958945274353,
+      "learning_rate": 0.0004580454545454545,
+      "loss": 3.348,
+      "step": 40700
+    },
+    {
+      "epoch": 11.866678315568757,
+      "grad_norm": 0.3595150411128998,
+      "learning_rate": 0.0004578706293706293,
+      "loss": 3.3672,
+      "step": 40750
+    },
+    {
+      "epoch": 11.88123944318248,
+      "grad_norm": 0.36767902970314026,
+      "learning_rate": 0.0004576958041958042,
+      "loss": 3.3728,
+      "step": 40800
+    },
+    {
+      "epoch": 11.895800570796203,
+      "grad_norm": 0.35420218110084534,
+      "learning_rate": 0.000457520979020979,
+      "loss": 3.3674,
+      "step": 40850
+    },
+    {
+      "epoch": 11.910361698409925,
+      "grad_norm": 0.35077446699142456,
+      "learning_rate": 0.00045734615384615383,
+      "loss": 3.3652,
+      "step": 40900
+    },
+    {
+      "epoch": 11.924922826023646,
+      "grad_norm": 0.3696039021015167,
+      "learning_rate": 0.0004571713286713287,
+      "loss": 3.3689,
+      "step": 40950
+    },
+    {
+      "epoch": 11.93948395363737,
+      "grad_norm": 0.34140545129776,
+      "learning_rate": 0.00045699650349650343,
+      "loss": 3.3645,
+      "step": 41000
+    },
+    {
+      "epoch": 11.93948395363737,
+      "eval_accuracy": 0.3705252438659946,
+      "eval_loss": 3.5469377040863037,
+      "eval_runtime": 179.8742,
+      "eval_samples_per_second": 92.542,
+      "eval_steps_per_second": 5.787,
+      "step": 41000
+    },
+    {
+      "epoch": 11.954045081251092,
+      "grad_norm": 0.35556402802467346,
+      "learning_rate": 0.0004568216783216783,
+      "loss": 3.3857,
+      "step": 41050
+    },
+    {
+      "epoch": 11.968606208864815,
+      "grad_norm": 0.3346519470214844,
+      "learning_rate": 0.0004566468531468531,
+      "loss": 3.3626,
+      "step": 41100
+    },
+    {
+      "epoch": 11.983167336478537,
+      "grad_norm": 0.34058094024658203,
+      "learning_rate": 0.00045647202797202794,
+      "loss": 3.3772,
+      "step": 41150
+    },
+    {
+      "epoch": 11.99772846409226,
+      "grad_norm": 0.38102081418037415,
+      "learning_rate": 0.00045629720279720274,
+      "loss": 3.3671,
+      "step": 41200
+    },
+    {
+      "epoch": 12.012231347195527,
+      "grad_norm": 0.3588341772556305,
+      "learning_rate": 0.0004561223776223776,
+      "loss": 3.2736,
+      "step": 41250
+    },
+    {
+      "epoch": 12.02679247480925,
+      "grad_norm": 0.3291144073009491,
+      "learning_rate": 0.0004559475524475524,
+      "loss": 3.2687,
+      "step": 41300
+    },
+    {
+      "epoch": 12.041353602422971,
+      "grad_norm": 0.35584986209869385,
+      "learning_rate": 0.00045577272727272724,
+      "loss": 3.271,
+      "step": 41350
+    },
+    {
+      "epoch": 12.055914730036694,
+      "grad_norm": 0.3618343770503998,
+      "learning_rate": 0.00045559790209790204,
+      "loss": 3.2689,
+      "step": 41400
+    },
+    {
+      "epoch": 12.070475857650417,
+      "grad_norm": 0.3511015772819519,
+      "learning_rate": 0.0004554230769230769,
+      "loss": 3.2771,
+      "step": 41450
+    },
+    {
+      "epoch": 12.08503698526414,
+      "grad_norm": 0.3445127308368683,
+      "learning_rate": 0.0004552482517482517,
+      "loss": 3.2766,
+      "step": 41500
+    },
+    {
+      "epoch": 12.099598112877862,
+      "grad_norm": 0.34342440962791443,
+      "learning_rate": 0.00045507342657342655,
+      "loss": 3.2763,
+      "step": 41550
+    },
+    {
+      "epoch": 12.114159240491583,
+      "grad_norm": 0.3772423565387726,
+      "learning_rate": 0.00045489860139860135,
+      "loss": 3.2767,
+      "step": 41600
+    },
+    {
+      "epoch": 12.128720368105306,
+      "grad_norm": 0.3567655086517334,
+      "learning_rate": 0.0004547237762237762,
+      "loss": 3.2699,
+      "step": 41650
+    },
+    {
+      "epoch": 12.143281495719028,
+      "grad_norm": 0.3559315800666809,
+      "learning_rate": 0.00045454895104895106,
+      "loss": 3.2895,
+      "step": 41700
+    },
+    {
+      "epoch": 12.157842623332751,
+      "grad_norm": 0.3517071604728699,
+      "learning_rate": 0.0004543741258741258,
+      "loss": 3.2963,
+      "step": 41750
+    },
+    {
+      "epoch": 12.172403750946474,
+      "grad_norm": 0.3621581792831421,
+      "learning_rate": 0.00045419930069930066,
+      "loss": 3.2909,
+      "step": 41800
+    },
+    {
+      "epoch": 12.186964878560195,
+      "grad_norm": 0.34336772561073303,
+      "learning_rate": 0.00045402447552447546,
+      "loss": 3.3055,
+      "step": 41850
+    },
+    {
+      "epoch": 12.201526006173918,
+      "grad_norm": 0.36337751150131226,
+      "learning_rate": 0.0004538496503496503,
+      "loss": 3.303,
+      "step": 41900
+    },
+    {
+      "epoch": 12.21608713378764,
+      "grad_norm": 0.38408976793289185,
+      "learning_rate": 0.0004536748251748251,
+      "loss": 3.32,
+      "step": 41950
+    },
+    {
+      "epoch": 12.230648261401363,
+      "grad_norm": 0.3528490960597992,
+      "learning_rate": 0.00045349999999999996,
+      "loss": 3.3026,
+      "step": 42000
+    },
+    {
+      "epoch": 12.230648261401363,
+      "eval_accuracy": 0.3697736661170223,
+      "eval_loss": 3.5663251876831055,
+      "eval_runtime": 179.7473,
+      "eval_samples_per_second": 92.608,
+      "eval_steps_per_second": 5.791,
+      "step": 42000
+    },
+    {
+      "epoch": 12.245209389015086,
+      "grad_norm": 0.36504805088043213,
+      "learning_rate": 0.00045332517482517476,
+      "loss": 3.3043,
+      "step": 42050
+    },
+    {
+      "epoch": 12.259770516628807,
+      "grad_norm": 0.35969120264053345,
+      "learning_rate": 0.0004531503496503496,
+      "loss": 3.3094,
+      "step": 42100
+    },
+    {
+      "epoch": 12.27433164424253,
+      "grad_norm": 0.3558993339538574,
+      "learning_rate": 0.0004529755244755244,
+      "loss": 3.3273,
+      "step": 42150
+    },
+    {
+      "epoch": 12.288892771856252,
+      "grad_norm": 0.3570861220359802,
+      "learning_rate": 0.00045280069930069927,
+      "loss": 3.3146,
+      "step": 42200
+    },
+    {
+      "epoch": 12.303453899469975,
+      "grad_norm": 0.3580907881259918,
+      "learning_rate": 0.00045262587412587407,
+      "loss": 3.3042,
+      "step": 42250
+    },
+    {
+      "epoch": 12.318015027083698,
+      "grad_norm": 0.36477571725845337,
+      "learning_rate": 0.0004524510489510489,
+      "loss": 3.315,
+      "step": 42300
+    },
+    {
+      "epoch": 12.33257615469742,
+      "grad_norm": 0.33544719219207764,
+      "learning_rate": 0.0004522762237762238,
+      "loss": 3.3091,
+      "step": 42350
+    },
+    {
+      "epoch": 12.347137282311142,
+      "grad_norm": 0.34085631370544434,
+      "learning_rate": 0.0004521013986013986,
+      "loss": 3.3134,
+      "step": 42400
+    },
+    {
+      "epoch": 12.361698409924864,
+      "grad_norm": 0.3568131625652313,
+      "learning_rate": 0.00045192657342657343,
+      "loss": 3.324,
+      "step": 42450
+    },
+    {
+      "epoch": 12.376259537538587,
+      "grad_norm": 0.33206743001937866,
+      "learning_rate": 0.0004517517482517482,
+      "loss": 3.3225,
+      "step": 42500
+    },
+    {
+      "epoch": 12.39082066515231,
+      "grad_norm": 0.37920984625816345,
+      "learning_rate": 0.00045157692307692303,
+      "loss": 3.3209,
+      "step": 42550
+    },
+    {
+      "epoch": 12.405381792766033,
+      "grad_norm": 0.3489953577518463,
+      "learning_rate": 0.00045140209790209783,
+      "loss": 3.3243,
+      "step": 42600
+    },
+    {
+      "epoch": 12.419942920379754,
+      "grad_norm": 0.340498149394989,
+      "learning_rate": 0.0004512272727272727,
+      "loss": 3.3187,
+      "step": 42650
+    },
+    {
+      "epoch": 12.434504047993476,
+      "grad_norm": 0.3989890515804291,
+      "learning_rate": 0.0004510524475524475,
+      "loss": 3.3103,
+      "step": 42700
+    },
+    {
+      "epoch": 12.449065175607199,
+      "grad_norm": 0.3404858112335205,
+      "learning_rate": 0.00045087762237762234,
+      "loss": 3.3273,
+      "step": 42750
+    },
+    {
+      "epoch": 12.463626303220922,
+      "grad_norm": 0.3558301329612732,
+      "learning_rate": 0.00045070279720279714,
+      "loss": 3.3207,
+      "step": 42800
+    },
+    {
+      "epoch": 12.478187430834645,
+      "grad_norm": 0.34800827503204346,
+      "learning_rate": 0.000450527972027972,
+      "loss": 3.3293,
+      "step": 42850
+    },
+    {
+      "epoch": 12.492748558448366,
+      "grad_norm": 0.3653114438056946,
+      "learning_rate": 0.0004503531468531468,
+      "loss": 3.3484,
+      "step": 42900
+    },
+    {
+      "epoch": 12.507309686062088,
+      "grad_norm": 0.3844183087348938,
+      "learning_rate": 0.00045017832167832165,
+      "loss": 3.3311,
+      "step": 42950
+    },
+    {
+      "epoch": 12.521870813675811,
+      "grad_norm": 0.3569987118244171,
+      "learning_rate": 0.0004500034965034965,
+      "loss": 3.3243,
+      "step": 43000
+    },
+    {
+      "epoch": 12.521870813675811,
+      "eval_accuracy": 0.37050173134451886,
+      "eval_loss": 3.554635524749756,
+      "eval_runtime": 179.7072,
+      "eval_samples_per_second": 92.628,
+      "eval_steps_per_second": 5.793,
+      "step": 43000
+    },
+    {
+      "epoch": 12.536431941289534,
+      "grad_norm": 0.348994642496109,
+      "learning_rate": 0.0004498286713286713,
+      "loss": 3.3379,
+      "step": 43050
+    },
+    {
+      "epoch": 12.550993068903256,
+      "grad_norm": 0.3396040201187134,
+      "learning_rate": 0.00044965384615384615,
+      "loss": 3.3401,
+      "step": 43100
+    },
+    {
+      "epoch": 12.565554196516977,
+      "grad_norm": 0.3896959722042084,
+      "learning_rate": 0.00044947902097902095,
+      "loss": 3.3389,
+      "step": 43150
+    },
+    {
+      "epoch": 12.5801153241307,
+      "grad_norm": 0.3507692813873291,
+      "learning_rate": 0.0004493041958041958,
+      "loss": 3.3419,
+      "step": 43200
+    },
+    {
+      "epoch": 12.594676451744423,
+      "grad_norm": 0.38180142641067505,
+      "learning_rate": 0.00044912937062937055,
+      "loss": 3.3378,
+      "step": 43250
+    },
+    {
+      "epoch": 12.609237579358146,
+      "grad_norm": 0.3205597996711731,
+      "learning_rate": 0.0004489545454545454,
+      "loss": 3.3532,
+      "step": 43300
+    },
+    {
+      "epoch": 12.623798706971868,
+      "grad_norm": 0.3707713782787323,
+      "learning_rate": 0.0004487797202797202,
+      "loss": 3.335,
+      "step": 43350
+    },
+    {
+      "epoch": 12.63835983458559,
+      "grad_norm": 0.3585401773452759,
+      "learning_rate": 0.00044860489510489506,
+      "loss": 3.3376,
+      "step": 43400
+    },
+    {
+      "epoch": 12.652920962199312,
+      "grad_norm": 0.36372795701026917,
+      "learning_rate": 0.00044843006993006986,
+      "loss": 3.3392,
+      "step": 43450
+    },
+    {
+      "epoch": 12.667482089813035,
+      "grad_norm": 0.36102354526519775,
+      "learning_rate": 0.0004482552447552447,
+      "loss": 3.3409,
+      "step": 43500
+    },
+    {
+      "epoch": 12.682043217426758,
+      "grad_norm": 0.36197811365127563,
+      "learning_rate": 0.0004480804195804195,
+      "loss": 3.3425,
+      "step": 43550
+    },
+    {
+      "epoch": 12.69660434504048,
+      "grad_norm": 0.3554995357990265,
+      "learning_rate": 0.00044790559440559437,
+      "loss": 3.3442,
+      "step": 43600
+    },
+    {
+      "epoch": 12.711165472654203,
+      "grad_norm": 0.35034361481666565,
+      "learning_rate": 0.00044773076923076917,
+      "loss": 3.3537,
+      "step": 43650
+    },
+    {
+      "epoch": 12.725726600267924,
+      "grad_norm": 0.3263697326183319,
+      "learning_rate": 0.000447555944055944,
+      "loss": 3.3484,
+      "step": 43700
+    },
+    {
+      "epoch": 12.740287727881647,
+      "grad_norm": 0.333683043718338,
+      "learning_rate": 0.0004473811188811189,
+      "loss": 3.3595,
+      "step": 43750
+    },
+    {
+      "epoch": 12.75484885549537,
+      "grad_norm": 0.3857356607913971,
+      "learning_rate": 0.0004472062937062937,
+      "loss": 3.3512,
+      "step": 43800
+    },
+    {
+      "epoch": 12.769409983109092,
+      "grad_norm": 0.36390504240989685,
+      "learning_rate": 0.00044703146853146853,
+      "loss": 3.3423,
+      "step": 43850
+    },
+    {
+      "epoch": 12.783971110722815,
+      "grad_norm": 0.361526221036911,
+      "learning_rate": 0.00044685664335664333,
+      "loss": 3.3576,
+      "step": 43900
+    },
+    {
+      "epoch": 12.798532238336536,
+      "grad_norm": 0.3464201092720032,
+      "learning_rate": 0.0004466818181818182,
+      "loss": 3.3507,
+      "step": 43950
+    },
+    {
+      "epoch": 12.813093365950259,
+      "grad_norm": 0.3635178506374359,
+      "learning_rate": 0.00044650699300699293,
+      "loss": 3.3648,
+      "step": 44000
+    },
+    {
+      "epoch": 12.813093365950259,
+      "eval_accuracy": 0.37106509135907784,
+      "eval_loss": 3.5451130867004395,
+      "eval_runtime": 179.8666,
+      "eval_samples_per_second": 92.546,
+      "eval_steps_per_second": 5.788,
+      "step": 44000
+    },
+    {
+      "epoch": 12.827654493563982,
+      "grad_norm": 0.35989975929260254,
+      "learning_rate": 0.0004463321678321678,
+      "loss": 3.3498,
+      "step": 44050
+    },
+    {
+      "epoch": 12.842215621177704,
+      "grad_norm": 0.36528387665748596,
+      "learning_rate": 0.0004461573426573426,
+      "loss": 3.3446,
+      "step": 44100
+    },
+    {
+      "epoch": 12.856776748791427,
+      "grad_norm": 0.3401133716106415,
+      "learning_rate": 0.00044598251748251744,
+      "loss": 3.3641,
+      "step": 44150
+    },
+    {
+      "epoch": 12.871337876405148,
+      "grad_norm": 0.34729644656181335,
+      "learning_rate": 0.00044580769230769224,
+      "loss": 3.3605,
+      "step": 44200
+    },
+    {
+      "epoch": 12.88589900401887,
+      "grad_norm": 0.3324040174484253,
+      "learning_rate": 0.0004456328671328671,
+      "loss": 3.3516,
+      "step": 44250
+    },
+    {
+      "epoch": 12.900460131632594,
+      "grad_norm": 0.3577946126461029,
+      "learning_rate": 0.0004454580419580419,
+      "loss": 3.3671,
+      "step": 44300
+    },
+    {
+      "epoch": 12.915021259246316,
+      "grad_norm": 0.3214498460292816,
+      "learning_rate": 0.00044528321678321674,
+      "loss": 3.3511,
+      "step": 44350
+    },
+    {
+      "epoch": 12.929582386860039,
+      "grad_norm": 0.35906174778938293,
+      "learning_rate": 0.0004451083916083916,
+      "loss": 3.3554,
+      "step": 44400
+    },
+    {
+      "epoch": 12.944143514473762,
+      "grad_norm": 0.3261829912662506,
+      "learning_rate": 0.0004449335664335664,
+      "loss": 3.3469,
+      "step": 44450
+    },
+    {
+      "epoch": 12.958704642087483,
+      "grad_norm": 0.340356707572937,
+      "learning_rate": 0.00044475874125874125,
+      "loss": 3.3583,
+      "step": 44500
+    },
+    {
+      "epoch": 12.973265769701205,
+      "grad_norm": 0.33837756514549255,
+      "learning_rate": 0.00044458391608391605,
+      "loss": 3.3421,
+      "step": 44550
+    },
+    {
+      "epoch": 12.987826897314928,
+      "grad_norm": 0.3522256910800934,
+      "learning_rate": 0.0004444090909090909,
+      "loss": 3.364,
+      "step": 44600
+    },
+    {
+      "epoch": 13.002329780418195,
+      "grad_norm": 0.3823665678501129,
+      "learning_rate": 0.0004442342657342657,
+      "loss": 3.3249,
+      "step": 44650
+    },
+    {
+      "epoch": 13.016890908031918,
+      "grad_norm": 0.337575763463974,
+      "learning_rate": 0.00044405944055944056,
+      "loss": 3.237,
+      "step": 44700
+    },
+    {
+      "epoch": 13.031452035645641,
+      "grad_norm": 0.37781283259391785,
+      "learning_rate": 0.0004438846153846153,
+      "loss": 3.2572,
+      "step": 44750
+    },
+    {
+      "epoch": 13.046013163259364,
+      "grad_norm": 0.3561868667602539,
+      "learning_rate": 0.00044370979020979016,
+      "loss": 3.2568,
+      "step": 44800
+    },
+    {
+      "epoch": 13.060574290873085,
+      "grad_norm": 0.34880560636520386,
+      "learning_rate": 0.00044353496503496496,
+      "loss": 3.2554,
+      "step": 44850
+    },
+    {
+      "epoch": 13.075135418486807,
+      "grad_norm": 0.35558071732521057,
+      "learning_rate": 0.0004433601398601398,
+      "loss": 3.2505,
+      "step": 44900
+    },
+    {
+      "epoch": 13.08969654610053,
+      "grad_norm": 0.3940425217151642,
+      "learning_rate": 0.0004431853146853146,
+      "loss": 3.2628,
+      "step": 44950
+    },
+    {
+      "epoch": 13.104257673714253,
+      "grad_norm": 0.37770476937294006,
+      "learning_rate": 0.00044301048951048946,
+      "loss": 3.275,
+      "step": 45000
+    },
+    {
+      "epoch": 13.104257673714253,
+      "eval_accuracy": 0.3701498664606343,
+      "eval_loss": 3.5611326694488525,
+      "eval_runtime": 179.8018,
+      "eval_samples_per_second": 92.58,
+      "eval_steps_per_second": 5.79,
+      "step": 45000
+    },
+    {
+      "epoch": 13.118818801327976,
+      "grad_norm": 0.4152877926826477,
+      "learning_rate": 0.00044283566433566426,
+      "loss": 3.2714,
+      "step": 45050
+    },
+    {
+      "epoch": 13.133379928941697,
+      "grad_norm": 0.381516695022583,
+      "learning_rate": 0.0004426608391608391,
+      "loss": 3.2652,
+      "step": 45100
+    },
+    {
+      "epoch": 13.14794105655542,
+      "grad_norm": 0.3559730350971222,
+      "learning_rate": 0.00044248601398601397,
+      "loss": 3.2787,
+      "step": 45150
+    },
+    {
+      "epoch": 13.162502184169142,
+      "grad_norm": 0.36056405305862427,
+      "learning_rate": 0.00044231118881118877,
+      "loss": 3.2815,
+      "step": 45200
+    },
+    {
+      "epoch": 13.177063311782865,
+      "grad_norm": 0.36023563146591187,
+      "learning_rate": 0.0004421363636363636,
+      "loss": 3.2925,
+      "step": 45250
+    },
+    {
+      "epoch": 13.191624439396588,
+      "grad_norm": 0.3763374388217926,
+      "learning_rate": 0.0004419615384615384,
+      "loss": 3.2771,
+      "step": 45300
+    },
+    {
+      "epoch": 13.206185567010309,
+      "grad_norm": 0.36706557869911194,
+      "learning_rate": 0.0004417867132867133,
+      "loss": 3.2913,
+      "step": 45350
+    },
+    {
+      "epoch": 13.220746694624031,
+      "grad_norm": 0.36076489090919495,
+      "learning_rate": 0.0004416118881118881,
+      "loss": 3.2803,
+      "step": 45400
+    },
+    {
+      "epoch": 13.235307822237754,
+      "grad_norm": 0.34233537316322327,
+      "learning_rate": 0.00044143706293706293,
+      "loss": 3.2837,
+      "step": 45450
+    },
+    {
+      "epoch": 13.249868949851477,
+      "grad_norm": 0.37531542778015137,
+      "learning_rate": 0.0004412622377622377,
+      "loss": 3.3033,
+      "step": 45500
+    },
+    {
+      "epoch": 13.2644300774652,
+      "grad_norm": 0.3751412034034729,
+      "learning_rate": 0.00044108741258741253,
+      "loss": 3.2912,
+      "step": 45550
+    },
+    {
+      "epoch": 13.27899120507892,
+      "grad_norm": 0.3610355854034424,
+      "learning_rate": 0.00044091258741258733,
+      "loss": 3.299,
+      "step": 45600
+    },
+    {
+      "epoch": 13.293552332692643,
+      "grad_norm": 0.3749397397041321,
+      "learning_rate": 0.0004407377622377622,
+      "loss": 3.294,
+      "step": 45650
+    },
+    {
+      "epoch": 13.308113460306366,
+      "grad_norm": 0.3670322895050049,
+      "learning_rate": 0.000440562937062937,
+      "loss": 3.2975,
+      "step": 45700
+    },
+    {
+      "epoch": 13.322674587920089,
+      "grad_norm": 0.3655760884284973,
+      "learning_rate": 0.00044038811188811184,
+      "loss": 3.2943,
+      "step": 45750
+    },
+    {
+      "epoch": 13.337235715533811,
+      "grad_norm": 0.3786638379096985,
+      "learning_rate": 0.0004402132867132867,
+      "loss": 3.2848,
+      "step": 45800
+    },
+    {
+      "epoch": 13.351796843147532,
+      "grad_norm": 0.34307000041007996,
+      "learning_rate": 0.0004400384615384615,
+      "loss": 3.2926,
+      "step": 45850
+    },
+    {
+      "epoch": 13.366357970761255,
+      "grad_norm": 0.3485225439071655,
+      "learning_rate": 0.00043986363636363635,
+      "loss": 3.2964,
+      "step": 45900
+    },
+    {
+      "epoch": 13.380919098374978,
+      "grad_norm": 0.34821778535842896,
+      "learning_rate": 0.00043968881118881115,
+      "loss": 3.3167,
+      "step": 45950
+    },
+    {
+      "epoch": 13.3954802259887,
+      "grad_norm": 0.3911685347557068,
+      "learning_rate": 0.000439513986013986,
+      "loss": 3.3082,
+      "step": 46000
+    },
+    {
+      "epoch": 13.3954802259887,
+      "eval_accuracy": 0.3704263737131891,
+      "eval_loss": 3.556065082550049,
+      "eval_runtime": 179.915,
+      "eval_samples_per_second": 92.521,
+      "eval_steps_per_second": 5.786,
+      "step": 46000
+    },
+    {
+      "epoch": 13.410041353602423,
+      "grad_norm": 0.3521997928619385,
+      "learning_rate": 0.0004393391608391608,
+      "loss": 3.3085,
+      "step": 46050
+    },
+    {
+      "epoch": 13.424602481216146,
+      "grad_norm": 0.3501444160938263,
+      "learning_rate": 0.00043916433566433565,
+      "loss": 3.3154,
+      "step": 46100
+    },
+    {
+      "epoch": 13.439163608829867,
+      "grad_norm": 0.3678217828273773,
+      "learning_rate": 0.00043898951048951045,
+      "loss": 3.3044,
+      "step": 46150
+    },
+    {
+      "epoch": 13.45372473644359,
+      "grad_norm": 0.397447407245636,
+      "learning_rate": 0.0004388146853146853,
+      "loss": 3.3181,
+      "step": 46200
+    },
+    {
+      "epoch": 13.468285864057313,
+      "grad_norm": 0.3521028757095337,
+      "learning_rate": 0.00043863986013986005,
+      "loss": 3.3053,
+      "step": 46250
+    },
+    {
+      "epoch": 13.482846991671035,
+      "grad_norm": 0.3721345365047455,
+      "learning_rate": 0.0004384650349650349,
+      "loss": 3.3111,
+      "step": 46300
+    },
+    {
+      "epoch": 13.497408119284758,
+      "grad_norm": 0.3641456365585327,
+      "learning_rate": 0.0004382902097902097,
+      "loss": 3.3039,
+      "step": 46350
+    },
+    {
+      "epoch": 13.51196924689848,
+      "grad_norm": 0.3499159812927246,
+      "learning_rate": 0.00043811538461538456,
+      "loss": 3.3111,
+      "step": 46400
+    },
+    {
+      "epoch": 13.526530374512202,
+      "grad_norm": 0.35478129982948303,
+      "learning_rate": 0.0004379405594405594,
+      "loss": 3.3228,
+      "step": 46450
+    },
+    {
+      "epoch": 13.541091502125925,
+      "grad_norm": 0.3691585958003998,
+      "learning_rate": 0.0004377657342657342,
+      "loss": 3.3195,
+      "step": 46500
+    },
+    {
+      "epoch": 13.555652629739647,
+      "grad_norm": 0.37163329124450684,
+      "learning_rate": 0.00043759090909090907,
+      "loss": 3.3317,
+      "step": 46550
+    },
+    {
+      "epoch": 13.57021375735337,
+      "grad_norm": 0.3679807782173157,
+      "learning_rate": 0.00043741608391608387,
+      "loss": 3.3153,
+      "step": 46600
+    },
+    {
+      "epoch": 13.584774884967091,
+      "grad_norm": 0.34902745485305786,
+      "learning_rate": 0.0004372412587412587,
+      "loss": 3.3289,
+      "step": 46650
+    },
+    {
+      "epoch": 13.599336012580814,
+      "grad_norm": 0.4036466181278229,
+      "learning_rate": 0.0004370664335664335,
+      "loss": 3.3076,
+      "step": 46700
+    },
+    {
+      "epoch": 13.613897140194537,
+      "grad_norm": 0.35974839329719543,
+      "learning_rate": 0.0004368916083916084,
+      "loss": 3.3236,
+      "step": 46750
+    },
+    {
+      "epoch": 13.62845826780826,
+      "grad_norm": 0.35530760884284973,
+      "learning_rate": 0.0004367167832167832,
+      "loss": 3.3195,
+      "step": 46800
+    },
+    {
+      "epoch": 13.643019395421982,
+      "grad_norm": 0.3408248722553253,
+      "learning_rate": 0.00043654195804195803,
+      "loss": 3.3229,
+      "step": 46850
+    },
+    {
+      "epoch": 13.657580523035705,
+      "grad_norm": 0.3587759733200073,
+      "learning_rate": 0.00043636713286713283,
+      "loss": 3.3369,
+      "step": 46900
+    },
+    {
+      "epoch": 13.672141650649426,
+      "grad_norm": 0.37035682797431946,
+      "learning_rate": 0.0004361923076923077,
+      "loss": 3.3287,
+      "step": 46950
+    },
+    {
+      "epoch": 13.686702778263149,
+      "grad_norm": 0.3677375316619873,
+      "learning_rate": 0.00043601748251748243,
+      "loss": 3.3173,
+      "step": 47000
+    },
+    {
+      "epoch": 13.686702778263149,
+      "eval_accuracy": 0.3709606957637255,
+      "eval_loss": 3.548353672027588,
+      "eval_runtime": 179.8113,
+      "eval_samples_per_second": 92.575,
+      "eval_steps_per_second": 5.789,
+      "step": 47000
+    },
+    {
+      "epoch": 13.701263905876871,
+      "grad_norm": 0.3480958640575409,
+      "learning_rate": 0.00043584265734265734,
+      "loss": 3.3378,
+      "step": 47050
+    },
+    {
+      "epoch": 13.715825033490594,
+      "grad_norm": 0.3933262825012207,
+      "learning_rate": 0.0004356678321678321,
+      "loss": 3.3263,
+      "step": 47100
+    },
+    {
+      "epoch": 13.730386161104317,
+      "grad_norm": 0.3582457900047302,
+      "learning_rate": 0.00043549300699300694,
+      "loss": 3.3258,
+      "step": 47150
+    },
+    {
+      "epoch": 13.744947288718038,
+      "grad_norm": 0.3504882752895355,
+      "learning_rate": 0.0004353181818181818,
+      "loss": 3.3322,
+      "step": 47200
+    },
+    {
+      "epoch": 13.75950841633176,
+      "grad_norm": 0.38114842772483826,
+      "learning_rate": 0.0004351433566433566,
+      "loss": 3.3381,
+      "step": 47250
+    },
+    {
+      "epoch": 13.774069543945483,
+      "grad_norm": 0.3709891736507416,
+      "learning_rate": 0.00043496853146853144,
+      "loss": 3.328,
+      "step": 47300
+    },
+    {
+      "epoch": 13.788630671559206,
+      "grad_norm": 0.3773655295372009,
+      "learning_rate": 0.00043479370629370624,
+      "loss": 3.3324,
+      "step": 47350
+    },
+    {
+      "epoch": 13.803191799172929,
+      "grad_norm": 0.34999653697013855,
+      "learning_rate": 0.0004346188811188811,
+      "loss": 3.3365,
+      "step": 47400
+    },
+    {
+      "epoch": 13.81775292678665,
+      "grad_norm": 0.3433198630809784,
+      "learning_rate": 0.0004344440559440559,
+      "loss": 3.328,
+      "step": 47450
+    },
+    {
+      "epoch": 13.832314054400372,
+      "grad_norm": 0.3613443970680237,
+      "learning_rate": 0.00043426923076923075,
+      "loss": 3.3452,
+      "step": 47500
+    },
+    {
+      "epoch": 13.846875182014095,
+      "grad_norm": 0.3868599832057953,
+      "learning_rate": 0.00043409440559440555,
+      "loss": 3.3437,
+      "step": 47550
+    },
+    {
+      "epoch": 13.861436309627818,
+      "grad_norm": 0.3586479127407074,
+      "learning_rate": 0.0004339195804195804,
+      "loss": 3.3467,
+      "step": 47600
+    },
+    {
+      "epoch": 13.87599743724154,
+      "grad_norm": 0.38852712512016296,
+      "learning_rate": 0.0004337447552447552,
+      "loss": 3.3367,
+      "step": 47650
+    },
+    {
+      "epoch": 13.890558564855262,
+      "grad_norm": 0.3898317515850067,
+      "learning_rate": 0.00043356993006993006,
+      "loss": 3.3341,
+      "step": 47700
+    },
+    {
+      "epoch": 13.905119692468984,
+      "grad_norm": 0.3417160212993622,
+      "learning_rate": 0.0004333951048951048,
+      "loss": 3.342,
+      "step": 47750
+    },
+    {
+      "epoch": 13.919680820082707,
+      "grad_norm": 0.36018186807632446,
+      "learning_rate": 0.0004332202797202797,
+      "loss": 3.3444,
+      "step": 47800
+    },
+    {
+      "epoch": 13.93424194769643,
+      "grad_norm": 0.3359622061252594,
+      "learning_rate": 0.00043304545454545456,
+      "loss": 3.3351,
+      "step": 47850
+    },
+    {
+      "epoch": 13.948803075310153,
+      "grad_norm": 0.35396841168403625,
+      "learning_rate": 0.0004328706293706293,
+      "loss": 3.3447,
+      "step": 47900
+    },
+    {
+      "epoch": 13.963364202923874,
+      "grad_norm": 0.3624133765697479,
+      "learning_rate": 0.00043269580419580416,
+      "loss": 3.3484,
+      "step": 47950
+    },
+    {
+      "epoch": 13.977925330537596,
+      "grad_norm": 0.36250928044319153,
+      "learning_rate": 0.00043252097902097896,
+      "loss": 3.3449,
+      "step": 48000
+    },
+    {
+      "epoch": 13.977925330537596,
+      "eval_accuracy": 0.37173649140981785,
+      "eval_loss": 3.539720058441162,
+      "eval_runtime": 179.6484,
+      "eval_samples_per_second": 92.659,
+      "eval_steps_per_second": 5.795,
+      "step": 48000
+    },
+    {
+      "epoch": 13.992486458151319,
+      "grad_norm": 0.3764212727546692,
+      "learning_rate": 0.0004323461538461538,
+      "loss": 3.3403,
+      "step": 48050
+    },
+    {
+      "epoch": 14.006989341254586,
+      "grad_norm": 0.353367418050766,
+      "learning_rate": 0.0004321713286713286,
+      "loss": 3.2937,
+      "step": 48100
+    },
+    {
+      "epoch": 14.021550468868309,
+      "grad_norm": 0.371685266494751,
+      "learning_rate": 0.00043199650349650347,
+      "loss": 3.2138,
+      "step": 48150
+    },
+    {
+      "epoch": 14.036111596482032,
+      "grad_norm": 0.3745143711566925,
+      "learning_rate": 0.00043182167832167827,
+      "loss": 3.2321,
+      "step": 48200
+    },
+    {
+      "epoch": 14.050672724095755,
+      "grad_norm": 0.3856935203075409,
+      "learning_rate": 0.0004316468531468531,
+      "loss": 3.2316,
+      "step": 48250
+    },
+    {
+      "epoch": 14.065233851709475,
+      "grad_norm": 0.36421141028404236,
+      "learning_rate": 0.0004314720279720279,
+      "loss": 3.2413,
+      "step": 48300
+    },
+    {
+      "epoch": 14.079794979323198,
+      "grad_norm": 0.3954959809780121,
+      "learning_rate": 0.0004312972027972028,
+      "loss": 3.2449,
+      "step": 48350
+    },
+    {
+      "epoch": 14.094356106936921,
+      "grad_norm": 0.36783069372177124,
+      "learning_rate": 0.0004311223776223776,
+      "loss": 3.2469,
+      "step": 48400
+    },
+    {
+      "epoch": 14.108917234550644,
+      "grad_norm": 0.3712175786495209,
+      "learning_rate": 0.00043094755244755243,
+      "loss": 3.2541,
+      "step": 48450
+    },
+    {
+      "epoch": 14.123478362164366,
+      "grad_norm": 0.3833962678909302,
+      "learning_rate": 0.0004307727272727272,
+      "loss": 3.2665,
+      "step": 48500
+    },
+    {
+      "epoch": 14.13803948977809,
+      "grad_norm": 0.3755820095539093,
+      "learning_rate": 0.0004305979020979021,
+      "loss": 3.2583,
+      "step": 48550
+    },
+    {
+      "epoch": 14.15260061739181,
+      "grad_norm": 0.3721693158149719,
+      "learning_rate": 0.00043042307692307694,
+      "loss": 3.2461,
+      "step": 48600
+    },
+    {
+      "epoch": 14.167161745005533,
+      "grad_norm": 0.36444807052612305,
+      "learning_rate": 0.0004302482517482517,
+      "loss": 3.2672,
+      "step": 48650
+    },
+    {
+      "epoch": 14.181722872619256,
+      "grad_norm": 0.3308829367160797,
+      "learning_rate": 0.00043007342657342654,
+      "loss": 3.2678,
+      "step": 48700
+    },
+    {
+      "epoch": 14.196284000232978,
+      "grad_norm": 0.3922617435455322,
+      "learning_rate": 0.00042989860139860134,
+      "loss": 3.2674,
+      "step": 48750
+    },
+    {
+      "epoch": 14.210845127846701,
+      "grad_norm": 0.3621092140674591,
+      "learning_rate": 0.0004297237762237762,
+      "loss": 3.2708,
+      "step": 48800
+    },
+    {
+      "epoch": 14.225406255460422,
+      "grad_norm": 0.36468708515167236,
+      "learning_rate": 0.000429548951048951,
+      "loss": 3.2829,
+      "step": 48850
+    },
+    {
+      "epoch": 14.239967383074145,
+      "grad_norm": 0.37252795696258545,
+      "learning_rate": 0.00042937412587412585,
+      "loss": 3.2624,
+      "step": 48900
+    },
+    {
+      "epoch": 14.254528510687868,
+      "grad_norm": 0.38541239500045776,
+      "learning_rate": 0.00042919930069930065,
+      "loss": 3.2773,
+      "step": 48950
+    },
+    {
+      "epoch": 14.26908963830159,
+      "grad_norm": 0.35144972801208496,
+      "learning_rate": 0.0004290244755244755,
+      "loss": 3.2796,
+      "step": 49000
+    },
+    {
+      "epoch": 14.26908963830159,
+      "eval_accuracy": 0.3708721711203693,
+      "eval_loss": 3.5575828552246094,
+      "eval_runtime": 179.8637,
+      "eval_samples_per_second": 92.548,
+      "eval_steps_per_second": 5.788,
+      "step": 49000
+    },
+    {
+      "epoch": 14.283650765915313,
+      "grad_norm": 0.36003535985946655,
+      "learning_rate": 0.0004288496503496503,
+      "loss": 3.2822,
+      "step": 49050
+    },
+    {
+      "epoch": 14.298211893529034,
+      "grad_norm": 0.3763592839241028,
+      "learning_rate": 0.00042867482517482515,
+      "loss": 3.2788,
+      "step": 49100
+    },
+    {
+      "epoch": 14.312773021142757,
+      "grad_norm": 0.35325881838798523,
+      "learning_rate": 0.00042849999999999995,
+      "loss": 3.2867,
+      "step": 49150
+    },
+    {
+      "epoch": 14.32733414875648,
+      "grad_norm": 0.36146456003189087,
+      "learning_rate": 0.0004283251748251748,
+      "loss": 3.2838,
+      "step": 49200
+    },
+    {
+      "epoch": 14.341895276370202,
+      "grad_norm": 0.35914602875709534,
+      "learning_rate": 0.00042815034965034966,
+      "loss": 3.2862,
+      "step": 49250
+    },
+    {
+      "epoch": 14.356456403983925,
+      "grad_norm": 0.35315200686454773,
+      "learning_rate": 0.00042797552447552446,
+      "loss": 3.2912,
+      "step": 49300
+    },
+    {
+      "epoch": 14.371017531597648,
+      "grad_norm": 0.355522483587265,
+      "learning_rate": 0.0004278006993006993,
+      "loss": 3.2925,
+      "step": 49350
+    },
+    {
+      "epoch": 14.385578659211369,
+      "grad_norm": 0.3386734426021576,
+      "learning_rate": 0.00042762587412587406,
+      "loss": 3.2745,
+      "step": 49400
+    },
+    {
+      "epoch": 14.400139786825092,
+      "grad_norm": 0.37271982431411743,
+      "learning_rate": 0.0004274510489510489,
+      "loss": 3.2914,
+      "step": 49450
+    },
+    {
+      "epoch": 14.414700914438814,
+      "grad_norm": 0.3474613130092621,
+      "learning_rate": 0.0004272762237762237,
+      "loss": 3.2917,
+      "step": 49500
+    },
+    {
+      "epoch": 14.429262042052537,
+      "grad_norm": 0.37980303168296814,
+      "learning_rate": 0.00042710139860139857,
+      "loss": 3.2942,
+      "step": 49550
+    },
+    {
+      "epoch": 14.44382316966626,
+      "grad_norm": 0.3631215989589691,
+      "learning_rate": 0.00042692657342657337,
+      "loss": 3.292,
+      "step": 49600
+    },
+    {
+      "epoch": 14.45838429727998,
+      "grad_norm": 0.3490867614746094,
+      "learning_rate": 0.0004267517482517482,
+      "loss": 3.2975,
+      "step": 49650
+    },
+    {
+      "epoch": 14.472945424893704,
+      "grad_norm": 0.3449762761592865,
+      "learning_rate": 0.000426576923076923,
+      "loss": 3.2993,
+      "step": 49700
+    },
+    {
+      "epoch": 14.487506552507426,
+      "grad_norm": 0.3923116624355316,
+      "learning_rate": 0.0004264020979020979,
+      "loss": 3.3112,
+      "step": 49750
+    },
+    {
+      "epoch": 14.502067680121149,
+      "grad_norm": 0.35419827699661255,
+      "learning_rate": 0.0004262272727272727,
+      "loss": 3.2972,
+      "step": 49800
+    },
+    {
+      "epoch": 14.516628807734872,
+      "grad_norm": 0.3563985824584961,
+      "learning_rate": 0.00042605244755244753,
+      "loss": 3.3037,
+      "step": 49850
+    },
+    {
+      "epoch": 14.531189935348593,
+      "grad_norm": 0.37263786792755127,
+      "learning_rate": 0.00042587762237762233,
+      "loss": 3.3085,
+      "step": 49900
+    },
+    {
+      "epoch": 14.545751062962315,
+      "grad_norm": 0.3686966896057129,
+      "learning_rate": 0.0004257027972027972,
+      "loss": 3.3032,
+      "step": 49950
+    },
+    {
+      "epoch": 14.560312190576038,
+      "grad_norm": 0.35801100730895996,
+      "learning_rate": 0.00042552797202797204,
+      "loss": 3.3156,
+      "step": 50000
+    },
+    {
+      "epoch": 14.560312190576038,
+      "eval_accuracy": 0.3712233306286096,
+      "eval_loss": 3.5494096279144287,
+      "eval_runtime": 179.8895,
+      "eval_samples_per_second": 92.535,
+      "eval_steps_per_second": 5.787,
+      "step": 50000
+    },
+    {
+      "epoch": 14.574873318189761,
+      "grad_norm": 0.36634013056755066,
+      "learning_rate": 0.00042535314685314684,
+      "loss": 3.3055,
+      "step": 50050
+    },
+    {
+      "epoch": 14.589434445803484,
+      "grad_norm": 0.38294774293899536,
+      "learning_rate": 0.0004251783216783217,
+      "loss": 3.2938,
+      "step": 50100
+    },
+    {
+      "epoch": 14.603995573417205,
+      "grad_norm": 0.36177265644073486,
+      "learning_rate": 0.00042500349650349643,
+      "loss": 3.3063,
+      "step": 50150
+    },
+    {
+      "epoch": 14.618556701030927,
+      "grad_norm": 0.38831570744514465,
+      "learning_rate": 0.0004248286713286713,
+      "loss": 3.3124,
+      "step": 50200
+    },
+    {
+      "epoch": 14.63311782864465,
+      "grad_norm": 0.3788811266422272,
+      "learning_rate": 0.0004246538461538461,
+      "loss": 3.3095,
+      "step": 50250
+    },
+    {
+      "epoch": 14.647678956258373,
+      "grad_norm": 0.37422165274620056,
+      "learning_rate": 0.00042447902097902094,
+      "loss": 3.3222,
+      "step": 50300
+    },
+    {
+      "epoch": 14.662240083872096,
+      "grad_norm": 0.34806740283966064,
+      "learning_rate": 0.00042430419580419574,
+      "loss": 3.3083,
+      "step": 50350
+    },
+    {
+      "epoch": 14.676801211485817,
+      "grad_norm": 0.38060638308525085,
+      "learning_rate": 0.0004241293706293706,
+      "loss": 3.3136,
+      "step": 50400
+    },
+    {
+      "epoch": 14.69136233909954,
+      "grad_norm": 0.35740432143211365,
+      "learning_rate": 0.0004239545454545454,
+      "loss": 3.318,
+      "step": 50450
+    },
+    {
+      "epoch": 14.705923466713262,
+      "grad_norm": 0.3618071377277374,
+      "learning_rate": 0.00042377972027972025,
+      "loss": 3.3257,
+      "step": 50500
+    },
+    {
+      "epoch": 14.720484594326985,
+      "grad_norm": 0.5806776285171509,
+      "learning_rate": 0.00042360489510489505,
+      "loss": 3.3143,
+      "step": 50550
+    },
+    {
+      "epoch": 14.735045721940708,
+      "grad_norm": 0.38723915815353394,
+      "learning_rate": 0.0004234300699300699,
+      "loss": 3.3116,
+      "step": 50600
+    },
+    {
+      "epoch": 14.749606849554429,
+      "grad_norm": 0.3388756215572357,
+      "learning_rate": 0.00042325524475524476,
+      "loss": 3.3198,
+      "step": 50650
+    },
+    {
+      "epoch": 14.764167977168151,
+      "grad_norm": 0.3440285325050354,
+      "learning_rate": 0.00042308041958041956,
+      "loss": 3.3098,
+      "step": 50700
+    },
+    {
+      "epoch": 14.778729104781874,
+      "grad_norm": 0.37004444003105164,
+      "learning_rate": 0.0004229055944055944,
+      "loss": 3.329,
+      "step": 50750
+    },
+    {
+      "epoch": 14.793290232395597,
+      "grad_norm": 0.34428054094314575,
+      "learning_rate": 0.0004227307692307692,
+      "loss": 3.3177,
+      "step": 50800
+    },
+    {
+      "epoch": 14.80785136000932,
+      "grad_norm": 0.36497217416763306,
+      "learning_rate": 0.00042255594405594406,
+      "loss": 3.3178,
+      "step": 50850
+    },
+    {
+      "epoch": 14.822412487623042,
+      "grad_norm": 0.3842722475528717,
+      "learning_rate": 0.0004223811188811188,
+      "loss": 3.309,
+      "step": 50900
+    },
+    {
+      "epoch": 14.836973615236763,
+      "grad_norm": 0.35773956775665283,
+      "learning_rate": 0.00042220629370629366,
+      "loss": 3.3148,
+      "step": 50950
+    },
+    {
+      "epoch": 14.851534742850486,
+      "grad_norm": 0.3422044813632965,
+      "learning_rate": 0.00042203146853146846,
+      "loss": 3.3248,
+      "step": 51000
+    },
+    {
+      "epoch": 14.851534742850486,
+      "eval_accuracy": 0.37198043382012874,
+      "eval_loss": 3.540163516998291,
+      "eval_runtime": 179.888,
+      "eval_samples_per_second": 92.535,
+      "eval_steps_per_second": 5.787,
+      "step": 51000
+    },
+    {
+      "epoch": 14.866095870464209,
+      "grad_norm": 0.34656912088394165,
+      "learning_rate": 0.0004218566433566433,
+      "loss": 3.3221,
+      "step": 51050
+    },
+    {
+      "epoch": 14.880656998077932,
+      "grad_norm": 0.3491133749485016,
+      "learning_rate": 0.0004216818181818181,
+      "loss": 3.3193,
+      "step": 51100
+    },
+    {
+      "epoch": 14.895218125691654,
+      "grad_norm": 0.3695593774318695,
+      "learning_rate": 0.00042150699300699297,
+      "loss": 3.3278,
+      "step": 51150
+    },
+    {
+      "epoch": 14.909779253305375,
+      "grad_norm": 0.3540794253349304,
+      "learning_rate": 0.00042133216783216777,
+      "loss": 3.3186,
+      "step": 51200
+    },
+    {
+      "epoch": 14.924340380919098,
+      "grad_norm": 0.3513336479663849,
+      "learning_rate": 0.0004211573426573426,
+      "loss": 3.329,
+      "step": 51250
+    },
+    {
+      "epoch": 14.93890150853282,
+      "grad_norm": 0.3728017807006836,
+      "learning_rate": 0.0004209825174825175,
+      "loss": 3.337,
+      "step": 51300
+    },
+    {
+      "epoch": 14.953462636146543,
+      "grad_norm": 0.3794797956943512,
+      "learning_rate": 0.0004208076923076923,
+      "loss": 3.3271,
+      "step": 51350
+    },
+    {
+      "epoch": 14.968023763760266,
+      "grad_norm": 0.3826712667942047,
+      "learning_rate": 0.00042063286713286713,
+      "loss": 3.3207,
+      "step": 51400
+    },
+    {
+      "epoch": 14.982584891373987,
+      "grad_norm": 0.3683113753795624,
+      "learning_rate": 0.00042045804195804193,
+      "loss": 3.3222,
+      "step": 51450
+    },
+    {
+      "epoch": 14.99714601898771,
+      "grad_norm": 0.3468119204044342,
+      "learning_rate": 0.0004202832167832168,
+      "loss": 3.3369,
+      "step": 51500
+    },
+    {
+      "epoch": 15.011648902090977,
+      "grad_norm": 0.3682543635368347,
+      "learning_rate": 0.0004201083916083916,
+      "loss": 3.2329,
+      "step": 51550
+    },
+    {
+      "epoch": 15.0262100297047,
+      "grad_norm": 0.38179928064346313,
+      "learning_rate": 0.00041993356643356644,
+      "loss": 3.2202,
+      "step": 51600
+    },
+    {
+      "epoch": 15.040771157318423,
+      "grad_norm": 0.3646683692932129,
+      "learning_rate": 0.0004197587412587412,
+      "loss": 3.2306,
+      "step": 51650
+    },
+    {
+      "epoch": 15.055332284932145,
+      "grad_norm": 0.37061500549316406,
+      "learning_rate": 0.00041958391608391604,
+      "loss": 3.2339,
+      "step": 51700
+    },
+    {
+      "epoch": 15.069893412545868,
+      "grad_norm": 0.4042431116104126,
+      "learning_rate": 0.00041940909090909084,
+      "loss": 3.242,
+      "step": 51750
+    },
+    {
+      "epoch": 15.084454540159589,
+      "grad_norm": 0.3703981339931488,
+      "learning_rate": 0.0004192342657342657,
+      "loss": 3.2247,
+      "step": 51800
+    },
+    {
+      "epoch": 15.099015667773312,
+      "grad_norm": 0.38997241854667664,
+      "learning_rate": 0.0004190594405594405,
+      "loss": 3.2418,
+      "step": 51850
+    },
+    {
+      "epoch": 15.113576795387035,
+      "grad_norm": 0.3963125944137573,
+      "learning_rate": 0.00041888461538461535,
+      "loss": 3.2503,
+      "step": 51900
+    },
+    {
+      "epoch": 15.128137923000757,
+      "grad_norm": 0.3787480294704437,
+      "learning_rate": 0.00041870979020979015,
+      "loss": 3.2543,
+      "step": 51950
+    },
+    {
+      "epoch": 15.14269905061448,
+      "grad_norm": 0.3614787459373474,
+      "learning_rate": 0.000418534965034965,
+      "loss": 3.2411,
+      "step": 52000
+    },
+    {
+      "epoch": 15.14269905061448,
+      "eval_accuracy": 0.3713345448551899,
+      "eval_loss": 3.558533191680908,
+      "eval_runtime": 179.6973,
+      "eval_samples_per_second": 92.634,
+      "eval_steps_per_second": 5.793,
+      "step": 52000
+    },
+    {
+      "epoch": 15.157260178228203,
+      "grad_norm": 0.37162527441978455,
+      "learning_rate": 0.00041836013986013985,
+      "loss": 3.2484,
+      "step": 52050
+    },
+    {
+      "epoch": 15.171821305841924,
+      "grad_norm": 0.37970441579818726,
+      "learning_rate": 0.00041818531468531465,
+      "loss": 3.251,
+      "step": 52100
+    },
+    {
+      "epoch": 15.186382433455647,
+      "grad_norm": 0.39577314257621765,
+      "learning_rate": 0.0004180104895104895,
+      "loss": 3.2534,
+      "step": 52150
+    },
+    {
+      "epoch": 15.20094356106937,
+      "grad_norm": 0.39917322993278503,
+      "learning_rate": 0.0004178356643356643,
+      "loss": 3.2581,
+      "step": 52200
+    },
+    {
+      "epoch": 15.215504688683092,
+      "grad_norm": 0.3855136036872864,
+      "learning_rate": 0.00041766083916083916,
+      "loss": 3.2488,
+      "step": 52250
+    },
+    {
+      "epoch": 15.230065816296815,
+      "grad_norm": 0.3883339762687683,
+      "learning_rate": 0.00041748601398601396,
+      "loss": 3.2616,
+      "step": 52300
+    },
+    {
+      "epoch": 15.244626943910536,
+      "grad_norm": 0.3921905755996704,
+      "learning_rate": 0.0004173111888111888,
+      "loss": 3.2664,
+      "step": 52350
+    },
+    {
+      "epoch": 15.259188071524258,
+      "grad_norm": 0.3932233154773712,
+      "learning_rate": 0.00041713636363636356,
+      "loss": 3.2652,
+      "step": 52400
+    },
+    {
+      "epoch": 15.273749199137981,
+      "grad_norm": 0.35905158519744873,
+      "learning_rate": 0.0004169615384615384,
+      "loss": 3.2578,
+      "step": 52450
+    },
+    {
+      "epoch": 15.288310326751704,
+      "grad_norm": 0.3927863538265228,
+      "learning_rate": 0.0004167867132867132,
+      "loss": 3.2658,
+      "step": 52500
+    },
+    {
+      "epoch": 15.302871454365427,
+      "grad_norm": 0.3803752660751343,
+      "learning_rate": 0.00041661188811188807,
+      "loss": 3.2791,
+      "step": 52550
+    },
+    {
+      "epoch": 15.317432581979148,
+      "grad_norm": 0.37040677666664124,
+      "learning_rate": 0.00041643706293706287,
+      "loss": 3.2675,
+      "step": 52600
+    },
+    {
+      "epoch": 15.33199370959287,
+      "grad_norm": 0.3753259778022766,
+      "learning_rate": 0.0004162622377622377,
+      "loss": 3.2588,
+      "step": 52650
+    },
+    {
+      "epoch": 15.346554837206593,
+      "grad_norm": 0.333647757768631,
+      "learning_rate": 0.0004160874125874126,
+      "loss": 3.2638,
+      "step": 52700
+    },
+    {
+      "epoch": 15.361115964820316,
+      "grad_norm": 0.4069964587688446,
+      "learning_rate": 0.0004159125874125874,
+      "loss": 3.2788,
+      "step": 52750
+    },
+    {
+      "epoch": 15.375677092434039,
+      "grad_norm": 0.3561253547668457,
+      "learning_rate": 0.00041573776223776223,
+      "loss": 3.269,
+      "step": 52800
+    },
+    {
+      "epoch": 15.39023822004776,
+      "grad_norm": 0.3721391558647156,
+      "learning_rate": 0.00041556293706293703,
+      "loss": 3.2708,
+      "step": 52850
+    },
+    {
+      "epoch": 15.404799347661482,
+      "grad_norm": 0.3592582941055298,
+      "learning_rate": 0.0004153881118881119,
+      "loss": 3.2836,
+      "step": 52900
+    },
+    {
+      "epoch": 15.419360475275205,
+      "grad_norm": 0.3861062824726105,
+      "learning_rate": 0.0004152132867132867,
+      "loss": 3.2757,
+      "step": 52950
+    },
+    {
+      "epoch": 15.433921602888928,
+      "grad_norm": 0.35728558897972107,
+      "learning_rate": 0.00041503846153846154,
+      "loss": 3.2784,
+      "step": 53000
+    },
+    {
+      "epoch": 15.433921602888928,
+      "eval_accuracy": 0.3720729555921358,
+      "eval_loss": 3.5504865646362305,
+      "eval_runtime": 179.8078,
+      "eval_samples_per_second": 92.577,
+      "eval_steps_per_second": 5.79,
+      "step": 53000
+    },
+    {
+      "epoch": 15.44848273050265,
+      "grad_norm": 0.36359742283821106,
+      "learning_rate": 0.00041486363636363634,
+      "loss": 3.2761,
+      "step": 53050
+    },
+    {
+      "epoch": 15.463043858116373,
+      "grad_norm": 0.375919908285141,
+      "learning_rate": 0.0004146888111888112,
+      "loss": 3.2761,
+      "step": 53100
+    },
+    {
+      "epoch": 15.477604985730094,
+      "grad_norm": 0.3810451924800873,
+      "learning_rate": 0.00041451398601398593,
+      "loss": 3.2829,
+      "step": 53150
+    },
+    {
+      "epoch": 15.492166113343817,
+      "grad_norm": 0.3776414394378662,
+      "learning_rate": 0.0004143391608391608,
+      "loss": 3.3008,
+      "step": 53200
+    },
+    {
+      "epoch": 15.50672724095754,
+      "grad_norm": 0.4121120274066925,
+      "learning_rate": 0.0004141643356643356,
+      "loss": 3.2937,
+      "step": 53250
+    },
+    {
+      "epoch": 15.521288368571263,
+      "grad_norm": 0.37099769711494446,
+      "learning_rate": 0.00041398951048951044,
+      "loss": 3.2832,
+      "step": 53300
+    },
+    {
+      "epoch": 15.535849496184985,
+      "grad_norm": 0.3595023453235626,
+      "learning_rate": 0.00041381468531468524,
+      "loss": 3.2857,
+      "step": 53350
+    },
+    {
+      "epoch": 15.550410623798706,
+      "grad_norm": 0.377890408039093,
+      "learning_rate": 0.0004136398601398601,
+      "loss": 3.2899,
+      "step": 53400
+    },
+    {
+      "epoch": 15.564971751412429,
+      "grad_norm": 0.37558192014694214,
+      "learning_rate": 0.00041346503496503495,
+      "loss": 3.298,
+      "step": 53450
+    },
+    {
+      "epoch": 15.579532879026152,
+      "grad_norm": 0.38341572880744934,
+      "learning_rate": 0.00041329020979020975,
+      "loss": 3.297,
+      "step": 53500
+    },
+    {
+      "epoch": 15.594094006639875,
+      "grad_norm": 0.3785656690597534,
+      "learning_rate": 0.0004131153846153846,
+      "loss": 3.2965,
+      "step": 53550
+    },
+    {
+      "epoch": 15.608655134253597,
+      "grad_norm": 0.3675081133842468,
+      "learning_rate": 0.0004129405594405594,
+      "loss": 3.2942,
+      "step": 53600
+    },
+    {
+      "epoch": 15.623216261867318,
+      "grad_norm": 0.35816362500190735,
+      "learning_rate": 0.00041276573426573426,
+      "loss": 3.2965,
+      "step": 53650
+    },
+    {
+      "epoch": 15.637777389481041,
+      "grad_norm": 0.3761393129825592,
+      "learning_rate": 0.00041259090909090906,
+      "loss": 3.2902,
+      "step": 53700
+    },
+    {
+      "epoch": 15.652338517094764,
+      "grad_norm": 0.4039056897163391,
+      "learning_rate": 0.0004124160839160839,
+      "loss": 3.2882,
+      "step": 53750
+    },
+    {
+      "epoch": 15.666899644708487,
+      "grad_norm": 0.3910631537437439,
+      "learning_rate": 0.0004122412587412587,
+      "loss": 3.2949,
+      "step": 53800
+    },
+    {
+      "epoch": 15.68146077232221,
+      "grad_norm": 0.38190576434135437,
+      "learning_rate": 0.00041206643356643356,
+      "loss": 3.3012,
+      "step": 53850
+    },
+    {
+      "epoch": 15.69602189993593,
+      "grad_norm": 0.34941723942756653,
+      "learning_rate": 0.0004118916083916083,
+      "loss": 3.293,
+      "step": 53900
+    },
+    {
+      "epoch": 15.710583027549653,
+      "grad_norm": 0.3839094936847687,
+      "learning_rate": 0.00041171678321678316,
+      "loss": 3.3087,
+      "step": 53950
+    },
+    {
+      "epoch": 15.725144155163376,
+      "grad_norm": 0.41564396023750305,
+      "learning_rate": 0.00041154195804195796,
+      "loss": 3.2982,
+      "step": 54000
+    },
+    {
+      "epoch": 15.725144155163376,
+      "eval_accuracy": 0.3723717997400926,
+      "eval_loss": 3.539506673812866,
+      "eval_runtime": 179.7562,
+      "eval_samples_per_second": 92.603,
+      "eval_steps_per_second": 5.791,
+      "step": 54000
+    },
+    {
+      "epoch": 15.739705282777098,
+      "grad_norm": 0.3858591616153717,
+      "learning_rate": 0.0004113671328671328,
+      "loss": 3.3119,
+      "step": 54050
+    },
+    {
+      "epoch": 15.754266410390821,
+      "grad_norm": 0.39614084362983704,
+      "learning_rate": 0.00041119230769230767,
+      "loss": 3.2928,
+      "step": 54100
+    },
+    {
+      "epoch": 15.768827538004544,
+      "grad_norm": 0.3449946641921997,
+      "learning_rate": 0.00041101748251748247,
+      "loss": 3.3069,
+      "step": 54150
+    },
+    {
+      "epoch": 15.783388665618265,
+      "grad_norm": 0.3629119098186493,
+      "learning_rate": 0.0004108426573426573,
+      "loss": 3.3068,
+      "step": 54200
+    },
+    {
+      "epoch": 15.797949793231988,
+      "grad_norm": 0.36593127250671387,
+      "learning_rate": 0.0004106678321678321,
+      "loss": 3.2962,
+      "step": 54250
+    },
+    {
+      "epoch": 15.81251092084571,
+      "grad_norm": 0.38235387206077576,
+      "learning_rate": 0.000410493006993007,
+      "loss": 3.3174,
+      "step": 54300
+    },
+    {
+      "epoch": 15.827072048459433,
+      "grad_norm": 0.3705120086669922,
+      "learning_rate": 0.0004103181818181818,
+      "loss": 3.3,
+      "step": 54350
+    },
+    {
+      "epoch": 15.841633176073156,
+      "grad_norm": 0.3669654428958893,
+      "learning_rate": 0.00041014335664335663,
+      "loss": 3.3017,
+      "step": 54400
+    },
+    {
+      "epoch": 15.856194303686877,
+      "grad_norm": 0.36617377400398254,
+      "learning_rate": 0.00040996853146853143,
+      "loss": 3.3104,
+      "step": 54450
+    },
+    {
+      "epoch": 15.8707554313006,
+      "grad_norm": 0.3692075312137604,
+      "learning_rate": 0.0004097937062937063,
+      "loss": 3.3003,
+      "step": 54500
+    },
+    {
+      "epoch": 15.885316558914322,
+      "grad_norm": 0.35389435291290283,
+      "learning_rate": 0.0004096188811188811,
+      "loss": 3.3191,
+      "step": 54550
+    },
+    {
+      "epoch": 15.899877686528045,
+      "grad_norm": 0.36336150765419006,
+      "learning_rate": 0.00040944405594405594,
+      "loss": 3.3095,
+      "step": 54600
+    },
+    {
+      "epoch": 15.914438814141768,
+      "grad_norm": 0.3665081262588501,
+      "learning_rate": 0.0004092692307692307,
+      "loss": 3.3147,
+      "step": 54650
+    },
+    {
+      "epoch": 15.928999941755489,
+      "grad_norm": 0.3585745394229889,
+      "learning_rate": 0.00040909440559440554,
+      "loss": 3.3144,
+      "step": 54700
+    },
+    {
+      "epoch": 15.943561069369212,
+      "grad_norm": 0.3691560924053192,
+      "learning_rate": 0.00040891958041958034,
+      "loss": 3.3034,
+      "step": 54750
+    },
+    {
+      "epoch": 15.958122196982934,
+      "grad_norm": 0.38911911845207214,
+      "learning_rate": 0.0004087447552447552,
+      "loss": 3.3129,
+      "step": 54800
+    },
+    {
+      "epoch": 15.972683324596657,
+      "grad_norm": 0.3935786783695221,
+      "learning_rate": 0.00040856993006993005,
+      "loss": 3.3115,
+      "step": 54850
+    },
+    {
+      "epoch": 15.98724445221038,
+      "grad_norm": 0.35671958327293396,
+      "learning_rate": 0.00040839510489510485,
+      "loss": 3.3134,
+      "step": 54900
+    },
+    {
+      "epoch": 16.001747335313645,
+      "grad_norm": 0.3811984360218048,
+      "learning_rate": 0.0004082202797202797,
+      "loss": 3.3019,
+      "step": 54950
+    },
+    {
+      "epoch": 16.01630846292737,
+      "grad_norm": 0.3744508922100067,
+      "learning_rate": 0.0004080454545454545,
+      "loss": 3.2021,
+      "step": 55000
+    },
+    {
+      "epoch": 16.01630846292737,
+      "eval_accuracy": 0.37187298159698456,
+      "eval_loss": 3.5502474308013916,
+      "eval_runtime": 179.8056,
+      "eval_samples_per_second": 92.578,
+      "eval_steps_per_second": 5.79,
+      "step": 55000
+    },
+    {
+      "epoch": 16.03086959054109,
+      "grad_norm": 0.3818357586860657,
+      "learning_rate": 0.00040787062937062935,
+      "loss": 3.1967,
+      "step": 55050
+    },
+    {
+      "epoch": 16.045430718154815,
+      "grad_norm": 0.38330283761024475,
+      "learning_rate": 0.00040769580419580415,
+      "loss": 3.2128,
+      "step": 55100
+    },
+    {
+      "epoch": 16.059991845768536,
+      "grad_norm": 0.4017890989780426,
+      "learning_rate": 0.000407520979020979,
+      "loss": 3.2118,
+      "step": 55150
+    },
+    {
+      "epoch": 16.074552973382257,
+      "grad_norm": 0.3700309693813324,
+      "learning_rate": 0.0004073461538461538,
+      "loss": 3.2218,
+      "step": 55200
+    },
+    {
+      "epoch": 16.08911410099598,
+      "grad_norm": 0.38918474316596985,
+      "learning_rate": 0.00040717132867132866,
+      "loss": 3.226,
+      "step": 55250
+    },
+    {
+      "epoch": 16.103675228609703,
+      "grad_norm": 0.3746088743209839,
+      "learning_rate": 0.00040699650349650346,
+      "loss": 3.2235,
+      "step": 55300
+    },
+    {
+      "epoch": 16.118236356223427,
+      "grad_norm": 0.3673935532569885,
+      "learning_rate": 0.0004068216783216783,
+      "loss": 3.2331,
+      "step": 55350
+    },
+    {
+      "epoch": 16.132797483837148,
+      "grad_norm": 0.3882383704185486,
+      "learning_rate": 0.00040664685314685306,
+      "loss": 3.2304,
+      "step": 55400
+    },
+    {
+      "epoch": 16.14735861145087,
+      "grad_norm": 0.39152103662490845,
+      "learning_rate": 0.0004064720279720279,
+      "loss": 3.2356,
+      "step": 55450
+    },
+    {
+      "epoch": 16.161919739064594,
+      "grad_norm": 0.36934158205986023,
+      "learning_rate": 0.00040629720279720277,
+      "loss": 3.2385,
+      "step": 55500
+    },
+    {
+      "epoch": 16.176480866678315,
+      "grad_norm": 0.37601685523986816,
+      "learning_rate": 0.00040612237762237757,
+      "loss": 3.2393,
+      "step": 55550
+    },
+    {
+      "epoch": 16.19104199429204,
+      "grad_norm": 0.3896099328994751,
+      "learning_rate": 0.0004059475524475524,
+      "loss": 3.2378,
+      "step": 55600
+    },
+    {
+      "epoch": 16.20560312190576,
+      "grad_norm": 0.4255818724632263,
+      "learning_rate": 0.0004057727272727272,
+      "loss": 3.2477,
+      "step": 55650
+    },
+    {
+      "epoch": 16.22016424951948,
+      "grad_norm": 0.3997867703437805,
+      "learning_rate": 0.0004055979020979021,
+      "loss": 3.248,
+      "step": 55700
+    },
+    {
+      "epoch": 16.234725377133206,
+      "grad_norm": 0.3857765197753906,
+      "learning_rate": 0.0004054230769230769,
+      "loss": 3.2516,
+      "step": 55750
+    },
+    {
+      "epoch": 16.249286504746927,
+      "grad_norm": 0.3792831003665924,
+      "learning_rate": 0.00040524825174825173,
+      "loss": 3.25,
+      "step": 55800
+    },
+    {
+      "epoch": 16.26384763236065,
+      "grad_norm": 0.3990461230278015,
+      "learning_rate": 0.00040507342657342653,
+      "loss": 3.2654,
+      "step": 55850
+    },
+    {
+      "epoch": 16.278408759974372,
+      "grad_norm": 0.3778679072856903,
+      "learning_rate": 0.0004048986013986014,
+      "loss": 3.2594,
+      "step": 55900
+    },
+    {
+      "epoch": 16.292969887588093,
+      "grad_norm": 0.38567569851875305,
+      "learning_rate": 0.0004047237762237762,
+      "loss": 3.2516,
+      "step": 55950
+    },
+    {
+      "epoch": 16.307531015201818,
+      "grad_norm": 0.3942672908306122,
+      "learning_rate": 0.00040454895104895104,
+      "loss": 3.2393,
+      "step": 56000
+    },
+    {
+      "epoch": 16.307531015201818,
+      "eval_accuracy": 0.37220392033675576,
+      "eval_loss": 3.5524516105651855,
+      "eval_runtime": 179.7486,
+      "eval_samples_per_second": 92.607,
+      "eval_steps_per_second": 5.791,
+      "step": 56000
+    },
+    {
+      "epoch": 16.32209214281554,
+      "grad_norm": 0.36694231629371643,
+      "learning_rate": 0.00040437412587412583,
+      "loss": 3.2515,
+      "step": 56050
+    },
+    {
+      "epoch": 16.336653270429263,
+      "grad_norm": 0.36499130725860596,
+      "learning_rate": 0.0004041993006993007,
+      "loss": 3.2496,
+      "step": 56100
+    },
+    {
+      "epoch": 16.351214398042984,
+      "grad_norm": 0.3922365605831146,
+      "learning_rate": 0.00040402447552447554,
+      "loss": 3.2439,
+      "step": 56150
+    },
+    {
+      "epoch": 16.36577552565671,
+      "grad_norm": 0.3942036032676697,
+      "learning_rate": 0.0004038496503496503,
+      "loss": 3.2654,
+      "step": 56200
+    },
+    {
+      "epoch": 16.38033665327043,
+      "grad_norm": 0.3666972219944,
+      "learning_rate": 0.00040367482517482514,
+      "loss": 3.2619,
+      "step": 56250
+    },
+    {
+      "epoch": 16.39489778088415,
+      "grad_norm": 0.37733784317970276,
+      "learning_rate": 0.00040349999999999994,
+      "loss": 3.2648,
+      "step": 56300
+    },
+    {
+      "epoch": 16.409458908497875,
+      "grad_norm": 0.3841692805290222,
+      "learning_rate": 0.0004033251748251748,
+      "loss": 3.2814,
+      "step": 56350
+    },
+    {
+      "epoch": 16.424020036111596,
+      "grad_norm": 0.3820192217826843,
+      "learning_rate": 0.0004031503496503496,
+      "loss": 3.2705,
+      "step": 56400
+    },
+    {
+      "epoch": 16.43858116372532,
+      "grad_norm": 0.3828584551811218,
+      "learning_rate": 0.00040297552447552445,
+      "loss": 3.2728,
+      "step": 56450
+    },
+    {
+      "epoch": 16.45314229133904,
+      "grad_norm": 0.4050110876560211,
+      "learning_rate": 0.00040280069930069925,
+      "loss": 3.2735,
+      "step": 56500
+    },
+    {
+      "epoch": 16.467703418952762,
+      "grad_norm": 0.38829973340034485,
+      "learning_rate": 0.0004026258741258741,
+      "loss": 3.2735,
+      "step": 56550
+    },
+    {
+      "epoch": 16.482264546566487,
+      "grad_norm": 0.39007800817489624,
+      "learning_rate": 0.0004024510489510489,
+      "loss": 3.2612,
+      "step": 56600
+    },
+    {
+      "epoch": 16.496825674180208,
+      "grad_norm": 0.38671746850013733,
+      "learning_rate": 0.00040227622377622376,
+      "loss": 3.271,
+      "step": 56650
+    },
+    {
+      "epoch": 16.511386801793932,
+      "grad_norm": 0.43336355686187744,
+      "learning_rate": 0.00040210139860139856,
+      "loss": 3.2684,
+      "step": 56700
+    },
+    {
+      "epoch": 16.525947929407653,
+      "grad_norm": 0.36696234345436096,
+      "learning_rate": 0.0004019265734265734,
+      "loss": 3.2739,
+      "step": 56750
+    },
+    {
+      "epoch": 16.540509057021374,
+      "grad_norm": 0.3764398396015167,
+      "learning_rate": 0.0004017517482517482,
+      "loss": 3.2852,
+      "step": 56800
+    },
+    {
+      "epoch": 16.5550701846351,
+      "grad_norm": 0.3709736764431,
+      "learning_rate": 0.00040157692307692306,
+      "loss": 3.2853,
+      "step": 56850
+    },
+    {
+      "epoch": 16.56963131224882,
+      "grad_norm": 0.37827807664871216,
+      "learning_rate": 0.0004014020979020979,
+      "loss": 3.2614,
+      "step": 56900
+    },
+    {
+      "epoch": 16.584192439862544,
+      "grad_norm": 0.34628504514694214,
+      "learning_rate": 0.00040122727272727266,
+      "loss": 3.2794,
+      "step": 56950
+    },
+    {
+      "epoch": 16.598753567476265,
+      "grad_norm": 0.37481993436813354,
+      "learning_rate": 0.0004010524475524475,
+      "loss": 3.2732,
+      "step": 57000
+    },
+    {
+      "epoch": 16.598753567476265,
+      "eval_accuracy": 0.37256272141447566,
+      "eval_loss": 3.544576406478882,
+      "eval_runtime": 179.8109,
+      "eval_samples_per_second": 92.575,
+      "eval_steps_per_second": 5.789,
+      "step": 57000
+    },
+    {
+      "epoch": 16.613314695089986,
+      "grad_norm": 0.3837030231952667,
+      "learning_rate": 0.0004008776223776223,
+      "loss": 3.2913,
+      "step": 57050
+    },
+    {
+      "epoch": 16.62787582270371,
+      "grad_norm": 0.34924599528312683,
+      "learning_rate": 0.00040070279720279717,
+      "loss": 3.2843,
+      "step": 57100
+    },
+    {
+      "epoch": 16.642436950317432,
+      "grad_norm": 0.37274640798568726,
+      "learning_rate": 0.00040052797202797197,
+      "loss": 3.2824,
+      "step": 57150
+    },
+    {
+      "epoch": 16.656998077931156,
+      "grad_norm": 0.3472432792186737,
+      "learning_rate": 0.0004003531468531468,
+      "loss": 3.2826,
+      "step": 57200
+    },
+    {
+      "epoch": 16.671559205544877,
+      "grad_norm": 0.3804296553134918,
+      "learning_rate": 0.0004001783216783216,
+      "loss": 3.2813,
+      "step": 57250
+    },
+    {
+      "epoch": 16.6861203331586,
+      "grad_norm": 0.36797961592674255,
+      "learning_rate": 0.0004000034965034965,
+      "loss": 3.291,
+      "step": 57300
+    },
+    {
+      "epoch": 16.700681460772323,
+      "grad_norm": 0.3804137408733368,
+      "learning_rate": 0.0003998286713286713,
+      "loss": 3.2863,
+      "step": 57350
+    },
+    {
+      "epoch": 16.715242588386044,
+      "grad_norm": 0.3731144070625305,
+      "learning_rate": 0.00039965384615384613,
+      "loss": 3.302,
+      "step": 57400
+    },
+    {
+      "epoch": 16.72980371599977,
+      "grad_norm": 0.3816506266593933,
+      "learning_rate": 0.00039947902097902093,
+      "loss": 3.2834,
+      "step": 57450
+    },
+    {
+      "epoch": 16.74436484361349,
+      "grad_norm": 0.38200855255126953,
+      "learning_rate": 0.0003993041958041958,
+      "loss": 3.2961,
+      "step": 57500
+    },
+    {
+      "epoch": 16.75892597122721,
+      "grad_norm": 0.4178982377052307,
+      "learning_rate": 0.00039912937062937064,
+      "loss": 3.2958,
+      "step": 57550
+    },
+    {
+      "epoch": 16.773487098840935,
+      "grad_norm": 0.37672388553619385,
+      "learning_rate": 0.00039895454545454544,
+      "loss": 3.2928,
+      "step": 57600
+    },
+    {
+      "epoch": 16.788048226454656,
+      "grad_norm": 0.3759998381137848,
+      "learning_rate": 0.0003987797202797203,
+      "loss": 3.2941,
+      "step": 57650
+    },
+    {
+      "epoch": 16.80260935406838,
+      "grad_norm": 0.3826366066932678,
+      "learning_rate": 0.00039860489510489504,
+      "loss": 3.2902,
+      "step": 57700
+    },
+    {
+      "epoch": 16.8171704816821,
+      "grad_norm": 0.3507887125015259,
+      "learning_rate": 0.0003984300699300699,
+      "loss": 3.285,
+      "step": 57750
+    },
+    {
+      "epoch": 16.831731609295822,
+      "grad_norm": 0.3854213058948517,
+      "learning_rate": 0.0003982552447552447,
+      "loss": 3.2929,
+      "step": 57800
+    },
+    {
+      "epoch": 16.846292736909547,
+      "grad_norm": 0.37187182903289795,
+      "learning_rate": 0.00039808041958041955,
+      "loss": 3.2943,
+      "step": 57850
+    },
+    {
+      "epoch": 16.860853864523268,
+      "grad_norm": 0.38784974813461304,
+      "learning_rate": 0.00039790559440559435,
+      "loss": 3.2884,
+      "step": 57900
+    },
+    {
+      "epoch": 16.875414992136992,
+      "grad_norm": 0.3870326280593872,
+      "learning_rate": 0.0003977307692307692,
+      "loss": 3.2948,
+      "step": 57950
+    },
+    {
+      "epoch": 16.889976119750713,
+      "grad_norm": 0.35052213072776794,
+      "learning_rate": 0.000397555944055944,
+      "loss": 3.2869,
+      "step": 58000
+    },
+    {
+      "epoch": 16.889976119750713,
+      "eval_accuracy": 0.3729087081679913,
+      "eval_loss": 3.5367329120635986,
+      "eval_runtime": 179.95,
+      "eval_samples_per_second": 92.503,
+      "eval_steps_per_second": 5.785,
+      "step": 58000
+    },
+    {
+      "epoch": 16.904537247364434,
+      "grad_norm": 0.3954494297504425,
+      "learning_rate": 0.00039738111888111885,
+      "loss": 3.306,
+      "step": 58050
+    },
+    {
+      "epoch": 16.91909837497816,
+      "grad_norm": 0.3943544030189514,
+      "learning_rate": 0.00039720629370629365,
+      "loss": 3.293,
+      "step": 58100
+    },
+    {
+      "epoch": 16.93365950259188,
+      "grad_norm": 0.35315266251564026,
+      "learning_rate": 0.0003970314685314685,
+      "loss": 3.3036,
+      "step": 58150
+    },
+    {
+      "epoch": 16.948220630205604,
+      "grad_norm": 0.37690219283103943,
+      "learning_rate": 0.0003968566433566433,
+      "loss": 3.2983,
+      "step": 58200
+    },
+    {
+      "epoch": 16.962781757819325,
+      "grad_norm": 0.35092926025390625,
+      "learning_rate": 0.00039668181818181816,
+      "loss": 3.299,
+      "step": 58250
+    },
+    {
+      "epoch": 16.977342885433046,
+      "grad_norm": 0.3741217851638794,
+      "learning_rate": 0.000396506993006993,
+      "loss": 3.2969,
+      "step": 58300
+    },
+    {
+      "epoch": 16.99190401304677,
+      "grad_norm": 0.3852684199810028,
+      "learning_rate": 0.0003963321678321678,
+      "loss": 3.2911,
+      "step": 58350
+    },
+    {
+      "epoch": 17.006406896150036,
+      "grad_norm": 0.3808322846889496,
+      "learning_rate": 0.00039615734265734267,
+      "loss": 3.2516,
+      "step": 58400
+    },
+    {
+      "epoch": 17.02096802376376,
+      "grad_norm": 0.38001880049705505,
+      "learning_rate": 0.0003959825174825174,
+      "loss": 3.1904,
+      "step": 58450
+    },
+    {
+      "epoch": 17.03552915137748,
+      "grad_norm": 0.426193505525589,
+      "learning_rate": 0.00039580769230769227,
+      "loss": 3.1828,
+      "step": 58500
+    },
+    {
+      "epoch": 17.050090278991206,
+      "grad_norm": 0.4035714268684387,
+      "learning_rate": 0.00039563286713286707,
+      "loss": 3.2003,
+      "step": 58550
+    },
+    {
+      "epoch": 17.064651406604927,
+      "grad_norm": 0.40296247601509094,
+      "learning_rate": 0.0003954580419580419,
+      "loss": 3.2132,
+      "step": 58600
+    },
+    {
+      "epoch": 17.07921253421865,
+      "grad_norm": 0.36768728494644165,
+      "learning_rate": 0.0003952832167832167,
+      "loss": 3.2112,
+      "step": 58650
+    },
+    {
+      "epoch": 17.093773661832373,
+      "grad_norm": 0.38769692182540894,
+      "learning_rate": 0.0003951083916083916,
+      "loss": 3.2056,
+      "step": 58700
+    },
+    {
+      "epoch": 17.108334789446094,
+      "grad_norm": 0.5081406831741333,
+      "learning_rate": 0.0003949335664335664,
+      "loss": 3.2071,
+      "step": 58750
+    },
+    {
+      "epoch": 17.122895917059818,
+      "grad_norm": 0.3892911374568939,
+      "learning_rate": 0.00039475874125874123,
+      "loss": 3.2145,
+      "step": 58800
+    },
+    {
+      "epoch": 17.13745704467354,
+      "grad_norm": 0.3744818866252899,
+      "learning_rate": 0.00039458391608391603,
+      "loss": 3.2318,
+      "step": 58850
+    },
+    {
+      "epoch": 17.152018172287264,
+      "grad_norm": 0.3952346742153168,
+      "learning_rate": 0.0003944090909090909,
+      "loss": 3.2336,
+      "step": 58900
+    },
+    {
+      "epoch": 17.166579299900985,
+      "grad_norm": 0.38604477047920227,
+      "learning_rate": 0.00039423426573426573,
+      "loss": 3.2199,
+      "step": 58950
+    },
+    {
+      "epoch": 17.181140427514705,
+      "grad_norm": 0.3790982663631439,
+      "learning_rate": 0.00039405944055944053,
+      "loss": 3.2329,
+      "step": 59000
+    },
+    {
+      "epoch": 17.181140427514705,
+      "eval_accuracy": 0.37229738260962186,
+      "eval_loss": 3.550474166870117,
+      "eval_runtime": 179.7974,
+      "eval_samples_per_second": 92.582,
+      "eval_steps_per_second": 5.79,
+      "step": 59000
+    },
+    {
+      "epoch": 17.19570155512843,
+      "grad_norm": 0.3766101598739624,
+      "learning_rate": 0.0003938846153846154,
+      "loss": 3.2224,
+      "step": 59050
+    },
+    {
+      "epoch": 17.21026268274215,
+      "grad_norm": 0.3996836245059967,
+      "learning_rate": 0.0003937097902097902,
+      "loss": 3.2276,
+      "step": 59100
+    },
+    {
+      "epoch": 17.224823810355876,
+      "grad_norm": 0.3663976490497589,
+      "learning_rate": 0.00039353496503496504,
+      "loss": 3.2365,
+      "step": 59150
+    },
+    {
+      "epoch": 17.239384937969596,
+      "grad_norm": 0.3746318221092224,
+      "learning_rate": 0.0003933601398601398,
+      "loss": 3.2349,
+      "step": 59200
+    },
+    {
+      "epoch": 17.253946065583317,
+      "grad_norm": 0.3788388669490814,
+      "learning_rate": 0.00039318531468531464,
+      "loss": 3.2399,
+      "step": 59250
+    },
+    {
+      "epoch": 17.268507193197042,
+      "grad_norm": 0.38114452362060547,
+      "learning_rate": 0.00039301048951048944,
+      "loss": 3.2377,
+      "step": 59300
+    },
+    {
+      "epoch": 17.283068320810763,
+      "grad_norm": 0.38223931193351746,
+      "learning_rate": 0.0003928356643356643,
+      "loss": 3.2375,
+      "step": 59350
+    },
+    {
+      "epoch": 17.297629448424487,
+      "grad_norm": 0.3718387484550476,
+      "learning_rate": 0.0003926608391608391,
+      "loss": 3.2493,
+      "step": 59400
+    },
+    {
+      "epoch": 17.31219057603821,
+      "grad_norm": 0.4343428313732147,
+      "learning_rate": 0.00039248601398601395,
+      "loss": 3.2366,
+      "step": 59450
+    },
+    {
+      "epoch": 17.32675170365193,
+      "grad_norm": 0.40242546796798706,
+      "learning_rate": 0.00039231118881118875,
+      "loss": 3.2394,
+      "step": 59500
+    },
+    {
+      "epoch": 17.341312831265654,
+      "grad_norm": 0.3776035010814667,
+      "learning_rate": 0.0003921363636363636,
+      "loss": 3.2384,
+      "step": 59550
+    },
+    {
+      "epoch": 17.355873958879375,
+      "grad_norm": 0.3844600021839142,
+      "learning_rate": 0.00039196153846153846,
+      "loss": 3.2412,
+      "step": 59600
+    },
+    {
+      "epoch": 17.3704350864931,
+      "grad_norm": 0.374520480632782,
+      "learning_rate": 0.00039178671328671326,
+      "loss": 3.2549,
+      "step": 59650
+    },
+    {
+      "epoch": 17.38499621410682,
+      "grad_norm": 0.38012269139289856,
+      "learning_rate": 0.0003916118881118881,
+      "loss": 3.2481,
+      "step": 59700
+    },
+    {
+      "epoch": 17.39955734172054,
+      "grad_norm": 0.37167397141456604,
+      "learning_rate": 0.0003914370629370629,
+      "loss": 3.2559,
+      "step": 59750
+    },
+    {
+      "epoch": 17.414118469334266,
+      "grad_norm": 0.42666929960250854,
+      "learning_rate": 0.00039126223776223776,
+      "loss": 3.2479,
+      "step": 59800
+    },
+    {
+      "epoch": 17.428679596947987,
+      "grad_norm": 0.37930217385292053,
+      "learning_rate": 0.00039108741258741256,
+      "loss": 3.2507,
+      "step": 59850
+    },
+    {
+      "epoch": 17.44324072456171,
+      "grad_norm": 0.3739822506904602,
+      "learning_rate": 0.0003909125874125874,
+      "loss": 3.2631,
+      "step": 59900
+    },
+    {
+      "epoch": 17.457801852175432,
+      "grad_norm": 0.36338284611701965,
+      "learning_rate": 0.00039073776223776216,
+      "loss": 3.2534,
+      "step": 59950
+    },
+    {
+      "epoch": 17.472362979789153,
+      "grad_norm": 0.3643838167190552,
+      "learning_rate": 0.000390562937062937,
+      "loss": 3.2627,
+      "step": 60000
+    },
+    {
+      "epoch": 17.472362979789153,
+      "eval_accuracy": 0.372753407963644,
+      "eval_loss": 3.54595685005188,
+      "eval_runtime": 179.9118,
+      "eval_samples_per_second": 92.523,
+      "eval_steps_per_second": 5.786,
+      "step": 60000
+    },
+    {
+      "epoch": 17.486924107402878,
+      "grad_norm": 0.36303991079330444,
+      "learning_rate": 0.0003903881118881118,
+      "loss": 3.2644,
+      "step": 60050
+    },
+    {
+      "epoch": 17.5014852350166,
+      "grad_norm": 0.3679315149784088,
+      "learning_rate": 0.00039021328671328667,
+      "loss": 3.2594,
+      "step": 60100
+    },
+    {
+      "epoch": 17.516046362630323,
+      "grad_norm": 0.3789927363395691,
+      "learning_rate": 0.00039003846153846147,
+      "loss": 3.2575,
+      "step": 60150
+    },
+    {
+      "epoch": 17.530607490244044,
+      "grad_norm": 0.380993127822876,
+      "learning_rate": 0.0003898636363636363,
+      "loss": 3.2592,
+      "step": 60200
+    },
+    {
+      "epoch": 17.545168617857765,
+      "grad_norm": 0.36757829785346985,
+      "learning_rate": 0.0003896888111888111,
+      "loss": 3.2626,
+      "step": 60250
+    },
+    {
+      "epoch": 17.55972974547149,
+      "grad_norm": 0.39095836877822876,
+      "learning_rate": 0.000389513986013986,
+      "loss": 3.2766,
+      "step": 60300
+    },
+    {
+      "epoch": 17.57429087308521,
+      "grad_norm": 0.3795633912086487,
+      "learning_rate": 0.00038933916083916083,
+      "loss": 3.2637,
+      "step": 60350
+    },
+    {
+      "epoch": 17.588852000698935,
+      "grad_norm": 0.3707791268825531,
+      "learning_rate": 0.00038916433566433563,
+      "loss": 3.276,
+      "step": 60400
+    },
+    {
+      "epoch": 17.603413128312656,
+      "grad_norm": 0.38798531889915466,
+      "learning_rate": 0.0003889895104895105,
+      "loss": 3.2725,
+      "step": 60450
+    },
+    {
+      "epoch": 17.617974255926377,
+      "grad_norm": 0.3834350109100342,
+      "learning_rate": 0.0003888146853146853,
+      "loss": 3.2726,
+      "step": 60500
+    },
+    {
+      "epoch": 17.6325353835401,
+      "grad_norm": 0.36958327889442444,
+      "learning_rate": 0.00038863986013986014,
+      "loss": 3.2846,
+      "step": 60550
+    },
+    {
+      "epoch": 17.647096511153823,
+      "grad_norm": 0.4254196584224701,
+      "learning_rate": 0.00038846503496503494,
+      "loss": 3.265,
+      "step": 60600
+    },
+    {
+      "epoch": 17.661657638767547,
+      "grad_norm": 0.35333532094955444,
+      "learning_rate": 0.0003882902097902098,
+      "loss": 3.2839,
+      "step": 60650
+    },
+    {
+      "epoch": 17.676218766381268,
+      "grad_norm": 0.4240613579750061,
+      "learning_rate": 0.00038811538461538454,
+      "loss": 3.2719,
+      "step": 60700
+    },
+    {
+      "epoch": 17.690779893994993,
+      "grad_norm": 0.4057902991771698,
+      "learning_rate": 0.0003879405594405594,
+      "loss": 3.2586,
+      "step": 60750
+    },
+    {
+      "epoch": 17.705341021608714,
+      "grad_norm": 0.42154860496520996,
+      "learning_rate": 0.0003877657342657342,
+      "loss": 3.2783,
+      "step": 60800
+    },
+    {
+      "epoch": 17.719902149222435,
+      "grad_norm": 0.37844952940940857,
+      "learning_rate": 0.00038759090909090905,
+      "loss": 3.271,
+      "step": 60850
+    },
+    {
+      "epoch": 17.73446327683616,
+      "grad_norm": 0.38174551725387573,
+      "learning_rate": 0.00038741608391608384,
+      "loss": 3.275,
+      "step": 60900
+    },
+    {
+      "epoch": 17.74902440444988,
+      "grad_norm": 0.36706846952438354,
+      "learning_rate": 0.0003872412587412587,
+      "loss": 3.2777,
+      "step": 60950
+    },
+    {
+      "epoch": 17.763585532063605,
+      "grad_norm": 0.37525689601898193,
+      "learning_rate": 0.00038706643356643355,
+      "loss": 3.2649,
+      "step": 61000
+    },
+    {
+      "epoch": 17.763585532063605,
+      "eval_accuracy": 0.3734166961944749,
+      "eval_loss": 3.5382072925567627,
+      "eval_runtime": 179.9566,
+      "eval_samples_per_second": 92.5,
+      "eval_steps_per_second": 5.785,
+      "step": 61000
+    },
+    {
+      "epoch": 17.778146659677326,
+      "grad_norm": 0.40704724192619324,
+      "learning_rate": 0.00038689160839160835,
+      "loss": 3.2755,
+      "step": 61050
+    },
+    {
+      "epoch": 17.792707787291047,
+      "grad_norm": 0.3714872896671295,
+      "learning_rate": 0.0003867167832167832,
+      "loss": 3.2766,
+      "step": 61100
+    },
+    {
+      "epoch": 17.80726891490477,
+      "grad_norm": 0.3932179808616638,
+      "learning_rate": 0.000386541958041958,
+      "loss": 3.2844,
+      "step": 61150
+    },
+    {
+      "epoch": 17.821830042518492,
+      "grad_norm": 0.42021384835243225,
+      "learning_rate": 0.00038636713286713286,
+      "loss": 3.2882,
+      "step": 61200
+    },
+    {
+      "epoch": 17.836391170132217,
+      "grad_norm": 0.39331310987472534,
+      "learning_rate": 0.00038619230769230766,
+      "loss": 3.2748,
+      "step": 61250
+    },
+    {
+      "epoch": 17.850952297745938,
+      "grad_norm": 0.36521244049072266,
+      "learning_rate": 0.0003860174825174825,
+      "loss": 3.2788,
+      "step": 61300
+    },
+    {
+      "epoch": 17.86551342535966,
+      "grad_norm": 0.3976779580116272,
+      "learning_rate": 0.0003858426573426573,
+      "loss": 3.2742,
+      "step": 61350
+    },
+    {
+      "epoch": 17.880074552973383,
+      "grad_norm": 0.37855014204978943,
+      "learning_rate": 0.00038566783216783217,
+      "loss": 3.3008,
+      "step": 61400
+    },
+    {
+      "epoch": 17.894635680587104,
+      "grad_norm": 0.40165817737579346,
+      "learning_rate": 0.0003854930069930069,
+      "loss": 3.2812,
+      "step": 61450
+    },
+    {
+      "epoch": 17.90919680820083,
+      "grad_norm": 0.3996519148349762,
+      "learning_rate": 0.00038531818181818177,
+      "loss": 3.2909,
+      "step": 61500
+    },
+    {
+      "epoch": 17.92375793581455,
+      "grad_norm": 0.37104541063308716,
+      "learning_rate": 0.00038514335664335657,
+      "loss": 3.2924,
+      "step": 61550
+    },
+    {
+      "epoch": 17.93831906342827,
+      "grad_norm": 0.37623950839042664,
+      "learning_rate": 0.0003849685314685314,
+      "loss": 3.2858,
+      "step": 61600
+    },
+    {
+      "epoch": 17.952880191041995,
+      "grad_norm": 0.393968790769577,
+      "learning_rate": 0.0003847937062937062,
+      "loss": 3.2893,
+      "step": 61650
+    },
+    {
+      "epoch": 17.967441318655716,
+      "grad_norm": 0.42417454719543457,
+      "learning_rate": 0.0003846188811188811,
+      "loss": 3.2834,
+      "step": 61700
+    },
+    {
+      "epoch": 17.98200244626944,
+      "grad_norm": 0.36883294582366943,
+      "learning_rate": 0.00038444405594405593,
+      "loss": 3.2832,
+      "step": 61750
+    },
+    {
+      "epoch": 17.99656357388316,
+      "grad_norm": 0.3445383608341217,
+      "learning_rate": 0.00038426923076923073,
+      "loss": 3.2962,
+      "step": 61800
+    },
+    {
+      "epoch": 18.01106645698643,
+      "grad_norm": 0.38899731636047363,
+      "learning_rate": 0.0003840944055944056,
+      "loss": 3.201,
+      "step": 61850
+    },
+    {
+      "epoch": 18.02562758460015,
+      "grad_norm": 0.389384001493454,
+      "learning_rate": 0.0003839195804195804,
+      "loss": 3.1656,
+      "step": 61900
+    },
+    {
+      "epoch": 18.040188712213872,
+      "grad_norm": 0.3927420973777771,
+      "learning_rate": 0.00038374475524475523,
+      "loss": 3.1787,
+      "step": 61950
+    },
+    {
+      "epoch": 18.054749839827597,
+      "grad_norm": 0.37237685918807983,
+      "learning_rate": 0.00038356993006993003,
+      "loss": 3.1859,
+      "step": 62000
+    },
+    {
+      "epoch": 18.054749839827597,
+      "eval_accuracy": 0.3727140244901721,
+      "eval_loss": 3.55122709274292,
+      "eval_runtime": 179.7522,
+      "eval_samples_per_second": 92.605,
+      "eval_steps_per_second": 5.791,
+      "step": 62000
+    },
+    {
+      "epoch": 18.069310967441318,
+      "grad_norm": 0.3972647488117218,
+      "learning_rate": 0.0003833951048951049,
+      "loss": 3.1965,
+      "step": 62050
+    },
+    {
+      "epoch": 18.083872095055042,
+      "grad_norm": 0.413581520318985,
+      "learning_rate": 0.0003832202797202797,
+      "loss": 3.1931,
+      "step": 62100
+    },
+    {
+      "epoch": 18.098433222668763,
+      "grad_norm": 0.3956906199455261,
+      "learning_rate": 0.00038304545454545454,
+      "loss": 3.2077,
+      "step": 62150
+    },
+    {
+      "epoch": 18.112994350282484,
+      "grad_norm": 0.4045657217502594,
+      "learning_rate": 0.0003828706293706293,
+      "loss": 3.1915,
+      "step": 62200
+    },
+    {
+      "epoch": 18.12755547789621,
+      "grad_norm": 0.37881526350975037,
+      "learning_rate": 0.00038269580419580414,
+      "loss": 3.2048,
+      "step": 62250
+    },
+    {
+      "epoch": 18.14211660550993,
+      "grad_norm": 0.4110286235809326,
+      "learning_rate": 0.00038252097902097894,
+      "loss": 3.206,
+      "step": 62300
+    },
+    {
+      "epoch": 18.156677733123654,
+      "grad_norm": 0.3890237510204315,
+      "learning_rate": 0.0003823461538461538,
+      "loss": 3.2024,
+      "step": 62350
+    },
+    {
+      "epoch": 18.171238860737375,
+      "grad_norm": 0.3655931055545807,
+      "learning_rate": 0.00038217132867132865,
+      "loss": 3.2136,
+      "step": 62400
+    },
+    {
+      "epoch": 18.185799988351096,
+      "grad_norm": 0.41427183151245117,
+      "learning_rate": 0.00038199650349650345,
+      "loss": 3.2244,
+      "step": 62450
+    },
+    {
+      "epoch": 18.20036111596482,
+      "grad_norm": 0.4075052738189697,
+      "learning_rate": 0.0003818216783216783,
+      "loss": 3.2025,
+      "step": 62500
+    },
+    {
+      "epoch": 18.214922243578542,
+      "grad_norm": 0.376781702041626,
+      "learning_rate": 0.0003816468531468531,
+      "loss": 3.2109,
+      "step": 62550
+    },
+    {
+      "epoch": 18.229483371192266,
+      "grad_norm": 0.3829306662082672,
+      "learning_rate": 0.00038147202797202796,
+      "loss": 3.236,
+      "step": 62600
+    },
+    {
+      "epoch": 18.244044498805987,
+      "grad_norm": 0.4079030156135559,
+      "learning_rate": 0.00038129720279720276,
+      "loss": 3.2274,
+      "step": 62650
+    },
+    {
+      "epoch": 18.25860562641971,
+      "grad_norm": 0.4116332530975342,
+      "learning_rate": 0.0003811223776223776,
+      "loss": 3.2281,
+      "step": 62700
+    },
+    {
+      "epoch": 18.273166754033433,
+      "grad_norm": 0.36425337195396423,
+      "learning_rate": 0.0003809475524475524,
+      "loss": 3.2316,
+      "step": 62750
+    },
+    {
+      "epoch": 18.287727881647154,
+      "grad_norm": 0.3588009178638458,
+      "learning_rate": 0.00038077272727272726,
+      "loss": 3.237,
+      "step": 62800
+    },
+    {
+      "epoch": 18.30228900926088,
+      "grad_norm": 0.38670557737350464,
+      "learning_rate": 0.00038059790209790206,
+      "loss": 3.2351,
+      "step": 62850
+    },
+    {
+      "epoch": 18.3168501368746,
+      "grad_norm": 0.3748593032360077,
+      "learning_rate": 0.0003804230769230769,
+      "loss": 3.2318,
+      "step": 62900
+    },
+    {
+      "epoch": 18.33141126448832,
+      "grad_norm": 0.3898875117301941,
+      "learning_rate": 0.00038024825174825166,
+      "loss": 3.2381,
+      "step": 62950
+    },
+    {
+      "epoch": 18.345972392102045,
+      "grad_norm": 0.40329083800315857,
+      "learning_rate": 0.0003800734265734265,
+      "loss": 3.2295,
+      "step": 63000
+    },
+    {
+      "epoch": 18.345972392102045,
+      "eval_accuracy": 0.3723687431123007,
+      "eval_loss": 3.5525670051574707,
+      "eval_runtime": 179.8597,
+      "eval_samples_per_second": 92.55,
+      "eval_steps_per_second": 5.788,
+      "step": 63000
+    },
+    {
+      "epoch": 18.360533519715766,
+      "grad_norm": 0.37379077076911926,
+      "learning_rate": 0.0003798986013986013,
+      "loss": 3.2425,
+      "step": 63050
+    },
+    {
+      "epoch": 18.37509464732949,
+      "grad_norm": 0.38135528564453125,
+      "learning_rate": 0.00037972377622377617,
+      "loss": 3.2355,
+      "step": 63100
+    },
+    {
+      "epoch": 18.38965577494321,
+      "grad_norm": 0.3832929730415344,
+      "learning_rate": 0.000379548951048951,
+      "loss": 3.2411,
+      "step": 63150
+    },
+    {
+      "epoch": 18.404216902556932,
+      "grad_norm": 0.38185015320777893,
+      "learning_rate": 0.0003793741258741258,
+      "loss": 3.2397,
+      "step": 63200
+    },
+    {
+      "epoch": 18.418778030170657,
+      "grad_norm": 0.40947502851486206,
+      "learning_rate": 0.0003791993006993007,
+      "loss": 3.2413,
+      "step": 63250
+    },
+    {
+      "epoch": 18.433339157784378,
+      "grad_norm": 0.4148065745830536,
+      "learning_rate": 0.0003790244755244755,
+      "loss": 3.2406,
+      "step": 63300
+    },
+    {
+      "epoch": 18.447900285398102,
+      "grad_norm": 0.37160590291023254,
+      "learning_rate": 0.00037884965034965033,
+      "loss": 3.2575,
+      "step": 63350
+    },
+    {
+      "epoch": 18.462461413011823,
+      "grad_norm": 0.3780807852745056,
+      "learning_rate": 0.00037867482517482513,
+      "loss": 3.2459,
+      "step": 63400
+    },
+    {
+      "epoch": 18.477022540625548,
+      "grad_norm": 0.37864765524864197,
+      "learning_rate": 0.0003785,
+      "loss": 3.235,
+      "step": 63450
+    },
+    {
+      "epoch": 18.49158366823927,
+      "grad_norm": 0.393699049949646,
+      "learning_rate": 0.0003783251748251748,
+      "loss": 3.2458,
+      "step": 63500
+    },
+    {
+      "epoch": 18.50614479585299,
+      "grad_norm": 0.40041160583496094,
+      "learning_rate": 0.00037815034965034964,
+      "loss": 3.2509,
+      "step": 63550
+    },
+    {
+      "epoch": 18.520705923466714,
+      "grad_norm": 0.4013485014438629,
+      "learning_rate": 0.00037797552447552444,
+      "loss": 3.2516,
+      "step": 63600
+    },
+    {
+      "epoch": 18.535267051080435,
+      "grad_norm": 0.3757307231426239,
+      "learning_rate": 0.0003778006993006993,
+      "loss": 3.2674,
+      "step": 63650
+    },
+    {
+      "epoch": 18.54982817869416,
+      "grad_norm": 0.392844557762146,
+      "learning_rate": 0.00037762587412587404,
+      "loss": 3.2657,
+      "step": 63700
+    },
+    {
+      "epoch": 18.56438930630788,
+      "grad_norm": 0.425006628036499,
+      "learning_rate": 0.0003774510489510489,
+      "loss": 3.2543,
+      "step": 63750
+    },
+    {
+      "epoch": 18.5789504339216,
+      "grad_norm": 0.3740367591381073,
+      "learning_rate": 0.0003772762237762238,
+      "loss": 3.251,
+      "step": 63800
+    },
+    {
+      "epoch": 18.593511561535326,
+      "grad_norm": 0.40788206458091736,
+      "learning_rate": 0.00037710139860139854,
+      "loss": 3.2649,
+      "step": 63850
+    },
+    {
+      "epoch": 18.608072689149047,
+      "grad_norm": 0.3888489007949829,
+      "learning_rate": 0.0003769265734265734,
+      "loss": 3.2551,
+      "step": 63900
+    },
+    {
+      "epoch": 18.62263381676277,
+      "grad_norm": 0.3995554447174072,
+      "learning_rate": 0.0003767517482517482,
+      "loss": 3.2607,
+      "step": 63950
+    },
+    {
+      "epoch": 18.637194944376493,
+      "grad_norm": 0.3996911942958832,
+      "learning_rate": 0.00037657692307692305,
+      "loss": 3.253,
+      "step": 64000
+    },
+    {
+      "epoch": 18.637194944376493,
+      "eval_accuracy": 0.3733275837380818,
+      "eval_loss": 3.5378425121307373,
+      "eval_runtime": 179.5439,
+      "eval_samples_per_second": 92.713,
+      "eval_steps_per_second": 5.798,
+      "step": 64000
+    },
+    {
+      "epoch": 18.651756071990214,
+      "grad_norm": 0.39029937982559204,
+      "learning_rate": 0.00037640209790209785,
+      "loss": 3.2797,
+      "step": 64050
+    },
+    {
+      "epoch": 18.666317199603938,
+      "grad_norm": 0.37387433648109436,
+      "learning_rate": 0.0003762272727272727,
+      "loss": 3.2686,
+      "step": 64100
+    },
+    {
+      "epoch": 18.68087832721766,
+      "grad_norm": 0.37229660153388977,
+      "learning_rate": 0.0003760524475524475,
+      "loss": 3.2552,
+      "step": 64150
+    },
+    {
+      "epoch": 18.695439454831384,
+      "grad_norm": 0.38790515065193176,
+      "learning_rate": 0.00037587762237762236,
+      "loss": 3.2541,
+      "step": 64200
+    },
+    {
+      "epoch": 18.710000582445105,
+      "grad_norm": 0.4044416844844818,
+      "learning_rate": 0.00037570279720279716,
+      "loss": 3.2744,
+      "step": 64250
+    },
+    {
+      "epoch": 18.724561710058826,
+      "grad_norm": 0.39733484387397766,
+      "learning_rate": 0.000375527972027972,
+      "loss": 3.2686,
+      "step": 64300
+    },
+    {
+      "epoch": 18.73912283767255,
+      "grad_norm": 0.38480284810066223,
+      "learning_rate": 0.0003753531468531468,
+      "loss": 3.2625,
+      "step": 64350
+    },
+    {
+      "epoch": 18.75368396528627,
+      "grad_norm": 0.4033631682395935,
+      "learning_rate": 0.00037517832167832167,
+      "loss": 3.2693,
+      "step": 64400
+    },
+    {
+      "epoch": 18.768245092899996,
+      "grad_norm": 0.41612687706947327,
+      "learning_rate": 0.0003750034965034965,
+      "loss": 3.2638,
+      "step": 64450
+    },
+    {
+      "epoch": 18.782806220513717,
+      "grad_norm": 0.40416860580444336,
+      "learning_rate": 0.00037482867132867127,
+      "loss": 3.2583,
+      "step": 64500
+    },
+    {
+      "epoch": 18.797367348127437,
+      "grad_norm": 0.3601404130458832,
+      "learning_rate": 0.0003746538461538462,
+      "loss": 3.2644,
+      "step": 64550
+    },
+    {
+      "epoch": 18.811928475741162,
+      "grad_norm": 0.3742053508758545,
+      "learning_rate": 0.0003744790209790209,
+      "loss": 3.2667,
+      "step": 64600
+    },
+    {
+      "epoch": 18.826489603354883,
+      "grad_norm": 0.37909430265426636,
+      "learning_rate": 0.0003743041958041958,
+      "loss": 3.2681,
+      "step": 64650
+    },
+    {
+      "epoch": 18.841050730968607,
+      "grad_norm": 0.40138882398605347,
+      "learning_rate": 0.0003741293706293706,
+      "loss": 3.2699,
+      "step": 64700
+    },
+    {
+      "epoch": 18.85561185858233,
+      "grad_norm": 0.42691561579704285,
+      "learning_rate": 0.0003739545454545454,
+      "loss": 3.2697,
+      "step": 64750
+    },
+    {
+      "epoch": 18.87017298619605,
+      "grad_norm": 0.378101110458374,
+      "learning_rate": 0.0003737797202797202,
+      "loss": 3.2754,
+      "step": 64800
+    },
+    {
+      "epoch": 18.884734113809774,
+      "grad_norm": 0.40638959407806396,
+      "learning_rate": 0.0003736048951048951,
+      "loss": 3.2699,
+      "step": 64850
+    },
+    {
+      "epoch": 18.899295241423495,
+      "grad_norm": 0.371693879365921,
+      "learning_rate": 0.0003734300699300699,
+      "loss": 3.2768,
+      "step": 64900
+    },
+    {
+      "epoch": 18.91385636903722,
+      "grad_norm": 0.38783252239227295,
+      "learning_rate": 0.00037325524475524473,
+      "loss": 3.2732,
+      "step": 64950
+    },
+    {
+      "epoch": 18.92841749665094,
+      "grad_norm": 0.3685288727283478,
+      "learning_rate": 0.00037308041958041953,
+      "loss": 3.2776,
+      "step": 65000
+    },
+    {
+      "epoch": 18.92841749665094,
+      "eval_accuracy": 0.3736479418431889,
+      "eval_loss": 3.5299794673919678,
+      "eval_runtime": 179.7636,
+      "eval_samples_per_second": 92.599,
+      "eval_steps_per_second": 5.791,
+      "step": 65000
+    },
+    {
+      "epoch": 18.94297862426466,
+      "grad_norm": 0.39574894309043884,
+      "learning_rate": 0.0003729055944055944,
+      "loss": 3.2666,
+      "step": 65050
+    },
+    {
+      "epoch": 18.957539751878386,
+      "grad_norm": 0.3749750852584839,
+      "learning_rate": 0.0003727307692307692,
+      "loss": 3.272,
+      "step": 65100
+    },
+    {
+      "epoch": 18.972100879492107,
+      "grad_norm": 0.41102007031440735,
+      "learning_rate": 0.00037255594405594404,
+      "loss": 3.2792,
+      "step": 65150
+    },
+    {
+      "epoch": 18.98666200710583,
+      "grad_norm": 0.39592432975769043,
+      "learning_rate": 0.0003723811188811189,
+      "loss": 3.2632,
+      "step": 65200
+    },
+    {
+      "epoch": 19.001164890209097,
+      "grad_norm": 0.4055386781692505,
+      "learning_rate": 0.00037220629370629364,
+      "loss": 3.272,
+      "step": 65250
+    },
+    {
+      "epoch": 19.01572601782282,
+      "grad_norm": 0.4150431752204895,
+      "learning_rate": 0.00037203146853146855,
+      "loss": 3.1721,
+      "step": 65300
+    },
+    {
+      "epoch": 19.030287145436542,
+      "grad_norm": 0.36770737171173096,
+      "learning_rate": 0.0003718566433566433,
+      "loss": 3.1721,
+      "step": 65350
+    },
+    {
+      "epoch": 19.044848273050263,
+      "grad_norm": 0.38085243105888367,
+      "learning_rate": 0.00037168181818181815,
+      "loss": 3.1797,
+      "step": 65400
+    },
+    {
+      "epoch": 19.059409400663988,
+      "grad_norm": 0.4273834228515625,
+      "learning_rate": 0.00037150699300699295,
+      "loss": 3.1857,
+      "step": 65450
+    },
+    {
+      "epoch": 19.07397052827771,
+      "grad_norm": 0.395258367061615,
+      "learning_rate": 0.0003713321678321678,
+      "loss": 3.1715,
+      "step": 65500
+    },
+    {
+      "epoch": 19.088531655891433,
+      "grad_norm": 0.44449496269226074,
+      "learning_rate": 0.0003711573426573426,
+      "loss": 3.1888,
+      "step": 65550
+    },
+    {
+      "epoch": 19.103092783505154,
+      "grad_norm": 0.4193888306617737,
+      "learning_rate": 0.00037098251748251746,
+      "loss": 3.2022,
+      "step": 65600
+    },
+    {
+      "epoch": 19.11765391111888,
+      "grad_norm": 0.42027559876441956,
+      "learning_rate": 0.00037080769230769226,
+      "loss": 3.1978,
+      "step": 65650
+    },
+    {
+      "epoch": 19.1322150387326,
+      "grad_norm": 0.37804079055786133,
+      "learning_rate": 0.0003706328671328671,
+      "loss": 3.1873,
+      "step": 65700
+    },
+    {
+      "epoch": 19.14677616634632,
+      "grad_norm": 0.3857940137386322,
+      "learning_rate": 0.0003704580419580419,
+      "loss": 3.1962,
+      "step": 65750
+    },
+    {
+      "epoch": 19.161337293960045,
+      "grad_norm": 0.3942515254020691,
+      "learning_rate": 0.00037028321678321676,
+      "loss": 3.2013,
+      "step": 65800
+    },
+    {
+      "epoch": 19.175898421573766,
+      "grad_norm": 0.41163113713264465,
+      "learning_rate": 0.0003701083916083916,
+      "loss": 3.2061,
+      "step": 65850
+    },
+    {
+      "epoch": 19.19045954918749,
+      "grad_norm": 0.3958335220813751,
+      "learning_rate": 0.0003699335664335664,
+      "loss": 3.1975,
+      "step": 65900
+    },
+    {
+      "epoch": 19.20502067680121,
+      "grad_norm": 0.4099127948284149,
+      "learning_rate": 0.00036975874125874127,
+      "loss": 3.1951,
+      "step": 65950
+    },
+    {
+      "epoch": 19.219581804414933,
+      "grad_norm": 0.3889888525009155,
+      "learning_rate": 0.00036958391608391607,
+      "loss": 3.2092,
+      "step": 66000
+    },
+    {
+      "epoch": 19.219581804414933,
+      "eval_accuracy": 0.37289683434464604,
+      "eval_loss": 3.550581455230713,
+      "eval_runtime": 179.8286,
+      "eval_samples_per_second": 92.566,
+      "eval_steps_per_second": 5.789,
+      "step": 66000
+    },
+    {
+      "epoch": 19.234142932028657,
+      "grad_norm": 0.43066680431365967,
+      "learning_rate": 0.0003694090909090909,
+      "loss": 3.2092,
+      "step": 66050
+    },
+    {
+      "epoch": 19.248704059642378,
+      "grad_norm": 0.39301857352256775,
+      "learning_rate": 0.00036923426573426567,
+      "loss": 3.2095,
+      "step": 66100
+    },
+    {
+      "epoch": 19.263265187256103,
+      "grad_norm": 0.39563480019569397,
+      "learning_rate": 0.0003690594405594405,
+      "loss": 3.2217,
+      "step": 66150
+    },
+    {
+      "epoch": 19.277826314869824,
+      "grad_norm": 0.4087700843811035,
+      "learning_rate": 0.0003688846153846153,
+      "loss": 3.2138,
+      "step": 66200
+    },
+    {
+      "epoch": 19.292387442483545,
+      "grad_norm": 0.39682114124298096,
+      "learning_rate": 0.0003687097902097902,
+      "loss": 3.2162,
+      "step": 66250
+    },
+    {
+      "epoch": 19.30694857009727,
+      "grad_norm": 0.4453403949737549,
+      "learning_rate": 0.000368534965034965,
+      "loss": 3.2261,
+      "step": 66300
+    },
+    {
+      "epoch": 19.32150969771099,
+      "grad_norm": 0.3561858534812927,
+      "learning_rate": 0.00036836013986013983,
+      "loss": 3.2254,
+      "step": 66350
+    },
+    {
+      "epoch": 19.336070825324715,
+      "grad_norm": 0.3951205313205719,
+      "learning_rate": 0.00036818531468531463,
+      "loss": 3.225,
+      "step": 66400
+    },
+    {
+      "epoch": 19.350631952938436,
+      "grad_norm": 0.4216971695423126,
+      "learning_rate": 0.0003680104895104895,
+      "loss": 3.2389,
+      "step": 66450
+    },
+    {
+      "epoch": 19.365193080552157,
+      "grad_norm": 0.37545275688171387,
+      "learning_rate": 0.0003678356643356643,
+      "loss": 3.218,
+      "step": 66500
+    },
+    {
+      "epoch": 19.37975420816588,
+      "grad_norm": 0.41713809967041016,
+      "learning_rate": 0.00036766083916083914,
+      "loss": 3.2371,
+      "step": 66550
+    },
+    {
+      "epoch": 19.394315335779602,
+      "grad_norm": 0.41936424374580383,
+      "learning_rate": 0.000367486013986014,
+      "loss": 3.2311,
+      "step": 66600
+    },
+    {
+      "epoch": 19.408876463393327,
+      "grad_norm": 0.4337376058101654,
+      "learning_rate": 0.0003673111888111888,
+      "loss": 3.2338,
+      "step": 66650
+    },
+    {
+      "epoch": 19.423437591007048,
+      "grad_norm": 0.3993789553642273,
+      "learning_rate": 0.00036713636363636365,
+      "loss": 3.2278,
+      "step": 66700
+    },
+    {
+      "epoch": 19.43799871862077,
+      "grad_norm": 0.40916818380355835,
+      "learning_rate": 0.00036696153846153844,
+      "loss": 3.2349,
+      "step": 66750
+    },
+    {
+      "epoch": 19.452559846234493,
+      "grad_norm": 0.41507256031036377,
+      "learning_rate": 0.0003667867132867133,
+      "loss": 3.2452,
+      "step": 66800
+    },
+    {
+      "epoch": 19.467120973848214,
+      "grad_norm": 0.407402366399765,
+      "learning_rate": 0.00036661188811188804,
+      "loss": 3.2273,
+      "step": 66850
+    },
+    {
+      "epoch": 19.48168210146194,
+      "grad_norm": 0.393828809261322,
+      "learning_rate": 0.0003664370629370629,
+      "loss": 3.2292,
+      "step": 66900
+    },
+    {
+      "epoch": 19.49624322907566,
+      "grad_norm": 0.3818491995334625,
+      "learning_rate": 0.0003662622377622377,
+      "loss": 3.2383,
+      "step": 66950
+    },
+    {
+      "epoch": 19.51080435668938,
+      "grad_norm": 0.3816803991794586,
+      "learning_rate": 0.00036608741258741255,
+      "loss": 3.2554,
+      "step": 67000
+    },
+    {
+      "epoch": 19.51080435668938,
+      "eval_accuracy": 0.3730255653997258,
+      "eval_loss": 3.5415730476379395,
+      "eval_runtime": 179.7245,
+      "eval_samples_per_second": 92.62,
+      "eval_steps_per_second": 5.792,
+      "step": 67000
+    },
+    {
+      "epoch": 19.525365484303105,
+      "grad_norm": 0.41924670338630676,
+      "learning_rate": 0.00036591258741258735,
+      "loss": 3.2414,
+      "step": 67050
+    },
+    {
+      "epoch": 19.539926611916826,
+      "grad_norm": 0.38373491168022156,
+      "learning_rate": 0.0003657377622377622,
+      "loss": 3.2434,
+      "step": 67100
+    },
+    {
+      "epoch": 19.55448773953055,
+      "grad_norm": 0.39270487427711487,
+      "learning_rate": 0.000365562937062937,
+      "loss": 3.2356,
+      "step": 67150
+    },
+    {
+      "epoch": 19.56904886714427,
+      "grad_norm": 0.39708688855171204,
+      "learning_rate": 0.00036538811188811186,
+      "loss": 3.2399,
+      "step": 67200
+    },
+    {
+      "epoch": 19.583609994757992,
+      "grad_norm": 0.4171540439128876,
+      "learning_rate": 0.0003652132867132867,
+      "loss": 3.2436,
+      "step": 67250
+    },
+    {
+      "epoch": 19.598171122371717,
+      "grad_norm": 0.39421361684799194,
+      "learning_rate": 0.0003650384615384615,
+      "loss": 3.2415,
+      "step": 67300
+    },
+    {
+      "epoch": 19.612732249985438,
+      "grad_norm": 0.39570003747940063,
+      "learning_rate": 0.00036486363636363637,
+      "loss": 3.2317,
+      "step": 67350
+    },
+    {
+      "epoch": 19.627293377599162,
+      "grad_norm": 0.40717577934265137,
+      "learning_rate": 0.00036468881118881117,
+      "loss": 3.242,
+      "step": 67400
+    },
+    {
+      "epoch": 19.641854505212883,
+      "grad_norm": 0.41312843561172485,
+      "learning_rate": 0.000364513986013986,
+      "loss": 3.2523,
+      "step": 67450
+    },
+    {
+      "epoch": 19.656415632826604,
+      "grad_norm": 0.430477499961853,
+      "learning_rate": 0.0003643391608391608,
+      "loss": 3.2413,
+      "step": 67500
+    },
+    {
+      "epoch": 19.67097676044033,
+      "grad_norm": 0.3858359456062317,
+      "learning_rate": 0.0003641643356643357,
+      "loss": 3.2544,
+      "step": 67550
+    },
+    {
+      "epoch": 19.68553788805405,
+      "grad_norm": 0.38897889852523804,
+      "learning_rate": 0.0003639895104895104,
+      "loss": 3.2468,
+      "step": 67600
+    },
+    {
+      "epoch": 19.700099015667774,
+      "grad_norm": 0.40000104904174805,
+      "learning_rate": 0.0003638146853146853,
+      "loss": 3.26,
+      "step": 67650
+    },
+    {
+      "epoch": 19.714660143281495,
+      "grad_norm": 0.3826768398284912,
+      "learning_rate": 0.00036363986013986007,
+      "loss": 3.2475,
+      "step": 67700
+    },
+    {
+      "epoch": 19.729221270895216,
+      "grad_norm": 0.41855689883232117,
+      "learning_rate": 0.0003634650349650349,
+      "loss": 3.2509,
+      "step": 67750
+    },
+    {
+      "epoch": 19.74378239850894,
+      "grad_norm": 0.39253532886505127,
+      "learning_rate": 0.0003632902097902097,
+      "loss": 3.2466,
+      "step": 67800
+    },
+    {
+      "epoch": 19.758343526122662,
+      "grad_norm": 0.3928729295730591,
+      "learning_rate": 0.0003631153846153846,
+      "loss": 3.2505,
+      "step": 67850
+    },
+    {
+      "epoch": 19.772904653736386,
+      "grad_norm": 0.3860240876674652,
+      "learning_rate": 0.00036294055944055943,
+      "loss": 3.2621,
+      "step": 67900
+    },
+    {
+      "epoch": 19.787465781350107,
+      "grad_norm": 0.41534480452537537,
+      "learning_rate": 0.00036276573426573423,
+      "loss": 3.2547,
+      "step": 67950
+    },
+    {
+      "epoch": 19.802026908963832,
+      "grad_norm": 0.3703750967979431,
+      "learning_rate": 0.0003625909090909091,
+      "loss": 3.2672,
+      "step": 68000
+    },
+    {
+      "epoch": 19.802026908963832,
+      "eval_accuracy": 0.3736522916596619,
+      "eval_loss": 3.534688949584961,
+      "eval_runtime": 179.6625,
+      "eval_samples_per_second": 92.651,
+      "eval_steps_per_second": 5.794,
+      "step": 68000
+    },
+    {
+      "epoch": 19.816588036577553,
+      "grad_norm": 0.43742990493774414,
+      "learning_rate": 0.0003624160839160839,
+      "loss": 3.2588,
+      "step": 68050
+    },
+    {
+      "epoch": 19.831149164191274,
+      "grad_norm": 0.38435590267181396,
+      "learning_rate": 0.00036224125874125874,
+      "loss": 3.2622,
+      "step": 68100
+    },
+    {
+      "epoch": 19.845710291805,
+      "grad_norm": 0.40424367785453796,
+      "learning_rate": 0.00036206643356643354,
+      "loss": 3.2583,
+      "step": 68150
+    },
+    {
+      "epoch": 19.86027141941872,
+      "grad_norm": 0.40375852584838867,
+      "learning_rate": 0.0003618916083916084,
+      "loss": 3.259,
+      "step": 68200
+    },
+    {
+      "epoch": 19.874832547032444,
+      "grad_norm": 0.4463939666748047,
+      "learning_rate": 0.0003617167832167832,
+      "loss": 3.2576,
+      "step": 68250
+    },
+    {
+      "epoch": 19.889393674646165,
+      "grad_norm": 0.4056410491466522,
+      "learning_rate": 0.00036154195804195805,
+      "loss": 3.2519,
+      "step": 68300
+    },
+    {
+      "epoch": 19.903954802259886,
+      "grad_norm": 0.37983453273773193,
+      "learning_rate": 0.0003613671328671328,
+      "loss": 3.2664,
+      "step": 68350
+    },
+    {
+      "epoch": 19.91851592987361,
+      "grad_norm": 0.38492605090141296,
+      "learning_rate": 0.00036119230769230765,
+      "loss": 3.256,
+      "step": 68400
+    },
+    {
+      "epoch": 19.93307705748733,
+      "grad_norm": 0.3811017870903015,
+      "learning_rate": 0.00036101748251748245,
+      "loss": 3.263,
+      "step": 68450
+    },
+    {
+      "epoch": 19.947638185101056,
+      "grad_norm": 0.38036519289016724,
+      "learning_rate": 0.0003608426573426573,
+      "loss": 3.267,
+      "step": 68500
+    },
+    {
+      "epoch": 19.962199312714777,
+      "grad_norm": 0.3989908695220947,
+      "learning_rate": 0.0003606678321678321,
+      "loss": 3.2585,
+      "step": 68550
+    },
+    {
+      "epoch": 19.976760440328498,
+      "grad_norm": 0.40518391132354736,
+      "learning_rate": 0.00036049300699300696,
+      "loss": 3.2629,
+      "step": 68600
+    },
+    {
+      "epoch": 19.991321567942222,
+      "grad_norm": 0.4135623276233673,
+      "learning_rate": 0.0003603181818181818,
+      "loss": 3.268,
+      "step": 68650
+    },
+    {
+      "epoch": 20.005824451045488,
+      "grad_norm": 0.41751164197921753,
+      "learning_rate": 0.0003601433566433566,
+      "loss": 3.2203,
+      "step": 68700
+    },
+    {
+      "epoch": 20.020385578659212,
+      "grad_norm": 0.3791915774345398,
+      "learning_rate": 0.00035996853146853146,
+      "loss": 3.1551,
+      "step": 68750
+    },
+    {
+      "epoch": 20.034946706272933,
+      "grad_norm": 0.4008273184299469,
+      "learning_rate": 0.00035979370629370626,
+      "loss": 3.1711,
+      "step": 68800
+    },
+    {
+      "epoch": 20.049507833886658,
+      "grad_norm": 0.3967386484146118,
+      "learning_rate": 0.0003596188811188811,
+      "loss": 3.1668,
+      "step": 68850
+    },
+    {
+      "epoch": 20.06406896150038,
+      "grad_norm": 0.40993747115135193,
+      "learning_rate": 0.0003594440559440559,
+      "loss": 3.1773,
+      "step": 68900
+    },
+    {
+      "epoch": 20.0786300891141,
+      "grad_norm": 0.42113885283470154,
+      "learning_rate": 0.00035926923076923077,
+      "loss": 3.1719,
+      "step": 68950
+    },
+    {
+      "epoch": 20.093191216727824,
+      "grad_norm": 0.41880732774734497,
+      "learning_rate": 0.00035909440559440557,
+      "loss": 3.178,
+      "step": 69000
+    },
+    {
+      "epoch": 20.093191216727824,
+      "eval_accuracy": 0.3726620618177107,
+      "eval_loss": 3.5489065647125244,
+      "eval_runtime": 179.7764,
+      "eval_samples_per_second": 92.593,
+      "eval_steps_per_second": 5.791,
+      "step": 69000
+    },
+    {
+      "epoch": 20.107752344341545,
+      "grad_norm": 0.44065818190574646,
+      "learning_rate": 0.0003589195804195804,
+      "loss": 3.1677,
+      "step": 69050
+    },
+    {
+      "epoch": 20.12231347195527,
+      "grad_norm": 0.4093584418296814,
+      "learning_rate": 0.00035874475524475517,
+      "loss": 3.1905,
+      "step": 69100
+    },
+    {
+      "epoch": 20.13687459956899,
+      "grad_norm": 0.4320305585861206,
+      "learning_rate": 0.00035856993006993,
+      "loss": 3.185,
+      "step": 69150
+    },
+    {
+      "epoch": 20.15143572718271,
+      "grad_norm": 0.4164067208766937,
+      "learning_rate": 0.0003583951048951048,
+      "loss": 3.1896,
+      "step": 69200
+    },
+    {
+      "epoch": 20.165996854796436,
+      "grad_norm": 0.3870382606983185,
+      "learning_rate": 0.0003582202797202797,
+      "loss": 3.1809,
+      "step": 69250
+    },
+    {
+      "epoch": 20.180557982410157,
+      "grad_norm": 0.3869068920612335,
+      "learning_rate": 0.00035804545454545453,
+      "loss": 3.1896,
+      "step": 69300
+    },
+    {
+      "epoch": 20.19511911002388,
+      "grad_norm": 0.3980240225791931,
+      "learning_rate": 0.00035787062937062933,
+      "loss": 3.2019,
+      "step": 69350
+    },
+    {
+      "epoch": 20.209680237637603,
+      "grad_norm": 0.3959798812866211,
+      "learning_rate": 0.0003576958041958042,
+      "loss": 3.1961,
+      "step": 69400
+    },
+    {
+      "epoch": 20.224241365251324,
+      "grad_norm": 0.40293267369270325,
+      "learning_rate": 0.000357520979020979,
+      "loss": 3.1988,
+      "step": 69450
+    },
+    {
+      "epoch": 20.238802492865048,
+      "grad_norm": 0.4127791225910187,
+      "learning_rate": 0.00035734615384615384,
+      "loss": 3.1983,
+      "step": 69500
+    },
+    {
+      "epoch": 20.25336362047877,
+      "grad_norm": 0.4141997992992401,
+      "learning_rate": 0.00035717132867132864,
+      "loss": 3.2069,
+      "step": 69550
+    },
+    {
+      "epoch": 20.267924748092494,
+      "grad_norm": 0.39670079946517944,
+      "learning_rate": 0.0003569965034965035,
+      "loss": 3.1993,
+      "step": 69600
+    },
+    {
+      "epoch": 20.282485875706215,
+      "grad_norm": 0.41046348214149475,
+      "learning_rate": 0.0003568216783216783,
+      "loss": 3.2115,
+      "step": 69650
+    },
+    {
+      "epoch": 20.297047003319935,
+      "grad_norm": 0.3998822271823883,
+      "learning_rate": 0.00035664685314685314,
+      "loss": 3.2161,
+      "step": 69700
+    },
+    {
+      "epoch": 20.31160813093366,
+      "grad_norm": 0.3757505714893341,
+      "learning_rate": 0.00035647202797202794,
+      "loss": 3.224,
+      "step": 69750
+    },
+    {
+      "epoch": 20.32616925854738,
+      "grad_norm": 0.42314520478248596,
+      "learning_rate": 0.0003562972027972028,
+      "loss": 3.2191,
+      "step": 69800
+    },
+    {
+      "epoch": 20.340730386161106,
+      "grad_norm": 0.372994065284729,
+      "learning_rate": 0.00035612237762237754,
+      "loss": 3.2216,
+      "step": 69850
+    },
+    {
+      "epoch": 20.355291513774826,
+      "grad_norm": 0.39920496940612793,
+      "learning_rate": 0.0003559475524475524,
+      "loss": 3.211,
+      "step": 69900
+    },
+    {
+      "epoch": 20.369852641388547,
+      "grad_norm": 0.40215423703193665,
+      "learning_rate": 0.0003557727272727272,
+      "loss": 3.205,
+      "step": 69950
+    },
+    {
+      "epoch": 20.384413769002272,
+      "grad_norm": 0.40181687474250793,
+      "learning_rate": 0.00035559790209790205,
+      "loss": 3.22,
+      "step": 70000
+    },
+    {
+      "epoch": 20.384413769002272,
+      "eval_accuracy": 0.37305248723681556,
+      "eval_loss": 3.5457870960235596,
+      "eval_runtime": 179.9649,
+      "eval_samples_per_second": 92.496,
+      "eval_steps_per_second": 5.784,
+      "step": 70000
+    },
+    {
+      "epoch": 20.398974896615993,
+      "grad_norm": 0.4172757863998413,
+      "learning_rate": 0.0003554230769230769,
+      "loss": 3.221,
+      "step": 70050
+    },
+    {
+      "epoch": 20.413536024229717,
+      "grad_norm": 0.3744981586933136,
+      "learning_rate": 0.0003552482517482517,
+      "loss": 3.2222,
+      "step": 70100
+    },
+    {
+      "epoch": 20.42809715184344,
+      "grad_norm": 0.45279887318611145,
+      "learning_rate": 0.00035507342657342656,
+      "loss": 3.2334,
+      "step": 70150
+    },
+    {
+      "epoch": 20.442658279457163,
+      "grad_norm": 0.41247794032096863,
+      "learning_rate": 0.00035489860139860136,
+      "loss": 3.2169,
+      "step": 70200
+    },
+    {
+      "epoch": 20.457219407070884,
+      "grad_norm": 0.381062388420105,
+      "learning_rate": 0.0003547237762237762,
+      "loss": 3.2149,
+      "step": 70250
+    },
+    {
+      "epoch": 20.471780534684605,
+      "grad_norm": 0.4296295642852783,
+      "learning_rate": 0.000354548951048951,
+      "loss": 3.23,
+      "step": 70300
+    },
+    {
+      "epoch": 20.48634166229833,
+      "grad_norm": 0.406954824924469,
+      "learning_rate": 0.00035437412587412587,
+      "loss": 3.2253,
+      "step": 70350
+    },
+    {
+      "epoch": 20.50090278991205,
+      "grad_norm": 0.4118635058403015,
+      "learning_rate": 0.00035419930069930067,
+      "loss": 3.2125,
+      "step": 70400
+    },
+    {
+      "epoch": 20.51546391752577,
+      "grad_norm": 0.44677120447158813,
+      "learning_rate": 0.0003540244755244755,
+      "loss": 3.2222,
+      "step": 70450
+    },
+    {
+      "epoch": 20.530025045139496,
+      "grad_norm": 0.38376492261886597,
+      "learning_rate": 0.0003538496503496503,
+      "loss": 3.2291,
+      "step": 70500
+    },
+    {
+      "epoch": 20.544586172753217,
+      "grad_norm": 0.3942374885082245,
+      "learning_rate": 0.0003536748251748252,
+      "loss": 3.2361,
+      "step": 70550
+    },
+    {
+      "epoch": 20.55914730036694,
+      "grad_norm": 0.39628666639328003,
+      "learning_rate": 0.0003534999999999999,
+      "loss": 3.2164,
+      "step": 70600
+    },
+    {
+      "epoch": 20.573708427980662,
+      "grad_norm": 0.4207809567451477,
+      "learning_rate": 0.00035332517482517477,
+      "loss": 3.2193,
+      "step": 70650
+    },
+    {
+      "epoch": 20.588269555594387,
+      "grad_norm": 0.40318813920021057,
+      "learning_rate": 0.0003531503496503496,
+      "loss": 3.2358,
+      "step": 70700
+    },
+    {
+      "epoch": 20.602830683208108,
+      "grad_norm": 0.39456906914711,
+      "learning_rate": 0.0003529755244755244,
+      "loss": 3.2453,
+      "step": 70750
+    },
+    {
+      "epoch": 20.61739181082183,
+      "grad_norm": 0.4417160749435425,
+      "learning_rate": 0.0003528006993006993,
+      "loss": 3.2363,
+      "step": 70800
+    },
+    {
+      "epoch": 20.631952938435553,
+      "grad_norm": 0.41225501894950867,
+      "learning_rate": 0.0003526258741258741,
+      "loss": 3.2298,
+      "step": 70850
+    },
+    {
+      "epoch": 20.646514066049274,
+      "grad_norm": 0.39469778537750244,
+      "learning_rate": 0.00035245104895104893,
+      "loss": 3.241,
+      "step": 70900
+    },
+    {
+      "epoch": 20.661075193663,
+      "grad_norm": 0.37514132261276245,
+      "learning_rate": 0.00035227622377622373,
+      "loss": 3.242,
+      "step": 70950
+    },
+    {
+      "epoch": 20.67563632127672,
+      "grad_norm": 0.4397062659263611,
+      "learning_rate": 0.0003521013986013986,
+      "loss": 3.2396,
+      "step": 71000
+    },
+    {
+      "epoch": 20.67563632127672,
+      "eval_accuracy": 0.3737319991074647,
+      "eval_loss": 3.5377063751220703,
+      "eval_runtime": 179.6736,
+      "eval_samples_per_second": 92.646,
+      "eval_steps_per_second": 5.794,
+      "step": 71000
+    },
+    {
+      "epoch": 20.69019744889044,
+      "grad_norm": 0.4205426573753357,
+      "learning_rate": 0.0003519265734265734,
+      "loss": 3.2439,
+      "step": 71050
+    },
+    {
+      "epoch": 20.704758576504165,
+      "grad_norm": 0.3931397497653961,
+      "learning_rate": 0.00035175174825174824,
+      "loss": 3.245,
+      "step": 71100
+    },
+    {
+      "epoch": 20.719319704117886,
+      "grad_norm": 0.41281893849372864,
+      "learning_rate": 0.00035157692307692304,
+      "loss": 3.2373,
+      "step": 71150
+    },
+    {
+      "epoch": 20.73388083173161,
+      "grad_norm": 0.38877737522125244,
+      "learning_rate": 0.0003514020979020979,
+      "loss": 3.2437,
+      "step": 71200
+    },
+    {
+      "epoch": 20.74844195934533,
+      "grad_norm": 0.3918205797672272,
+      "learning_rate": 0.0003512272727272727,
+      "loss": 3.2329,
+      "step": 71250
+    },
+    {
+      "epoch": 20.763003086959053,
+      "grad_norm": 0.4087960720062256,
+      "learning_rate": 0.00035105244755244755,
+      "loss": 3.2433,
+      "step": 71300
+    },
+    {
+      "epoch": 20.777564214572777,
+      "grad_norm": 0.37705087661743164,
+      "learning_rate": 0.0003508776223776223,
+      "loss": 3.2449,
+      "step": 71350
+    },
+    {
+      "epoch": 20.792125342186498,
+      "grad_norm": 0.4165264666080475,
+      "learning_rate": 0.00035070279720279715,
+      "loss": 3.2455,
+      "step": 71400
+    },
+    {
+      "epoch": 20.806686469800223,
+      "grad_norm": 0.3990912437438965,
+      "learning_rate": 0.000350527972027972,
+      "loss": 3.252,
+      "step": 71450
+    },
+    {
+      "epoch": 20.821247597413944,
+      "grad_norm": 0.4242381453514099,
+      "learning_rate": 0.0003503531468531468,
+      "loss": 3.2341,
+      "step": 71500
+    },
+    {
+      "epoch": 20.835808725027665,
+      "grad_norm": 0.39090588688850403,
+      "learning_rate": 0.00035017832167832166,
+      "loss": 3.2474,
+      "step": 71550
+    },
+    {
+      "epoch": 20.85036985264139,
+      "grad_norm": 0.38443315029144287,
+      "learning_rate": 0.00035000349650349645,
+      "loss": 3.2553,
+      "step": 71600
+    },
+    {
+      "epoch": 20.86493098025511,
+      "grad_norm": 0.3968809247016907,
+      "learning_rate": 0.0003498286713286713,
+      "loss": 3.2484,
+      "step": 71650
+    },
+    {
+      "epoch": 20.879492107868835,
+      "grad_norm": 0.4066787660121918,
+      "learning_rate": 0.0003496538461538461,
+      "loss": 3.2533,
+      "step": 71700
+    },
+    {
+      "epoch": 20.894053235482556,
+      "grad_norm": 0.4234001040458679,
+      "learning_rate": 0.00034947902097902096,
+      "loss": 3.2429,
+      "step": 71750
+    },
+    {
+      "epoch": 20.908614363096277,
+      "grad_norm": 0.37977680563926697,
+      "learning_rate": 0.00034930419580419576,
+      "loss": 3.2499,
+      "step": 71800
+    },
+    {
+      "epoch": 20.92317549071,
+      "grad_norm": 0.435090571641922,
+      "learning_rate": 0.0003491293706293706,
+      "loss": 3.2546,
+      "step": 71850
+    },
+    {
+      "epoch": 20.937736618323722,
+      "grad_norm": 0.41951361298561096,
+      "learning_rate": 0.0003489545454545454,
+      "loss": 3.2535,
+      "step": 71900
+    },
+    {
+      "epoch": 20.952297745937447,
+      "grad_norm": 0.40699416399002075,
+      "learning_rate": 0.00034877972027972027,
+      "loss": 3.253,
+      "step": 71950
+    },
+    {
+      "epoch": 20.966858873551168,
+      "grad_norm": 0.43681085109710693,
+      "learning_rate": 0.00034860489510489507,
+      "loss": 3.2587,
+      "step": 72000
+    },
+    {
+      "epoch": 20.966858873551168,
+      "eval_accuracy": 0.37395536806148433,
+      "eval_loss": 3.530937910079956,
+      "eval_runtime": 179.6973,
+      "eval_samples_per_second": 92.634,
+      "eval_steps_per_second": 5.793,
+      "step": 72000
+    },
+    {
+      "epoch": 20.98142000116489,
+      "grad_norm": 0.40609315037727356,
+      "learning_rate": 0.0003484300699300699,
+      "loss": 3.2681,
+      "step": 72050
+    },
+    {
+      "epoch": 20.995981128778613,
+      "grad_norm": 0.3836963176727295,
+      "learning_rate": 0.0003482552447552448,
+      "loss": 3.2512,
+      "step": 72100
+    },
+    {
+      "epoch": 21.01048401188188,
+      "grad_norm": 0.42151519656181335,
+      "learning_rate": 0.0003480804195804195,
+      "loss": 3.1914,
+      "step": 72150
+    },
+    {
+      "epoch": 21.025045139495603,
+      "grad_norm": 0.39965352416038513,
+      "learning_rate": 0.0003479055944055944,
+      "loss": 3.1574,
+      "step": 72200
+    },
+    {
+      "epoch": 21.039606267109324,
+      "grad_norm": 0.39035817980766296,
+      "learning_rate": 0.0003477307692307692,
+      "loss": 3.1594,
+      "step": 72250
+    },
+    {
+      "epoch": 21.05416739472305,
+      "grad_norm": 0.43687039613723755,
+      "learning_rate": 0.00034755594405594403,
+      "loss": 3.1593,
+      "step": 72300
+    },
+    {
+      "epoch": 21.06872852233677,
+      "grad_norm": 0.43884846568107605,
+      "learning_rate": 0.00034738111888111883,
+      "loss": 3.1601,
+      "step": 72350
+    },
+    {
+      "epoch": 21.08328964995049,
+      "grad_norm": 0.4526152014732361,
+      "learning_rate": 0.0003472062937062937,
+      "loss": 3.1574,
+      "step": 72400
+    },
+    {
+      "epoch": 21.097850777564215,
+      "grad_norm": 0.4564531445503235,
+      "learning_rate": 0.0003470314685314685,
+      "loss": 3.1738,
+      "step": 72450
+    },
+    {
+      "epoch": 21.112411905177936,
+      "grad_norm": 0.3975127339363098,
+      "learning_rate": 0.00034685664335664334,
+      "loss": 3.1673,
+      "step": 72500
+    },
+    {
+      "epoch": 21.12697303279166,
+      "grad_norm": 0.4158114194869995,
+      "learning_rate": 0.00034668181818181814,
+      "loss": 3.1641,
+      "step": 72550
+    },
+    {
+      "epoch": 21.14153416040538,
+      "grad_norm": 0.40853938460350037,
+      "learning_rate": 0.000346506993006993,
+      "loss": 3.1721,
+      "step": 72600
+    },
+    {
+      "epoch": 21.156095288019102,
+      "grad_norm": 0.4014092981815338,
+      "learning_rate": 0.0003463321678321678,
+      "loss": 3.1741,
+      "step": 72650
+    },
+    {
+      "epoch": 21.170656415632827,
+      "grad_norm": 0.4352574944496155,
+      "learning_rate": 0.00034615734265734264,
+      "loss": 3.1833,
+      "step": 72700
+    },
+    {
+      "epoch": 21.185217543246548,
+      "grad_norm": 0.3742693066596985,
+      "learning_rate": 0.0003459825174825175,
+      "loss": 3.1811,
+      "step": 72750
+    },
+    {
+      "epoch": 21.199778670860272,
+      "grad_norm": 0.38294392824172974,
+      "learning_rate": 0.0003458076923076923,
+      "loss": 3.2017,
+      "step": 72800
+    },
+    {
+      "epoch": 21.214339798473993,
+      "grad_norm": 0.4095962643623352,
+      "learning_rate": 0.00034563286713286715,
+      "loss": 3.1751,
+      "step": 72850
+    },
+    {
+      "epoch": 21.228900926087718,
+      "grad_norm": 0.4042835831642151,
+      "learning_rate": 0.0003454580419580419,
+      "loss": 3.1879,
+      "step": 72900
+    },
+    {
+      "epoch": 21.24346205370144,
+      "grad_norm": 0.3950826823711395,
+      "learning_rate": 0.00034528321678321675,
+      "loss": 3.1906,
+      "step": 72950
+    },
+    {
+      "epoch": 21.25802318131516,
+      "grad_norm": 0.387794554233551,
+      "learning_rate": 0.00034510839160839155,
+      "loss": 3.1993,
+      "step": 73000
+    },
+    {
+      "epoch": 21.25802318131516,
+      "eval_accuracy": 0.37353320073838725,
+      "eval_loss": 3.5477094650268555,
+      "eval_runtime": 180.0332,
+      "eval_samples_per_second": 92.461,
+      "eval_steps_per_second": 5.782,
+      "step": 73000
+    },
+    {
+      "epoch": 21.272584308928884,
+      "grad_norm": 0.40166184306144714,
+      "learning_rate": 0.0003449335664335664,
+      "loss": 3.1922,
+      "step": 73050
+    },
+    {
+      "epoch": 21.287145436542605,
+      "grad_norm": 0.40153566002845764,
+      "learning_rate": 0.0003447587412587412,
+      "loss": 3.1882,
+      "step": 73100
+    },
+    {
+      "epoch": 21.30170656415633,
+      "grad_norm": 0.39476677775382996,
+      "learning_rate": 0.00034458391608391606,
+      "loss": 3.2036,
+      "step": 73150
+    },
+    {
+      "epoch": 21.31626769177005,
+      "grad_norm": 0.4329933524131775,
+      "learning_rate": 0.00034440909090909086,
+      "loss": 3.1981,
+      "step": 73200
+    },
+    {
+      "epoch": 21.330828819383772,
+      "grad_norm": 0.4002084732055664,
+      "learning_rate": 0.0003442342657342657,
+      "loss": 3.2047,
+      "step": 73250
+    },
+    {
+      "epoch": 21.345389946997496,
+      "grad_norm": 0.42490726709365845,
+      "learning_rate": 0.0003440594405594405,
+      "loss": 3.1977,
+      "step": 73300
+    },
+    {
+      "epoch": 21.359951074611217,
+      "grad_norm": 0.3994070887565613,
+      "learning_rate": 0.00034388461538461537,
+      "loss": 3.2103,
+      "step": 73350
+    },
+    {
+      "epoch": 21.374512202224942,
+      "grad_norm": 0.399056613445282,
+      "learning_rate": 0.00034370979020979017,
+      "loss": 3.2099,
+      "step": 73400
+    },
+    {
+      "epoch": 21.389073329838663,
+      "grad_norm": 0.39403849840164185,
+      "learning_rate": 0.000343534965034965,
+      "loss": 3.2125,
+      "step": 73450
+    },
+    {
+      "epoch": 21.403634457452384,
+      "grad_norm": 0.4174330532550812,
+      "learning_rate": 0.0003433601398601399,
+      "loss": 3.2097,
+      "step": 73500
+    },
+    {
+      "epoch": 21.41819558506611,
+      "grad_norm": 0.4052690863609314,
+      "learning_rate": 0.0003431853146853147,
+      "loss": 3.2114,
+      "step": 73550
+    },
+    {
+      "epoch": 21.43275671267983,
+      "grad_norm": 0.3890800178050995,
+      "learning_rate": 0.0003430104895104895,
+      "loss": 3.2104,
+      "step": 73600
+    },
+    {
+      "epoch": 21.447317840293554,
+      "grad_norm": 0.4229435324668884,
+      "learning_rate": 0.00034283566433566427,
+      "loss": 3.2104,
+      "step": 73650
+    },
+    {
+      "epoch": 21.461878967907275,
+      "grad_norm": 0.39021503925323486,
+      "learning_rate": 0.0003426608391608391,
+      "loss": 3.2126,
+      "step": 73700
+    },
+    {
+      "epoch": 21.476440095520996,
+      "grad_norm": 0.38516533374786377,
+      "learning_rate": 0.0003424860139860139,
+      "loss": 3.2203,
+      "step": 73750
+    },
+    {
+      "epoch": 21.49100122313472,
+      "grad_norm": 0.39541691541671753,
+      "learning_rate": 0.0003423111888111888,
+      "loss": 3.2282,
+      "step": 73800
+    },
+    {
+      "epoch": 21.50556235074844,
+      "grad_norm": 0.3969518840312958,
+      "learning_rate": 0.0003421363636363636,
+      "loss": 3.2145,
+      "step": 73850
+    },
+    {
+      "epoch": 21.520123478362166,
+      "grad_norm": 0.4029087722301483,
+      "learning_rate": 0.00034196153846153843,
+      "loss": 3.229,
+      "step": 73900
+    },
+    {
+      "epoch": 21.534684605975887,
+      "grad_norm": 0.4271200895309448,
+      "learning_rate": 0.00034178671328671323,
+      "loss": 3.2216,
+      "step": 73950
+    },
+    {
+      "epoch": 21.549245733589608,
+      "grad_norm": 0.4122142493724823,
+      "learning_rate": 0.0003416118881118881,
+      "loss": 3.2144,
+      "step": 74000
+    },
+    {
+      "epoch": 21.549245733589608,
+      "eval_accuracy": 0.37348217856678484,
+      "eval_loss": 3.5398857593536377,
+      "eval_runtime": 179.8183,
+      "eval_samples_per_second": 92.571,
+      "eval_steps_per_second": 5.789,
+      "step": 74000
+    },
+    {
+      "epoch": 21.563806861203332,
+      "grad_norm": 0.37525999546051025,
+      "learning_rate": 0.0003414370629370629,
+      "loss": 3.2275,
+      "step": 74050
+    },
+    {
+      "epoch": 21.578367988817053,
+      "grad_norm": 0.42617496848106384,
+      "learning_rate": 0.00034126223776223774,
+      "loss": 3.2097,
+      "step": 74100
+    },
+    {
+      "epoch": 21.592929116430778,
+      "grad_norm": 0.41545069217681885,
+      "learning_rate": 0.0003410874125874126,
+      "loss": 3.2252,
+      "step": 74150
+    },
+    {
+      "epoch": 21.6074902440445,
+      "grad_norm": 0.3902042508125305,
+      "learning_rate": 0.0003409125874125874,
+      "loss": 3.2274,
+      "step": 74200
+    },
+    {
+      "epoch": 21.62205137165822,
+      "grad_norm": 0.3986893892288208,
+      "learning_rate": 0.00034073776223776225,
+      "loss": 3.2347,
+      "step": 74250
+    },
+    {
+      "epoch": 21.636612499271944,
+      "grad_norm": 0.3976801335811615,
+      "learning_rate": 0.00034056293706293705,
+      "loss": 3.2224,
+      "step": 74300
+    },
+    {
+      "epoch": 21.651173626885665,
+      "grad_norm": 0.41786202788352966,
+      "learning_rate": 0.0003403881118881119,
+      "loss": 3.2307,
+      "step": 74350
+    },
+    {
+      "epoch": 21.66573475449939,
+      "grad_norm": 0.43465039134025574,
+      "learning_rate": 0.00034021328671328665,
+      "loss": 3.2236,
+      "step": 74400
+    },
+    {
+      "epoch": 21.68029588211311,
+      "grad_norm": 0.3973064720630646,
+      "learning_rate": 0.0003400384615384615,
+      "loss": 3.2318,
+      "step": 74450
+    },
+    {
+      "epoch": 21.69485700972683,
+      "grad_norm": 0.4066312909126282,
+      "learning_rate": 0.0003398636363636363,
+      "loss": 3.2169,
+      "step": 74500
+    },
+    {
+      "epoch": 21.709418137340556,
+      "grad_norm": 0.41648659110069275,
+      "learning_rate": 0.00033968881118881115,
+      "loss": 3.2215,
+      "step": 74550
+    },
+    {
+      "epoch": 21.723979264954277,
+      "grad_norm": 0.40081432461738586,
+      "learning_rate": 0.00033951398601398595,
+      "loss": 3.2344,
+      "step": 74600
+    },
+    {
+      "epoch": 21.738540392568,
+      "grad_norm": 0.3875221610069275,
+      "learning_rate": 0.0003393391608391608,
+      "loss": 3.2356,
+      "step": 74650
+    },
+    {
+      "epoch": 21.753101520181723,
+      "grad_norm": 0.38530296087265015,
+      "learning_rate": 0.0003391643356643356,
+      "loss": 3.23,
+      "step": 74700
+    },
+    {
+      "epoch": 21.767662647795444,
+      "grad_norm": 0.39854732155799866,
+      "learning_rate": 0.00033898951048951046,
+      "loss": 3.2332,
+      "step": 74750
+    },
+    {
+      "epoch": 21.782223775409168,
+      "grad_norm": 0.4148358404636383,
+      "learning_rate": 0.00033881468531468526,
+      "loss": 3.2353,
+      "step": 74800
+    },
+    {
+      "epoch": 21.79678490302289,
+      "grad_norm": 0.3934561014175415,
+      "learning_rate": 0.0003386398601398601,
+      "loss": 3.2407,
+      "step": 74850
+    },
+    {
+      "epoch": 21.811346030636614,
+      "grad_norm": 0.414039671421051,
+      "learning_rate": 0.00033846503496503497,
+      "loss": 3.2336,
+      "step": 74900
+    },
+    {
+      "epoch": 21.825907158250335,
+      "grad_norm": 0.39278024435043335,
+      "learning_rate": 0.00033829020979020977,
+      "loss": 3.2404,
+      "step": 74950
+    },
+    {
+      "epoch": 21.840468285864056,
+      "grad_norm": 0.42450013756752014,
+      "learning_rate": 0.0003381153846153846,
+      "loss": 3.2363,
+      "step": 75000
+    },
+    {
+      "epoch": 21.840468285864056,
+      "eval_accuracy": 0.3742865419264702,
+      "eval_loss": 3.531919479370117,
+      "eval_runtime": 179.9028,
+      "eval_samples_per_second": 92.528,
+      "eval_steps_per_second": 5.786,
+      "step": 75000
+    },
+    {
+      "epoch": 21.85502941347778,
+      "grad_norm": 0.3962397873401642,
+      "learning_rate": 0.0003379405594405594,
+      "loss": 3.2311,
+      "step": 75050
+    },
+    {
+      "epoch": 21.8695905410915,
+      "grad_norm": 0.4015287756919861,
+      "learning_rate": 0.0003377657342657343,
+      "loss": 3.2483,
+      "step": 75100
+    },
+    {
+      "epoch": 21.884151668705226,
+      "grad_norm": 0.4102756977081299,
+      "learning_rate": 0.000337590909090909,
+      "loss": 3.2399,
+      "step": 75150
+    },
+    {
+      "epoch": 21.898712796318947,
+      "grad_norm": 0.4179084599018097,
+      "learning_rate": 0.0003374160839160839,
+      "loss": 3.2523,
+      "step": 75200
+    },
+    {
+      "epoch": 21.91327392393267,
+      "grad_norm": 0.42700445652008057,
+      "learning_rate": 0.0003372412587412587,
+      "loss": 3.2424,
+      "step": 75250
+    },
+    {
+      "epoch": 21.927835051546392,
+      "grad_norm": 0.4214681386947632,
+      "learning_rate": 0.00033706643356643353,
+      "loss": 3.2407,
+      "step": 75300
+    },
+    {
+      "epoch": 21.942396179160113,
+      "grad_norm": 0.43577635288238525,
+      "learning_rate": 0.00033689160839160833,
+      "loss": 3.2301,
+      "step": 75350
+    },
+    {
+      "epoch": 21.956957306773838,
+      "grad_norm": 0.4034266173839569,
+      "learning_rate": 0.0003367167832167832,
+      "loss": 3.2496,
+      "step": 75400
+    },
+    {
+      "epoch": 21.97151843438756,
+      "grad_norm": 0.40454426407814026,
+      "learning_rate": 0.000336541958041958,
+      "loss": 3.2426,
+      "step": 75450
+    },
+    {
+      "epoch": 21.986079562001283,
+      "grad_norm": 0.4080248475074768,
+      "learning_rate": 0.00033636713286713284,
+      "loss": 3.2446,
+      "step": 75500
+    },
+    {
+      "epoch": 22.00058244510455,
+      "grad_norm": 0.4675055742263794,
+      "learning_rate": 0.0003361923076923077,
+      "loss": 3.2306,
+      "step": 75550
+    },
+    {
+      "epoch": 22.015143572718273,
+      "grad_norm": 0.43351686000823975,
+      "learning_rate": 0.0003360174825174825,
+      "loss": 3.1425,
+      "step": 75600
+    },
+    {
+      "epoch": 22.029704700331994,
+      "grad_norm": 0.4114014506340027,
+      "learning_rate": 0.00033584265734265734,
+      "loss": 3.1431,
+      "step": 75650
+    },
+    {
+      "epoch": 22.044265827945715,
+      "grad_norm": 0.40984776616096497,
+      "learning_rate": 0.00033566783216783214,
+      "loss": 3.1486,
+      "step": 75700
+    },
+    {
+      "epoch": 22.05882695555944,
+      "grad_norm": 0.3943866193294525,
+      "learning_rate": 0.000335493006993007,
+      "loss": 3.1488,
+      "step": 75750
+    },
+    {
+      "epoch": 22.07338808317316,
+      "grad_norm": 0.44786471128463745,
+      "learning_rate": 0.0003353181818181818,
+      "loss": 3.1494,
+      "step": 75800
+    },
+    {
+      "epoch": 22.087949210786885,
+      "grad_norm": 0.42942312359809875,
+      "learning_rate": 0.00033514335664335665,
+      "loss": 3.1642,
+      "step": 75850
+    },
+    {
+      "epoch": 22.102510338400606,
+      "grad_norm": 0.40645474195480347,
+      "learning_rate": 0.0003349685314685314,
+      "loss": 3.1632,
+      "step": 75900
+    },
+    {
+      "epoch": 22.117071466014327,
+      "grad_norm": 0.41174495220184326,
+      "learning_rate": 0.00033479370629370625,
+      "loss": 3.1626,
+      "step": 75950
+    },
+    {
+      "epoch": 22.13163259362805,
+      "grad_norm": 0.41596174240112305,
+      "learning_rate": 0.00033461888111888105,
+      "loss": 3.1602,
+      "step": 76000
+    },
+    {
+      "epoch": 22.13163259362805,
+      "eval_accuracy": 0.37349816708138833,
+      "eval_loss": 3.5504817962646484,
+      "eval_runtime": 203.6866,
+      "eval_samples_per_second": 81.724,
+      "eval_steps_per_second": 5.111,
+      "step": 76000
+    },
+    {
+      "epoch": 22.146193721241772,
+      "grad_norm": 0.4284675419330597,
+      "learning_rate": 0.0003344440559440559,
+      "loss": 3.1731,
+      "step": 76050
+    },
+    {
+      "epoch": 22.160754848855497,
+      "grad_norm": 0.39755699038505554,
+      "learning_rate": 0.0003342692307692307,
+      "loss": 3.1743,
+      "step": 76100
+    },
+    {
+      "epoch": 22.175315976469218,
+      "grad_norm": 0.41730695962905884,
+      "learning_rate": 0.00033409440559440556,
+      "loss": 3.1718,
+      "step": 76150
+    },
+    {
+      "epoch": 22.18987710408294,
+      "grad_norm": 0.42204800248146057,
+      "learning_rate": 0.00033391958041958036,
+      "loss": 3.1784,
+      "step": 76200
+    },
+    {
+      "epoch": 22.204438231696663,
+      "grad_norm": 0.42969995737075806,
+      "learning_rate": 0.0003337447552447552,
+      "loss": 3.176,
+      "step": 76250
+    },
+    {
+      "epoch": 22.218999359310384,
+      "grad_norm": 0.39928796887397766,
+      "learning_rate": 0.00033356993006993007,
+      "loss": 3.1751,
+      "step": 76300
+    },
+    {
+      "epoch": 22.23356048692411,
+      "grad_norm": 0.44051143527030945,
+      "learning_rate": 0.00033339510489510487,
+      "loss": 3.1718,
+      "step": 76350
+    },
+    {
+      "epoch": 22.24812161453783,
+      "grad_norm": 0.4043222963809967,
+      "learning_rate": 0.0003332202797202797,
+      "loss": 3.1673,
+      "step": 76400
+    },
+    {
+      "epoch": 22.26268274215155,
+      "grad_norm": 0.4258318841457367,
+      "learning_rate": 0.0003330454545454545,
+      "loss": 3.1765,
+      "step": 76450
+    },
+    {
+      "epoch": 22.277243869765275,
+      "grad_norm": 0.41968485713005066,
+      "learning_rate": 0.0003328706293706294,
+      "loss": 3.187,
+      "step": 76500
+    },
+    {
+      "epoch": 22.291804997378996,
+      "grad_norm": 0.41846907138824463,
+      "learning_rate": 0.00033269580419580417,
+      "loss": 3.1817,
+      "step": 76550
+    },
+    {
+      "epoch": 22.30636612499272,
+      "grad_norm": 0.42832231521606445,
+      "learning_rate": 0.000332520979020979,
+      "loss": 3.1935,
+      "step": 76600
+    },
+    {
+      "epoch": 22.32092725260644,
+      "grad_norm": 0.4351508319377899,
+      "learning_rate": 0.00033234615384615377,
+      "loss": 3.1889,
+      "step": 76650
+    },
+    {
+      "epoch": 22.335488380220163,
+      "grad_norm": 0.45581719279289246,
+      "learning_rate": 0.0003321713286713286,
+      "loss": 3.1928,
+      "step": 76700
+    },
+    {
+      "epoch": 22.350049507833887,
+      "grad_norm": 0.40861231088638306,
+      "learning_rate": 0.0003319965034965034,
+      "loss": 3.1976,
+      "step": 76750
+    },
+    {
+      "epoch": 22.364610635447608,
+      "grad_norm": 0.45059776306152344,
+      "learning_rate": 0.0003318216783216783,
+      "loss": 3.2065,
+      "step": 76800
+    },
+    {
+      "epoch": 22.379171763061333,
+      "grad_norm": 0.41020745038986206,
+      "learning_rate": 0.0003316468531468531,
+      "loss": 3.1938,
+      "step": 76850
+    },
+    {
+      "epoch": 22.393732890675054,
+      "grad_norm": 0.43098288774490356,
+      "learning_rate": 0.00033147202797202793,
+      "loss": 3.199,
+      "step": 76900
+    },
+    {
+      "epoch": 22.408294018288775,
+      "grad_norm": 0.4355289340019226,
+      "learning_rate": 0.0003312972027972028,
+      "loss": 3.2009,
+      "step": 76950
+    },
+    {
+      "epoch": 22.4228551459025,
+      "grad_norm": 0.4274457097053528,
+      "learning_rate": 0.0003311223776223776,
+      "loss": 3.1957,
+      "step": 77000
+    },
+    {
+      "epoch": 22.4228551459025,
+      "eval_accuracy": 0.37429806306199337,
+      "eval_loss": 3.5408267974853516,
+      "eval_runtime": 209.8461,
+      "eval_samples_per_second": 79.325,
+      "eval_steps_per_second": 4.961,
+      "step": 77000
+    },
+    {
+      "epoch": 22.43741627351622,
+      "grad_norm": 0.4628225266933441,
+      "learning_rate": 0.00033094755244755244,
+      "loss": 3.2019,
+      "step": 77050
+    },
+    {
+      "epoch": 22.451977401129945,
+      "grad_norm": 0.40951475501060486,
+      "learning_rate": 0.00033077272727272724,
+      "loss": 3.2059,
+      "step": 77100
+    },
+    {
+      "epoch": 22.466538528743666,
+      "grad_norm": 0.4228519797325134,
+      "learning_rate": 0.0003305979020979021,
+      "loss": 3.196,
+      "step": 77150
+    },
+    {
+      "epoch": 22.481099656357387,
+      "grad_norm": 0.39467620849609375,
+      "learning_rate": 0.0003304230769230769,
+      "loss": 3.2024,
+      "step": 77200
+    },
+    {
+      "epoch": 22.49566078397111,
+      "grad_norm": 0.41270583868026733,
+      "learning_rate": 0.00033024825174825175,
+      "loss": 3.1946,
+      "step": 77250
+    },
+    {
+      "epoch": 22.510221911584832,
+      "grad_norm": 0.44770345091819763,
+      "learning_rate": 0.00033007342657342655,
+      "loss": 3.1961,
+      "step": 77300
+    },
+    {
+      "epoch": 22.524783039198557,
+      "grad_norm": 0.41133832931518555,
+      "learning_rate": 0.0003298986013986014,
+      "loss": 3.2093,
+      "step": 77350
+    },
+    {
+      "epoch": 22.539344166812278,
+      "grad_norm": 0.40371203422546387,
+      "learning_rate": 0.00032972377622377615,
+      "loss": 3.217,
+      "step": 77400
+    },
+    {
+      "epoch": 22.553905294426002,
+      "grad_norm": 0.4077425003051758,
+      "learning_rate": 0.000329548951048951,
+      "loss": 3.2111,
+      "step": 77450
+    },
+    {
+      "epoch": 22.568466422039723,
+      "grad_norm": 0.42343926429748535,
+      "learning_rate": 0.0003293741258741258,
+      "loss": 3.2109,
+      "step": 77500
+    },
+    {
+      "epoch": 22.583027549653444,
+      "grad_norm": 0.4148765802383423,
+      "learning_rate": 0.00032919930069930065,
+      "loss": 3.2178,
+      "step": 77550
+    },
+    {
+      "epoch": 22.59758867726717,
+      "grad_norm": 0.4103222191333771,
+      "learning_rate": 0.0003290244755244755,
+      "loss": 3.2097,
+      "step": 77600
+    },
+    {
+      "epoch": 22.61214980488089,
+      "grad_norm": 0.44261687994003296,
+      "learning_rate": 0.0003288496503496503,
+      "loss": 3.2068,
+      "step": 77650
+    },
+    {
+      "epoch": 22.626710932494614,
+      "grad_norm": 0.41240397095680237,
+      "learning_rate": 0.00032867482517482516,
+      "loss": 3.2092,
+      "step": 77700
+    },
+    {
+      "epoch": 22.641272060108335,
+      "grad_norm": 0.40966787934303284,
+      "learning_rate": 0.00032849999999999996,
+      "loss": 3.2144,
+      "step": 77750
+    },
+    {
+      "epoch": 22.655833187722056,
+      "grad_norm": 0.39857950806617737,
+      "learning_rate": 0.0003283251748251748,
+      "loss": 3.2212,
+      "step": 77800
+    },
+    {
+      "epoch": 22.67039431533578,
+      "grad_norm": 0.4325350522994995,
+      "learning_rate": 0.0003281503496503496,
+      "loss": 3.2199,
+      "step": 77850
+    },
+    {
+      "epoch": 22.6849554429495,
+      "grad_norm": 0.4444861114025116,
+      "learning_rate": 0.00032797552447552447,
+      "loss": 3.2181,
+      "step": 77900
+    },
+    {
+      "epoch": 22.699516570563226,
+      "grad_norm": 0.39821311831474304,
+      "learning_rate": 0.00032780069930069927,
+      "loss": 3.2296,
+      "step": 77950
+    },
+    {
+      "epoch": 22.714077698176947,
+      "grad_norm": 0.4173693060874939,
+      "learning_rate": 0.0003276258741258741,
+      "loss": 3.2258,
+      "step": 78000
+    },
+    {
+      "epoch": 22.714077698176947,
+      "eval_accuracy": 0.3743928185235406,
+      "eval_loss": 3.53715443611145,
+      "eval_runtime": 179.68,
+      "eval_samples_per_second": 92.642,
+      "eval_steps_per_second": 5.794,
+      "step": 78000
+    },
+    {
+      "epoch": 22.728638825790668,
+      "grad_norm": 0.4090815782546997,
+      "learning_rate": 0.0003274510489510489,
+      "loss": 3.225,
+      "step": 78050
+    },
+    {
+      "epoch": 22.743199953404392,
+      "grad_norm": 0.3890690803527832,
+      "learning_rate": 0.0003272762237762238,
+      "loss": 3.2165,
+      "step": 78100
+    },
+    {
+      "epoch": 22.757761081018113,
+      "grad_norm": 0.4822121858596802,
+      "learning_rate": 0.0003271013986013985,
+      "loss": 3.2231,
+      "step": 78150
+    },
+    {
+      "epoch": 22.772322208631838,
+      "grad_norm": 0.4495407044887543,
+      "learning_rate": 0.0003269265734265734,
+      "loss": 3.2311,
+      "step": 78200
+    },
+    {
+      "epoch": 22.78688333624556,
+      "grad_norm": 0.42291054129600525,
+      "learning_rate": 0.0003267517482517482,
+      "loss": 3.2286,
+      "step": 78250
+    },
+    {
+      "epoch": 22.80144446385928,
+      "grad_norm": 0.43875908851623535,
+      "learning_rate": 0.00032657692307692303,
+      "loss": 3.2133,
+      "step": 78300
+    },
+    {
+      "epoch": 22.816005591473004,
+      "grad_norm": 0.39126718044281006,
+      "learning_rate": 0.0003264020979020979,
+      "loss": 3.2198,
+      "step": 78350
+    },
+    {
+      "epoch": 22.830566719086725,
+      "grad_norm": 0.39724770188331604,
+      "learning_rate": 0.0003262272727272727,
+      "loss": 3.2341,
+      "step": 78400
+    },
+    {
+      "epoch": 22.84512784670045,
+      "grad_norm": 0.40128400921821594,
+      "learning_rate": 0.00032605244755244754,
+      "loss": 3.2236,
+      "step": 78450
+    },
+    {
+      "epoch": 22.85968897431417,
+      "grad_norm": 0.42113637924194336,
+      "learning_rate": 0.00032587762237762234,
+      "loss": 3.2344,
+      "step": 78500
+    },
+    {
+      "epoch": 22.874250101927892,
+      "grad_norm": 0.3953815698623657,
+      "learning_rate": 0.0003257027972027972,
+      "loss": 3.2346,
+      "step": 78550
+    },
+    {
+      "epoch": 22.888811229541616,
+      "grad_norm": 0.40962186455726624,
+      "learning_rate": 0.000325527972027972,
+      "loss": 3.2355,
+      "step": 78600
+    },
+    {
+      "epoch": 22.903372357155337,
+      "grad_norm": 0.41248080134391785,
+      "learning_rate": 0.00032535314685314684,
+      "loss": 3.2279,
+      "step": 78650
+    },
+    {
+      "epoch": 22.917933484769062,
+      "grad_norm": 0.3938314914703369,
+      "learning_rate": 0.00032517832167832164,
+      "loss": 3.2427,
+      "step": 78700
+    },
+    {
+      "epoch": 22.932494612382783,
+      "grad_norm": 0.4408927857875824,
+      "learning_rate": 0.0003250034965034965,
+      "loss": 3.2283,
+      "step": 78750
+    },
+    {
+      "epoch": 22.947055739996504,
+      "grad_norm": 0.4145766496658325,
+      "learning_rate": 0.0003248286713286713,
+      "loss": 3.2249,
+      "step": 78800
+    },
+    {
+      "epoch": 22.96161686761023,
+      "grad_norm": 0.39652279019355774,
+      "learning_rate": 0.00032465384615384615,
+      "loss": 3.2319,
+      "step": 78850
+    },
+    {
+      "epoch": 22.97617799522395,
+      "grad_norm": 0.39880838990211487,
+      "learning_rate": 0.0003244790209790209,
+      "loss": 3.2527,
+      "step": 78900
+    },
+    {
+      "epoch": 22.990739122837674,
+      "grad_norm": 0.42180368304252625,
+      "learning_rate": 0.00032430419580419575,
+      "loss": 3.258,
+      "step": 78950
+    },
+    {
+      "epoch": 23.00524200594094,
+      "grad_norm": 0.45832839608192444,
+      "learning_rate": 0.00032412937062937066,
+      "loss": 3.1934,
+      "step": 79000
+    },
+    {
+      "epoch": 23.00524200594094,
+      "eval_accuracy": 0.37386425704076576,
+      "eval_loss": 3.5435686111450195,
+      "eval_runtime": 204.0982,
+      "eval_samples_per_second": 81.559,
+      "eval_steps_per_second": 5.1,
+      "step": 79000
+    },
+    {
+      "epoch": 23.019803133554664,
+      "grad_norm": 0.43102172017097473,
+      "learning_rate": 0.0003239545454545454,
+      "loss": 3.1394,
+      "step": 79050
+    },
+    {
+      "epoch": 23.034364261168385,
+      "grad_norm": 0.4099760949611664,
+      "learning_rate": 0.00032377972027972026,
+      "loss": 3.1418,
+      "step": 79100
+    },
+    {
+      "epoch": 23.048925388782106,
+      "grad_norm": 0.4064216911792755,
+      "learning_rate": 0.00032360489510489506,
+      "loss": 3.1343,
+      "step": 79150
+    },
+    {
+      "epoch": 23.06348651639583,
+      "grad_norm": 0.41040489077568054,
+      "learning_rate": 0.0003234300699300699,
+      "loss": 3.1439,
+      "step": 79200
+    },
+    {
+      "epoch": 23.07804764400955,
+      "grad_norm": 0.40389707684516907,
+      "learning_rate": 0.0003232552447552447,
+      "loss": 3.1514,
+      "step": 79250
+    },
+    {
+      "epoch": 23.092608771623276,
+      "grad_norm": 0.44415801763534546,
+      "learning_rate": 0.00032308041958041957,
+      "loss": 3.1359,
+      "step": 79300
+    },
+    {
+      "epoch": 23.107169899236997,
+      "grad_norm": 0.4046581983566284,
+      "learning_rate": 0.00032290559440559437,
+      "loss": 3.1528,
+      "step": 79350
+    },
+    {
+      "epoch": 23.121731026850718,
+      "grad_norm": 0.4351769983768463,
+      "learning_rate": 0.0003227307692307692,
+      "loss": 3.1586,
+      "step": 79400
+    },
+    {
+      "epoch": 23.136292154464442,
+      "grad_norm": 0.45303016901016235,
+      "learning_rate": 0.000322555944055944,
+      "loss": 3.1487,
+      "step": 79450
+    },
+    {
+      "epoch": 23.150853282078163,
+      "grad_norm": 0.4389405846595764,
+      "learning_rate": 0.00032238111888111887,
+      "loss": 3.1571,
+      "step": 79500
+    },
+    {
+      "epoch": 23.165414409691888,
+      "grad_norm": 0.425350546836853,
+      "learning_rate": 0.00032220629370629367,
+      "loss": 3.1615,
+      "step": 79550
+    },
+    {
+      "epoch": 23.17997553730561,
+      "grad_norm": 0.4540613293647766,
+      "learning_rate": 0.0003220314685314685,
+      "loss": 3.1741,
+      "step": 79600
+    },
+    {
+      "epoch": 23.19453666491933,
+      "grad_norm": 0.44594645500183105,
+      "learning_rate": 0.00032185664335664327,
+      "loss": 3.1742,
+      "step": 79650
+    },
+    {
+      "epoch": 23.209097792533054,
+      "grad_norm": 0.4110971689224243,
+      "learning_rate": 0.0003216818181818181,
+      "loss": 3.1683,
+      "step": 79700
+    },
+    {
+      "epoch": 23.223658920146775,
+      "grad_norm": 0.4295291602611542,
+      "learning_rate": 0.00032150699300699303,
+      "loss": 3.1743,
+      "step": 79750
+    },
+    {
+      "epoch": 23.2382200477605,
+      "grad_norm": 0.4476734399795532,
+      "learning_rate": 0.0003213321678321678,
+      "loss": 3.1666,
+      "step": 79800
+    },
+    {
+      "epoch": 23.25278117537422,
+      "grad_norm": 0.42437073588371277,
+      "learning_rate": 0.00032115734265734263,
+      "loss": 3.1704,
+      "step": 79850
+    },
+    {
+      "epoch": 23.26734230298794,
+      "grad_norm": 0.41974642872810364,
+      "learning_rate": 0.00032098251748251743,
+      "loss": 3.1811,
+      "step": 79900
+    },
+    {
+      "epoch": 23.281903430601666,
+      "grad_norm": 0.42279666662216187,
+      "learning_rate": 0.0003208076923076923,
+      "loss": 3.1837,
+      "step": 79950
+    },
+    {
+      "epoch": 23.296464558215387,
+      "grad_norm": 0.4157053828239441,
+      "learning_rate": 0.0003206328671328671,
+      "loss": 3.183,
+      "step": 80000
+    },
+    {
+      "epoch": 23.296464558215387,
+      "eval_accuracy": 0.37421988392808647,
+      "eval_loss": 3.5450761318206787,
+      "eval_runtime": 924.0109,
+      "eval_samples_per_second": 18.015,
+      "eval_steps_per_second": 1.127,
+      "step": 80000
+    },
+    {
+      "epoch": 23.31102568582911,
+      "grad_norm": 0.4075993001461029,
+      "learning_rate": 0.00032045804195804194,
+      "loss": 3.1425,
+      "step": 80050
+    },
+    {
+      "epoch": 23.325586813442833,
+      "grad_norm": 0.435520738363266,
+      "learning_rate": 0.00032028321678321674,
+      "loss": 3.1409,
+      "step": 80100
+    },
+    {
+      "epoch": 23.340147941056557,
+      "grad_norm": 0.44043847918510437,
+      "learning_rate": 0.0003201083916083916,
+      "loss": 3.1473,
+      "step": 80150
+    },
+    {
+      "epoch": 23.354709068670278,
+      "grad_norm": 0.406053751707077,
+      "learning_rate": 0.0003199335664335664,
+      "loss": 3.149,
+      "step": 80200
+    },
+    {
+      "epoch": 23.369270196284,
+      "grad_norm": 0.423145055770874,
+      "learning_rate": 0.00031975874125874125,
+      "loss": 3.1526,
+      "step": 80250
+    },
+    {
+      "epoch": 23.383831323897724,
+      "grad_norm": 0.4597681164741516,
+      "learning_rate": 0.00031958391608391605,
+      "loss": 3.1547,
+      "step": 80300
+    },
+    {
+      "epoch": 23.398392451511445,
+      "grad_norm": 0.41860097646713257,
+      "learning_rate": 0.0003194090909090909,
+      "loss": 3.1435,
+      "step": 80350
+    },
+    {
+      "epoch": 23.41295357912517,
+      "grad_norm": 0.4147387742996216,
+      "learning_rate": 0.00031923426573426576,
+      "loss": 3.1565,
+      "step": 80400
+    },
+    {
+      "epoch": 23.42751470673889,
+      "grad_norm": 0.43652892112731934,
+      "learning_rate": 0.0003190594405594405,
+      "loss": 3.155,
+      "step": 80450
+    },
+    {
+      "epoch": 23.44207583435261,
+      "grad_norm": 0.43268704414367676,
+      "learning_rate": 0.0003188846153846154,
+      "loss": 3.1696,
+      "step": 80500
+    },
+    {
+      "epoch": 23.456636961966336,
+      "grad_norm": 0.4125154912471771,
+      "learning_rate": 0.00031870979020979015,
+      "loss": 3.1671,
+      "step": 80550
+    },
+    {
+      "epoch": 23.471198089580056,
+      "grad_norm": 0.436293363571167,
+      "learning_rate": 0.000318534965034965,
+      "loss": 3.1677,
+      "step": 80600
+    },
+    {
+      "epoch": 23.48575921719378,
+      "grad_norm": 0.44055047631263733,
+      "learning_rate": 0.0003183601398601398,
+      "loss": 3.168,
+      "step": 80650
+    },
+    {
+      "epoch": 23.500320344807502,
+      "grad_norm": 0.4154927432537079,
+      "learning_rate": 0.00031818531468531466,
+      "loss": 3.1656,
+      "step": 80700
+    },
+    {
+      "epoch": 23.514881472421223,
+      "grad_norm": 0.4015692174434662,
+      "learning_rate": 0.00031801048951048946,
+      "loss": 3.1698,
+      "step": 80750
+    },
+    {
+      "epoch": 23.529442600034947,
+      "grad_norm": 0.462243914604187,
+      "learning_rate": 0.0003178356643356643,
+      "loss": 3.1756,
+      "step": 80800
+    },
+    {
+      "epoch": 23.54400372764867,
+      "grad_norm": 0.3996904194355011,
+      "learning_rate": 0.0003176608391608391,
+      "loss": 3.1665,
+      "step": 80850
+    },
+    {
+      "epoch": 23.558564855262393,
+      "grad_norm": 0.4055446982383728,
+      "learning_rate": 0.00031748601398601397,
+      "loss": 3.1848,
+      "step": 80900
+    },
+    {
+      "epoch": 23.573125982876114,
+      "grad_norm": 0.41298335790634155,
+      "learning_rate": 0.00031731118881118877,
+      "loss": 3.1912,
+      "step": 80950
+    },
+    {
+      "epoch": 23.587687110489835,
+      "grad_norm": 0.447311669588089,
+      "learning_rate": 0.0003171363636363636,
+      "loss": 3.182,
+      "step": 81000
+    },
+    {
+      "epoch": 23.587687110489835,
+      "eval_accuracy": 0.3740028633548653,
+      "eval_loss": 3.5492684841156006,
+      "eval_runtime": 80.4358,
+      "eval_samples_per_second": 206.948,
+      "eval_steps_per_second": 12.942,
+      "step": 81000
+    },
+    {
+      "epoch": 23.60224823810356,
+      "grad_norm": 0.42425885796546936,
+      "learning_rate": 0.0003169615384615385,
+      "loss": 3.1837,
+      "step": 81050
+    },
+    {
+      "epoch": 23.61680936571728,
+      "grad_norm": 0.40934208035469055,
+      "learning_rate": 0.0003167867132867133,
+      "loss": 3.1819,
+      "step": 81100
+    },
+    {
+      "epoch": 23.631370493331005,
+      "grad_norm": 0.4387608766555786,
+      "learning_rate": 0.00031661188811188813,
+      "loss": 3.1764,
+      "step": 81150
+    },
+    {
+      "epoch": 23.645931620944726,
+      "grad_norm": 0.3999146521091461,
+      "learning_rate": 0.0003164370629370629,
+      "loss": 3.1878,
+      "step": 81200
+    },
+    {
+      "epoch": 23.660492748558447,
+      "grad_norm": 0.4529256820678711,
+      "learning_rate": 0.0003162622377622378,
+      "loss": 3.179,
+      "step": 81250
+    },
+    {
+      "epoch": 23.67505387617217,
+      "grad_norm": 0.4573615491390228,
+      "learning_rate": 0.00031608741258741253,
+      "loss": 3.1939,
+      "step": 81300
+    },
+    {
+      "epoch": 23.689615003785892,
+      "grad_norm": 0.43424728512763977,
+      "learning_rate": 0.0003159125874125874,
+      "loss": 3.1878,
+      "step": 81350
+    },
+    {
+      "epoch": 23.704176131399617,
+      "grad_norm": 0.4275583326816559,
+      "learning_rate": 0.0003157377622377622,
+      "loss": 3.1769,
+      "step": 81400
+    },
+    {
+      "epoch": 23.718737259013338,
+      "grad_norm": 0.46233272552490234,
+      "learning_rate": 0.00031556293706293704,
+      "loss": 3.1916,
+      "step": 81450
+    },
+    {
+      "epoch": 23.73329838662706,
+      "grad_norm": 0.4323495030403137,
+      "learning_rate": 0.00031538811188811184,
+      "loss": 3.2002,
+      "step": 81500
+    },
+    {
+      "epoch": 23.747859514240783,
+      "grad_norm": 0.4355345666408539,
+      "learning_rate": 0.0003152132867132867,
+      "loss": 3.1995,
+      "step": 81550
+    },
+    {
+      "epoch": 23.762420641854504,
+      "grad_norm": 0.45010533928871155,
+      "learning_rate": 0.0003150384615384615,
+      "loss": 3.1912,
+      "step": 81600
+    },
+    {
+      "epoch": 23.77698176946823,
+      "grad_norm": 0.41172489523887634,
+      "learning_rate": 0.00031486363636363634,
+      "loss": 3.1936,
+      "step": 81650
+    },
+    {
+      "epoch": 23.79154289708195,
+      "grad_norm": 0.4244674742221832,
+      "learning_rate": 0.00031468881118881114,
+      "loss": 3.2111,
+      "step": 81700
+    },
+    {
+      "epoch": 23.80610402469567,
+      "grad_norm": 0.45701536536216736,
+      "learning_rate": 0.000314513986013986,
+      "loss": 3.2003,
+      "step": 81750
+    },
+    {
+      "epoch": 23.820665152309395,
+      "grad_norm": 0.4102361798286438,
+      "learning_rate": 0.00031433916083916085,
+      "loss": 3.1871,
+      "step": 81800
+    },
+    {
+      "epoch": 23.835226279923116,
+      "grad_norm": 0.4519789516925812,
+      "learning_rate": 0.00031416433566433565,
+      "loss": 3.199,
+      "step": 81850
+    },
+    {
+      "epoch": 23.84978740753684,
+      "grad_norm": 0.42273131012916565,
+      "learning_rate": 0.0003139895104895105,
+      "loss": 3.189,
+      "step": 81900
+    },
+    {
+      "epoch": 23.86434853515056,
+      "grad_norm": 0.39564603567123413,
+      "learning_rate": 0.00031381468531468525,
+      "loss": 3.2031,
+      "step": 81950
+    },
+    {
+      "epoch": 23.878909662764286,
+      "grad_norm": 0.4374904930591583,
+      "learning_rate": 0.00031363986013986016,
+      "loss": 3.2035,
+      "step": 82000
+    },
+    {
+      "epoch": 23.878909662764286,
+      "eval_accuracy": 0.37404953570999466,
+      "eval_loss": 3.541025400161743,
+      "eval_runtime": 80.2075,
+      "eval_samples_per_second": 207.537,
+      "eval_steps_per_second": 12.979,
+      "step": 82000
+    },
+    {
+      "epoch": 23.893470790378007,
+      "grad_norm": 0.41857510805130005,
+      "learning_rate": 0.0003134650349650349,
+      "loss": 3.2,
+      "step": 82050
+    },
+    {
+      "epoch": 23.908031917991728,
+      "grad_norm": 0.43600863218307495,
+      "learning_rate": 0.00031329020979020976,
+      "loss": 3.2011,
+      "step": 82100
+    },
+    {
+      "epoch": 23.922593045605453,
+      "grad_norm": 0.4216744899749756,
+      "learning_rate": 0.00031311538461538456,
+      "loss": 3.2035,
+      "step": 82150
+    },
+    {
+      "epoch": 23.937154173219174,
+      "grad_norm": 0.411630243062973,
+      "learning_rate": 0.0003129405594405594,
+      "loss": 3.2061,
+      "step": 82200
+    },
+    {
+      "epoch": 23.951715300832895,
+      "grad_norm": 0.438644140958786,
+      "learning_rate": 0.0003127657342657342,
+      "loss": 3.2096,
+      "step": 82250
+    },
+    {
+      "epoch": 23.96627642844662,
+      "grad_norm": 0.41276854276657104,
+      "learning_rate": 0.00031259090909090907,
+      "loss": 3.2023,
+      "step": 82300
+    },
+    {
+      "epoch": 23.98083755606034,
+      "grad_norm": 0.4013693928718567,
+      "learning_rate": 0.00031241608391608386,
+      "loss": 3.2011,
+      "step": 82350
+    },
+    {
+      "epoch": 23.995398683674065,
+      "grad_norm": 0.4175088703632355,
+      "learning_rate": 0.0003122412587412587,
+      "loss": 3.2096,
+      "step": 82400
+    },
+    {
+      "epoch": 24.010192789329604,
+      "grad_norm": 0.3980584442615509,
+      "learning_rate": 0.00031206643356643357,
+      "loss": 3.2127,
+      "step": 82450
+    },
+    {
+      "epoch": 24.02475391694333,
+      "grad_norm": 0.40012335777282715,
+      "learning_rate": 0.00031189160839160837,
+      "loss": 3.1319,
+      "step": 82500
+    },
+    {
+      "epoch": 24.03931504455705,
+      "grad_norm": 0.45777830481529236,
+      "learning_rate": 0.0003117167832167832,
+      "loss": 3.1226,
+      "step": 82550
+    },
+    {
+      "epoch": 24.053876172170774,
+      "grad_norm": 0.44569453597068787,
+      "learning_rate": 0.000311541958041958,
+      "loss": 3.1373,
+      "step": 82600
+    },
+    {
+      "epoch": 24.068437299784495,
+      "grad_norm": 0.4426841735839844,
+      "learning_rate": 0.0003113671328671329,
+      "loss": 3.1402,
+      "step": 82650
+    },
+    {
+      "epoch": 24.082998427398216,
+      "grad_norm": 0.4288923442363739,
+      "learning_rate": 0.0003111923076923076,
+      "loss": 3.1401,
+      "step": 82700
+    },
+    {
+      "epoch": 24.09755955501194,
+      "grad_norm": 0.45076754689216614,
+      "learning_rate": 0.00031101748251748253,
+      "loss": 3.1331,
+      "step": 82750
+    },
+    {
+      "epoch": 24.11212068262566,
+      "grad_norm": 0.4328001141548157,
+      "learning_rate": 0.0003108426573426573,
+      "loss": 3.1486,
+      "step": 82800
+    },
+    {
+      "epoch": 24.126681810239386,
+      "grad_norm": 0.4192425608634949,
+      "learning_rate": 0.00031066783216783213,
+      "loss": 3.1511,
+      "step": 82850
+    },
+    {
+      "epoch": 24.141242937853107,
+      "grad_norm": 0.432831346988678,
+      "learning_rate": 0.00031049300699300693,
+      "loss": 3.1592,
+      "step": 82900
+    },
+    {
+      "epoch": 24.15580406546683,
+      "grad_norm": 0.4342346787452698,
+      "learning_rate": 0.0003103181818181818,
+      "loss": 3.1628,
+      "step": 82950
+    },
+    {
+      "epoch": 24.170365193080553,
+      "grad_norm": 0.4602639973163605,
+      "learning_rate": 0.0003101433566433566,
+      "loss": 3.1533,
+      "step": 83000
+    },
+    {
+      "epoch": 24.170365193080553,
+      "eval_accuracy": 0.3738702527337421,
+      "eval_loss": 3.5505053997039795,
+      "eval_runtime": 81.647,
+      "eval_samples_per_second": 203.878,
+      "eval_steps_per_second": 12.75,
+      "step": 83000
+    },
+    {
+      "epoch": 24.184926320694274,
+      "grad_norm": 0.42068788409233093,
+      "learning_rate": 0.00030996853146853144,
+      "loss": 3.1589,
+      "step": 83050
+    },
+    {
+      "epoch": 24.199487448308,
+      "grad_norm": 0.4067046344280243,
+      "learning_rate": 0.00030979370629370624,
+      "loss": 3.1652,
+      "step": 83100
+    },
+    {
+      "epoch": 24.21404857592172,
+      "grad_norm": 0.43839457631111145,
+      "learning_rate": 0.0003096188811188811,
+      "loss": 3.1579,
+      "step": 83150
+    },
+    {
+      "epoch": 24.22860970353544,
+      "grad_norm": 0.439429372549057,
+      "learning_rate": 0.00030944405594405595,
+      "loss": 3.1643,
+      "step": 83200
+    },
+    {
+      "epoch": 24.243170831149165,
+      "grad_norm": 0.41719990968704224,
+      "learning_rate": 0.00030926923076923075,
+      "loss": 3.1661,
+      "step": 83250
+    },
+    {
+      "epoch": 24.257731958762886,
+      "grad_norm": 0.44479116797447205,
+      "learning_rate": 0.0003090944055944056,
+      "loss": 3.154,
+      "step": 83300
+    },
+    {
+      "epoch": 24.27229308637661,
+      "grad_norm": 0.4219229519367218,
+      "learning_rate": 0.0003089195804195804,
+      "loss": 3.1805,
+      "step": 83350
+    },
+    {
+      "epoch": 24.28685421399033,
+      "grad_norm": 0.42215579748153687,
+      "learning_rate": 0.00030874475524475525,
+      "loss": 3.1685,
+      "step": 83400
+    },
+    {
+      "epoch": 24.301415341604052,
+      "grad_norm": 0.40854838490486145,
+      "learning_rate": 0.00030856993006993,
+      "loss": 3.1756,
+      "step": 83450
+    },
+    {
+      "epoch": 24.315976469217777,
+      "grad_norm": 0.4285014271736145,
+      "learning_rate": 0.0003083951048951049,
+      "loss": 3.1724,
+      "step": 83500
+    },
+    {
+      "epoch": 24.330537596831498,
+      "grad_norm": 0.44286397099494934,
+      "learning_rate": 0.00030822027972027965,
+      "loss": 3.1803,
+      "step": 83550
+    },
+    {
+      "epoch": 24.345098724445222,
+      "grad_norm": 0.4677809476852417,
+      "learning_rate": 0.0003080454545454545,
+      "loss": 3.175,
+      "step": 83600
+    },
+    {
+      "epoch": 24.359659852058943,
+      "grad_norm": 0.44377604126930237,
+      "learning_rate": 0.0003078706293706293,
+      "loss": 3.1706,
+      "step": 83650
+    },
+    {
+      "epoch": 24.374220979672664,
+      "grad_norm": 0.42284804582595825,
+      "learning_rate": 0.00030769580419580416,
+      "loss": 3.1799,
+      "step": 83700
+    },
+    {
+      "epoch": 24.38878210728639,
+      "grad_norm": 0.4500799775123596,
+      "learning_rate": 0.00030752097902097896,
+      "loss": 3.1823,
+      "step": 83750
+    },
+    {
+      "epoch": 24.40334323490011,
+      "grad_norm": 0.43594327569007874,
+      "learning_rate": 0.0003073461538461538,
+      "loss": 3.1918,
+      "step": 83800
+    },
+    {
+      "epoch": 24.417904362513834,
+      "grad_norm": 0.487556517124176,
+      "learning_rate": 0.00030717132867132867,
+      "loss": 3.1907,
+      "step": 83850
+    },
+    {
+      "epoch": 24.432465490127555,
+      "grad_norm": 0.4531272053718567,
+      "learning_rate": 0.00030699650349650347,
+      "loss": 3.2009,
+      "step": 83900
+    },
+    {
+      "epoch": 24.44702661774128,
+      "grad_norm": 0.4245949387550354,
+      "learning_rate": 0.0003068216783216783,
+      "loss": 3.1898,
+      "step": 83950
+    },
+    {
+      "epoch": 24.461587745355,
+      "grad_norm": 0.4252057373523712,
+      "learning_rate": 0.0003066468531468531,
+      "loss": 3.1841,
+      "step": 84000
+    },
+    {
+      "epoch": 24.461587745355,
+      "eval_accuracy": 0.3741536961801322,
+      "eval_loss": 3.5409674644470215,
+      "eval_runtime": 81.8197,
+      "eval_samples_per_second": 203.447,
+      "eval_steps_per_second": 12.723,
+      "step": 84000
+    },
+    {
+      "epoch": 24.47614887296872,
+      "grad_norm": 0.42920979857444763,
+      "learning_rate": 0.000306472027972028,
+      "loss": 3.1956,
+      "step": 84050
+    },
+    {
+      "epoch": 24.490710000582446,
+      "grad_norm": 0.43671944737434387,
+      "learning_rate": 0.0003062972027972028,
+      "loss": 3.184,
+      "step": 84100
+    },
+    {
+      "epoch": 24.505271128196167,
+      "grad_norm": 0.4089336693286896,
+      "learning_rate": 0.00030612237762237763,
+      "loss": 3.186,
+      "step": 84150
+    },
+    {
+      "epoch": 24.51983225580989,
+      "grad_norm": 0.4428396224975586,
+      "learning_rate": 0.0003059475524475524,
+      "loss": 3.2047,
+      "step": 84200
+    },
+    {
+      "epoch": 24.534393383423613,
+      "grad_norm": 0.419969767332077,
+      "learning_rate": 0.0003057727272727273,
+      "loss": 3.1867,
+      "step": 84250
+    },
+    {
+      "epoch": 24.548954511037334,
+      "grad_norm": 0.40729889273643494,
+      "learning_rate": 0.00030559790209790203,
+      "loss": 3.2113,
+      "step": 84300
+    },
+    {
+      "epoch": 24.563515638651058,
+      "grad_norm": 0.45397377014160156,
+      "learning_rate": 0.0003054230769230769,
+      "loss": 3.1974,
+      "step": 84350
+    },
+    {
+      "epoch": 24.57807676626478,
+      "grad_norm": 0.4194105863571167,
+      "learning_rate": 0.0003052482517482517,
+      "loss": 3.2091,
+      "step": 84400
+    },
+    {
+      "epoch": 24.592637893878504,
+      "grad_norm": 0.4194582402706146,
+      "learning_rate": 0.00030507342657342654,
+      "loss": 3.1941,
+      "step": 84450
+    },
+    {
+      "epoch": 24.607199021492224,
+      "grad_norm": 0.40591809153556824,
+      "learning_rate": 0.00030489860139860134,
+      "loss": 3.1961,
+      "step": 84500
+    },
+    {
+      "epoch": 24.621760149105945,
+      "grad_norm": 0.4254491627216339,
+      "learning_rate": 0.0003047237762237762,
+      "loss": 3.2122,
+      "step": 84550
+    },
+    {
+      "epoch": 24.63632127671967,
+      "grad_norm": 0.4454302489757538,
+      "learning_rate": 0.00030454895104895104,
+      "loss": 3.2067,
+      "step": 84600
+    },
+    {
+      "epoch": 24.65088240433339,
+      "grad_norm": 0.4131137430667877,
+      "learning_rate": 0.00030437412587412584,
+      "loss": 3.2015,
+      "step": 84650
+    },
+    {
+      "epoch": 24.665443531947115,
+      "grad_norm": 0.4385775923728943,
+      "learning_rate": 0.0003041993006993007,
+      "loss": 3.1937,
+      "step": 84700
+    },
+    {
+      "epoch": 24.680004659560836,
+      "grad_norm": 0.42315569519996643,
+      "learning_rate": 0.0003040244755244755,
+      "loss": 3.2043,
+      "step": 84750
+    },
+    {
+      "epoch": 24.694565787174557,
+      "grad_norm": 0.42199307680130005,
+      "learning_rate": 0.00030384965034965035,
+      "loss": 3.2053,
+      "step": 84800
+    },
+    {
+      "epoch": 24.709126914788282,
+      "grad_norm": 0.4560045599937439,
+      "learning_rate": 0.00030367482517482515,
+      "loss": 3.2146,
+      "step": 84850
+    },
+    {
+      "epoch": 24.723688042402003,
+      "grad_norm": 0.4480164349079132,
+      "learning_rate": 0.0003035,
+      "loss": 3.189,
+      "step": 84900
+    },
+    {
+      "epoch": 24.738249170015727,
+      "grad_norm": 0.45350882411003113,
+      "learning_rate": 0.00030332517482517475,
+      "loss": 3.2061,
+      "step": 84950
+    },
+    {
+      "epoch": 24.75281029762945,
+      "grad_norm": 0.4041248857975006,
+      "learning_rate": 0.00030315034965034966,
+      "loss": 3.1957,
+      "step": 85000
+    },
+    {
+      "epoch": 24.75281029762945,
+      "eval_accuracy": 0.37445571451848825,
+      "eval_loss": 3.537726640701294,
+      "eval_runtime": 81.755,
+      "eval_samples_per_second": 203.608,
+      "eval_steps_per_second": 12.733,
+      "step": 85000
+    },
+    {
+      "epoch": 24.76737142524317,
+      "grad_norm": 0.4523855447769165,
+      "learning_rate": 0.0003029755244755244,
+      "loss": 3.2056,
+      "step": 85050
+    },
+    {
+      "epoch": 24.781932552856894,
+      "grad_norm": 0.39477360248565674,
+      "learning_rate": 0.00030280069930069926,
+      "loss": 3.2026,
+      "step": 85100
+    },
+    {
+      "epoch": 24.796493680470615,
+      "grad_norm": 0.4192032814025879,
+      "learning_rate": 0.00030262587412587406,
+      "loss": 3.2077,
+      "step": 85150
+    },
+    {
+      "epoch": 24.81105480808434,
+      "grad_norm": 0.44437819719314575,
+      "learning_rate": 0.0003024510489510489,
+      "loss": 3.2079,
+      "step": 85200
+    },
+    {
+      "epoch": 24.82561593569806,
+      "grad_norm": 0.454645037651062,
+      "learning_rate": 0.00030227622377622377,
+      "loss": 3.2258,
+      "step": 85250
+    },
+    {
+      "epoch": 24.84017706331178,
+      "grad_norm": 0.4044639766216278,
+      "learning_rate": 0.00030210139860139856,
+      "loss": 3.2006,
+      "step": 85300
+    },
+    {
+      "epoch": 24.854738190925506,
+      "grad_norm": 0.3925352990627289,
+      "learning_rate": 0.0003019265734265734,
+      "loss": 3.2019,
+      "step": 85350
+    },
+    {
+      "epoch": 24.869299318539227,
+      "grad_norm": 0.4119468331336975,
+      "learning_rate": 0.0003017517482517482,
+      "loss": 3.2163,
+      "step": 85400
+    },
+    {
+      "epoch": 24.88386044615295,
+      "grad_norm": 0.4199124574661255,
+      "learning_rate": 0.00030157692307692307,
+      "loss": 3.2147,
+      "step": 85450
+    },
+    {
+      "epoch": 24.898421573766672,
+      "grad_norm": 0.435290664434433,
+      "learning_rate": 0.00030140209790209787,
+      "loss": 3.219,
+      "step": 85500
+    },
+    {
+      "epoch": 24.912982701380393,
+      "grad_norm": 0.4147189259529114,
+      "learning_rate": 0.0003012272727272727,
+      "loss": 3.209,
+      "step": 85550
+    },
+    {
+      "epoch": 24.927543828994118,
+      "grad_norm": 0.4204513132572174,
+      "learning_rate": 0.0003010524475524475,
+      "loss": 3.2265,
+      "step": 85600
+    },
+    {
+      "epoch": 24.94210495660784,
+      "grad_norm": 0.4550034701824188,
+      "learning_rate": 0.0003008776223776224,
+      "loss": 3.2328,
+      "step": 85650
+    },
+    {
+      "epoch": 24.956666084221563,
+      "grad_norm": 0.4185858964920044,
+      "learning_rate": 0.0003007027972027972,
+      "loss": 3.2214,
+      "step": 85700
+    },
+    {
+      "epoch": 24.971227211835284,
+      "grad_norm": 0.44467946887016296,
+      "learning_rate": 0.00030052797202797203,
+      "loss": 3.2178,
+      "step": 85750
+    },
+    {
+      "epoch": 24.985788339449005,
+      "grad_norm": 0.4270879626274109,
+      "learning_rate": 0.0003003531468531468,
+      "loss": 3.2316,
+      "step": 85800
+    },
+    {
+      "epoch": 25.000291222552274,
+      "grad_norm": 0.42271360754966736,
+      "learning_rate": 0.00030017832167832163,
+      "loss": 3.2205,
+      "step": 85850
+    },
+    {
+      "epoch": 25.014852350165995,
+      "grad_norm": 0.4256053566932678,
+      "learning_rate": 0.0003000034965034965,
+      "loss": 3.1192,
+      "step": 85900
+    },
+    {
+      "epoch": 25.02941347777972,
+      "grad_norm": 0.4629895091056824,
+      "learning_rate": 0.0002998286713286713,
+      "loss": 3.1012,
+      "step": 85950
+    },
+    {
+      "epoch": 25.04397460539344,
+      "grad_norm": 0.465717613697052,
+      "learning_rate": 0.00029965384615384614,
+      "loss": 3.1198,
+      "step": 86000
+    },
+    {
+      "epoch": 25.04397460539344,
+      "eval_accuracy": 0.3741750925746752,
+      "eval_loss": 3.5509843826293945,
+      "eval_runtime": 81.7203,
+      "eval_samples_per_second": 203.695,
+      "eval_steps_per_second": 12.739,
+      "step": 86000
+    },
+    {
+      "epoch": 25.058535733007165,
+      "grad_norm": 0.4088675379753113,
+      "learning_rate": 0.00029947902097902094,
+      "loss": 3.1269,
+      "step": 86050
+    },
+    {
+      "epoch": 25.073096860620886,
+      "grad_norm": 0.46389642357826233,
+      "learning_rate": 0.0002993041958041958,
+      "loss": 3.134,
+      "step": 86100
+    },
+    {
+      "epoch": 25.087657988234607,
+      "grad_norm": 0.44134199619293213,
+      "learning_rate": 0.0002991293706293706,
+      "loss": 3.129,
+      "step": 86150
+    },
+    {
+      "epoch": 25.10221911584833,
+      "grad_norm": 0.4049500524997711,
+      "learning_rate": 0.0002989545454545454,
+      "loss": 3.139,
+      "step": 86200
+    },
+    {
+      "epoch": 25.116780243462053,
+      "grad_norm": 0.4358852803707123,
+      "learning_rate": 0.00029877972027972025,
+      "loss": 3.1309,
+      "step": 86250
+    },
+    {
+      "epoch": 25.131341371075777,
+      "grad_norm": 0.4504960775375366,
+      "learning_rate": 0.0002986048951048951,
+      "loss": 3.1381,
+      "step": 86300
+    },
+    {
+      "epoch": 25.145902498689498,
+      "grad_norm": 0.4623953402042389,
+      "learning_rate": 0.0002984300699300699,
+      "loss": 3.138,
+      "step": 86350
+    },
+    {
+      "epoch": 25.160463626303223,
+      "grad_norm": 0.4341890811920166,
+      "learning_rate": 0.00029825524475524475,
+      "loss": 3.1464,
+      "step": 86400
+    },
+    {
+      "epoch": 25.175024753916944,
+      "grad_norm": 0.4155861437320709,
+      "learning_rate": 0.00029808041958041955,
+      "loss": 3.1528,
+      "step": 86450
+    },
+    {
+      "epoch": 25.189585881530665,
+      "grad_norm": 0.43031230568885803,
+      "learning_rate": 0.0002979055944055944,
+      "loss": 3.1627,
+      "step": 86500
+    },
+    {
+      "epoch": 25.20414700914439,
+      "grad_norm": 0.42100533843040466,
+      "learning_rate": 0.0002977307692307692,
+      "loss": 3.1528,
+      "step": 86550
+    },
+    {
+      "epoch": 25.21870813675811,
+      "grad_norm": 0.45310893654823303,
+      "learning_rate": 0.000297555944055944,
+      "loss": 3.1452,
+      "step": 86600
+    },
+    {
+      "epoch": 25.233269264371835,
+      "grad_norm": 0.43537208437919617,
+      "learning_rate": 0.00029738111888111886,
+      "loss": 3.1542,
+      "step": 86650
+    },
+    {
+      "epoch": 25.247830391985556,
+      "grad_norm": 0.4328289031982422,
+      "learning_rate": 0.00029720629370629366,
+      "loss": 3.1518,
+      "step": 86700
+    },
+    {
+      "epoch": 25.262391519599277,
+      "grad_norm": 0.4341360032558441,
+      "learning_rate": 0.0002970314685314685,
+      "loss": 3.1585,
+      "step": 86750
+    },
+    {
+      "epoch": 25.276952647213,
+      "grad_norm": 0.42555415630340576,
+      "learning_rate": 0.0002968566433566433,
+      "loss": 3.1713,
+      "step": 86800
+    },
+    {
+      "epoch": 25.291513774826722,
+      "grad_norm": 0.4444054663181305,
+      "learning_rate": 0.00029668181818181817,
+      "loss": 3.167,
+      "step": 86850
+    },
+    {
+      "epoch": 25.306074902440447,
+      "grad_norm": 0.4327361285686493,
+      "learning_rate": 0.00029650699300699297,
+      "loss": 3.161,
+      "step": 86900
+    },
+    {
+      "epoch": 25.320636030054168,
+      "grad_norm": 0.4254668354988098,
+      "learning_rate": 0.0002963321678321678,
+      "loss": 3.1667,
+      "step": 86950
+    },
+    {
+      "epoch": 25.33519715766789,
+      "grad_norm": 0.4326876997947693,
+      "learning_rate": 0.0002961573426573426,
+      "loss": 3.1721,
+      "step": 87000
+    },
+    {
+      "epoch": 25.33519715766789,
+      "eval_accuracy": 0.374514613384785,
+      "eval_loss": 3.5434107780456543,
+      "eval_runtime": 81.7833,
+      "eval_samples_per_second": 203.538,
+      "eval_steps_per_second": 12.729,
+      "step": 87000
+    },
+    {
+      "epoch": 25.349758285281613,
+      "grad_norm": 0.42808645963668823,
+      "learning_rate": 0.0002959825174825175,
+      "loss": 3.1783,
+      "step": 87050
+    },
+    {
+      "epoch": 25.364319412895334,
+      "grad_norm": 0.43070703744888306,
+      "learning_rate": 0.0002958076923076923,
+      "loss": 3.1639,
+      "step": 87100
+    },
+    {
+      "epoch": 25.37888054050906,
+      "grad_norm": 0.4267195463180542,
+      "learning_rate": 0.00029563286713286713,
+      "loss": 3.1752,
+      "step": 87150
+    },
+    {
+      "epoch": 25.39344166812278,
+      "grad_norm": 0.4095643162727356,
+      "learning_rate": 0.00029545804195804193,
+      "loss": 3.1775,
+      "step": 87200
+    },
+    {
+      "epoch": 25.4080027957365,
+      "grad_norm": 0.43954792618751526,
+      "learning_rate": 0.0002952832167832168,
+      "loss": 3.174,
+      "step": 87250
+    },
+    {
+      "epoch": 25.422563923350225,
+      "grad_norm": 0.42223548889160156,
+      "learning_rate": 0.0002951083916083916,
+      "loss": 3.1674,
+      "step": 87300
+    },
+    {
+      "epoch": 25.437125050963946,
+      "grad_norm": 0.41911086440086365,
+      "learning_rate": 0.0002949335664335664,
+      "loss": 3.1803,
+      "step": 87350
+    },
+    {
+      "epoch": 25.45168617857767,
+      "grad_norm": 0.42769038677215576,
+      "learning_rate": 0.00029475874125874124,
+      "loss": 3.1638,
+      "step": 87400
+    },
+    {
+      "epoch": 25.46624730619139,
+      "grad_norm": 0.4431706964969635,
+      "learning_rate": 0.00029458391608391604,
+      "loss": 3.1688,
+      "step": 87450
+    },
+    {
+      "epoch": 25.480808433805112,
+      "grad_norm": 0.4112902879714966,
+      "learning_rate": 0.0002944090909090909,
+      "loss": 3.1988,
+      "step": 87500
+    },
+    {
+      "epoch": 25.495369561418837,
+      "grad_norm": 0.42165741324424744,
+      "learning_rate": 0.0002942342657342657,
+      "loss": 3.1791,
+      "step": 87550
+    },
+    {
+      "epoch": 25.509930689032558,
+      "grad_norm": 0.4093731939792633,
+      "learning_rate": 0.00029405944055944054,
+      "loss": 3.1786,
+      "step": 87600
+    },
+    {
+      "epoch": 25.524491816646282,
+      "grad_norm": 0.4248475432395935,
+      "learning_rate": 0.0002938846153846154,
+      "loss": 3.184,
+      "step": 87650
+    },
+    {
+      "epoch": 25.539052944260003,
+      "grad_norm": 0.42811527848243713,
+      "learning_rate": 0.0002937097902097902,
+      "loss": 3.1883,
+      "step": 87700
+    },
+    {
+      "epoch": 25.553614071873724,
+      "grad_norm": 0.44041386246681213,
+      "learning_rate": 0.000293534965034965,
+      "loss": 3.1955,
+      "step": 87750
+    },
+    {
+      "epoch": 25.56817519948745,
+      "grad_norm": 0.4249640107154846,
+      "learning_rate": 0.00029336013986013985,
+      "loss": 3.1888,
+      "step": 87800
+    },
+    {
+      "epoch": 25.58273632710117,
+      "grad_norm": 0.4414690136909485,
+      "learning_rate": 0.00029318531468531465,
+      "loss": 3.201,
+      "step": 87850
+    },
+    {
+      "epoch": 25.597297454714894,
+      "grad_norm": 0.45681437849998474,
+      "learning_rate": 0.0002930104895104895,
+      "loss": 3.1867,
+      "step": 87900
+    },
+    {
+      "epoch": 25.611858582328615,
+      "grad_norm": 0.41531237959861755,
+      "learning_rate": 0.0002928356643356643,
+      "loss": 3.1941,
+      "step": 87950
+    },
+    {
+      "epoch": 25.626419709942336,
+      "grad_norm": 0.4413832426071167,
+      "learning_rate": 0.00029266083916083916,
+      "loss": 3.1847,
+      "step": 88000
+    },
+    {
+      "epoch": 25.626419709942336,
+      "eval_accuracy": 0.37498650969080327,
+      "eval_loss": 3.533761501312256,
+      "eval_runtime": 81.8224,
+      "eval_samples_per_second": 203.441,
+      "eval_steps_per_second": 12.723,
+      "step": 88000
+    },
+    {
+      "epoch": 25.64098083755606,
+      "grad_norm": 0.422968327999115,
+      "learning_rate": 0.00029248601398601396,
+      "loss": 3.1973,
+      "step": 88050
+    },
+    {
+      "epoch": 25.655541965169782,
+      "grad_norm": 0.4194158911705017,
+      "learning_rate": 0.00029231118881118876,
+      "loss": 3.2006,
+      "step": 88100
+    },
+    {
+      "epoch": 25.670103092783506,
+      "grad_norm": 0.42744433879852295,
+      "learning_rate": 0.0002921363636363636,
+      "loss": 3.1893,
+      "step": 88150
+    },
+    {
+      "epoch": 25.684664220397227,
+      "grad_norm": 0.4123415946960449,
+      "learning_rate": 0.0002919615384615384,
+      "loss": 3.2009,
+      "step": 88200
+    },
+    {
+      "epoch": 25.69922534801095,
+      "grad_norm": 0.4174706041812897,
+      "learning_rate": 0.00029178671328671326,
+      "loss": 3.1865,
+      "step": 88250
+    },
+    {
+      "epoch": 25.713786475624673,
+      "grad_norm": 0.4339861571788788,
+      "learning_rate": 0.00029161188811188806,
+      "loss": 3.1874,
+      "step": 88300
+    },
+    {
+      "epoch": 25.728347603238394,
+      "grad_norm": 0.4318697154521942,
+      "learning_rate": 0.0002914370629370629,
+      "loss": 3.1928,
+      "step": 88350
+    },
+    {
+      "epoch": 25.74290873085212,
+      "grad_norm": 0.4214717149734497,
+      "learning_rate": 0.00029126223776223777,
+      "loss": 3.1945,
+      "step": 88400
+    },
+    {
+      "epoch": 25.75746985846584,
+      "grad_norm": 0.45795249938964844,
+      "learning_rate": 0.00029108741258741257,
+      "loss": 3.1971,
+      "step": 88450
+    },
+    {
+      "epoch": 25.772030986079564,
+      "grad_norm": 0.4096623361110687,
+      "learning_rate": 0.00029091258741258737,
+      "loss": 3.2021,
+      "step": 88500
+    },
+    {
+      "epoch": 25.786592113693285,
+      "grad_norm": 0.4361957311630249,
+      "learning_rate": 0.0002907377622377622,
+      "loss": 3.2035,
+      "step": 88550
+    },
+    {
+      "epoch": 25.801153241307006,
+      "grad_norm": 0.44716793298721313,
+      "learning_rate": 0.000290562937062937,
+      "loss": 3.1993,
+      "step": 88600
+    },
+    {
+      "epoch": 25.81571436892073,
+      "grad_norm": 0.4102550148963928,
+      "learning_rate": 0.0002903881118881119,
+      "loss": 3.1993,
+      "step": 88650
+    },
+    {
+      "epoch": 25.83027549653445,
+      "grad_norm": 0.4365271329879761,
+      "learning_rate": 0.0002902132867132867,
+      "loss": 3.1944,
+      "step": 88700
+    },
+    {
+      "epoch": 25.844836624148176,
+      "grad_norm": 0.44143885374069214,
+      "learning_rate": 0.00029003846153846153,
+      "loss": 3.2149,
+      "step": 88750
+    },
+    {
+      "epoch": 25.859397751761897,
+      "grad_norm": 0.4330017864704132,
+      "learning_rate": 0.00028986363636363633,
+      "loss": 3.2087,
+      "step": 88800
+    },
+    {
+      "epoch": 25.873958879375618,
+      "grad_norm": 0.41513675451278687,
+      "learning_rate": 0.00028968881118881113,
+      "loss": 3.1979,
+      "step": 88850
+    },
+    {
+      "epoch": 25.888520006989342,
+      "grad_norm": 0.45582523941993713,
+      "learning_rate": 0.000289513986013986,
+      "loss": 3.1936,
+      "step": 88900
+    },
+    {
+      "epoch": 25.903081134603063,
+      "grad_norm": 0.43732136487960815,
+      "learning_rate": 0.0002893391608391608,
+      "loss": 3.2025,
+      "step": 88950
+    },
+    {
+      "epoch": 25.917642262216788,
+      "grad_norm": 0.4278513491153717,
+      "learning_rate": 0.00028916433566433564,
+      "loss": 3.2055,
+      "step": 89000
+    },
+    {
+      "epoch": 25.917642262216788,
+      "eval_accuracy": 0.37510172104603445,
+      "eval_loss": 3.530043840408325,
+      "eval_runtime": 81.6291,
+      "eval_samples_per_second": 203.922,
+      "eval_steps_per_second": 12.753,
+      "step": 89000
+    },
+    {
+      "epoch": 25.93220338983051,
+      "grad_norm": 0.42948028445243835,
+      "learning_rate": 0.0002889895104895105,
+      "loss": 3.201,
+      "step": 89050
+    },
+    {
+      "epoch": 25.94676451744423,
+      "grad_norm": 0.415725976228714,
+      "learning_rate": 0.0002888146853146853,
+      "loss": 3.1962,
+      "step": 89100
+    },
+    {
+      "epoch": 25.961325645057954,
+      "grad_norm": 0.4632090628147125,
+      "learning_rate": 0.00028863986013986015,
+      "loss": 3.1898,
+      "step": 89150
+    },
+    {
+      "epoch": 25.975886772671675,
+      "grad_norm": 0.42952993512153625,
+      "learning_rate": 0.00028846503496503495,
+      "loss": 3.1991,
+      "step": 89200
+    },
+    {
+      "epoch": 25.9904479002854,
+      "grad_norm": 0.440868616104126,
+      "learning_rate": 0.00028829020979020975,
+      "loss": 3.1998,
+      "step": 89250
+    },
+    {
+      "epoch": 26.004950783388665,
+      "grad_norm": 0.40771979093551636,
+      "learning_rate": 0.0002881153846153846,
+      "loss": 3.1739,
+      "step": 89300
+    },
+    {
+      "epoch": 26.01951191100239,
+      "grad_norm": 0.4043092727661133,
+      "learning_rate": 0.0002879405594405594,
+      "loss": 3.1053,
+      "step": 89350
+    },
+    {
+      "epoch": 26.03407303861611,
+      "grad_norm": 0.4216521680355072,
+      "learning_rate": 0.00028776573426573425,
+      "loss": 3.1138,
+      "step": 89400
+    },
+    {
+      "epoch": 26.04863416622983,
+      "grad_norm": 0.45783573389053345,
+      "learning_rate": 0.00028759090909090905,
+      "loss": 3.1126,
+      "step": 89450
+    },
+    {
+      "epoch": 26.063195293843556,
+      "grad_norm": 0.4315570890903473,
+      "learning_rate": 0.0002874160839160839,
+      "loss": 3.1133,
+      "step": 89500
+    },
+    {
+      "epoch": 26.077756421457277,
+      "grad_norm": 0.45492488145828247,
+      "learning_rate": 0.0002872412587412587,
+      "loss": 3.1265,
+      "step": 89550
+    },
+    {
+      "epoch": 26.092317549071,
+      "grad_norm": 0.44426026940345764,
+      "learning_rate": 0.0002870664335664335,
+      "loss": 3.127,
+      "step": 89600
+    },
+    {
+      "epoch": 26.106878676684723,
+      "grad_norm": 0.4351698160171509,
+      "learning_rate": 0.00028689160839160836,
+      "loss": 3.1366,
+      "step": 89650
+    },
+    {
+      "epoch": 26.121439804298443,
+      "grad_norm": 0.4132281243801117,
+      "learning_rate": 0.0002867167832167832,
+      "loss": 3.1335,
+      "step": 89700
+    },
+    {
+      "epoch": 26.136000931912168,
+      "grad_norm": 0.42897453904151917,
+      "learning_rate": 0.000286541958041958,
+      "loss": 3.1487,
+      "step": 89750
+    },
+    {
+      "epoch": 26.15056205952589,
+      "grad_norm": 0.42802971601486206,
+      "learning_rate": 0.00028636713286713287,
+      "loss": 3.1416,
+      "step": 89800
+    },
+    {
+      "epoch": 26.165123187139613,
+      "grad_norm": 0.4399700462818146,
+      "learning_rate": 0.00028619230769230767,
+      "loss": 3.1312,
+      "step": 89850
+    },
+    {
+      "epoch": 26.179684314753334,
+      "grad_norm": 0.444806843996048,
+      "learning_rate": 0.0002860174825174825,
+      "loss": 3.1316,
+      "step": 89900
+    },
+    {
+      "epoch": 26.194245442367055,
+      "grad_norm": 0.4164326786994934,
+      "learning_rate": 0.0002858426573426573,
+      "loss": 3.1531,
+      "step": 89950
+    },
+    {
+      "epoch": 26.20880656998078,
+      "grad_norm": 0.47919753193855286,
+      "learning_rate": 0.0002856678321678321,
+      "loss": 3.1387,
+      "step": 90000
+    },
+    {
+      "epoch": 26.20880656998078,
+      "eval_accuracy": 0.3742905390551211,
+      "eval_loss": 3.5481929779052734,
+      "eval_runtime": 81.7603,
+      "eval_samples_per_second": 203.595,
+      "eval_steps_per_second": 12.732,
+      "step": 90000
+    },
+    {
+      "epoch": 26.2233676975945,
+      "grad_norm": 0.41104620695114136,
+      "learning_rate": 0.000285493006993007,
+      "loss": 3.1324,
+      "step": 90050
+    },
+    {
+      "epoch": 26.237928825208225,
+      "grad_norm": 0.42326322197914124,
+      "learning_rate": 0.0002853181818181818,
+      "loss": 3.1438,
+      "step": 90100
+    },
+    {
+      "epoch": 26.252489952821946,
+      "grad_norm": 0.44323936104774475,
+      "learning_rate": 0.00028514335664335663,
+      "loss": 3.1566,
+      "step": 90150
+    },
+    {
+      "epoch": 26.267051080435667,
+      "grad_norm": 0.44852617383003235,
+      "learning_rate": 0.00028496853146853143,
+      "loss": 3.1535,
+      "step": 90200
+    },
+    {
+      "epoch": 26.281612208049392,
+      "grad_norm": 0.4421542286872864,
+      "learning_rate": 0.0002847937062937063,
+      "loss": 3.1578,
+      "step": 90250
+    },
+    {
+      "epoch": 26.296173335663113,
+      "grad_norm": 0.45970240235328674,
+      "learning_rate": 0.0002846188811188811,
+      "loss": 3.1538,
+      "step": 90300
+    },
+    {
+      "epoch": 26.310734463276837,
+      "grad_norm": 0.48239070177078247,
+      "learning_rate": 0.0002844440559440559,
+      "loss": 3.1532,
+      "step": 90350
+    },
+    {
+      "epoch": 26.32529559089056,
+      "grad_norm": 0.42468351125717163,
+      "learning_rate": 0.00028426923076923074,
+      "loss": 3.1495,
+      "step": 90400
+    },
+    {
+      "epoch": 26.33985671850428,
+      "grad_norm": 0.4387561082839966,
+      "learning_rate": 0.0002840944055944056,
+      "loss": 3.1623,
+      "step": 90450
+    },
+    {
+      "epoch": 26.354417846118004,
+      "grad_norm": 0.43278950452804565,
+      "learning_rate": 0.0002839195804195804,
+      "loss": 3.1566,
+      "step": 90500
+    },
+    {
+      "epoch": 26.368978973731725,
+      "grad_norm": 0.4446139931678772,
+      "learning_rate": 0.00028374475524475524,
+      "loss": 3.168,
+      "step": 90550
+    },
+    {
+      "epoch": 26.38354010134545,
+      "grad_norm": 0.4542044401168823,
+      "learning_rate": 0.00028356993006993004,
+      "loss": 3.1542,
+      "step": 90600
+    },
+    {
+      "epoch": 26.39810122895917,
+      "grad_norm": 0.4319124221801758,
+      "learning_rate": 0.0002833951048951049,
+      "loss": 3.1641,
+      "step": 90650
+    },
+    {
+      "epoch": 26.41266235657289,
+      "grad_norm": 0.40921565890312195,
+      "learning_rate": 0.0002832202797202797,
+      "loss": 3.1606,
+      "step": 90700
+    },
+    {
+      "epoch": 26.427223484186616,
+      "grad_norm": 0.4638734757900238,
+      "learning_rate": 0.0002830454545454545,
+      "loss": 3.1725,
+      "step": 90750
+    },
+    {
+      "epoch": 26.441784611800337,
+      "grad_norm": 0.43601810932159424,
+      "learning_rate": 0.00028287062937062935,
+      "loss": 3.159,
+      "step": 90800
+    },
+    {
+      "epoch": 26.45634573941406,
+      "grad_norm": 0.4467451870441437,
+      "learning_rate": 0.00028269580419580415,
+      "loss": 3.1726,
+      "step": 90850
+    },
+    {
+      "epoch": 26.470906867027782,
+      "grad_norm": 0.4496610760688782,
+      "learning_rate": 0.000282520979020979,
+      "loss": 3.1601,
+      "step": 90900
+    },
+    {
+      "epoch": 26.485467994641503,
+      "grad_norm": 0.43101951479911804,
+      "learning_rate": 0.0002823461538461538,
+      "loss": 3.1734,
+      "step": 90950
+    },
+    {
+      "epoch": 26.500029122255228,
+      "grad_norm": 0.4648696482181549,
+      "learning_rate": 0.00028217132867132866,
+      "loss": 3.1767,
+      "step": 91000
+    },
+    {
+      "epoch": 26.500029122255228,
+      "eval_accuracy": 0.3745937330195509,
+      "eval_loss": 3.539630889892578,
+      "eval_runtime": 81.2454,
+      "eval_samples_per_second": 204.885,
+      "eval_steps_per_second": 12.813,
+      "step": 91000
+    },
+    {
+      "epoch": 26.51459024986895,
+      "grad_norm": 0.4609490931034088,
+      "learning_rate": 0.00028199650349650346,
+      "loss": 3.1757,
+      "step": 91050
+    },
+    {
+      "epoch": 26.529151377482673,
+      "grad_norm": 0.4083974361419678,
+      "learning_rate": 0.0002818216783216783,
+      "loss": 3.1822,
+      "step": 91100
+    },
+    {
+      "epoch": 26.543712505096394,
+      "grad_norm": 0.4239809215068817,
+      "learning_rate": 0.0002816468531468531,
+      "loss": 3.1685,
+      "step": 91150
+    },
+    {
+      "epoch": 26.55827363271012,
+      "grad_norm": 0.4246228337287903,
+      "learning_rate": 0.00028147202797202796,
+      "loss": 3.177,
+      "step": 91200
+    },
+    {
+      "epoch": 26.57283476032384,
+      "grad_norm": 0.4565730690956116,
+      "learning_rate": 0.00028129720279720276,
+      "loss": 3.191,
+      "step": 91250
+    },
+    {
+      "epoch": 26.58739588793756,
+      "grad_norm": 0.4267881512641907,
+      "learning_rate": 0.0002811223776223776,
+      "loss": 3.1774,
+      "step": 91300
+    },
+    {
+      "epoch": 26.601957015551285,
+      "grad_norm": 0.418037474155426,
+      "learning_rate": 0.0002809475524475524,
+      "loss": 3.1792,
+      "step": 91350
+    },
+    {
+      "epoch": 26.616518143165006,
+      "grad_norm": 0.4386206567287445,
+      "learning_rate": 0.00028077272727272727,
+      "loss": 3.1748,
+      "step": 91400
+    },
+    {
+      "epoch": 26.63107927077873,
+      "grad_norm": 0.47127339243888855,
+      "learning_rate": 0.00028059790209790207,
+      "loss": 3.1774,
+      "step": 91450
+    },
+    {
+      "epoch": 26.64564039839245,
+      "grad_norm": 0.43801349401474,
+      "learning_rate": 0.00028042307692307687,
+      "loss": 3.1796,
+      "step": 91500
+    },
+    {
+      "epoch": 26.660201526006173,
+      "grad_norm": 0.4200217127799988,
+      "learning_rate": 0.0002802482517482517,
+      "loss": 3.1718,
+      "step": 91550
+    },
+    {
+      "epoch": 26.674762653619897,
+      "grad_norm": 0.4898620545864105,
+      "learning_rate": 0.0002800734265734265,
+      "loss": 3.1915,
+      "step": 91600
+    },
+    {
+      "epoch": 26.689323781233618,
+      "grad_norm": 0.4548785090446472,
+      "learning_rate": 0.0002798986013986014,
+      "loss": 3.1842,
+      "step": 91650
+    },
+    {
+      "epoch": 26.703884908847343,
+      "grad_norm": 0.42794618010520935,
+      "learning_rate": 0.0002797237762237762,
+      "loss": 3.1773,
+      "step": 91700
+    },
+    {
+      "epoch": 26.718446036461064,
+      "grad_norm": 0.4374929368495941,
+      "learning_rate": 0.00027954895104895103,
+      "loss": 3.1894,
+      "step": 91750
+    },
+    {
+      "epoch": 26.733007164074785,
+      "grad_norm": 0.46282780170440674,
+      "learning_rate": 0.0002793741258741259,
+      "loss": 3.1916,
+      "step": 91800
+    },
+    {
+      "epoch": 26.74756829168851,
+      "grad_norm": 0.413688987493515,
+      "learning_rate": 0.0002791993006993007,
+      "loss": 3.195,
+      "step": 91850
+    },
+    {
+      "epoch": 26.76212941930223,
+      "grad_norm": 0.43770772218704224,
+      "learning_rate": 0.0002790244755244755,
+      "loss": 3.183,
+      "step": 91900
+    },
+    {
+      "epoch": 26.776690546915955,
+      "grad_norm": 0.43841320276260376,
+      "learning_rate": 0.00027884965034965034,
+      "loss": 3.1939,
+      "step": 91950
+    },
+    {
+      "epoch": 26.791251674529676,
+      "grad_norm": 0.4358888566493988,
+      "learning_rate": 0.00027867482517482514,
+      "loss": 3.1893,
+      "step": 92000
+    },
+    {
+      "epoch": 26.791251674529676,
+      "eval_accuracy": 0.37525760906341865,
+      "eval_loss": 3.532517194747925,
+      "eval_runtime": 80.5056,
+      "eval_samples_per_second": 206.768,
+      "eval_steps_per_second": 12.931,
+      "step": 92000
+    },
+    {
+      "epoch": 26.805812802143397,
+      "grad_norm": 0.43341144919395447,
+      "learning_rate": 0.0002785,
+      "loss": 3.1936,
+      "step": 92050
+    },
+    {
+      "epoch": 26.82037392975712,
+      "grad_norm": 0.41889843344688416,
+      "learning_rate": 0.0002783251748251748,
+      "loss": 3.1841,
+      "step": 92100
+    },
+    {
+      "epoch": 26.834935057370842,
+      "grad_norm": 0.39708298444747925,
+      "learning_rate": 0.00027815034965034965,
+      "loss": 3.1769,
+      "step": 92150
+    },
+    {
+      "epoch": 26.849496184984567,
+      "grad_norm": 0.41780245304107666,
+      "learning_rate": 0.00027797552447552445,
+      "loss": 3.1899,
+      "step": 92200
+    },
+    {
+      "epoch": 26.864057312598288,
+      "grad_norm": 0.42979490756988525,
+      "learning_rate": 0.00027780069930069925,
+      "loss": 3.1975,
+      "step": 92250
+    },
+    {
+      "epoch": 26.87861844021201,
+      "grad_norm": 0.44316792488098145,
+      "learning_rate": 0.0002776258741258741,
+      "loss": 3.1838,
+      "step": 92300
+    },
+    {
+      "epoch": 26.893179567825733,
+      "grad_norm": 0.4461120367050171,
+      "learning_rate": 0.0002774510489510489,
+      "loss": 3.1812,
+      "step": 92350
+    },
+    {
+      "epoch": 26.907740695439454,
+      "grad_norm": 0.4495375156402588,
+      "learning_rate": 0.00027727622377622375,
+      "loss": 3.1994,
+      "step": 92400
+    },
+    {
+      "epoch": 26.92230182305318,
+      "grad_norm": 0.43633168935775757,
+      "learning_rate": 0.00027710139860139855,
+      "loss": 3.1962,
+      "step": 92450
+    },
+    {
+      "epoch": 26.9368629506669,
+      "grad_norm": 0.4445933401584625,
+      "learning_rate": 0.0002769265734265734,
+      "loss": 3.1962,
+      "step": 92500
+    },
+    {
+      "epoch": 26.95142407828062,
+      "grad_norm": 0.4221359193325043,
+      "learning_rate": 0.00027675174825174826,
+      "loss": 3.1908,
+      "step": 92550
+    },
+    {
+      "epoch": 26.965985205894345,
+      "grad_norm": 0.4284641146659851,
+      "learning_rate": 0.00027657692307692306,
+      "loss": 3.2026,
+      "step": 92600
+    },
+    {
+      "epoch": 26.980546333508066,
+      "grad_norm": 0.428451806306839,
+      "learning_rate": 0.00027640209790209786,
+      "loss": 3.2005,
+      "step": 92650
+    },
+    {
+      "epoch": 26.99510746112179,
+      "grad_norm": 0.4614730477333069,
+      "learning_rate": 0.0002762272727272727,
+      "loss": 3.1913,
+      "step": 92700
+    },
+    {
+      "epoch": 27.009610344225056,
+      "grad_norm": 0.43082916736602783,
+      "learning_rate": 0.0002760524475524475,
+      "loss": 3.141,
+      "step": 92750
+    },
+    {
+      "epoch": 27.02417147183878,
+      "grad_norm": 0.4590015113353729,
+      "learning_rate": 0.00027587762237762237,
+      "loss": 3.0963,
+      "step": 92800
+    },
+    {
+      "epoch": 27.0387325994525,
+      "grad_norm": 0.4303778111934662,
+      "learning_rate": 0.00027570279720279717,
+      "loss": 3.1013,
+      "step": 92850
+    },
+    {
+      "epoch": 27.053293727066222,
+      "grad_norm": 0.47176748514175415,
+      "learning_rate": 0.000275527972027972,
+      "loss": 3.1143,
+      "step": 92900
+    },
+    {
+      "epoch": 27.067854854679947,
+      "grad_norm": 0.4238791763782501,
+      "learning_rate": 0.0002753531468531468,
+      "loss": 3.1279,
+      "step": 92950
+    },
+    {
+      "epoch": 27.082415982293668,
+      "grad_norm": 0.46568360924720764,
+      "learning_rate": 0.0002751783216783216,
+      "loss": 3.1194,
+      "step": 93000
+    },
+    {
+      "epoch": 27.082415982293668,
+      "eval_accuracy": 0.3744267941170731,
+      "eval_loss": 3.5482656955718994,
+      "eval_runtime": 80.1211,
+      "eval_samples_per_second": 207.761,
+      "eval_steps_per_second": 12.993,
+      "step": 93000
+    },
+    {
+      "epoch": 27.096977109907392,
+      "grad_norm": 0.4266977608203888,
+      "learning_rate": 0.0002750034965034965,
+      "loss": 3.1087,
+      "step": 93050
+    },
+    {
+      "epoch": 27.111538237521113,
+      "grad_norm": 0.4253590404987335,
+      "learning_rate": 0.0002748286713286713,
+      "loss": 3.1223,
+      "step": 93100
+    },
+    {
+      "epoch": 27.126099365134834,
+      "grad_norm": 0.47660887241363525,
+      "learning_rate": 0.00027465384615384613,
+      "loss": 3.1237,
+      "step": 93150
+    },
+    {
+      "epoch": 27.14066049274856,
+      "grad_norm": 0.4275610148906708,
+      "learning_rate": 0.000274479020979021,
+      "loss": 3.1237,
+      "step": 93200
+    },
+    {
+      "epoch": 27.15522162036228,
+      "grad_norm": 0.4558410346508026,
+      "learning_rate": 0.0002743041958041958,
+      "loss": 3.1228,
+      "step": 93250
+    },
+    {
+      "epoch": 27.169782747976004,
+      "grad_norm": 0.4413674473762512,
+      "learning_rate": 0.00027412937062937064,
+      "loss": 3.1332,
+      "step": 93300
+    },
+    {
+      "epoch": 27.184343875589725,
+      "grad_norm": 0.43724343180656433,
+      "learning_rate": 0.00027395454545454544,
+      "loss": 3.1409,
+      "step": 93350
+    },
+    {
+      "epoch": 27.19890500320345,
+      "grad_norm": 0.4715915024280548,
+      "learning_rate": 0.00027377972027972024,
+      "loss": 3.1249,
+      "step": 93400
+    },
+    {
+      "epoch": 27.21346613081717,
+      "grad_norm": 0.43383172154426575,
+      "learning_rate": 0.0002736048951048951,
+      "loss": 3.1361,
+      "step": 93450
+    },
+    {
+      "epoch": 27.228027258430892,
+      "grad_norm": 0.4315720200538635,
+      "learning_rate": 0.0002734300699300699,
+      "loss": 3.1299,
+      "step": 93500
+    },
+    {
+      "epoch": 27.242588386044616,
+      "grad_norm": 0.47162047028541565,
+      "learning_rate": 0.00027325524475524474,
+      "loss": 3.1471,
+      "step": 93550
+    },
+    {
+      "epoch": 27.257149513658337,
+      "grad_norm": 0.44325777888298035,
+      "learning_rate": 0.00027308041958041954,
+      "loss": 3.1333,
+      "step": 93600
+    },
+    {
+      "epoch": 27.271710641272062,
+      "grad_norm": 0.43961724638938904,
+      "learning_rate": 0.0002729055944055944,
+      "loss": 3.1455,
+      "step": 93650
+    },
+    {
+      "epoch": 27.286271768885783,
+      "grad_norm": 0.4498037099838257,
+      "learning_rate": 0.0002727307692307692,
+      "loss": 3.1441,
+      "step": 93700
+    },
+    {
+      "epoch": 27.300832896499504,
+      "grad_norm": 0.43977901339530945,
+      "learning_rate": 0.000272555944055944,
+      "loss": 3.1543,
+      "step": 93750
+    },
+    {
+      "epoch": 27.31539402411323,
+      "grad_norm": 0.44024720788002014,
+      "learning_rate": 0.00027238111888111885,
+      "loss": 3.1355,
+      "step": 93800
+    },
+    {
+      "epoch": 27.32995515172695,
+      "grad_norm": 0.4535662531852722,
+      "learning_rate": 0.0002722062937062937,
+      "loss": 3.1427,
+      "step": 93850
+    },
+    {
+      "epoch": 27.344516279340674,
+      "grad_norm": 0.46223586797714233,
+      "learning_rate": 0.0002720314685314685,
+      "loss": 3.1344,
+      "step": 93900
+    },
+    {
+      "epoch": 27.359077406954395,
+      "grad_norm": 0.44665613770484924,
+      "learning_rate": 0.00027185664335664336,
+      "loss": 3.1499,
+      "step": 93950
+    },
+    {
+      "epoch": 27.373638534568116,
+      "grad_norm": 0.41817039251327515,
+      "learning_rate": 0.00027168181818181816,
+      "loss": 3.146,
+      "step": 94000
+    },
+    {
+      "epoch": 27.373638534568116,
+      "eval_accuracy": 0.3747087092495673,
+      "eval_loss": 3.545140027999878,
+      "eval_runtime": 80.3242,
+      "eval_samples_per_second": 207.235,
+      "eval_steps_per_second": 12.96,
+      "step": 94000
+    },
+    {
+      "epoch": 27.38819966218184,
+      "grad_norm": 0.4406947195529938,
+      "learning_rate": 0.000271506993006993,
+      "loss": 3.1496,
+      "step": 94050
+    },
+    {
+      "epoch": 27.40276078979556,
+      "grad_norm": 0.44461363554000854,
+      "learning_rate": 0.0002713321678321678,
+      "loss": 3.1488,
+      "step": 94100
+    },
+    {
+      "epoch": 27.417321917409286,
+      "grad_norm": 0.437137246131897,
+      "learning_rate": 0.0002711573426573426,
+      "loss": 3.1507,
+      "step": 94150
+    },
+    {
+      "epoch": 27.431883045023007,
+      "grad_norm": 0.43815845251083374,
+      "learning_rate": 0.00027098251748251746,
+      "loss": 3.1596,
+      "step": 94200
+    },
+    {
+      "epoch": 27.446444172636728,
+      "grad_norm": 0.44346702098846436,
+      "learning_rate": 0.00027080769230769226,
+      "loss": 3.1428,
+      "step": 94250
+    },
+    {
+      "epoch": 27.461005300250452,
+      "grad_norm": 0.45342469215393066,
+      "learning_rate": 0.0002706328671328671,
+      "loss": 3.1459,
+      "step": 94300
+    },
+    {
+      "epoch": 27.475566427864173,
+      "grad_norm": 0.431231826543808,
+      "learning_rate": 0.0002704580419580419,
+      "loss": 3.1567,
+      "step": 94350
+    },
+    {
+      "epoch": 27.490127555477898,
+      "grad_norm": 0.48505812883377075,
+      "learning_rate": 0.00027028321678321677,
+      "loss": 3.1563,
+      "step": 94400
+    },
+    {
+      "epoch": 27.50468868309162,
+      "grad_norm": 0.4472305178642273,
+      "learning_rate": 0.00027010839160839157,
+      "loss": 3.1576,
+      "step": 94450
+    },
+    {
+      "epoch": 27.51924981070534,
+      "grad_norm": 0.43050718307495117,
+      "learning_rate": 0.00026993356643356637,
+      "loss": 3.1602,
+      "step": 94500
+    },
+    {
+      "epoch": 27.533810938319064,
+      "grad_norm": 0.4264449179172516,
+      "learning_rate": 0.0002697587412587412,
+      "loss": 3.1585,
+      "step": 94550
+    },
+    {
+      "epoch": 27.548372065932785,
+      "grad_norm": 0.4252346158027649,
+      "learning_rate": 0.0002695839160839161,
+      "loss": 3.1651,
+      "step": 94600
+    },
+    {
+      "epoch": 27.56293319354651,
+      "grad_norm": 0.4452660381793976,
+      "learning_rate": 0.0002694090909090909,
+      "loss": 3.1683,
+      "step": 94650
+    },
+    {
+      "epoch": 27.57749432116023,
+      "grad_norm": 0.4676416516304016,
+      "learning_rate": 0.00026923426573426573,
+      "loss": 3.1722,
+      "step": 94700
+    },
+    {
+      "epoch": 27.59205544877395,
+      "grad_norm": 0.4459725022315979,
+      "learning_rate": 0.00026905944055944053,
+      "loss": 3.1786,
+      "step": 94750
+    },
+    {
+      "epoch": 27.606616576387676,
+      "grad_norm": 0.43419715762138367,
+      "learning_rate": 0.0002688846153846154,
+      "loss": 3.1628,
+      "step": 94800
+    },
+    {
+      "epoch": 27.621177704001397,
+      "grad_norm": 0.4442163109779358,
+      "learning_rate": 0.0002687097902097902,
+      "loss": 3.1747,
+      "step": 94850
+    },
+    {
+      "epoch": 27.63573883161512,
+      "grad_norm": 0.4572891592979431,
+      "learning_rate": 0.000268534965034965,
+      "loss": 3.1667,
+      "step": 94900
+    },
+    {
+      "epoch": 27.650299959228843,
+      "grad_norm": 0.4328926205635071,
+      "learning_rate": 0.00026836013986013984,
+      "loss": 3.1682,
+      "step": 94950
+    },
+    {
+      "epoch": 27.664861086842564,
+      "grad_norm": 0.4510846734046936,
+      "learning_rate": 0.00026818531468531464,
+      "loss": 3.1737,
+      "step": 95000
+    },
+    {
+      "epoch": 27.664861086842564,
+      "eval_accuracy": 0.3751563876584656,
+      "eval_loss": 3.538522720336914,
+      "eval_runtime": 80.1488,
+      "eval_samples_per_second": 207.689,
+      "eval_steps_per_second": 12.988,
+      "step": 95000
+    },
+    {
+      "epoch": 27.679422214456288,
+      "grad_norm": 0.46054887771606445,
+      "learning_rate": 0.0002680104895104895,
+      "loss": 3.1805,
+      "step": 95050
+    },
+    {
+      "epoch": 27.69398334207001,
+      "grad_norm": 0.44147053360939026,
+      "learning_rate": 0.0002678356643356643,
+      "loss": 3.1851,
+      "step": 95100
+    },
+    {
+      "epoch": 27.708544469683734,
+      "grad_norm": 0.424288272857666,
+      "learning_rate": 0.00026766083916083915,
+      "loss": 3.1731,
+      "step": 95150
+    },
+    {
+      "epoch": 27.723105597297454,
+      "grad_norm": 0.4261254072189331,
+      "learning_rate": 0.00026748601398601395,
+      "loss": 3.1827,
+      "step": 95200
+    },
+    {
+      "epoch": 27.737666724911175,
+      "grad_norm": 0.4687895178794861,
+      "learning_rate": 0.0002673111888111888,
+      "loss": 3.1915,
+      "step": 95250
+    },
+    {
+      "epoch": 27.7522278525249,
+      "grad_norm": 0.4428876042366028,
+      "learning_rate": 0.0002671363636363636,
+      "loss": 3.1873,
+      "step": 95300
+    },
+    {
+      "epoch": 27.76678898013862,
+      "grad_norm": 0.4460248649120331,
+      "learning_rate": 0.00026696153846153845,
+      "loss": 3.1759,
+      "step": 95350
+    },
+    {
+      "epoch": 27.781350107752345,
+      "grad_norm": 0.45214036107063293,
+      "learning_rate": 0.00026678671328671325,
+      "loss": 3.183,
+      "step": 95400
+    },
+    {
+      "epoch": 27.795911235366066,
+      "grad_norm": 0.4358888864517212,
+      "learning_rate": 0.0002666118881118881,
+      "loss": 3.1758,
+      "step": 95450
+    },
+    {
+      "epoch": 27.810472362979787,
+      "grad_norm": 0.4265310764312744,
+      "learning_rate": 0.0002664370629370629,
+      "loss": 3.1986,
+      "step": 95500
+    },
+    {
+      "epoch": 27.825033490593512,
+      "grad_norm": 0.4481215178966522,
+      "learning_rate": 0.00026626223776223776,
+      "loss": 3.172,
+      "step": 95550
+    },
+    {
+      "epoch": 27.839594618207233,
+      "grad_norm": 0.42975902557373047,
+      "learning_rate": 0.00026608741258741256,
+      "loss": 3.1959,
+      "step": 95600
+    },
+    {
+      "epoch": 27.854155745820957,
+      "grad_norm": 0.4381426274776459,
+      "learning_rate": 0.00026591258741258736,
+      "loss": 3.1879,
+      "step": 95650
+    },
+    {
+      "epoch": 27.86871687343468,
+      "grad_norm": 0.42458629608154297,
+      "learning_rate": 0.0002657377622377622,
+      "loss": 3.1921,
+      "step": 95700
+    },
+    {
+      "epoch": 27.883278001048403,
+      "grad_norm": 0.4169227182865143,
+      "learning_rate": 0.000265562937062937,
+      "loss": 3.1862,
+      "step": 95750
+    },
+    {
+      "epoch": 27.897839128662124,
+      "grad_norm": 0.45456182956695557,
+      "learning_rate": 0.00026538811188811187,
+      "loss": 3.184,
+      "step": 95800
+    },
+    {
+      "epoch": 27.912400256275845,
+      "grad_norm": 0.4321167469024658,
+      "learning_rate": 0.00026521328671328667,
+      "loss": 3.184,
+      "step": 95850
+    },
+    {
+      "epoch": 27.92696138388957,
+      "grad_norm": 0.48365333676338196,
+      "learning_rate": 0.0002650384615384615,
+      "loss": 3.1859,
+      "step": 95900
+    },
+    {
+      "epoch": 27.94152251150329,
+      "grad_norm": 0.47256672382354736,
+      "learning_rate": 0.0002648636363636364,
+      "loss": 3.1861,
+      "step": 95950
+    },
+    {
+      "epoch": 27.956083639117015,
+      "grad_norm": 0.46820148825645447,
+      "learning_rate": 0.0002646888111888112,
+      "loss": 3.1959,
+      "step": 96000
+    },
+    {
+      "epoch": 27.956083639117015,
+      "eval_accuracy": 0.3755887829284046,
+      "eval_loss": 3.529139995574951,
+      "eval_runtime": 80.2502,
+      "eval_samples_per_second": 207.426,
+      "eval_steps_per_second": 12.972,
+      "step": 96000
+    },
+    {
+      "epoch": 27.970644766730736,
+      "grad_norm": 0.42508140206336975,
+      "learning_rate": 0.000264513986013986,
+      "loss": 3.1972,
+      "step": 96050
+    },
+    {
+      "epoch": 27.985205894344457,
+      "grad_norm": 0.41875147819519043,
+      "learning_rate": 0.00026433916083916083,
+      "loss": 3.191,
+      "step": 96100
+    },
+    {
+      "epoch": 27.99976702195818,
+      "grad_norm": 0.4364998936653137,
+      "learning_rate": 0.00026416433566433563,
+      "loss": 3.2029,
+      "step": 96150
+    },
+    {
+      "epoch": 28.014269905061447,
+      "grad_norm": 0.4501911699771881,
+      "learning_rate": 0.0002639895104895105,
+      "loss": 3.0886,
+      "step": 96200
+    },
+    {
+      "epoch": 28.02883103267517,
+      "grad_norm": 0.4514945149421692,
+      "learning_rate": 0.0002638146853146853,
+      "loss": 3.0965,
+      "step": 96250
+    },
+    {
+      "epoch": 28.043392160288892,
+      "grad_norm": 0.4345646798610687,
+      "learning_rate": 0.00026363986013986014,
+      "loss": 3.0975,
+      "step": 96300
+    },
+    {
+      "epoch": 28.057953287902617,
+      "grad_norm": 0.4820605516433716,
+      "learning_rate": 0.00026346503496503494,
+      "loss": 3.1039,
+      "step": 96350
+    },
+    {
+      "epoch": 28.072514415516338,
+      "grad_norm": 0.4498361647129059,
+      "learning_rate": 0.00026329020979020974,
+      "loss": 3.1032,
+      "step": 96400
+    },
+    {
+      "epoch": 28.08707554313006,
+      "grad_norm": 0.4363076984882355,
+      "learning_rate": 0.0002631153846153846,
+      "loss": 3.1,
+      "step": 96450
+    },
+    {
+      "epoch": 28.101636670743783,
+      "grad_norm": 0.4439297020435333,
+      "learning_rate": 0.0002629405594405594,
+      "loss": 3.1075,
+      "step": 96500
+    },
+    {
+      "epoch": 28.116197798357504,
+      "grad_norm": 0.4495980739593506,
+      "learning_rate": 0.00026276573426573424,
+      "loss": 3.1011,
+      "step": 96550
+    },
+    {
+      "epoch": 28.13075892597123,
+      "grad_norm": 0.4267682731151581,
+      "learning_rate": 0.00026259090909090904,
+      "loss": 3.1196,
+      "step": 96600
+    },
+    {
+      "epoch": 28.14532005358495,
+      "grad_norm": 0.45746278762817383,
+      "learning_rate": 0.0002624160839160839,
+      "loss": 3.1064,
+      "step": 96650
+    },
+    {
+      "epoch": 28.15988118119867,
+      "grad_norm": 0.4494858682155609,
+      "learning_rate": 0.00026224125874125875,
+      "loss": 3.1358,
+      "step": 96700
+    },
+    {
+      "epoch": 28.174442308812395,
+      "grad_norm": 0.4345605671405792,
+      "learning_rate": 0.00026206643356643355,
+      "loss": 3.1165,
+      "step": 96750
+    },
+    {
+      "epoch": 28.189003436426116,
+      "grad_norm": 0.4938182532787323,
+      "learning_rate": 0.00026189160839160835,
+      "loss": 3.1316,
+      "step": 96800
+    },
+    {
+      "epoch": 28.20356456403984,
+      "grad_norm": 0.4320889711380005,
+      "learning_rate": 0.0002617167832167832,
+      "loss": 3.1396,
+      "step": 96850
+    },
+    {
+      "epoch": 28.21812569165356,
+      "grad_norm": 0.4658863842487335,
+      "learning_rate": 0.000261541958041958,
+      "loss": 3.1285,
+      "step": 96900
+    },
+    {
+      "epoch": 28.232686819267283,
+      "grad_norm": 0.4570460319519043,
+      "learning_rate": 0.00026136713286713286,
+      "loss": 3.1254,
+      "step": 96950
+    },
+    {
+      "epoch": 28.247247946881007,
+      "grad_norm": 0.428600013256073,
+      "learning_rate": 0.00026119230769230766,
+      "loss": 3.1176,
+      "step": 97000
+    },
+    {
+      "epoch": 28.247247946881007,
+      "eval_accuracy": 0.37457974306927283,
+      "eval_loss": 3.549133539199829,
+      "eval_runtime": 80.2696,
+      "eval_samples_per_second": 207.376,
+      "eval_steps_per_second": 12.969,
+      "step": 97000
+    },
+    {
+      "epoch": 28.261809074494728,
+      "grad_norm": 0.4301176965236664,
+      "learning_rate": 0.0002610174825174825,
+      "loss": 3.1302,
+      "step": 97050
+    },
+    {
+      "epoch": 28.276370202108453,
+      "grad_norm": 0.4620283544063568,
+      "learning_rate": 0.0002608426573426573,
+      "loss": 3.1245,
+      "step": 97100
+    },
+    {
+      "epoch": 28.290931329722174,
+      "grad_norm": 0.47182488441467285,
+      "learning_rate": 0.0002606678321678321,
+      "loss": 3.1322,
+      "step": 97150
+    },
+    {
+      "epoch": 28.305492457335895,
+      "grad_norm": 0.4671400189399719,
+      "learning_rate": 0.00026049300699300696,
+      "loss": 3.1412,
+      "step": 97200
+    },
+    {
+      "epoch": 28.32005358494962,
+      "grad_norm": 0.47812679409980774,
+      "learning_rate": 0.00026031818181818176,
+      "loss": 3.1458,
+      "step": 97250
+    },
+    {
+      "epoch": 28.33461471256334,
+      "grad_norm": 0.4420274794101715,
+      "learning_rate": 0.0002601433566433566,
+      "loss": 3.1302,
+      "step": 97300
+    },
+    {
+      "epoch": 28.349175840177065,
+      "grad_norm": 0.44124430418014526,
+      "learning_rate": 0.00025996853146853147,
+      "loss": 3.1475,
+      "step": 97350
+    },
+    {
+      "epoch": 28.363736967790786,
+      "grad_norm": 0.44124090671539307,
+      "learning_rate": 0.00025979370629370627,
+      "loss": 3.14,
+      "step": 97400
+    },
+    {
+      "epoch": 28.378298095404507,
+      "grad_norm": 0.4414704740047455,
+      "learning_rate": 0.0002596188811188811,
+      "loss": 3.1497,
+      "step": 97450
+    },
+    {
+      "epoch": 28.39285922301823,
+      "grad_norm": 0.46400997042655945,
+      "learning_rate": 0.0002594440559440559,
+      "loss": 3.1499,
+      "step": 97500
+    },
+    {
+      "epoch": 28.407420350631952,
+      "grad_norm": 0.4541986286640167,
+      "learning_rate": 0.0002592692307692307,
+      "loss": 3.1394,
+      "step": 97550
+    },
+    {
+      "epoch": 28.421981478245677,
+      "grad_norm": 0.441388338804245,
+      "learning_rate": 0.0002590944055944056,
+      "loss": 3.146,
+      "step": 97600
+    },
+    {
+      "epoch": 28.436542605859398,
+      "grad_norm": 0.44498082995414734,
+      "learning_rate": 0.0002589195804195804,
+      "loss": 3.1438,
+      "step": 97650
+    },
+    {
+      "epoch": 28.45110373347312,
+      "grad_norm": 0.4672899842262268,
+      "learning_rate": 0.00025874475524475523,
+      "loss": 3.1599,
+      "step": 97700
+    },
+    {
+      "epoch": 28.465664861086843,
+      "grad_norm": 0.4834342300891876,
+      "learning_rate": 0.00025856993006993003,
+      "loss": 3.1441,
+      "step": 97750
+    },
+    {
+      "epoch": 28.480225988700564,
+      "grad_norm": 0.47349074482917786,
+      "learning_rate": 0.0002583951048951049,
+      "loss": 3.1528,
+      "step": 97800
+    },
+    {
+      "epoch": 28.49478711631429,
+      "grad_norm": 0.4718509614467621,
+      "learning_rate": 0.0002582202797202797,
+      "loss": 3.1566,
+      "step": 97850
+    },
+    {
+      "epoch": 28.50934824392801,
+      "grad_norm": 0.4297626316547394,
+      "learning_rate": 0.0002580454545454545,
+      "loss": 3.1597,
+      "step": 97900
+    },
+    {
+      "epoch": 28.523909371541734,
+      "grad_norm": 0.44546571373939514,
+      "learning_rate": 0.00025787062937062934,
+      "loss": 3.1509,
+      "step": 97950
+    },
+    {
+      "epoch": 28.538470499155455,
+      "grad_norm": 0.4164583683013916,
+      "learning_rate": 0.0002576958041958042,
+      "loss": 3.1518,
+      "step": 98000
+    },
+    {
+      "epoch": 28.538470499155455,
+      "eval_accuracy": 0.3749666416101563,
+      "eval_loss": 3.5419790744781494,
+      "eval_runtime": 80.1957,
+      "eval_samples_per_second": 207.567,
+      "eval_steps_per_second": 12.981,
+      "step": 98000
+    },
+    {
+      "epoch": 28.553031626769176,
+      "grad_norm": 0.4906649589538574,
+      "learning_rate": 0.000257520979020979,
+      "loss": 3.1513,
+      "step": 98050
+    },
+    {
+      "epoch": 28.5675927543829,
+      "grad_norm": 0.44152840971946716,
+      "learning_rate": 0.00025734615384615385,
+      "loss": 3.1588,
+      "step": 98100
+    },
+    {
+      "epoch": 28.58215388199662,
+      "grad_norm": 0.4337315261363983,
+      "learning_rate": 0.00025717132867132865,
+      "loss": 3.1519,
+      "step": 98150
+    },
+    {
+      "epoch": 28.596715009610342,
+      "grad_norm": 0.47403421998023987,
+      "learning_rate": 0.0002569965034965035,
+      "loss": 3.1559,
+      "step": 98200
+    },
+    {
+      "epoch": 28.611276137224067,
+      "grad_norm": 0.4357426166534424,
+      "learning_rate": 0.0002568216783216783,
+      "loss": 3.1732,
+      "step": 98250
+    },
+    {
+      "epoch": 28.625837264837788,
+      "grad_norm": 0.46377402544021606,
+      "learning_rate": 0.0002566468531468531,
+      "loss": 3.1715,
+      "step": 98300
+    },
+    {
+      "epoch": 28.640398392451512,
+      "grad_norm": 0.46978867053985596,
+      "learning_rate": 0.00025647202797202795,
+      "loss": 3.179,
+      "step": 98350
+    },
+    {
+      "epoch": 28.654959520065233,
+      "grad_norm": 0.434087872505188,
+      "learning_rate": 0.00025629720279720275,
+      "loss": 3.1732,
+      "step": 98400
+    },
+    {
+      "epoch": 28.669520647678958,
+      "grad_norm": 0.4620792865753174,
+      "learning_rate": 0.0002561223776223776,
+      "loss": 3.1526,
+      "step": 98450
+    },
+    {
+      "epoch": 28.68408177529268,
+      "grad_norm": 0.4453633725643158,
+      "learning_rate": 0.0002559475524475524,
+      "loss": 3.1641,
+      "step": 98500
+    },
+    {
+      "epoch": 28.6986429029064,
+      "grad_norm": 0.4334501624107361,
+      "learning_rate": 0.00025577272727272726,
+      "loss": 3.1682,
+      "step": 98550
+    },
+    {
+      "epoch": 28.713204030520124,
+      "grad_norm": 0.4319615662097931,
+      "learning_rate": 0.00025559790209790206,
+      "loss": 3.1638,
+      "step": 98600
+    },
+    {
+      "epoch": 28.727765158133845,
+      "grad_norm": 0.4342224895954132,
+      "learning_rate": 0.00025542307692307686,
+      "loss": 3.177,
+      "step": 98650
+    },
+    {
+      "epoch": 28.74232628574757,
+      "grad_norm": 0.46854275465011597,
+      "learning_rate": 0.00025524825174825177,
+      "loss": 3.1656,
+      "step": 98700
+    },
+    {
+      "epoch": 28.75688741336129,
+      "grad_norm": 0.4229051172733307,
+      "learning_rate": 0.00025507342657342657,
+      "loss": 3.1708,
+      "step": 98750
+    },
+    {
+      "epoch": 28.771448540975012,
+      "grad_norm": 0.4276522099971771,
+      "learning_rate": 0.00025489860139860137,
+      "loss": 3.1722,
+      "step": 98800
+    },
+    {
+      "epoch": 28.786009668588736,
+      "grad_norm": 0.43881532549858093,
+      "learning_rate": 0.0002547237762237762,
+      "loss": 3.1635,
+      "step": 98850
+    },
+    {
+      "epoch": 28.800570796202457,
+      "grad_norm": 0.44678550958633423,
+      "learning_rate": 0.000254548951048951,
+      "loss": 3.183,
+      "step": 98900
+    },
+    {
+      "epoch": 28.815131923816182,
+      "grad_norm": 0.43881022930145264,
+      "learning_rate": 0.0002543741258741259,
+      "loss": 3.1755,
+      "step": 98950
+    },
+    {
+      "epoch": 28.829693051429903,
+      "grad_norm": 0.45341137051582336,
+      "learning_rate": 0.0002541993006993007,
+      "loss": 3.1707,
+      "step": 99000
+    },
+    {
+      "epoch": 28.829693051429903,
+      "eval_accuracy": 0.37557420516508966,
+      "eval_loss": 3.5332367420196533,
+      "eval_runtime": 80.287,
+      "eval_samples_per_second": 207.331,
+      "eval_steps_per_second": 12.966,
+      "step": 99000
+    },
+    {
+      "epoch": 28.844254179043624,
+      "grad_norm": 0.4361846446990967,
+      "learning_rate": 0.0002540244755244755,
+      "loss": 3.1643,
+      "step": 99050
+    },
+    {
+      "epoch": 28.85881530665735,
+      "grad_norm": 0.44548898935317993,
+      "learning_rate": 0.00025384965034965033,
+      "loss": 3.1791,
+      "step": 99100
+    },
+    {
+      "epoch": 28.87337643427107,
+      "grad_norm": 0.45371025800704956,
+      "learning_rate": 0.00025367482517482513,
+      "loss": 3.1785,
+      "step": 99150
+    },
+    {
+      "epoch": 28.887937561884794,
+      "grad_norm": 0.4528813660144806,
+      "learning_rate": 0.0002535,
+      "loss": 3.1653,
+      "step": 99200
+    },
+    {
+      "epoch": 28.902498689498515,
+      "grad_norm": 0.464300274848938,
+      "learning_rate": 0.0002533251748251748,
+      "loss": 3.1823,
+      "step": 99250
+    },
+    {
+      "epoch": 28.917059817112236,
+      "grad_norm": 0.4450153410434723,
+      "learning_rate": 0.00025315034965034964,
+      "loss": 3.1798,
+      "step": 99300
+    },
+    {
+      "epoch": 28.93162094472596,
+      "grad_norm": 0.43621668219566345,
+      "learning_rate": 0.00025297552447552444,
+      "loss": 3.18,
+      "step": 99350
+    },
+    {
+      "epoch": 28.94618207233968,
+      "grad_norm": 0.43210044503211975,
+      "learning_rate": 0.0002528006993006993,
+      "loss": 3.178,
+      "step": 99400
+    },
+    {
+      "epoch": 28.960743199953406,
+      "grad_norm": 0.44858497381210327,
+      "learning_rate": 0.00025262587412587414,
+      "loss": 3.1737,
+      "step": 99450
+    },
+    {
+      "epoch": 28.975304327567127,
+      "grad_norm": 0.42016497254371643,
+      "learning_rate": 0.00025245104895104894,
+      "loss": 3.1891,
+      "step": 99500
+    },
+    {
+      "epoch": 28.989865455180848,
+      "grad_norm": 0.457013875246048,
+      "learning_rate": 0.00025227622377622374,
+      "loss": 3.1854,
+      "step": 99550
+    },
+    {
+      "epoch": 29.004368338284117,
+      "grad_norm": 0.4366385042667389,
+      "learning_rate": 0.0002521013986013986,
+      "loss": 3.156,
+      "step": 99600
+    },
+    {
+      "epoch": 29.018929465897838,
+      "grad_norm": 0.4410390853881836,
+      "learning_rate": 0.0002519265734265734,
+      "loss": 3.0916,
+      "step": 99650
+    },
+    {
+      "epoch": 29.033490593511562,
+      "grad_norm": 0.4760029911994934,
+      "learning_rate": 0.00025175174825174825,
+      "loss": 3.0856,
+      "step": 99700
+    },
+    {
+      "epoch": 29.048051721125283,
+      "grad_norm": 0.4599713981151581,
+      "learning_rate": 0.00025157692307692305,
+      "loss": 3.0921,
+      "step": 99750
+    },
+    {
+      "epoch": 29.062612848739008,
+      "grad_norm": 0.4557690918445587,
+      "learning_rate": 0.0002514020979020979,
+      "loss": 3.0904,
+      "step": 99800
+    },
+    {
+      "epoch": 29.07717397635273,
+      "grad_norm": 0.46123120188713074,
+      "learning_rate": 0.0002512272727272727,
+      "loss": 3.0978,
+      "step": 99850
+    },
+    {
+      "epoch": 29.09173510396645,
+      "grad_norm": 0.47671201825141907,
+      "learning_rate": 0.0002510524475524475,
+      "loss": 3.1055,
+      "step": 99900
+    },
+    {
+      "epoch": 29.106296231580174,
+      "grad_norm": 0.465494841337204,
+      "learning_rate": 0.00025087762237762236,
+      "loss": 3.1087,
+      "step": 99950
+    },
+    {
+      "epoch": 29.120857359193895,
+      "grad_norm": 0.4717271029949188,
+      "learning_rate": 0.00025070279720279716,
+      "loss": 3.1001,
+      "step": 100000
+    },
+    {
+      "epoch": 29.120857359193895,
+      "eval_accuracy": 0.3747177615703355,
+      "eval_loss": 3.5496444702148438,
+      "eval_runtime": 80.0651,
+      "eval_samples_per_second": 207.906,
+      "eval_steps_per_second": 13.002,
+      "step": 100000
+    },
+    {
+      "epoch": 29.13541848680762,
+      "grad_norm": 0.4883511960506439,
+      "learning_rate": 0.000250527972027972,
+      "loss": 3.0934,
+      "step": 100050
+    },
+    {
+      "epoch": 29.14997961442134,
+      "grad_norm": 0.46403738856315613,
+      "learning_rate": 0.00025035314685314686,
+      "loss": 3.1031,
+      "step": 100100
+    },
+    {
+      "epoch": 29.16454074203506,
+      "grad_norm": 0.44273096323013306,
+      "learning_rate": 0.00025017832167832166,
+      "loss": 3.115,
+      "step": 100150
+    },
+    {
+      "epoch": 29.179101869648786,
+      "grad_norm": 0.44171518087387085,
+      "learning_rate": 0.0002500034965034965,
+      "loss": 3.1134,
+      "step": 100200
+    },
+    {
+      "epoch": 29.193662997262507,
+      "grad_norm": 0.4661337733268738,
+      "learning_rate": 0.0002498286713286713,
+      "loss": 3.1086,
+      "step": 100250
+    },
+    {
+      "epoch": 29.20822412487623,
+      "grad_norm": 0.4632498621940613,
+      "learning_rate": 0.0002496538461538461,
+      "loss": 3.1203,
+      "step": 100300
+    },
+    {
+      "epoch": 29.222785252489953,
+      "grad_norm": 0.44383248686790466,
+      "learning_rate": 0.00024947902097902097,
+      "loss": 3.1172,
+      "step": 100350
+    },
+    {
+      "epoch": 29.237346380103673,
+      "grad_norm": 0.4826676845550537,
+      "learning_rate": 0.00024930419580419577,
+      "loss": 3.1148,
+      "step": 100400
+    },
+    {
+      "epoch": 29.251907507717398,
+      "grad_norm": 0.4492054879665375,
+      "learning_rate": 0.0002491293706293706,
+      "loss": 3.1228,
+      "step": 100450
+    },
+    {
+      "epoch": 29.26646863533112,
+      "grad_norm": 0.4762059450149536,
+      "learning_rate": 0.0002489545454545454,
+      "loss": 3.1254,
+      "step": 100500
+    },
+    {
+      "epoch": 29.281029762944843,
+      "grad_norm": 0.44990840554237366,
+      "learning_rate": 0.0002487797202797203,
+      "loss": 3.1239,
+      "step": 100550
+    },
+    {
+      "epoch": 29.295590890558564,
+      "grad_norm": 0.4655245840549469,
+      "learning_rate": 0.0002486048951048951,
+      "loss": 3.1264,
+      "step": 100600
+    },
+    {
+      "epoch": 29.31015201817229,
+      "grad_norm": 0.4644462764263153,
+      "learning_rate": 0.0002484300699300699,
+      "loss": 3.1268,
+      "step": 100650
+    },
+    {
+      "epoch": 29.32471314578601,
+      "grad_norm": 0.4611171782016754,
+      "learning_rate": 0.00024825524475524473,
+      "loss": 3.1363,
+      "step": 100700
+    },
+    {
+      "epoch": 29.33927427339973,
+      "grad_norm": 0.4311342239379883,
+      "learning_rate": 0.00024808041958041953,
+      "loss": 3.1333,
+      "step": 100750
+    },
+    {
+      "epoch": 29.353835401013455,
+      "grad_norm": 0.4353893995285034,
+      "learning_rate": 0.0002479055944055944,
+      "loss": 3.1292,
+      "step": 100800
+    },
+    {
+      "epoch": 29.368396528627176,
+      "grad_norm": 0.44847074151039124,
+      "learning_rate": 0.00024773076923076924,
+      "loss": 3.1416,
+      "step": 100850
+    },
+    {
+      "epoch": 29.3829576562409,
+      "grad_norm": 0.47350096702575684,
+      "learning_rate": 0.00024755594405594404,
+      "loss": 3.142,
+      "step": 100900
+    },
+    {
+      "epoch": 29.397518783854622,
+      "grad_norm": 0.4682367444038391,
+      "learning_rate": 0.0002473811188811189,
+      "loss": 3.1421,
+      "step": 100950
+    },
+    {
+      "epoch": 29.412079911468343,
+      "grad_norm": 0.4489864408969879,
+      "learning_rate": 0.0002472062937062937,
+      "loss": 3.1358,
+      "step": 101000
+    },
+    {
+      "epoch": 29.412079911468343,
+      "eval_accuracy": 0.374867536332136,
+      "eval_loss": 3.5422158241271973,
+      "eval_runtime": 80.1833,
+      "eval_samples_per_second": 207.599,
+      "eval_steps_per_second": 12.983,
+      "step": 101000
+    },
+    {
+      "epoch": 29.426641039082067,
+      "grad_norm": 0.44524815678596497,
+      "learning_rate": 0.0002470314685314685,
+      "loss": 3.1352,
+      "step": 101050
+    },
+    {
+      "epoch": 29.44120216669579,
+      "grad_norm": 0.4737188518047333,
+      "learning_rate": 0.00024685664335664335,
+      "loss": 3.1558,
+      "step": 101100
+    },
+    {
+      "epoch": 29.455763294309513,
+      "grad_norm": 0.42699918150901794,
+      "learning_rate": 0.00024668181818181815,
+      "loss": 3.1368,
+      "step": 101150
+    },
+    {
+      "epoch": 29.470324421923234,
+      "grad_norm": 0.4792460501194,
+      "learning_rate": 0.000246506993006993,
+      "loss": 3.1303,
+      "step": 101200
+    },
+    {
+      "epoch": 29.484885549536955,
+      "grad_norm": 0.4665248394012451,
+      "learning_rate": 0.0002463321678321678,
+      "loss": 3.1459,
+      "step": 101250
+    },
+    {
+      "epoch": 29.49944667715068,
+      "grad_norm": 0.46637818217277527,
+      "learning_rate": 0.00024615734265734265,
+      "loss": 3.1432,
+      "step": 101300
+    },
+    {
+      "epoch": 29.5140078047644,
+      "grad_norm": 0.44114309549331665,
+      "learning_rate": 0.00024598251748251745,
+      "loss": 3.1468,
+      "step": 101350
+    },
+    {
+      "epoch": 29.528568932378125,
+      "grad_norm": 0.42972317337989807,
+      "learning_rate": 0.00024580769230769225,
+      "loss": 3.1451,
+      "step": 101400
+    },
+    {
+      "epoch": 29.543130059991846,
+      "grad_norm": 0.4702269732952118,
+      "learning_rate": 0.0002456328671328671,
+      "loss": 3.1502,
+      "step": 101450
+    },
+    {
+      "epoch": 29.557691187605567,
+      "grad_norm": 0.4438580274581909,
+      "learning_rate": 0.00024545804195804196,
+      "loss": 3.1464,
+      "step": 101500
+    },
+    {
+      "epoch": 29.57225231521929,
+      "grad_norm": 0.4511452913284302,
+      "learning_rate": 0.00024528321678321676,
+      "loss": 3.1685,
+      "step": 101550
+    },
+    {
+      "epoch": 29.586813442833012,
+      "grad_norm": 0.47536587715148926,
+      "learning_rate": 0.0002451083916083916,
+      "loss": 3.1497,
+      "step": 101600
+    },
+    {
+      "epoch": 29.601374570446737,
+      "grad_norm": 0.45455479621887207,
+      "learning_rate": 0.0002449335664335664,
+      "loss": 3.1497,
+      "step": 101650
+    },
+    {
+      "epoch": 29.615935698060458,
+      "grad_norm": 0.44872206449508667,
+      "learning_rate": 0.00024475874125874127,
+      "loss": 3.1571,
+      "step": 101700
+    },
+    {
+      "epoch": 29.63049682567418,
+      "grad_norm": 0.445295512676239,
+      "learning_rate": 0.00024458391608391607,
+      "loss": 3.1487,
+      "step": 101750
+    },
+    {
+      "epoch": 29.645057953287903,
+      "grad_norm": 0.441080778837204,
+      "learning_rate": 0.00024440909090909087,
+      "loss": 3.1633,
+      "step": 101800
+    },
+    {
+      "epoch": 29.659619080901624,
+      "grad_norm": 0.46299949288368225,
+      "learning_rate": 0.0002442342657342657,
+      "loss": 3.1565,
+      "step": 101850
+    },
+    {
+      "epoch": 29.67418020851535,
+      "grad_norm": 0.47477272152900696,
+      "learning_rate": 0.00024405944055944052,
+      "loss": 3.1519,
+      "step": 101900
+    },
+    {
+      "epoch": 29.68874133612907,
+      "grad_norm": 0.49416160583496094,
+      "learning_rate": 0.00024388461538461535,
+      "loss": 3.1549,
+      "step": 101950
+    },
+    {
+      "epoch": 29.70330246374279,
+      "grad_norm": 0.42633551359176636,
+      "learning_rate": 0.00024370979020979017,
+      "loss": 3.1647,
+      "step": 102000
+    },
+    {
+      "epoch": 29.70330246374279,
+      "eval_accuracy": 0.37523515460540935,
+      "eval_loss": 3.538114070892334,
+      "eval_runtime": 80.0602,
+      "eval_samples_per_second": 207.919,
+      "eval_steps_per_second": 13.003,
+      "step": 102000
+    },
+    {
+      "epoch": 29.717863591356515,
+      "grad_norm": 0.44595080614089966,
+      "learning_rate": 0.000243534965034965,
+      "loss": 3.1493,
+      "step": 102050
+    },
+    {
+      "epoch": 29.732424718970236,
+      "grad_norm": 0.4457579553127289,
+      "learning_rate": 0.00024336013986013983,
+      "loss": 3.1555,
+      "step": 102100
+    },
+    {
+      "epoch": 29.74698584658396,
+      "grad_norm": 0.4522918164730072,
+      "learning_rate": 0.00024318531468531468,
+      "loss": 3.1614,
+      "step": 102150
+    },
+    {
+      "epoch": 29.76154697419768,
+      "grad_norm": 0.4831245541572571,
+      "learning_rate": 0.0002430104895104895,
+      "loss": 3.1669,
+      "step": 102200
+    },
+    {
+      "epoch": 29.776108101811403,
+      "grad_norm": 0.4426896274089813,
+      "learning_rate": 0.00024283566433566434,
+      "loss": 3.1637,
+      "step": 102250
+    },
+    {
+      "epoch": 29.790669229425127,
+      "grad_norm": 0.4679453372955322,
+      "learning_rate": 0.00024266083916083916,
+      "loss": 3.143,
+      "step": 102300
+    },
+    {
+      "epoch": 29.805230357038848,
+      "grad_norm": 0.46347737312316895,
+      "learning_rate": 0.00024248601398601396,
+      "loss": 3.1639,
+      "step": 102350
+    },
+    {
+      "epoch": 29.819791484652573,
+      "grad_norm": 0.43473029136657715,
+      "learning_rate": 0.0002423111888111888,
+      "loss": 3.1701,
+      "step": 102400
+    },
+    {
+      "epoch": 29.834352612266294,
+      "grad_norm": 0.476441353559494,
+      "learning_rate": 0.00024213636363636362,
+      "loss": 3.1764,
+      "step": 102450
+    },
+    {
+      "epoch": 29.848913739880015,
+      "grad_norm": 0.4864274263381958,
+      "learning_rate": 0.00024196153846153844,
+      "loss": 3.175,
+      "step": 102500
+    },
+    {
+      "epoch": 29.86347486749374,
+      "grad_norm": 0.4513048827648163,
+      "learning_rate": 0.00024178671328671327,
+      "loss": 3.1733,
+      "step": 102550
+    },
+    {
+      "epoch": 29.87803599510746,
+      "grad_norm": 0.45398372411727905,
+      "learning_rate": 0.0002416118881118881,
+      "loss": 3.1719,
+      "step": 102600
+    },
+    {
+      "epoch": 29.892597122721185,
+      "grad_norm": 0.46702417731285095,
+      "learning_rate": 0.0002414370629370629,
+      "loss": 3.1643,
+      "step": 102650
+    },
+    {
+      "epoch": 29.907158250334906,
+      "grad_norm": 0.4604516625404358,
+      "learning_rate": 0.00024126223776223772,
+      "loss": 3.1713,
+      "step": 102700
+    },
+    {
+      "epoch": 29.921719377948627,
+      "grad_norm": 0.4356239438056946,
+      "learning_rate": 0.00024108741258741255,
+      "loss": 3.1608,
+      "step": 102750
+    },
+    {
+      "epoch": 29.93628050556235,
+      "grad_norm": 0.4850304126739502,
+      "learning_rate": 0.00024091258741258738,
+      "loss": 3.161,
+      "step": 102800
+    },
+    {
+      "epoch": 29.950841633176072,
+      "grad_norm": 0.44736698269844055,
+      "learning_rate": 0.00024073776223776223,
+      "loss": 3.1603,
+      "step": 102850
+    },
+    {
+      "epoch": 29.965402760789797,
+      "grad_norm": 0.4641413986682892,
+      "learning_rate": 0.00024056293706293706,
+      "loss": 3.1901,
+      "step": 102900
+    },
+    {
+      "epoch": 29.979963888403518,
+      "grad_norm": 0.4486852288246155,
+      "learning_rate": 0.00024038811188811188,
+      "loss": 3.1877,
+      "step": 102950
+    },
+    {
+      "epoch": 29.994525016017242,
+      "grad_norm": 0.49141523241996765,
+      "learning_rate": 0.0002402132867132867,
+      "loss": 3.1656,
+      "step": 103000
+    },
+    {
+      "epoch": 29.994525016017242,
+      "eval_accuracy": 0.37589832527363287,
+      "eval_loss": 3.530726432800293,
+      "eval_runtime": 80.1567,
+      "eval_samples_per_second": 207.668,
+      "eval_steps_per_second": 12.987,
+      "step": 103000
+    },
+    {
+      "epoch": 30.009027899120507,
+      "grad_norm": 0.45091068744659424,
+      "learning_rate": 0.00024003846153846154,
+      "loss": 3.1075,
+      "step": 103050
+    },
+    {
+      "epoch": 30.023589026734232,
+      "grad_norm": 0.49528542160987854,
+      "learning_rate": 0.00023986363636363634,
+      "loss": 3.0731,
+      "step": 103100
+    },
+    {
+      "epoch": 30.038150154347953,
+      "grad_norm": 0.4450590908527374,
+      "learning_rate": 0.00023968881118881116,
+      "loss": 3.0853,
+      "step": 103150
+    },
+    {
+      "epoch": 30.052711281961674,
+      "grad_norm": 0.47571080923080444,
+      "learning_rate": 0.000239513986013986,
+      "loss": 3.0899,
+      "step": 103200
+    },
+    {
+      "epoch": 30.0672724095754,
+      "grad_norm": 0.4562253952026367,
+      "learning_rate": 0.00023933916083916082,
+      "loss": 3.084,
+      "step": 103250
+    },
+    {
+      "epoch": 30.08183353718912,
+      "grad_norm": 0.5024160742759705,
+      "learning_rate": 0.00023916433566433564,
+      "loss": 3.086,
+      "step": 103300
+    },
+    {
+      "epoch": 30.096394664802844,
+      "grad_norm": 0.4524730145931244,
+      "learning_rate": 0.00023898951048951047,
+      "loss": 3.0895,
+      "step": 103350
+    },
+    {
+      "epoch": 30.110955792416565,
+      "grad_norm": 0.47511914372444153,
+      "learning_rate": 0.00023881468531468527,
+      "loss": 3.1031,
+      "step": 103400
+    },
+    {
+      "epoch": 30.125516920030286,
+      "grad_norm": 0.4788551330566406,
+      "learning_rate": 0.0002386398601398601,
+      "loss": 3.0969,
+      "step": 103450
+    },
+    {
+      "epoch": 30.14007804764401,
+      "grad_norm": 0.47147780656814575,
+      "learning_rate": 0.00023846503496503492,
+      "loss": 3.1024,
+      "step": 103500
+    },
+    {
+      "epoch": 30.15463917525773,
+      "grad_norm": 0.4830690324306488,
+      "learning_rate": 0.00023829020979020978,
+      "loss": 3.1028,
+      "step": 103550
+    },
+    {
+      "epoch": 30.169200302871456,
+      "grad_norm": 0.43729934096336365,
+      "learning_rate": 0.0002381153846153846,
+      "loss": 3.0977,
+      "step": 103600
+    },
+    {
+      "epoch": 30.183761430485177,
+      "grad_norm": 0.4548850655555725,
+      "learning_rate": 0.00023794055944055943,
+      "loss": 3.1001,
+      "step": 103650
+    },
+    {
+      "epoch": 30.198322558098898,
+      "grad_norm": 0.45362600684165955,
+      "learning_rate": 0.00023776573426573426,
+      "loss": 3.1122,
+      "step": 103700
+    },
+    {
+      "epoch": 30.212883685712622,
+      "grad_norm": 0.48883917927742004,
+      "learning_rate": 0.00023759090909090909,
+      "loss": 3.1036,
+      "step": 103750
+    },
+    {
+      "epoch": 30.227444813326343,
+      "grad_norm": 0.4714140295982361,
+      "learning_rate": 0.0002374160839160839,
+      "loss": 3.12,
+      "step": 103800
+    },
+    {
+      "epoch": 30.242005940940068,
+      "grad_norm": 0.45753130316734314,
+      "learning_rate": 0.0002372412587412587,
+      "loss": 3.1131,
+      "step": 103850
+    },
+    {
+      "epoch": 30.25656706855379,
+      "grad_norm": 0.4977312684059143,
+      "learning_rate": 0.00023706643356643354,
+      "loss": 3.116,
+      "step": 103900
+    },
+    {
+      "epoch": 30.27112819616751,
+      "grad_norm": 0.4684438109397888,
+      "learning_rate": 0.00023689160839160837,
+      "loss": 3.1174,
+      "step": 103950
+    },
+    {
+      "epoch": 30.285689323781234,
+      "grad_norm": 0.5155289173126221,
+      "learning_rate": 0.0002367167832167832,
+      "loss": 3.1186,
+      "step": 104000
+    },
+    {
+      "epoch": 30.285689323781234,
+      "eval_accuracy": 0.3748508424418882,
+      "eval_loss": 3.547801971435547,
+      "eval_runtime": 80.077,
+      "eval_samples_per_second": 207.875,
+      "eval_steps_per_second": 13.0,
+      "step": 104000
+    },
+    {
+      "epoch": 30.300250451394955,
+      "grad_norm": 0.46649569272994995,
+      "learning_rate": 0.00023654195804195802,
+      "loss": 3.1257,
+      "step": 104050
+    },
+    {
+      "epoch": 30.31481157900868,
+      "grad_norm": 0.4594707489013672,
+      "learning_rate": 0.00023636713286713285,
+      "loss": 3.1221,
+      "step": 104100
+    },
+    {
+      "epoch": 30.3293727066224,
+      "grad_norm": 0.46418672800064087,
+      "learning_rate": 0.00023619230769230765,
+      "loss": 3.1134,
+      "step": 104150
+    },
+    {
+      "epoch": 30.343933834236122,
+      "grad_norm": 0.4466555714607239,
+      "learning_rate": 0.00023601748251748247,
+      "loss": 3.1242,
+      "step": 104200
+    },
+    {
+      "epoch": 30.358494961849846,
+      "grad_norm": 0.43967121839523315,
+      "learning_rate": 0.00023584265734265733,
+      "loss": 3.1281,
+      "step": 104250
+    },
+    {
+      "epoch": 30.373056089463567,
+      "grad_norm": 0.4631558656692505,
+      "learning_rate": 0.00023566783216783215,
+      "loss": 3.1192,
+      "step": 104300
+    },
+    {
+      "epoch": 30.387617217077292,
+      "grad_norm": 0.4498026371002197,
+      "learning_rate": 0.00023549300699300698,
+      "loss": 3.1147,
+      "step": 104350
+    },
+    {
+      "epoch": 30.402178344691013,
+      "grad_norm": 0.4484608471393585,
+      "learning_rate": 0.0002353181818181818,
+      "loss": 3.1144,
+      "step": 104400
+    },
+    {
+      "epoch": 30.416739472304734,
+      "grad_norm": 0.4739231765270233,
+      "learning_rate": 0.00023514335664335663,
+      "loss": 3.1434,
+      "step": 104450
+    },
+    {
+      "epoch": 30.43130059991846,
+      "grad_norm": 0.46061864495277405,
+      "learning_rate": 0.00023496853146853146,
+      "loss": 3.1278,
+      "step": 104500
+    },
+    {
+      "epoch": 30.44586172753218,
+      "grad_norm": 0.46449682116508484,
+      "learning_rate": 0.0002347937062937063,
+      "loss": 3.1416,
+      "step": 104550
+    },
+    {
+      "epoch": 30.460422855145904,
+      "grad_norm": 0.48044365644454956,
+      "learning_rate": 0.0002346188811188811,
+      "loss": 3.1387,
+      "step": 104600
+    },
+    {
+      "epoch": 30.474983982759625,
+      "grad_norm": 0.4746876060962677,
+      "learning_rate": 0.0002344440559440559,
+      "loss": 3.1409,
+      "step": 104650
+    },
+    {
+      "epoch": 30.489545110373346,
+      "grad_norm": 0.4719257354736328,
+      "learning_rate": 0.00023426923076923074,
+      "loss": 3.1386,
+      "step": 104700
+    },
+    {
+      "epoch": 30.50410623798707,
+      "grad_norm": 0.4413412809371948,
+      "learning_rate": 0.00023409440559440557,
+      "loss": 3.1297,
+      "step": 104750
+    },
+    {
+      "epoch": 30.51866736560079,
+      "grad_norm": 0.4679592251777649,
+      "learning_rate": 0.0002339195804195804,
+      "loss": 3.1452,
+      "step": 104800
+    },
+    {
+      "epoch": 30.533228493214516,
+      "grad_norm": 0.4566100835800171,
+      "learning_rate": 0.00023374475524475522,
+      "loss": 3.1453,
+      "step": 104850
+    },
+    {
+      "epoch": 30.547789620828237,
+      "grad_norm": 0.47029706835746765,
+      "learning_rate": 0.00023356993006993002,
+      "loss": 3.1506,
+      "step": 104900
+    },
+    {
+      "epoch": 30.562350748441958,
+      "grad_norm": 0.4564160406589508,
+      "learning_rate": 0.0002333951048951049,
+      "loss": 3.1419,
+      "step": 104950
+    },
+    {
+      "epoch": 30.576911876055682,
+      "grad_norm": 0.4709474742412567,
+      "learning_rate": 0.0002332202797202797,
+      "loss": 3.146,
+      "step": 105000
+    },
+    {
+      "epoch": 30.576911876055682,
+      "eval_accuracy": 0.3752152865247623,
+      "eval_loss": 3.5399692058563232,
+      "eval_runtime": 80.0664,
+      "eval_samples_per_second": 207.902,
+      "eval_steps_per_second": 13.002,
+      "step": 105000
+    },
+    {
+      "epoch": 30.591473003669403,
+      "grad_norm": 0.46142178773880005,
+      "learning_rate": 0.00023304545454545453,
+      "loss": 3.1563,
+      "step": 105050
+    },
+    {
+      "epoch": 30.606034131283128,
+      "grad_norm": 0.46358320116996765,
+      "learning_rate": 0.00023287062937062935,
+      "loss": 3.1385,
+      "step": 105100
+    },
+    {
+      "epoch": 30.62059525889685,
+      "grad_norm": 0.4504646360874176,
+      "learning_rate": 0.00023269580419580418,
+      "loss": 3.1471,
+      "step": 105150
+    },
+    {
+      "epoch": 30.635156386510573,
+      "grad_norm": 0.4714089632034302,
+      "learning_rate": 0.000232520979020979,
+      "loss": 3.1404,
+      "step": 105200
+    },
+    {
+      "epoch": 30.649717514124294,
+      "grad_norm": 0.4826344847679138,
+      "learning_rate": 0.00023234615384615384,
+      "loss": 3.1385,
+      "step": 105250
+    },
+    {
+      "epoch": 30.664278641738015,
+      "grad_norm": 0.4676797688007355,
+      "learning_rate": 0.00023217132867132866,
+      "loss": 3.158,
+      "step": 105300
+    },
+    {
+      "epoch": 30.67883976935174,
+      "grad_norm": 0.4501577317714691,
+      "learning_rate": 0.00023199650349650346,
+      "loss": 3.1386,
+      "step": 105350
+    },
+    {
+      "epoch": 30.69340089696546,
+      "grad_norm": 0.4723682701587677,
+      "learning_rate": 0.0002318216783216783,
+      "loss": 3.15,
+      "step": 105400
+    },
+    {
+      "epoch": 30.707962024579185,
+      "grad_norm": 0.48127326369285583,
+      "learning_rate": 0.00023164685314685312,
+      "loss": 3.1437,
+      "step": 105450
+    },
+    {
+      "epoch": 30.722523152192906,
+      "grad_norm": 0.4738757908344269,
+      "learning_rate": 0.00023147202797202794,
+      "loss": 3.144,
+      "step": 105500
+    },
+    {
+      "epoch": 30.737084279806627,
+      "grad_norm": 0.47248905897140503,
+      "learning_rate": 0.00023129720279720277,
+      "loss": 3.1454,
+      "step": 105550
+    },
+    {
+      "epoch": 30.75164540742035,
+      "grad_norm": 0.4754147231578827,
+      "learning_rate": 0.0002311223776223776,
+      "loss": 3.1639,
+      "step": 105600
+    },
+    {
+      "epoch": 30.766206535034073,
+      "grad_norm": 0.44767364859580994,
+      "learning_rate": 0.00023094755244755245,
+      "loss": 3.1555,
+      "step": 105650
+    },
+    {
+      "epoch": 30.780767662647797,
+      "grad_norm": 0.46636345982551575,
+      "learning_rate": 0.00023077272727272728,
+      "loss": 3.1481,
+      "step": 105700
+    },
+    {
+      "epoch": 30.795328790261518,
+      "grad_norm": 0.4550693929195404,
+      "learning_rate": 0.00023059790209790208,
+      "loss": 3.1523,
+      "step": 105750
+    },
+    {
+      "epoch": 30.80988991787524,
+      "grad_norm": 0.4555010199546814,
+      "learning_rate": 0.0002304230769230769,
+      "loss": 3.1685,
+      "step": 105800
+    },
+    {
+      "epoch": 30.824451045488964,
+      "grad_norm": 0.45012399554252625,
+      "learning_rate": 0.00023024825174825173,
+      "loss": 3.1568,
+      "step": 105850
+    },
+    {
+      "epoch": 30.839012173102684,
+      "grad_norm": 0.45288777351379395,
+      "learning_rate": 0.00023007342657342656,
+      "loss": 3.1639,
+      "step": 105900
+    },
+    {
+      "epoch": 30.85357330071641,
+      "grad_norm": 0.4628019332885742,
+      "learning_rate": 0.00022989860139860138,
+      "loss": 3.1595,
+      "step": 105950
+    },
+    {
+      "epoch": 30.86813442833013,
+      "grad_norm": 0.46279942989349365,
+      "learning_rate": 0.0002297237762237762,
+      "loss": 3.1564,
+      "step": 106000
+    },
+    {
+      "epoch": 30.86813442833013,
+      "eval_accuracy": 0.375562213779137,
+      "eval_loss": 3.534041166305542,
+      "eval_runtime": 80.0374,
+      "eval_samples_per_second": 207.978,
+      "eval_steps_per_second": 13.006,
+      "step": 106000
+    },
+    {
+      "epoch": 30.88269555594385,
+      "grad_norm": 0.5104995369911194,
+      "learning_rate": 0.00022954895104895104,
+      "loss": 3.1669,
+      "step": 106050
+    },
+    {
+      "epoch": 30.897256683557575,
+      "grad_norm": 0.4528321921825409,
+      "learning_rate": 0.00022937412587412584,
+      "loss": 3.1579,
+      "step": 106100
+    },
+    {
+      "epoch": 30.911817811171296,
+      "grad_norm": 0.5047301650047302,
+      "learning_rate": 0.00022919930069930066,
+      "loss": 3.1495,
+      "step": 106150
+    },
+    {
+      "epoch": 30.92637893878502,
+      "grad_norm": 0.4677218496799469,
+      "learning_rate": 0.0002290244755244755,
+      "loss": 3.1518,
+      "step": 106200
+    },
+    {
+      "epoch": 30.940940066398742,
+      "grad_norm": 0.4636547267436981,
+      "learning_rate": 0.00022884965034965032,
+      "loss": 3.1625,
+      "step": 106250
+    },
+    {
+      "epoch": 30.955501194012463,
+      "grad_norm": 0.4806773066520691,
+      "learning_rate": 0.00022867482517482517,
+      "loss": 3.1572,
+      "step": 106300
+    },
+    {
+      "epoch": 30.970062321626187,
+      "grad_norm": 0.446891188621521,
+      "learning_rate": 0.0002285,
+      "loss": 3.174,
+      "step": 106350
+    },
+    {
+      "epoch": 30.98462344923991,
+      "grad_norm": 0.4715198874473572,
+      "learning_rate": 0.00022832517482517482,
+      "loss": 3.1645,
+      "step": 106400
+    },
+    {
+      "epoch": 30.999184576853633,
+      "grad_norm": 0.47789454460144043,
+      "learning_rate": 0.00022815034965034965,
+      "loss": 3.1705,
+      "step": 106450
+    },
+    {
+      "epoch": 31.0136874599569,
+      "grad_norm": 0.47417598962783813,
+      "learning_rate": 0.00022797552447552445,
+      "loss": 3.0832,
+      "step": 106500
+    },
+    {
+      "epoch": 31.028248587570623,
+      "grad_norm": 0.4684296250343323,
+      "learning_rate": 0.00022780069930069928,
+      "loss": 3.0746,
+      "step": 106550
+    },
+    {
+      "epoch": 31.042809715184344,
+      "grad_norm": 0.48018065094947815,
+      "learning_rate": 0.0002276258741258741,
+      "loss": 3.0733,
+      "step": 106600
+    },
+    {
+      "epoch": 31.057370842798065,
+      "grad_norm": 0.45850443840026855,
+      "learning_rate": 0.00022745104895104893,
+      "loss": 3.0727,
+      "step": 106650
+    },
+    {
+      "epoch": 31.07193197041179,
+      "grad_norm": 0.48787158727645874,
+      "learning_rate": 0.00022727622377622376,
+      "loss": 3.0854,
+      "step": 106700
+    },
+    {
+      "epoch": 31.08649309802551,
+      "grad_norm": 0.4667603671550751,
+      "learning_rate": 0.00022710139860139858,
+      "loss": 3.0871,
+      "step": 106750
+    },
+    {
+      "epoch": 31.101054225639235,
+      "grad_norm": 0.47687122225761414,
+      "learning_rate": 0.0002269265734265734,
+      "loss": 3.0864,
+      "step": 106800
+    },
+    {
+      "epoch": 31.115615353252956,
+      "grad_norm": 0.5060867667198181,
+      "learning_rate": 0.0002267517482517482,
+      "loss": 3.0812,
+      "step": 106850
+    },
+    {
+      "epoch": 31.130176480866677,
+      "grad_norm": 0.45447468757629395,
+      "learning_rate": 0.00022657692307692304,
+      "loss": 3.1017,
+      "step": 106900
+    },
+    {
+      "epoch": 31.1447376084804,
+      "grad_norm": 0.45640525221824646,
+      "learning_rate": 0.00022640209790209787,
+      "loss": 3.0849,
+      "step": 106950
+    },
+    {
+      "epoch": 31.159298736094122,
+      "grad_norm": 0.47501760721206665,
+      "learning_rate": 0.00022622727272727272,
+      "loss": 3.1052,
+      "step": 107000
+    },
+    {
+      "epoch": 31.159298736094122,
+      "eval_accuracy": 0.37493631045745257,
+      "eval_loss": 3.5480411052703857,
+      "eval_runtime": 80.1188,
+      "eval_samples_per_second": 207.766,
+      "eval_steps_per_second": 12.993,
+      "step": 107000
+    },
+    {
+      "epoch": 31.173859863707847,
+      "grad_norm": 0.4731229543685913,
+      "learning_rate": 0.00022605244755244755,
+      "loss": 3.0897,
+      "step": 107050
+    },
+    {
+      "epoch": 31.188420991321568,
+      "grad_norm": 0.43716487288475037,
+      "learning_rate": 0.00022587762237762237,
+      "loss": 3.0957,
+      "step": 107100
+    },
+    {
+      "epoch": 31.20298211893529,
+      "grad_norm": 0.46976327896118164,
+      "learning_rate": 0.0002257027972027972,
+      "loss": 3.0931,
+      "step": 107150
+    },
+    {
+      "epoch": 31.217543246549013,
+      "grad_norm": 0.48122093081474304,
+      "learning_rate": 0.00022552797202797203,
+      "loss": 3.1004,
+      "step": 107200
+    },
+    {
+      "epoch": 31.232104374162734,
+      "grad_norm": 0.48461681604385376,
+      "learning_rate": 0.00022535314685314683,
+      "loss": 3.0987,
+      "step": 107250
+    },
+    {
+      "epoch": 31.24666550177646,
+      "grad_norm": 0.47971364855766296,
+      "learning_rate": 0.00022517832167832165,
+      "loss": 3.1053,
+      "step": 107300
+    },
+    {
+      "epoch": 31.26122662939018,
+      "grad_norm": 0.48928123712539673,
+      "learning_rate": 0.00022500349650349648,
+      "loss": 3.111,
+      "step": 107350
+    },
+    {
+      "epoch": 31.2757877570039,
+      "grad_norm": 0.4729492962360382,
+      "learning_rate": 0.0002248286713286713,
+      "loss": 3.1053,
+      "step": 107400
+    },
+    {
+      "epoch": 31.290348884617625,
+      "grad_norm": 0.4527254104614258,
+      "learning_rate": 0.00022465384615384613,
+      "loss": 3.1115,
+      "step": 107450
+    },
+    {
+      "epoch": 31.304910012231346,
+      "grad_norm": 0.515654444694519,
+      "learning_rate": 0.00022447902097902096,
+      "loss": 3.1026,
+      "step": 107500
+    },
+    {
+      "epoch": 31.31947113984507,
+      "grad_norm": 0.46846842765808105,
+      "learning_rate": 0.0002243041958041958,
+      "loss": 3.1171,
+      "step": 107550
+    },
+    {
+      "epoch": 31.33403226745879,
+      "grad_norm": 0.4890732765197754,
+      "learning_rate": 0.00022412937062937059,
+      "loss": 3.1184,
+      "step": 107600
+    },
+    {
+      "epoch": 31.348593395072513,
+      "grad_norm": 0.48433053493499756,
+      "learning_rate": 0.0002239545454545454,
+      "loss": 3.1206,
+      "step": 107650
+    },
+    {
+      "epoch": 31.363154522686237,
+      "grad_norm": 0.46679550409317017,
+      "learning_rate": 0.00022377972027972027,
+      "loss": 3.1204,
+      "step": 107700
+    },
+    {
+      "epoch": 31.377715650299958,
+      "grad_norm": 0.4765881896018982,
+      "learning_rate": 0.0002236048951048951,
+      "loss": 3.1142,
+      "step": 107750
+    },
+    {
+      "epoch": 31.392276777913683,
+      "grad_norm": 0.4770815968513489,
+      "learning_rate": 0.00022343006993006992,
+      "loss": 3.1075,
+      "step": 107800
+    },
+    {
+      "epoch": 31.406837905527404,
+      "grad_norm": 0.44743603467941284,
+      "learning_rate": 0.00022325524475524475,
+      "loss": 3.1302,
+      "step": 107850
+    },
+    {
+      "epoch": 31.421399033141128,
+      "grad_norm": 0.464642733335495,
+      "learning_rate": 0.00022308041958041957,
+      "loss": 3.1183,
+      "step": 107900
+    },
+    {
+      "epoch": 31.43596016075485,
+      "grad_norm": 0.4641686975955963,
+      "learning_rate": 0.0002229055944055944,
+      "loss": 3.1279,
+      "step": 107950
+    },
+    {
+      "epoch": 31.45052128836857,
+      "grad_norm": 0.4865407943725586,
+      "learning_rate": 0.0002227307692307692,
+      "loss": 3.1158,
+      "step": 108000
+    },
+    {
+      "epoch": 31.45052128836857,
+      "eval_accuracy": 0.3751401640186473,
+      "eval_loss": 3.5436835289001465,
+      "eval_runtime": 80.2711,
+      "eval_samples_per_second": 207.372,
+      "eval_steps_per_second": 12.969,
+      "step": 108000
+    },
+    {
+      "epoch": 31.465082415982295,
+      "grad_norm": 0.48484715819358826,
+      "learning_rate": 0.00022255594405594403,
+      "loss": 3.1279,
+      "step": 108050
+    },
+    {
+      "epoch": 31.479643543596016,
+      "grad_norm": 0.49689507484436035,
+      "learning_rate": 0.00022238111888111885,
+      "loss": 3.1381,
+      "step": 108100
+    },
+    {
+      "epoch": 31.49420467120974,
+      "grad_norm": 0.48077550530433655,
+      "learning_rate": 0.00022220629370629368,
+      "loss": 3.1143,
+      "step": 108150
+    },
+    {
+      "epoch": 31.50876579882346,
+      "grad_norm": 0.46697720885276794,
+      "learning_rate": 0.0002220314685314685,
+      "loss": 3.1276,
+      "step": 108200
+    },
+    {
+      "epoch": 31.523326926437182,
+      "grad_norm": 0.450339138507843,
+      "learning_rate": 0.00022185664335664333,
+      "loss": 3.127,
+      "step": 108250
+    },
+    {
+      "epoch": 31.537888054050907,
+      "grad_norm": 0.48000800609588623,
+      "learning_rate": 0.00022168181818181816,
+      "loss": 3.1256,
+      "step": 108300
+    },
+    {
+      "epoch": 31.552449181664628,
+      "grad_norm": 0.45078784227371216,
+      "learning_rate": 0.00022150699300699296,
+      "loss": 3.1354,
+      "step": 108350
+    },
+    {
+      "epoch": 31.567010309278352,
+      "grad_norm": 0.5220247507095337,
+      "learning_rate": 0.00022133216783216782,
+      "loss": 3.1319,
+      "step": 108400
+    },
+    {
+      "epoch": 31.581571436892073,
+      "grad_norm": 0.47364139556884766,
+      "learning_rate": 0.00022115734265734264,
+      "loss": 3.138,
+      "step": 108450
+    },
+    {
+      "epoch": 31.596132564505794,
+      "grad_norm": 0.5164151191711426,
+      "learning_rate": 0.00022098251748251747,
+      "loss": 3.1304,
+      "step": 108500
+    },
+    {
+      "epoch": 31.61069369211952,
+      "grad_norm": 0.48746195435523987,
+      "learning_rate": 0.0002208076923076923,
+      "loss": 3.1443,
+      "step": 108550
+    },
+    {
+      "epoch": 31.62525481973324,
+      "grad_norm": 0.4802226424217224,
+      "learning_rate": 0.00022063286713286712,
+      "loss": 3.1355,
+      "step": 108600
+    },
+    {
+      "epoch": 31.639815947346964,
+      "grad_norm": 0.4758455455303192,
+      "learning_rate": 0.00022045804195804195,
+      "loss": 3.1396,
+      "step": 108650
+    },
+    {
+      "epoch": 31.654377074960685,
+      "grad_norm": 0.5246302485466003,
+      "learning_rate": 0.00022028321678321678,
+      "loss": 3.129,
+      "step": 108700
+    },
+    {
+      "epoch": 31.668938202574406,
+      "grad_norm": 0.530346691608429,
+      "learning_rate": 0.00022010839160839158,
+      "loss": 3.1192,
+      "step": 108750
+    },
+    {
+      "epoch": 31.68349933018813,
+      "grad_norm": 0.5023282170295715,
+      "learning_rate": 0.0002199335664335664,
+      "loss": 3.1508,
+      "step": 108800
+    },
+    {
+      "epoch": 31.69806045780185,
+      "grad_norm": 0.4601937532424927,
+      "learning_rate": 0.00021975874125874123,
+      "loss": 3.1397,
+      "step": 108850
+    },
+    {
+      "epoch": 31.712621585415576,
+      "grad_norm": 0.461478590965271,
+      "learning_rate": 0.00021958391608391606,
+      "loss": 3.146,
+      "step": 108900
+    },
+    {
+      "epoch": 31.727182713029297,
+      "grad_norm": 0.4497092068195343,
+      "learning_rate": 0.00021940909090909088,
+      "loss": 3.1376,
+      "step": 108950
+    },
+    {
+      "epoch": 31.741743840643018,
+      "grad_norm": 0.4443832039833069,
+      "learning_rate": 0.0002192342657342657,
+      "loss": 3.1372,
+      "step": 109000
+    },
+    {
+      "epoch": 31.741743840643018,
+      "eval_accuracy": 0.37569035702117987,
+      "eval_loss": 3.5374155044555664,
+      "eval_runtime": 80.5794,
+      "eval_samples_per_second": 206.579,
+      "eval_steps_per_second": 12.919,
+      "step": 109000
+    },
+    {
+      "epoch": 31.756304968256742,
+      "grad_norm": 0.4915781319141388,
+      "learning_rate": 0.00021905944055944054,
+      "loss": 3.1481,
+      "step": 109050
+    },
+    {
+      "epoch": 31.770866095870463,
+      "grad_norm": 0.469692587852478,
+      "learning_rate": 0.0002188846153846154,
+      "loss": 3.1523,
+      "step": 109100
+    },
+    {
+      "epoch": 31.785427223484188,
+      "grad_norm": 0.4651733636856079,
+      "learning_rate": 0.0002187097902097902,
+      "loss": 3.1517,
+      "step": 109150
+    },
+    {
+      "epoch": 31.79998835109791,
+      "grad_norm": 0.46926674246788025,
+      "learning_rate": 0.00021853496503496502,
+      "loss": 3.1418,
+      "step": 109200
+    },
+    {
+      "epoch": 31.81454947871163,
+      "grad_norm": 0.47990190982818604,
+      "learning_rate": 0.00021836013986013984,
+      "loss": 3.1477,
+      "step": 109250
+    },
+    {
+      "epoch": 31.829110606325354,
+      "grad_norm": 0.46162647008895874,
+      "learning_rate": 0.00021818531468531467,
+      "loss": 3.1466,
+      "step": 109300
+    },
+    {
+      "epoch": 31.843671733939075,
+      "grad_norm": 0.47720441222190857,
+      "learning_rate": 0.0002180104895104895,
+      "loss": 3.1627,
+      "step": 109350
+    },
+    {
+      "epoch": 31.8582328615528,
+      "grad_norm": 0.47837400436401367,
+      "learning_rate": 0.00021783566433566432,
+      "loss": 3.1456,
+      "step": 109400
+    },
+    {
+      "epoch": 31.87279398916652,
+      "grad_norm": 0.491738885641098,
+      "learning_rate": 0.00021766083916083915,
+      "loss": 3.1459,
+      "step": 109450
+    },
+    {
+      "epoch": 31.887355116780242,
+      "grad_norm": 0.48230868577957153,
+      "learning_rate": 0.00021748601398601395,
+      "loss": 3.1453,
+      "step": 109500
+    },
+    {
+      "epoch": 31.901916244393966,
+      "grad_norm": 0.476310133934021,
+      "learning_rate": 0.00021731118881118878,
+      "loss": 3.1492,
+      "step": 109550
+    },
+    {
+      "epoch": 31.916477372007687,
+      "grad_norm": 0.4597647786140442,
+      "learning_rate": 0.0002171363636363636,
+      "loss": 3.1604,
+      "step": 109600
+    },
+    {
+      "epoch": 31.931038499621412,
+      "grad_norm": 0.45928624272346497,
+      "learning_rate": 0.00021696153846153843,
+      "loss": 3.1674,
+      "step": 109650
+    },
+    {
+      "epoch": 31.945599627235133,
+      "grad_norm": 0.4845750033855438,
+      "learning_rate": 0.00021678671328671326,
+      "loss": 3.1557,
+      "step": 109700
+    },
+    {
+      "epoch": 31.960160754848857,
+      "grad_norm": 0.46560871601104736,
+      "learning_rate": 0.00021661188811188808,
+      "loss": 3.1529,
+      "step": 109750
+    },
+    {
+      "epoch": 31.97472188246258,
+      "grad_norm": 0.44942712783813477,
+      "learning_rate": 0.00021643706293706294,
+      "loss": 3.1677,
+      "step": 109800
+    },
+    {
+      "epoch": 31.9892830100763,
+      "grad_norm": 0.4887540340423584,
+      "learning_rate": 0.00021626223776223777,
+      "loss": 3.1662,
+      "step": 109850
+    },
+    {
+      "epoch": 32.00378589317957,
+      "grad_norm": 0.4859778881072998,
+      "learning_rate": 0.00021608741258741256,
+      "loss": 3.1318,
+      "step": 109900
+    },
+    {
+      "epoch": 32.01834702079329,
+      "grad_norm": 0.47409725189208984,
+      "learning_rate": 0.0002159125874125874,
+      "loss": 3.0582,
+      "step": 109950
+    },
+    {
+      "epoch": 32.03290814840701,
+      "grad_norm": 0.5324541926383972,
+      "learning_rate": 0.00021573776223776222,
+      "loss": 3.0647,
+      "step": 110000
+    },
+    {
+      "epoch": 32.03290814840701,
+      "eval_accuracy": 0.3751038371729673,
+      "eval_loss": 3.548638343811035,
+      "eval_runtime": 80.3445,
+      "eval_samples_per_second": 207.183,
+      "eval_steps_per_second": 12.957,
+      "step": 110000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 171700,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 50,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 20,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 14
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.29922355806208e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}