diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,5569 @@
+{
+  "best_metric": 3.2710256576538086,
+  "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_6910/checkpoint-30000",
+  "epoch": 20.0,
+  "eval_steps": 1000,
+  "global_step": 37100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.026954177897574125,
+      "grad_norm": 8.296930313110352,
+      "learning_rate": 0.000276,
+      "loss": 44.82,
+      "step": 50
+    },
+    {
+      "epoch": 0.05390835579514825,
+      "grad_norm": 11.741185188293457,
+      "learning_rate": 0.0005759999999999999,
+      "loss": 34.5172,
+      "step": 100
+    },
+    {
+      "epoch": 0.08086253369272237,
+      "grad_norm": 9.069363594055176,
+      "learning_rate": 0.000599254054054054,
+      "loss": 31.9796,
+      "step": 150
+    },
+    {
+      "epoch": 0.1078167115902965,
+      "grad_norm": 8.488612174987793,
+      "learning_rate": 0.0005984432432432432,
+      "loss": 30.1663,
+      "step": 200
+    },
+    {
+      "epoch": 0.1347708894878706,
+      "grad_norm": 3.753924608230591,
+      "learning_rate": 0.0005976324324324324,
+      "loss": 29.1509,
+      "step": 250
+    },
+    {
+      "epoch": 0.16172506738544473,
+      "grad_norm": 5.127024173736572,
+      "learning_rate": 0.0005968216216216216,
+      "loss": 28.2834,
+      "step": 300
+    },
+    {
+      "epoch": 0.18867924528301888,
+      "grad_norm": 8.49015998840332,
+      "learning_rate": 0.0005960108108108108,
+      "loss": 27.4926,
+      "step": 350
+    },
+    {
+      "epoch": 0.215633423180593,
+      "grad_norm": 5.718660831451416,
+      "learning_rate": 0.0005951999999999999,
+      "loss": 26.72,
+      "step": 400
+    },
+    {
+      "epoch": 0.24258760107816713,
+      "grad_norm": 4.080533027648926,
+      "learning_rate": 0.0005943891891891891,
+      "loss": 26.0067,
+      "step": 450
+    },
+    {
+      "epoch": 0.2695417789757412,
+      "grad_norm": 4.703437805175781,
+      "learning_rate": 0.0005935783783783783,
+      "loss": 25.2921,
+      "step": 500
+    },
+    {
+      "epoch": 0.29649595687331537,
+      "grad_norm": 4.108513355255127,
+      "learning_rate": 0.0005927675675675675,
+      "loss": 24.7137,
+      "step": 550
+    },
+    {
+      "epoch": 0.32345013477088946,
+      "grad_norm": 3.727118968963623,
+      "learning_rate": 0.0005919567567567567,
+      "loss": 24.2733,
+      "step": 600
+    },
+    {
+      "epoch": 0.3504043126684636,
+      "grad_norm": 7.353233814239502,
+      "learning_rate": 0.0005911459459459459,
+      "loss": 23.7954,
+      "step": 650
+    },
+    {
+      "epoch": 0.37735849056603776,
+      "grad_norm": 6.545207977294922,
+      "learning_rate": 0.0005903351351351351,
+      "loss": 23.4134,
+      "step": 700
+    },
+    {
+      "epoch": 0.40431266846361186,
+      "grad_norm": 4.7703776359558105,
+      "learning_rate": 0.0005895243243243242,
+      "loss": 23.0314,
+      "step": 750
+    },
+    {
+      "epoch": 0.431266846361186,
+      "grad_norm": 5.744662761688232,
+      "learning_rate": 0.0005887135135135134,
+      "loss": 22.6726,
+      "step": 800
+    },
+    {
+      "epoch": 0.4582210242587601,
+      "grad_norm": 4.685823440551758,
+      "learning_rate": 0.0005879027027027026,
+      "loss": 22.3204,
+      "step": 850
+    },
+    {
+      "epoch": 0.48517520215633425,
+      "grad_norm": 4.1006999015808105,
+      "learning_rate": 0.0005870918918918918,
+      "loss": 22.0621,
+      "step": 900
+    },
+    {
+      "epoch": 0.5121293800539084,
+      "grad_norm": 4.774529457092285,
+      "learning_rate": 0.000586281081081081,
+      "loss": 21.7707,
+      "step": 950
+    },
+    {
+      "epoch": 0.5390835579514824,
+      "grad_norm": 6.737414836883545,
+      "learning_rate": 0.0005854702702702703,
+      "loss": 21.4198,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5390835579514824,
+      "eval_accuracy": 0.29736351021476004,
+      "eval_loss": 4.217971324920654,
+      "eval_runtime": 184.1069,
+      "eval_samples_per_second": 97.834,
+      "eval_steps_per_second": 6.116,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5660377358490566,
+      "grad_norm": 3.665602445602417,
+      "learning_rate": 0.0005846594594594594,
+      "loss": 21.278,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5929919137466307,
+      "grad_norm": 4.725819110870361,
+      "learning_rate": 0.0005838486486486486,
+      "loss": 21.0817,
+      "step": 1100
+    },
+    {
+      "epoch": 0.6199460916442049,
+      "grad_norm": 2.7761611938476562,
+      "learning_rate": 0.0005830378378378377,
+      "loss": 20.9031,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6469002695417789,
+      "grad_norm": 3.616361379623413,
+      "learning_rate": 0.0005822270270270269,
+      "loss": 20.7398,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6738544474393531,
+      "grad_norm": 3.293269395828247,
+      "learning_rate": 0.0005814162162162161,
+      "loss": 20.6258,
+      "step": 1250
+    },
+    {
+      "epoch": 0.7008086253369272,
+      "grad_norm": 3.5149900913238525,
+      "learning_rate": 0.0005806054054054054,
+      "loss": 20.5009,
+      "step": 1300
+    },
+    {
+      "epoch": 0.7277628032345014,
+      "grad_norm": 3.3984060287475586,
+      "learning_rate": 0.0005797945945945946,
+      "loss": 20.3903,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7547169811320755,
+      "grad_norm": 3.0169084072113037,
+      "learning_rate": 0.0005789837837837838,
+      "loss": 20.2629,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7816711590296496,
+      "grad_norm": 3.1698763370513916,
+      "learning_rate": 0.0005781729729729729,
+      "loss": 20.1576,
+      "step": 1450
+    },
+    {
+      "epoch": 0.8086253369272237,
+      "grad_norm": 2.82014536857605,
+      "learning_rate": 0.0005773621621621621,
+      "loss": 20.074,
+      "step": 1500
+    },
+    {
+      "epoch": 0.8355795148247979,
+      "grad_norm": 3.329674482345581,
+      "learning_rate": 0.0005765513513513512,
+      "loss": 19.9387,
+      "step": 1550
+    },
+    {
+      "epoch": 0.862533692722372,
+      "grad_norm": 3.1232547760009766,
+      "learning_rate": 0.0005757405405405405,
+      "loss": 19.8412,
+      "step": 1600
+    },
+    {
+      "epoch": 0.889487870619946,
+      "grad_norm": 3.4831297397613525,
+      "learning_rate": 0.0005749297297297297,
+      "loss": 19.7664,
+      "step": 1650
+    },
+    {
+      "epoch": 0.9164420485175202,
+      "grad_norm": 3.227527618408203,
+      "learning_rate": 0.0005741189189189189,
+      "loss": 19.7121,
+      "step": 1700
+    },
+    {
+      "epoch": 0.9433962264150944,
+      "grad_norm": 2.779829740524292,
+      "learning_rate": 0.0005733081081081081,
+      "loss": 19.6274,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9703504043126685,
+      "grad_norm": 2.9527933597564697,
+      "learning_rate": 0.0005724972972972973,
+      "loss": 19.5337,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9973045822102425,
+      "grad_norm": 3.239156484603882,
+      "learning_rate": 0.0005716864864864864,
+      "loss": 19.4532,
+      "step": 1850
+    },
+    {
+      "epoch": 1.0242587601078168,
+      "grad_norm": 2.773244857788086,
+      "learning_rate": 0.0005708756756756756,
+      "loss": 19.1807,
+      "step": 1900
+    },
+    {
+      "epoch": 1.0512129380053907,
+      "grad_norm": 3.259617328643799,
+      "learning_rate": 0.0005700648648648648,
+      "loss": 19.1206,
+      "step": 1950
+    },
+    {
+      "epoch": 1.0781671159029649,
+      "grad_norm": 3.177616834640503,
+      "learning_rate": 0.000569254054054054,
+      "loss": 19.135,
+      "step": 2000
+    },
+    {
+      "epoch": 1.0781671159029649,
+      "eval_accuracy": 0.3386392111716781,
+      "eval_loss": 3.795865297317505,
+      "eval_runtime": 182.4488,
+      "eval_samples_per_second": 98.724,
+      "eval_steps_per_second": 6.172,
+      "step": 2000
+    },
+    {
+      "epoch": 1.105121293800539,
+      "grad_norm": 2.7435619831085205,
+      "learning_rate": 0.0005684432432432432,
+      "loss": 19.0787,
+      "step": 2050
+    },
+    {
+      "epoch": 1.1320754716981132,
+      "grad_norm": 2.1673481464385986,
+      "learning_rate": 0.0005676324324324324,
+      "loss": 19.0351,
+      "step": 2100
+    },
+    {
+      "epoch": 1.1590296495956873,
+      "grad_norm": 2.312748670578003,
+      "learning_rate": 0.0005668216216216216,
+      "loss": 18.9722,
+      "step": 2150
+    },
+    {
+      "epoch": 1.1859838274932615,
+      "grad_norm": 2.636430501937866,
+      "learning_rate": 0.0005660108108108108,
+      "loss": 18.9062,
+      "step": 2200
+    },
+    {
+      "epoch": 1.2129380053908356,
+      "grad_norm": 2.2114217281341553,
+      "learning_rate": 0.0005652,
+      "loss": 18.8991,
+      "step": 2250
+    },
+    {
+      "epoch": 1.2398921832884098,
+      "grad_norm": 2.093247652053833,
+      "learning_rate": 0.0005643891891891892,
+      "loss": 18.8394,
+      "step": 2300
+    },
+    {
+      "epoch": 1.266846361185984,
+      "grad_norm": 1.8652660846710205,
+      "learning_rate": 0.0005635783783783783,
+      "loss": 18.7991,
+      "step": 2350
+    },
+    {
+      "epoch": 1.2938005390835579,
+      "grad_norm": 2.7426042556762695,
+      "learning_rate": 0.0005627675675675675,
+      "loss": 18.8135,
+      "step": 2400
+    },
+    {
+      "epoch": 1.320754716981132,
+      "grad_norm": 2.2966415882110596,
+      "learning_rate": 0.0005619567567567567,
+      "loss": 18.7167,
+      "step": 2450
+    },
+    {
+      "epoch": 1.3477088948787062,
+      "grad_norm": 2.343276023864746,
+      "learning_rate": 0.0005611459459459459,
+      "loss": 18.711,
+      "step": 2500
+    },
+    {
+      "epoch": 1.3746630727762803,
+      "grad_norm": 2.7991366386413574,
+      "learning_rate": 0.0005603351351351351,
+      "loss": 18.6614,
+      "step": 2550
+    },
+    {
+      "epoch": 1.4016172506738545,
+      "grad_norm": 2.226125717163086,
+      "learning_rate": 0.0005595243243243243,
+      "loss": 18.6536,
+      "step": 2600
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 2.165311098098755,
+      "learning_rate": 0.0005587135135135135,
+      "loss": 18.5774,
+      "step": 2650
+    },
+    {
+      "epoch": 1.4555256064690028,
+      "grad_norm": 2.3691980838775635,
+      "learning_rate": 0.0005579027027027027,
+      "loss": 18.5688,
+      "step": 2700
+    },
+    {
+      "epoch": 1.482479784366577,
+      "grad_norm": 2.169313669204712,
+      "learning_rate": 0.0005570918918918918,
+      "loss": 18.5461,
+      "step": 2750
+    },
+    {
+      "epoch": 1.509433962264151,
+      "grad_norm": 1.9214165210723877,
+      "learning_rate": 0.000556281081081081,
+      "loss": 18.5095,
+      "step": 2800
+    },
+    {
+      "epoch": 1.536388140161725,
+      "grad_norm": 2.0874290466308594,
+      "learning_rate": 0.0005554702702702702,
+      "loss": 18.5082,
+      "step": 2850
+    },
+    {
+      "epoch": 1.5633423180592994,
+      "grad_norm": 2.0279717445373535,
+      "learning_rate": 0.0005546594594594594,
+      "loss": 18.4448,
+      "step": 2900
+    },
+    {
+      "epoch": 1.5902964959568733,
+      "grad_norm": 1.8327255249023438,
+      "learning_rate": 0.0005538486486486486,
+      "loss": 18.4225,
+      "step": 2950
+    },
+    {
+      "epoch": 1.6172506738544474,
+      "grad_norm": 2.1154870986938477,
+      "learning_rate": 0.0005530378378378378,
+      "loss": 18.3835,
+      "step": 3000
+    },
+    {
+      "epoch": 1.6172506738544474,
+      "eval_accuracy": 0.353348908946547,
+      "eval_loss": 3.639807939529419,
+      "eval_runtime": 179.8343,
+      "eval_samples_per_second": 100.159,
+      "eval_steps_per_second": 6.261,
+      "step": 3000
+    },
+    {
+      "epoch": 1.6442048517520216,
+      "grad_norm": 1.7121838331222534,
+      "learning_rate": 0.000552227027027027,
+      "loss": 18.3714,
+      "step": 3050
+    },
+    {
+      "epoch": 1.6711590296495957,
+      "grad_norm": 1.950905680656433,
+      "learning_rate": 0.0005514162162162161,
+      "loss": 18.3744,
+      "step": 3100
+    },
+    {
+      "epoch": 1.6981132075471699,
+      "grad_norm": 1.8936762809753418,
+      "learning_rate": 0.0005506054054054053,
+      "loss": 18.3124,
+      "step": 3150
+    },
+    {
+      "epoch": 1.7250673854447438,
+      "grad_norm": 2.1329703330993652,
+      "learning_rate": 0.0005497945945945945,
+      "loss": 18.2764,
+      "step": 3200
+    },
+    {
+      "epoch": 1.7520215633423182,
+      "grad_norm": 1.8802683353424072,
+      "learning_rate": 0.0005489837837837837,
+      "loss": 18.2335,
+      "step": 3250
+    },
+    {
+      "epoch": 1.778975741239892,
+      "grad_norm": 1.9462956190109253,
+      "learning_rate": 0.0005481729729729729,
+      "loss": 18.2418,
+      "step": 3300
+    },
+    {
+      "epoch": 1.8059299191374663,
+      "grad_norm": 1.771466851234436,
+      "learning_rate": 0.0005473621621621621,
+      "loss": 18.1747,
+      "step": 3350
+    },
+    {
+      "epoch": 1.8328840970350404,
+      "grad_norm": 1.7610357999801636,
+      "learning_rate": 0.0005465513513513514,
+      "loss": 18.1823,
+      "step": 3400
+    },
+    {
+      "epoch": 1.8598382749326146,
+      "grad_norm": 1.9204082489013672,
+      "learning_rate": 0.0005457405405405405,
+      "loss": 18.106,
+      "step": 3450
+    },
+    {
+      "epoch": 1.8867924528301887,
+      "grad_norm": 2.1296563148498535,
+      "learning_rate": 0.0005449297297297296,
+      "loss": 18.1452,
+      "step": 3500
+    },
+    {
+      "epoch": 1.9137466307277629,
+      "grad_norm": 2.0194156169891357,
+      "learning_rate": 0.0005441189189189188,
+      "loss": 18.1186,
+      "step": 3550
+    },
+    {
+      "epoch": 1.940700808625337,
+      "grad_norm": 2.01413631439209,
+      "learning_rate": 0.000543308108108108,
+      "loss": 18.0921,
+      "step": 3600
+    },
+    {
+      "epoch": 1.967654986522911,
+      "grad_norm": 1.983556866645813,
+      "learning_rate": 0.0005424972972972972,
+      "loss": 18.0646,
+      "step": 3650
+    },
+    {
+      "epoch": 1.9946091644204853,
+      "grad_norm": 1.752462387084961,
+      "learning_rate": 0.0005416864864864865,
+      "loss": 18.0691,
+      "step": 3700
+    },
+    {
+      "epoch": 2.0215633423180592,
+      "grad_norm": 1.784954309463501,
+      "learning_rate": 0.0005408756756756757,
+      "loss": 17.6995,
+      "step": 3750
+    },
+    {
+      "epoch": 2.0485175202156336,
+      "grad_norm": 1.8124125003814697,
+      "learning_rate": 0.0005400648648648649,
+      "loss": 17.6682,
+      "step": 3800
+    },
+    {
+      "epoch": 2.0754716981132075,
+      "grad_norm": 1.8763898611068726,
+      "learning_rate": 0.000539254054054054,
+      "loss": 17.6483,
+      "step": 3850
+    },
+    {
+      "epoch": 2.1024258760107815,
+      "grad_norm": 2.0498359203338623,
+      "learning_rate": 0.0005384432432432431,
+      "loss": 17.6595,
+      "step": 3900
+    },
+    {
+      "epoch": 2.129380053908356,
+      "grad_norm": 2.153012990951538,
+      "learning_rate": 0.0005376324324324323,
+      "loss": 17.6197,
+      "step": 3950
+    },
+    {
+      "epoch": 2.1563342318059298,
+      "grad_norm": 1.7588380575180054,
+      "learning_rate": 0.0005368216216216215,
+      "loss": 17.6178,
+      "step": 4000
+    },
+    {
+      "epoch": 2.1563342318059298,
+      "eval_accuracy": 0.36234291294388216,
+      "eval_loss": 3.5501065254211426,
+      "eval_runtime": 179.4768,
+      "eval_samples_per_second": 100.358,
+      "eval_steps_per_second": 6.274,
+      "step": 4000
+    },
+    {
+      "epoch": 2.183288409703504,
+      "grad_norm": 1.8876287937164307,
+      "learning_rate": 0.0005360108108108108,
+      "loss": 17.5874,
+      "step": 4050
+    },
+    {
+      "epoch": 2.210242587601078,
+      "grad_norm": 1.8467564582824707,
+      "learning_rate": 0.0005352,
+      "loss": 17.6174,
+      "step": 4100
+    },
+    {
+      "epoch": 2.2371967654986524,
+      "grad_norm": 1.6003799438476562,
+      "learning_rate": 0.0005343891891891892,
+      "loss": 17.6635,
+      "step": 4150
+    },
+    {
+      "epoch": 2.2641509433962264,
+      "grad_norm": 1.851845145225525,
+      "learning_rate": 0.0005335783783783784,
+      "loss": 17.5831,
+      "step": 4200
+    },
+    {
+      "epoch": 2.2911051212938007,
+      "grad_norm": 1.834288477897644,
+      "learning_rate": 0.0005327675675675675,
+      "loss": 17.6081,
+      "step": 4250
+    },
+    {
+      "epoch": 2.3180592991913747,
+      "grad_norm": 1.741883397102356,
+      "learning_rate": 0.0005319567567567566,
+      "loss": 17.5878,
+      "step": 4300
+    },
+    {
+      "epoch": 2.3450134770889486,
+      "grad_norm": 2.0378310680389404,
+      "learning_rate": 0.0005311459459459459,
+      "loss": 17.5712,
+      "step": 4350
+    },
+    {
+      "epoch": 2.371967654986523,
+      "grad_norm": 1.803305983543396,
+      "learning_rate": 0.0005303351351351351,
+      "loss": 17.5698,
+      "step": 4400
+    },
+    {
+      "epoch": 2.398921832884097,
+      "grad_norm": 1.83433198928833,
+      "learning_rate": 0.0005295243243243243,
+      "loss": 17.5659,
+      "step": 4450
+    },
+    {
+      "epoch": 2.4258760107816713,
+      "grad_norm": 1.8511073589324951,
+      "learning_rate": 0.0005287135135135135,
+      "loss": 17.5527,
+      "step": 4500
+    },
+    {
+      "epoch": 2.452830188679245,
+      "grad_norm": 1.9463386535644531,
+      "learning_rate": 0.0005279027027027027,
+      "loss": 17.5418,
+      "step": 4550
+    },
+    {
+      "epoch": 2.4797843665768196,
+      "grad_norm": 1.6681801080703735,
+      "learning_rate": 0.0005270918918918919,
+      "loss": 17.5826,
+      "step": 4600
+    },
+    {
+      "epoch": 2.5067385444743935,
+      "grad_norm": 1.8517515659332275,
+      "learning_rate": 0.000526281081081081,
+      "loss": 17.5022,
+      "step": 4650
+    },
+    {
+      "epoch": 2.533692722371968,
+      "grad_norm": 1.640101671218872,
+      "learning_rate": 0.0005254702702702702,
+      "loss": 17.5124,
+      "step": 4700
+    },
+    {
+      "epoch": 2.560646900269542,
+      "grad_norm": 1.7293671369552612,
+      "learning_rate": 0.0005246594594594594,
+      "loss": 17.5609,
+      "step": 4750
+    },
+    {
+      "epoch": 2.5876010781671157,
+      "grad_norm": 1.7008033990859985,
+      "learning_rate": 0.0005238486486486486,
+      "loss": 17.5315,
+      "step": 4800
+    },
+    {
+      "epoch": 2.61455525606469,
+      "grad_norm": 1.646754503250122,
+      "learning_rate": 0.0005230378378378378,
+      "loss": 17.4806,
+      "step": 4850
+    },
+    {
+      "epoch": 2.641509433962264,
+      "grad_norm": 1.6908066272735596,
+      "learning_rate": 0.000522227027027027,
+      "loss": 17.5326,
+      "step": 4900
+    },
+    {
+      "epoch": 2.6684636118598384,
+      "grad_norm": 1.6482735872268677,
+      "learning_rate": 0.0005214162162162162,
+      "loss": 17.4713,
+      "step": 4950
+    },
+    {
+      "epoch": 2.6954177897574123,
+      "grad_norm": 1.7912228107452393,
+      "learning_rate": 0.0005206054054054054,
+      "loss": 17.4994,
+      "step": 5000
+    },
+    {
+      "epoch": 2.6954177897574123,
+      "eval_accuracy": 0.36892821615335375,
+      "eval_loss": 3.4884941577911377,
+      "eval_runtime": 179.4879,
+      "eval_samples_per_second": 100.352,
+      "eval_steps_per_second": 6.273,
+      "step": 5000
+    },
+    {
+      "epoch": 2.7223719676549867,
+      "grad_norm": 1.4762933254241943,
+      "learning_rate": 0.0005197945945945946,
+      "loss": 17.4871,
+      "step": 5050
+    },
+    {
+      "epoch": 2.7493261455525606,
+      "grad_norm": 1.655876636505127,
+      "learning_rate": 0.0005189837837837837,
+      "loss": 17.4543,
+      "step": 5100
+    },
+    {
+      "epoch": 2.776280323450135,
+      "grad_norm": 1.5306168794631958,
+      "learning_rate": 0.0005181729729729729,
+      "loss": 17.4413,
+      "step": 5150
+    },
+    {
+      "epoch": 2.803234501347709,
+      "grad_norm": 1.7954528331756592,
+      "learning_rate": 0.0005173621621621621,
+      "loss": 17.3965,
+      "step": 5200
+    },
+    {
+      "epoch": 2.830188679245283,
+      "grad_norm": 1.7866185903549194,
+      "learning_rate": 0.0005165513513513513,
+      "loss": 17.4073,
+      "step": 5250
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 1.7150987386703491,
+      "learning_rate": 0.0005157405405405405,
+      "loss": 17.4216,
+      "step": 5300
+    },
+    {
+      "epoch": 2.884097035040431,
+      "grad_norm": 1.7467018365859985,
+      "learning_rate": 0.0005149297297297297,
+      "loss": 17.3859,
+      "step": 5350
+    },
+    {
+      "epoch": 2.9110512129380055,
+      "grad_norm": 1.8833541870117188,
+      "learning_rate": 0.0005141189189189189,
+      "loss": 17.3729,
+      "step": 5400
+    },
+    {
+      "epoch": 2.9380053908355794,
+      "grad_norm": 1.8507732152938843,
+      "learning_rate": 0.000513308108108108,
+      "loss": 17.3817,
+      "step": 5450
+    },
+    {
+      "epoch": 2.964959568733154,
+      "grad_norm": 1.6320147514343262,
+      "learning_rate": 0.0005124972972972972,
+      "loss": 17.3824,
+      "step": 5500
+    },
+    {
+      "epoch": 2.9919137466307277,
+      "grad_norm": 1.676006555557251,
+      "learning_rate": 0.0005116864864864864,
+      "loss": 17.3853,
+      "step": 5550
+    },
+    {
+      "epoch": 3.018867924528302,
+      "grad_norm": 1.6603399515151978,
+      "learning_rate": 0.0005108756756756756,
+      "loss": 17.0273,
+      "step": 5600
+    },
+    {
+      "epoch": 3.045822102425876,
+      "grad_norm": 1.6063870191574097,
+      "learning_rate": 0.0005100648648648648,
+      "loss": 16.9003,
+      "step": 5650
+    },
+    {
+      "epoch": 3.07277628032345,
+      "grad_norm": 1.7241891622543335,
+      "learning_rate": 0.000509254054054054,
+      "loss": 16.8804,
+      "step": 5700
+    },
+    {
+      "epoch": 3.0997304582210243,
+      "grad_norm": 1.7697309255599976,
+      "learning_rate": 0.0005084432432432432,
+      "loss": 16.9783,
+      "step": 5750
+    },
+    {
+      "epoch": 3.1266846361185983,
+      "grad_norm": 1.793578863143921,
+      "learning_rate": 0.0005076324324324324,
+      "loss": 16.9443,
+      "step": 5800
+    },
+    {
+      "epoch": 3.1536388140161726,
+      "grad_norm": 1.615790843963623,
+      "learning_rate": 0.0005068216216216217,
+      "loss": 16.9679,
+      "step": 5850
+    },
+    {
+      "epoch": 3.1805929919137466,
+      "grad_norm": 1.551426887512207,
+      "learning_rate": 0.0005060108108108107,
+      "loss": 16.9544,
+      "step": 5900
+    },
+    {
+      "epoch": 3.207547169811321,
+      "grad_norm": 1.5588505268096924,
+      "learning_rate": 0.0005051999999999999,
+      "loss": 16.9667,
+      "step": 5950
+    },
+    {
+      "epoch": 3.234501347708895,
+      "grad_norm": 1.5159425735473633,
+      "learning_rate": 0.0005043891891891891,
+      "loss": 16.9783,
+      "step": 6000
+    },
+    {
+      "epoch": 3.234501347708895,
+      "eval_accuracy": 0.3732482324243068,
+      "eval_loss": 3.449159860610962,
+      "eval_runtime": 179.7805,
+      "eval_samples_per_second": 100.189,
+      "eval_steps_per_second": 6.263,
+      "step": 6000
+    },
+    {
+      "epoch": 3.2614555256064692,
+      "grad_norm": 1.9565342664718628,
+      "learning_rate": 0.0005035783783783783,
+      "loss": 17.0079,
+      "step": 6050
+    },
+    {
+      "epoch": 3.288409703504043,
+      "grad_norm": 1.7779537439346313,
+      "learning_rate": 0.0005027675675675675,
+      "loss": 16.9515,
+      "step": 6100
+    },
+    {
+      "epoch": 3.315363881401617,
+      "grad_norm": 1.6085190773010254,
+      "learning_rate": 0.0005019567567567568,
+      "loss": 17.0043,
+      "step": 6150
+    },
+    {
+      "epoch": 3.3423180592991915,
+      "grad_norm": 1.5527286529541016,
+      "learning_rate": 0.000501145945945946,
+      "loss": 16.9876,
+      "step": 6200
+    },
+    {
+      "epoch": 3.3692722371967654,
+      "grad_norm": 1.6721524000167847,
+      "learning_rate": 0.0005003351351351352,
+      "loss": 17.0027,
+      "step": 6250
+    },
+    {
+      "epoch": 3.3962264150943398,
+      "grad_norm": 1.5313137769699097,
+      "learning_rate": 0.0004995243243243242,
+      "loss": 17.0007,
+      "step": 6300
+    },
+    {
+      "epoch": 3.4231805929919137,
+      "grad_norm": 1.630896806716919,
+      "learning_rate": 0.0004987135135135134,
+      "loss": 16.9898,
+      "step": 6350
+    },
+    {
+      "epoch": 3.450134770889488,
+      "grad_norm": 1.446964979171753,
+      "learning_rate": 0.0004979027027027026,
+      "loss": 17.0057,
+      "step": 6400
+    },
+    {
+      "epoch": 3.477088948787062,
+      "grad_norm": 1.7500450611114502,
+      "learning_rate": 0.0004970918918918919,
+      "loss": 16.9788,
+      "step": 6450
+    },
+    {
+      "epoch": 3.5040431266846364,
+      "grad_norm": 1.640507459640503,
+      "learning_rate": 0.0004962810810810811,
+      "loss": 16.9703,
+      "step": 6500
+    },
+    {
+      "epoch": 3.5309973045822103,
+      "grad_norm": 1.54189133644104,
+      "learning_rate": 0.0004954702702702703,
+      "loss": 16.9812,
+      "step": 6550
+    },
+    {
+      "epoch": 3.557951482479784,
+      "grad_norm": 1.5099515914916992,
+      "learning_rate": 0.0004946594594594595,
+      "loss": 16.9529,
+      "step": 6600
+    },
+    {
+      "epoch": 3.5849056603773586,
+      "grad_norm": 1.5417841672897339,
+      "learning_rate": 0.0004938486486486486,
+      "loss": 16.9888,
+      "step": 6650
+    },
+    {
+      "epoch": 3.6118598382749325,
+      "grad_norm": 1.670837640762329,
+      "learning_rate": 0.0004930378378378377,
+      "loss": 16.9116,
+      "step": 6700
+    },
+    {
+      "epoch": 3.638814016172507,
+      "grad_norm": 1.6270161867141724,
+      "learning_rate": 0.0004922270270270269,
+      "loss": 16.9732,
+      "step": 6750
+    },
+    {
+      "epoch": 3.665768194070081,
+      "grad_norm": 1.6039149761199951,
+      "learning_rate": 0.0004914162162162162,
+      "loss": 16.9689,
+      "step": 6800
+    },
+    {
+      "epoch": 3.6927223719676547,
+      "grad_norm": 1.5096150636672974,
+      "learning_rate": 0.0004906054054054054,
+      "loss": 16.984,
+      "step": 6850
+    },
+    {
+      "epoch": 3.719676549865229,
+      "grad_norm": 1.5823392868041992,
+      "learning_rate": 0.0004897945945945946,
+      "loss": 16.9951,
+      "step": 6900
+    },
+    {
+      "epoch": 3.7466307277628035,
+      "grad_norm": 1.5228644609451294,
+      "learning_rate": 0.0004889837837837838,
+      "loss": 16.9585,
+      "step": 6950
+    },
+    {
+      "epoch": 3.7735849056603774,
+      "grad_norm": 1.5134717226028442,
+      "learning_rate": 0.00048817297297297296,
+      "loss": 16.9938,
+      "step": 7000
+    },
+    {
+      "epoch": 3.7735849056603774,
+      "eval_accuracy": 0.3765476201340876,
+      "eval_loss": 3.413353681564331,
+      "eval_runtime": 183.6576,
+      "eval_samples_per_second": 98.074,
+      "eval_steps_per_second": 6.131,
+      "step": 7000
+    },
+    {
+      "epoch": 3.8005390835579513,
+      "grad_norm": 1.524044156074524,
+      "learning_rate": 0.00048736216216216214,
+      "loss": 16.9498,
+      "step": 7050
+    },
+    {
+      "epoch": 3.8274932614555257,
+      "grad_norm": 1.5392922163009644,
+      "learning_rate": 0.0004865513513513513,
+      "loss": 16.9014,
+      "step": 7100
+    },
+    {
+      "epoch": 3.8544474393530996,
+      "grad_norm": 1.5926542282104492,
+      "learning_rate": 0.00048574054054054046,
+      "loss": 16.9565,
+      "step": 7150
+    },
+    {
+      "epoch": 3.881401617250674,
+      "grad_norm": 1.546481966972351,
+      "learning_rate": 0.00048492972972972965,
+      "loss": 16.9432,
+      "step": 7200
+    },
+    {
+      "epoch": 3.908355795148248,
+      "grad_norm": 1.512964129447937,
+      "learning_rate": 0.0004841189189189189,
+      "loss": 16.9287,
+      "step": 7250
+    },
+    {
+      "epoch": 3.935309973045822,
+      "grad_norm": 1.6536102294921875,
+      "learning_rate": 0.0004833081081081081,
+      "loss": 16.8681,
+      "step": 7300
+    },
+    {
+      "epoch": 3.9622641509433962,
+      "grad_norm": 1.5245862007141113,
+      "learning_rate": 0.00048249729729729727,
+      "loss": 16.9772,
+      "step": 7350
+    },
+    {
+      "epoch": 3.9892183288409706,
+      "grad_norm": 1.6729086637496948,
+      "learning_rate": 0.00048168648648648645,
+      "loss": 16.9178,
+      "step": 7400
+    },
+    {
+      "epoch": 4.0161725067385445,
+      "grad_norm": 1.7279188632965088,
+      "learning_rate": 0.00048087567567567564,
+      "loss": 16.6391,
+      "step": 7450
+    },
+    {
+      "epoch": 4.0431266846361185,
+      "grad_norm": 1.4739909172058105,
+      "learning_rate": 0.0004800648648648648,
+      "loss": 16.469,
+      "step": 7500
+    },
+    {
+      "epoch": 4.070080862533692,
+      "grad_norm": 1.708237648010254,
+      "learning_rate": 0.000479254054054054,
+      "loss": 16.4581,
+      "step": 7550
+    },
+    {
+      "epoch": 4.097035040431267,
+      "grad_norm": 1.564699649810791,
+      "learning_rate": 0.0004784432432432432,
+      "loss": 16.5009,
+      "step": 7600
+    },
+    {
+      "epoch": 4.123989218328841,
+      "grad_norm": 1.540399193763733,
+      "learning_rate": 0.0004776324324324324,
+      "loss": 16.5215,
+      "step": 7650
+    },
+    {
+      "epoch": 4.150943396226415,
+      "grad_norm": 1.4679442644119263,
+      "learning_rate": 0.0004768216216216216,
+      "loss": 16.5071,
+      "step": 7700
+    },
+    {
+      "epoch": 4.177897574123989,
+      "grad_norm": 1.6866313219070435,
+      "learning_rate": 0.00047601081081081076,
+      "loss": 16.5778,
+      "step": 7750
+    },
+    {
+      "epoch": 4.204851752021563,
+      "grad_norm": 1.5899074077606201,
+      "learning_rate": 0.0004752,
+      "loss": 16.5755,
+      "step": 7800
+    },
+    {
+      "epoch": 4.231805929919138,
+      "grad_norm": 1.5043095350265503,
+      "learning_rate": 0.0004743891891891892,
+      "loss": 16.5264,
+      "step": 7850
+    },
+    {
+      "epoch": 4.258760107816712,
+      "grad_norm": 1.496482253074646,
+      "learning_rate": 0.0004735783783783783,
+      "loss": 16.5654,
+      "step": 7900
+    },
+    {
+      "epoch": 4.285714285714286,
+      "grad_norm": 1.5001450777053833,
+      "learning_rate": 0.0004727675675675675,
+      "loss": 16.6061,
+      "step": 7950
+    },
+    {
+      "epoch": 4.3126684636118595,
+      "grad_norm": 1.57184898853302,
+      "learning_rate": 0.0004719567567567567,
+      "loss": 16.5676,
+      "step": 8000
+    },
+    {
+      "epoch": 4.3126684636118595,
+      "eval_accuracy": 0.37935440300073925,
+      "eval_loss": 3.3927013874053955,
+      "eval_runtime": 183.3993,
+      "eval_samples_per_second": 98.212,
+      "eval_steps_per_second": 6.14,
+      "step": 8000
+    },
+    {
+      "epoch": 4.339622641509434,
+      "grad_norm": 1.6450532674789429,
+      "learning_rate": 0.0004711459459459459,
+      "loss": 16.5523,
+      "step": 8050
+    },
+    {
+      "epoch": 4.366576819407008,
+      "grad_norm": 1.6096779108047485,
+      "learning_rate": 0.0004703351351351351,
+      "loss": 16.6157,
+      "step": 8100
+    },
+    {
+      "epoch": 4.393530997304582,
+      "grad_norm": 1.5400686264038086,
+      "learning_rate": 0.0004695243243243243,
+      "loss": 16.5685,
+      "step": 8150
+    },
+    {
+      "epoch": 4.420485175202156,
+      "grad_norm": 1.546473503112793,
+      "learning_rate": 0.0004687135135135135,
+      "loss": 16.5769,
+      "step": 8200
+    },
+    {
+      "epoch": 4.44743935309973,
+      "grad_norm": 1.4172500371932983,
+      "learning_rate": 0.0004679027027027027,
+      "loss": 16.6169,
+      "step": 8250
+    },
+    {
+      "epoch": 4.474393530997305,
+      "grad_norm": 1.545454740524292,
+      "learning_rate": 0.0004670918918918918,
+      "loss": 16.614,
+      "step": 8300
+    },
+    {
+      "epoch": 4.501347708894879,
+      "grad_norm": 1.4791597127914429,
+      "learning_rate": 0.000466281081081081,
+      "loss": 16.6006,
+      "step": 8350
+    },
+    {
+      "epoch": 4.528301886792453,
+      "grad_norm": 1.5513116121292114,
+      "learning_rate": 0.0004654702702702702,
+      "loss": 16.6113,
+      "step": 8400
+    },
+    {
+      "epoch": 4.555256064690027,
+      "grad_norm": 1.5687803030014038,
+      "learning_rate": 0.00046465945945945944,
+      "loss": 16.6075,
+      "step": 8450
+    },
+    {
+      "epoch": 4.5822102425876015,
+      "grad_norm": 1.5172430276870728,
+      "learning_rate": 0.0004638486486486486,
+      "loss": 16.5888,
+      "step": 8500
+    },
+    {
+      "epoch": 4.609164420485175,
+      "grad_norm": 1.5713242292404175,
+      "learning_rate": 0.0004630378378378378,
+      "loss": 16.5863,
+      "step": 8550
+    },
+    {
+      "epoch": 4.636118598382749,
+      "grad_norm": 1.5270150899887085,
+      "learning_rate": 0.000462227027027027,
+      "loss": 16.6009,
+      "step": 8600
+    },
+    {
+      "epoch": 4.663072776280323,
+      "grad_norm": 1.6063131093978882,
+      "learning_rate": 0.0004614162162162162,
+      "loss": 16.6324,
+      "step": 8650
+    },
+    {
+      "epoch": 4.690026954177897,
+      "grad_norm": 1.5824447870254517,
+      "learning_rate": 0.0004606054054054053,
+      "loss": 16.6239,
+      "step": 8700
+    },
+    {
+      "epoch": 4.716981132075472,
+      "grad_norm": 1.4564862251281738,
+      "learning_rate": 0.00045979459459459456,
+      "loss": 16.5982,
+      "step": 8750
+    },
+    {
+      "epoch": 4.743935309973046,
+      "grad_norm": 1.5050652027130127,
+      "learning_rate": 0.00045898378378378375,
+      "loss": 16.5798,
+      "step": 8800
+    },
+    {
+      "epoch": 4.77088948787062,
+      "grad_norm": 1.4407291412353516,
+      "learning_rate": 0.00045817297297297293,
+      "loss": 16.6124,
+      "step": 8850
+    },
+    {
+      "epoch": 4.797843665768194,
+      "grad_norm": 1.6084630489349365,
+      "learning_rate": 0.0004573621621621621,
+      "loss": 16.5858,
+      "step": 8900
+    },
+    {
+      "epoch": 4.824797843665769,
+      "grad_norm": 1.5382035970687866,
+      "learning_rate": 0.0004565513513513513,
+      "loss": 16.5519,
+      "step": 8950
+    },
+    {
+      "epoch": 4.8517520215633425,
+      "grad_norm": 1.5417052507400513,
+      "learning_rate": 0.00045574054054054055,
+      "loss": 16.623,
+      "step": 9000
+    },
+    {
+      "epoch": 4.8517520215633425,
+      "eval_accuracy": 0.3819663820553638,
+      "eval_loss": 3.366989850997925,
+      "eval_runtime": 183.4323,
+      "eval_samples_per_second": 98.194,
+      "eval_steps_per_second": 6.139,
+      "step": 9000
+    },
+    {
+      "epoch": 4.878706199460916,
+      "grad_norm": 1.4613817930221558,
+      "learning_rate": 0.00045492972972972973,
+      "loss": 16.6013,
+      "step": 9050
+    },
+    {
+      "epoch": 4.90566037735849,
+      "grad_norm": 1.5268949270248413,
+      "learning_rate": 0.00045411891891891887,
+      "loss": 16.5575,
+      "step": 9100
+    },
+    {
+      "epoch": 4.932614555256064,
+      "grad_norm": 1.5352474451065063,
+      "learning_rate": 0.00045330810810810805,
+      "loss": 16.5424,
+      "step": 9150
+    },
+    {
+      "epoch": 4.959568733153639,
+      "grad_norm": 1.538854718208313,
+      "learning_rate": 0.00045249729729729724,
+      "loss": 16.5602,
+      "step": 9200
+    },
+    {
+      "epoch": 4.986522911051213,
+      "grad_norm": 1.450854778289795,
+      "learning_rate": 0.00045168648648648643,
+      "loss": 16.6054,
+      "step": 9250
+    },
+    {
+      "epoch": 5.013477088948787,
+      "grad_norm": 1.4519016742706299,
+      "learning_rate": 0.0004508756756756756,
+      "loss": 16.3572,
+      "step": 9300
+    },
+    {
+      "epoch": 5.040431266846361,
+      "grad_norm": 1.5664149522781372,
+      "learning_rate": 0.00045006486486486486,
+      "loss": 16.1281,
+      "step": 9350
+    },
+    {
+      "epoch": 5.067385444743936,
+      "grad_norm": 1.5345157384872437,
+      "learning_rate": 0.00044925405405405404,
+      "loss": 16.1432,
+      "step": 9400
+    },
+    {
+      "epoch": 5.09433962264151,
+      "grad_norm": 1.5099859237670898,
+      "learning_rate": 0.00044844324324324323,
+      "loss": 16.1704,
+      "step": 9450
+    },
+    {
+      "epoch": 5.121293800539084,
+      "grad_norm": 1.5200412273406982,
+      "learning_rate": 0.00044763243243243236,
+      "loss": 16.1838,
+      "step": 9500
+    },
+    {
+      "epoch": 5.1482479784366575,
+      "grad_norm": 1.4238975048065186,
+      "learning_rate": 0.00044682162162162155,
+      "loss": 16.1983,
+      "step": 9550
+    },
+    {
+      "epoch": 5.175202156334231,
+      "grad_norm": 1.560627818107605,
+      "learning_rate": 0.00044601081081081074,
+      "loss": 16.2052,
+      "step": 9600
+    },
+    {
+      "epoch": 5.202156334231806,
+      "grad_norm": 1.5408422946929932,
+      "learning_rate": 0.0004452,
+      "loss": 16.2084,
+      "step": 9650
+    },
+    {
+      "epoch": 5.22911051212938,
+      "grad_norm": 1.4684669971466064,
+      "learning_rate": 0.00044438918918918917,
+      "loss": 16.2638,
+      "step": 9700
+    },
+    {
+      "epoch": 5.256064690026954,
+      "grad_norm": 1.408656120300293,
+      "learning_rate": 0.00044357837837837835,
+      "loss": 16.2383,
+      "step": 9750
+    },
+    {
+      "epoch": 5.283018867924528,
+      "grad_norm": 1.4579087495803833,
+      "learning_rate": 0.00044276756756756754,
+      "loss": 16.2952,
+      "step": 9800
+    },
+    {
+      "epoch": 5.309973045822103,
+      "grad_norm": 1.5319942235946655,
+      "learning_rate": 0.00044195675675675673,
+      "loss": 16.2531,
+      "step": 9850
+    },
+    {
+      "epoch": 5.336927223719677,
+      "grad_norm": 1.6440680027008057,
+      "learning_rate": 0.00044114594594594586,
+      "loss": 16.2352,
+      "step": 9900
+    },
+    {
+      "epoch": 5.363881401617251,
+      "grad_norm": 1.5315848588943481,
+      "learning_rate": 0.0004403351351351351,
+      "loss": 16.2721,
+      "step": 9950
+    },
+    {
+      "epoch": 5.390835579514825,
+      "grad_norm": 1.5545754432678223,
+      "learning_rate": 0.0004395243243243243,
+      "loss": 16.268,
+      "step": 10000
+    },
+    {
+      "epoch": 5.390835579514825,
+      "eval_accuracy": 0.38350960199180106,
+      "eval_loss": 3.3579905033111572,
+      "eval_runtime": 183.2768,
+      "eval_samples_per_second": 98.278,
+      "eval_steps_per_second": 6.144,
+      "step": 10000
+    },
+    {
+      "epoch": 5.4177897574123985,
+      "grad_norm": 1.5124561786651611,
+      "learning_rate": 0.0004387135135135135,
+      "loss": 16.2922,
+      "step": 10050
+    },
+    {
+      "epoch": 5.444743935309973,
+      "grad_norm": 1.5418624877929688,
+      "learning_rate": 0.00043790270270270266,
+      "loss": 16.3084,
+      "step": 10100
+    },
+    {
+      "epoch": 5.471698113207547,
+      "grad_norm": 1.556877851486206,
+      "learning_rate": 0.00043709189189189185,
+      "loss": 16.2663,
+      "step": 10150
+    },
+    {
+      "epoch": 5.498652291105121,
+      "grad_norm": 1.4898020029067993,
+      "learning_rate": 0.00043628108108108104,
+      "loss": 16.3106,
+      "step": 10200
+    },
+    {
+      "epoch": 5.525606469002695,
+      "grad_norm": 1.553478717803955,
+      "learning_rate": 0.0004354702702702703,
+      "loss": 16.2869,
+      "step": 10250
+    },
+    {
+      "epoch": 5.55256064690027,
+      "grad_norm": 1.482102870941162,
+      "learning_rate": 0.0004346594594594594,
+      "loss": 16.2996,
+      "step": 10300
+    },
+    {
+      "epoch": 5.579514824797844,
+      "grad_norm": 1.4998143911361694,
+      "learning_rate": 0.0004338486486486486,
+      "loss": 16.3236,
+      "step": 10350
+    },
+    {
+      "epoch": 5.606469002695418,
+      "grad_norm": 1.3639193773269653,
+      "learning_rate": 0.0004330378378378378,
+      "loss": 16.3022,
+      "step": 10400
+    },
+    {
+      "epoch": 5.633423180592992,
+      "grad_norm": 1.5346744060516357,
+      "learning_rate": 0.00043222702702702697,
+      "loss": 16.3686,
+      "step": 10450
+    },
+    {
+      "epoch": 5.660377358490566,
+      "grad_norm": 1.5229798555374146,
+      "learning_rate": 0.00043141621621621616,
+      "loss": 16.339,
+      "step": 10500
+    },
+    {
+      "epoch": 5.6873315363881405,
+      "grad_norm": 1.6072639226913452,
+      "learning_rate": 0.0004306054054054054,
+      "loss": 16.3501,
+      "step": 10550
+    },
+    {
+      "epoch": 5.714285714285714,
+      "grad_norm": 1.42572021484375,
+      "learning_rate": 0.0004297945945945946,
+      "loss": 16.3437,
+      "step": 10600
+    },
+    {
+      "epoch": 5.741239892183288,
+      "grad_norm": 1.48056960105896,
+      "learning_rate": 0.0004289837837837838,
+      "loss": 16.3218,
+      "step": 10650
+    },
+    {
+      "epoch": 5.768194070080862,
+      "grad_norm": 1.6410192251205444,
+      "learning_rate": 0.0004281729729729729,
+      "loss": 16.301,
+      "step": 10700
+    },
+    {
+      "epoch": 5.795148247978437,
+      "grad_norm": 1.4807794094085693,
+      "learning_rate": 0.0004273621621621621,
+      "loss": 16.2993,
+      "step": 10750
+    },
+    {
+      "epoch": 5.822102425876011,
+      "grad_norm": 1.5122348070144653,
+      "learning_rate": 0.0004265513513513513,
+      "loss": 16.3178,
+      "step": 10800
+    },
+    {
+      "epoch": 5.849056603773585,
+      "grad_norm": 1.4910032749176025,
+      "learning_rate": 0.0004257405405405405,
+      "loss": 16.3674,
+      "step": 10850
+    },
+    {
+      "epoch": 5.876010781671159,
+      "grad_norm": 1.449251413345337,
+      "learning_rate": 0.0004249297297297297,
+      "loss": 16.3492,
+      "step": 10900
+    },
+    {
+      "epoch": 5.902964959568733,
+      "grad_norm": 1.4819191694259644,
+      "learning_rate": 0.0004241189189189189,
+      "loss": 16.3509,
+      "step": 10950
+    },
+    {
+      "epoch": 5.929919137466308,
+      "grad_norm": 1.4536081552505493,
+      "learning_rate": 0.0004233081081081081,
+      "loss": 16.3595,
+      "step": 11000
+    },
+    {
+      "epoch": 5.929919137466308,
+      "eval_accuracy": 0.3857778223954198,
+      "eval_loss": 3.337559938430786,
+      "eval_runtime": 183.0486,
+      "eval_samples_per_second": 98.4,
+      "eval_steps_per_second": 6.151,
+      "step": 11000
+    },
+    {
+      "epoch": 5.9568733153638815,
+      "grad_norm": 1.6155593395233154,
+      "learning_rate": 0.00042249729729729727,
+      "loss": 16.3047,
+      "step": 11050
+    },
+    {
+      "epoch": 5.9838274932614555,
+      "grad_norm": 1.4833221435546875,
+      "learning_rate": 0.0004216864864864864,
+      "loss": 16.3674,
+      "step": 11100
+    },
+    {
+      "epoch": 6.010781671159029,
+      "grad_norm": 1.4600006341934204,
+      "learning_rate": 0.0004208756756756756,
+      "loss": 16.168,
+      "step": 11150
+    },
+    {
+      "epoch": 6.037735849056604,
+      "grad_norm": 1.5126168727874756,
+      "learning_rate": 0.00042006486486486483,
+      "loss": 15.8955,
+      "step": 11200
+    },
+    {
+      "epoch": 6.064690026954178,
+      "grad_norm": 1.5764695405960083,
+      "learning_rate": 0.000419254054054054,
+      "loss": 15.8718,
+      "step": 11250
+    },
+    {
+      "epoch": 6.091644204851752,
+      "grad_norm": 1.6239944696426392,
+      "learning_rate": 0.0004184432432432432,
+      "loss": 15.8961,
+      "step": 11300
+    },
+    {
+      "epoch": 6.118598382749326,
+      "grad_norm": 1.5485799312591553,
+      "learning_rate": 0.0004176324324324324,
+      "loss": 15.9413,
+      "step": 11350
+    },
+    {
+      "epoch": 6.1455525606469,
+      "grad_norm": 1.5715981721878052,
+      "learning_rate": 0.0004168216216216216,
+      "loss": 15.9672,
+      "step": 11400
+    },
+    {
+      "epoch": 6.172506738544475,
+      "grad_norm": 1.5469777584075928,
+      "learning_rate": 0.0004160108108108108,
+      "loss": 15.9807,
+      "step": 11450
+    },
+    {
+      "epoch": 6.199460916442049,
+      "grad_norm": 1.4775831699371338,
+      "learning_rate": 0.00041519999999999995,
+      "loss": 16.0002,
+      "step": 11500
+    },
+    {
+      "epoch": 6.226415094339623,
+      "grad_norm": 1.545695424079895,
+      "learning_rate": 0.00041438918918918914,
+      "loss": 16.028,
+      "step": 11550
+    },
+    {
+      "epoch": 6.2533692722371965,
+      "grad_norm": 1.4563783407211304,
+      "learning_rate": 0.00041357837837837833,
+      "loss": 16.0287,
+      "step": 11600
+    },
+    {
+      "epoch": 6.280323450134771,
+      "grad_norm": 1.4710512161254883,
+      "learning_rate": 0.0004127675675675675,
+      "loss": 16.0216,
+      "step": 11650
+    },
+    {
+      "epoch": 6.307277628032345,
+      "grad_norm": 1.4847939014434814,
+      "learning_rate": 0.0004119567567567567,
+      "loss": 16.0212,
+      "step": 11700
+    },
+    {
+      "epoch": 6.334231805929919,
+      "grad_norm": 1.4909292459487915,
+      "learning_rate": 0.00041114594594594594,
+      "loss": 16.0188,
+      "step": 11750
+    },
+    {
+      "epoch": 6.361185983827493,
+      "grad_norm": 1.4875826835632324,
+      "learning_rate": 0.00041033513513513513,
+      "loss": 16.0297,
+      "step": 11800
+    },
+    {
+      "epoch": 6.388140161725067,
+      "grad_norm": 1.471168041229248,
+      "learning_rate": 0.0004095243243243243,
+      "loss": 16.0682,
+      "step": 11850
+    },
+    {
+      "epoch": 6.415094339622642,
+      "grad_norm": 1.4281312227249146,
+      "learning_rate": 0.00040871351351351345,
+      "loss": 16.0746,
+      "step": 11900
+    },
+    {
+      "epoch": 6.442048517520216,
+      "grad_norm": 1.4156763553619385,
+      "learning_rate": 0.00040790270270270264,
+      "loss": 16.0395,
+      "step": 11950
+    },
+    {
+      "epoch": 6.46900269541779,
+      "grad_norm": 1.5514986515045166,
+      "learning_rate": 0.0004070918918918918,
+      "loss": 16.0848,
+      "step": 12000
+    },
+    {
+      "epoch": 6.46900269541779,
+      "eval_accuracy": 0.38665525440095816,
+      "eval_loss": 3.334074020385742,
+      "eval_runtime": 183.4671,
+      "eval_samples_per_second": 98.176,
+      "eval_steps_per_second": 6.137,
+      "step": 12000
+    },
+    {
+      "epoch": 6.495956873315364,
+      "grad_norm": 1.50123929977417,
+      "learning_rate": 0.000406281081081081,
+      "loss": 16.0249,
+      "step": 12050
+    },
+    {
+      "epoch": 6.5229110512129385,
+      "grad_norm": 1.5448544025421143,
+      "learning_rate": 0.00040547027027027025,
+      "loss": 16.055,
+      "step": 12100
+    },
+    {
+      "epoch": 6.549865229110512,
+      "grad_norm": 1.4838643074035645,
+      "learning_rate": 0.00040465945945945944,
+      "loss": 16.0818,
+      "step": 12150
+    },
+    {
+      "epoch": 6.576819407008086,
+      "grad_norm": 1.5144941806793213,
+      "learning_rate": 0.00040384864864864863,
+      "loss": 16.0781,
+      "step": 12200
+    },
+    {
+      "epoch": 6.60377358490566,
+      "grad_norm": 1.5219935178756714,
+      "learning_rate": 0.0004030378378378378,
+      "loss": 16.1189,
+      "step": 12250
+    },
+    {
+      "epoch": 6.630727762803234,
+      "grad_norm": 1.5085090398788452,
+      "learning_rate": 0.00040222702702702695,
+      "loss": 16.0967,
+      "step": 12300
+    },
+    {
+      "epoch": 6.657681940700809,
+      "grad_norm": 1.3718434572219849,
+      "learning_rate": 0.00040141621621621614,
+      "loss": 16.063,
+      "step": 12350
+    },
+    {
+      "epoch": 6.684636118598383,
+      "grad_norm": 1.4403940439224243,
+      "learning_rate": 0.0004006054054054054,
+      "loss": 16.0543,
+      "step": 12400
+    },
+    {
+      "epoch": 6.711590296495957,
+      "grad_norm": 1.4802608489990234,
+      "learning_rate": 0.00039979459459459456,
+      "loss": 16.0814,
+      "step": 12450
+    },
+    {
+      "epoch": 6.738544474393531,
+      "grad_norm": 1.5132447481155396,
+      "learning_rate": 0.00039898378378378375,
+      "loss": 16.0892,
+      "step": 12500
+    },
+    {
+      "epoch": 6.765498652291106,
+      "grad_norm": 1.5833419561386108,
+      "learning_rate": 0.00039817297297297294,
+      "loss": 16.1005,
+      "step": 12550
+    },
+    {
+      "epoch": 6.7924528301886795,
+      "grad_norm": 1.4338456392288208,
+      "learning_rate": 0.0003973621621621621,
+      "loss": 16.1218,
+      "step": 12600
+    },
+    {
+      "epoch": 6.819407008086253,
+      "grad_norm": 1.5977522134780884,
+      "learning_rate": 0.00039655135135135137,
+      "loss": 16.1079,
+      "step": 12650
+    },
+    {
+      "epoch": 6.846361185983827,
+      "grad_norm": 1.5367788076400757,
+      "learning_rate": 0.0003957405405405405,
+      "loss": 16.0879,
+      "step": 12700
+    },
+    {
+      "epoch": 6.873315363881401,
+      "grad_norm": 1.3250383138656616,
+      "learning_rate": 0.0003949297297297297,
+      "loss": 16.1008,
+      "step": 12750
+    },
+    {
+      "epoch": 6.900269541778976,
+      "grad_norm": 1.6440327167510986,
+      "learning_rate": 0.00039411891891891887,
+      "loss": 16.1511,
+      "step": 12800
+    },
+    {
+      "epoch": 6.92722371967655,
+      "grad_norm": 1.5747231245040894,
+      "learning_rate": 0.00039330810810810806,
+      "loss": 16.101,
+      "step": 12850
+    },
+    {
+      "epoch": 6.954177897574124,
+      "grad_norm": 1.536387324333191,
+      "learning_rate": 0.00039249729729729725,
+      "loss": 16.1572,
+      "step": 12900
+    },
+    {
+      "epoch": 6.981132075471698,
+      "grad_norm": 1.4417625665664673,
+      "learning_rate": 0.0003916864864864865,
+      "loss": 16.1219,
+      "step": 12950
+    },
+    {
+      "epoch": 7.008086253369272,
+      "grad_norm": 1.502989649772644,
+      "learning_rate": 0.0003908756756756757,
+      "loss": 15.9683,
+      "step": 13000
+    },
+    {
+      "epoch": 7.008086253369272,
+      "eval_accuracy": 0.387830161497032,
+      "eval_loss": 3.324005365371704,
+      "eval_runtime": 183.0109,
+      "eval_samples_per_second": 98.42,
+      "eval_steps_per_second": 6.153,
+      "step": 13000
+    },
+    {
+      "epoch": 7.035040431266847,
+      "grad_norm": 1.597719430923462,
+      "learning_rate": 0.00039006486486486486,
+      "loss": 15.6299,
+      "step": 13050
+    },
+    {
+      "epoch": 7.061994609164421,
+      "grad_norm": 1.4646968841552734,
+      "learning_rate": 0.000389254054054054,
+      "loss": 15.6497,
+      "step": 13100
+    },
+    {
+      "epoch": 7.0889487870619945,
+      "grad_norm": 1.446700930595398,
+      "learning_rate": 0.0003884432432432432,
+      "loss": 15.6701,
+      "step": 13150
+    },
+    {
+      "epoch": 7.115902964959568,
+      "grad_norm": 1.491384744644165,
+      "learning_rate": 0.00038763243243243237,
+      "loss": 15.7227,
+      "step": 13200
+    },
+    {
+      "epoch": 7.142857142857143,
+      "grad_norm": 1.5890462398529053,
+      "learning_rate": 0.00038682162162162156,
+      "loss": 15.7391,
+      "step": 13250
+    },
+    {
+      "epoch": 7.169811320754717,
+      "grad_norm": 1.4061617851257324,
+      "learning_rate": 0.0003860108108108108,
+      "loss": 15.7767,
+      "step": 13300
+    },
+    {
+      "epoch": 7.196765498652291,
+      "grad_norm": 1.529066562652588,
+      "learning_rate": 0.0003852,
+      "loss": 15.7941,
+      "step": 13350
+    },
+    {
+      "epoch": 7.223719676549865,
+      "grad_norm": 1.6018165349960327,
+      "learning_rate": 0.00038438918918918917,
+      "loss": 15.8158,
+      "step": 13400
+    },
+    {
+      "epoch": 7.250673854447439,
+      "grad_norm": 1.4185563325881958,
+      "learning_rate": 0.00038357837837837836,
+      "loss": 15.813,
+      "step": 13450
+    },
+    {
+      "epoch": 7.277628032345014,
+      "grad_norm": 1.3626817464828491,
+      "learning_rate": 0.0003827675675675675,
+      "loss": 15.7934,
+      "step": 13500
+    },
+    {
+      "epoch": 7.304582210242588,
+      "grad_norm": 1.5028986930847168,
+      "learning_rate": 0.0003819567567567567,
+      "loss": 15.8513,
+      "step": 13550
+    },
+    {
+      "epoch": 7.331536388140162,
+      "grad_norm": 1.55814790725708,
+      "learning_rate": 0.0003811459459459459,
+      "loss": 15.8259,
+      "step": 13600
+    },
+    {
+      "epoch": 7.3584905660377355,
+      "grad_norm": 1.5069361925125122,
+      "learning_rate": 0.0003803351351351351,
+      "loss": 15.8,
+      "step": 13650
+    },
+    {
+      "epoch": 7.38544474393531,
+      "grad_norm": 1.5427130460739136,
+      "learning_rate": 0.0003795243243243243,
+      "loss": 15.8371,
+      "step": 13700
+    },
+    {
+      "epoch": 7.412398921832884,
+      "grad_norm": 1.6525657176971436,
+      "learning_rate": 0.0003787135135135135,
+      "loss": 15.8431,
+      "step": 13750
+    },
+    {
+      "epoch": 7.439353099730458,
+      "grad_norm": 1.4115492105484009,
+      "learning_rate": 0.00037790270270270267,
+      "loss": 15.8876,
+      "step": 13800
+    },
+    {
+      "epoch": 7.466307277628032,
+      "grad_norm": 1.4626988172531128,
+      "learning_rate": 0.0003770918918918919,
+      "loss": 15.8694,
+      "step": 13850
+    },
+    {
+      "epoch": 7.493261455525606,
+      "grad_norm": 1.446192979812622,
+      "learning_rate": 0.000376281081081081,
+      "loss": 15.8921,
+      "step": 13900
+    },
+    {
+      "epoch": 7.520215633423181,
+      "grad_norm": 1.570351004600525,
+      "learning_rate": 0.00037547027027027023,
+      "loss": 15.8805,
+      "step": 13950
+    },
+    {
+      "epoch": 7.547169811320755,
+      "grad_norm": 1.4021625518798828,
+      "learning_rate": 0.0003746594594594594,
+      "loss": 15.9139,
+      "step": 14000
+    },
+    {
+      "epoch": 7.547169811320755,
+      "eval_accuracy": 0.38896671625309154,
+      "eval_loss": 3.3171634674072266,
+      "eval_runtime": 183.4636,
+      "eval_samples_per_second": 98.178,
+      "eval_steps_per_second": 6.137,
+      "step": 14000
+    },
+    {
+      "epoch": 7.574123989218329,
+      "grad_norm": 1.5256940126419067,
+      "learning_rate": 0.0003738486486486486,
+      "loss": 15.844,
+      "step": 14050
+    },
+    {
+      "epoch": 7.601078167115903,
+      "grad_norm": 1.5494848489761353,
+      "learning_rate": 0.0003730378378378378,
+      "loss": 15.8598,
+      "step": 14100
+    },
+    {
+      "epoch": 7.628032345013477,
+      "grad_norm": 1.4843692779541016,
+      "learning_rate": 0.000372227027027027,
+      "loss": 15.8758,
+      "step": 14150
+    },
+    {
+      "epoch": 7.654986522911051,
+      "grad_norm": 1.4996877908706665,
+      "learning_rate": 0.0003714162162162162,
+      "loss": 15.9204,
+      "step": 14200
+    },
+    {
+      "epoch": 7.681940700808625,
+      "grad_norm": 1.4797159433364868,
+      "learning_rate": 0.0003706054054054054,
+      "loss": 15.9329,
+      "step": 14250
+    },
+    {
+      "epoch": 7.708894878706199,
+      "grad_norm": 1.4859975576400757,
+      "learning_rate": 0.0003697945945945946,
+      "loss": 15.9547,
+      "step": 14300
+    },
+    {
+      "epoch": 7.735849056603773,
+      "grad_norm": 1.5032356977462769,
+      "learning_rate": 0.0003689837837837837,
+      "loss": 15.9601,
+      "step": 14350
+    },
+    {
+      "epoch": 7.762803234501348,
+      "grad_norm": 1.4542951583862305,
+      "learning_rate": 0.0003681729729729729,
+      "loss": 15.8923,
+      "step": 14400
+    },
+    {
+      "epoch": 7.789757412398922,
+      "grad_norm": 1.4771339893341064,
+      "learning_rate": 0.0003673621621621621,
+      "loss": 15.9012,
+      "step": 14450
+    },
+    {
+      "epoch": 7.816711590296496,
+      "grad_norm": 1.440063714981079,
+      "learning_rate": 0.00036655135135135134,
+      "loss": 15.9271,
+      "step": 14500
+    },
+    {
+      "epoch": 7.84366576819407,
+      "grad_norm": 1.5838462114334106,
+      "learning_rate": 0.00036574054054054053,
+      "loss": 15.931,
+      "step": 14550
+    },
+    {
+      "epoch": 7.870619946091644,
+      "grad_norm": 1.4777487516403198,
+      "learning_rate": 0.0003649297297297297,
+      "loss": 15.9197,
+      "step": 14600
+    },
+    {
+      "epoch": 7.8975741239892185,
+      "grad_norm": 1.497223973274231,
+      "learning_rate": 0.0003641189189189189,
+      "loss": 15.9195,
+      "step": 14650
+    },
+    {
+      "epoch": 7.9245283018867925,
+      "grad_norm": 1.3650174140930176,
+      "learning_rate": 0.0003633081081081081,
+      "loss": 15.9278,
+      "step": 14700
+    },
+    {
+      "epoch": 7.951482479784366,
+      "grad_norm": 1.4432156085968018,
+      "learning_rate": 0.0003624972972972972,
+      "loss": 15.973,
+      "step": 14750
+    },
+    {
+      "epoch": 7.97843665768194,
+      "grad_norm": 1.8180121183395386,
+      "learning_rate": 0.00036168648648648646,
+      "loss": 15.9264,
+      "step": 14800
+    },
+    {
+      "epoch": 8.005390835579515,
+      "grad_norm": 1.5880870819091797,
+      "learning_rate": 0.00036087567567567565,
+      "loss": 15.8274,
+      "step": 14850
+    },
+    {
+      "epoch": 8.032345013477089,
+      "grad_norm": 1.5161232948303223,
+      "learning_rate": 0.00036006486486486484,
+      "loss": 15.4744,
+      "step": 14900
+    },
+    {
+      "epoch": 8.059299191374663,
+      "grad_norm": 1.5016933679580688,
+      "learning_rate": 0.000359254054054054,
+      "loss": 15.5281,
+      "step": 14950
+    },
+    {
+      "epoch": 8.086253369272237,
+      "grad_norm": 1.4770663976669312,
+      "learning_rate": 0.0003584432432432432,
+      "loss": 15.5172,
+      "step": 15000
+    },
+    {
+      "epoch": 8.086253369272237,
+      "eval_accuracy": 0.38934643701328925,
+      "eval_loss": 3.314887285232544,
+      "eval_runtime": 182.5262,
+      "eval_samples_per_second": 98.682,
+      "eval_steps_per_second": 6.169,
+      "step": 15000
+    },
+    {
+      "epoch": 8.11320754716981,
+      "grad_norm": 1.571670413017273,
+      "learning_rate": 0.0003576324324324324,
+      "loss": 15.5307,
+      "step": 15050
+    },
+    {
+      "epoch": 8.140161725067385,
+      "grad_norm": 1.412194013595581,
+      "learning_rate": 0.00035682162162162164,
+      "loss": 15.5542,
+      "step": 15100
+    },
+    {
+      "epoch": 8.167115902964959,
+      "grad_norm": 1.5343204736709595,
+      "learning_rate": 0.0003560108108108108,
+      "loss": 15.5985,
+      "step": 15150
+    },
+    {
+      "epoch": 8.194070080862534,
+      "grad_norm": 1.4480562210083008,
+      "learning_rate": 0.00035519999999999996,
+      "loss": 15.6071,
+      "step": 15200
+    },
+    {
+      "epoch": 8.221024258760108,
+      "grad_norm": 1.4651609659194946,
+      "learning_rate": 0.00035438918918918915,
+      "loss": 15.6506,
+      "step": 15250
+    },
+    {
+      "epoch": 8.247978436657682,
+      "grad_norm": 1.5500885248184204,
+      "learning_rate": 0.00035357837837837833,
+      "loss": 15.6198,
+      "step": 15300
+    },
+    {
+      "epoch": 8.274932614555256,
+      "grad_norm": 1.457398533821106,
+      "learning_rate": 0.0003527675675675675,
+      "loss": 15.6218,
+      "step": 15350
+    },
+    {
+      "epoch": 8.30188679245283,
+      "grad_norm": 1.457667350769043,
+      "learning_rate": 0.00035195675675675676,
+      "loss": 15.6641,
+      "step": 15400
+    },
+    {
+      "epoch": 8.328840970350404,
+      "grad_norm": 1.4689645767211914,
+      "learning_rate": 0.0003511621621621621,
+      "loss": 15.6165,
+      "step": 15450
+    },
+    {
+      "epoch": 8.355795148247978,
+      "grad_norm": 1.434046983718872,
+      "learning_rate": 0.0003503513513513513,
+      "loss": 15.6322,
+      "step": 15500
+    },
+    {
+      "epoch": 8.382749326145552,
+      "grad_norm": 1.5870915651321411,
+      "learning_rate": 0.0003495405405405405,
+      "loss": 15.6622,
+      "step": 15550
+    },
+    {
+      "epoch": 8.409703504043126,
+      "grad_norm": 1.5007177591323853,
+      "learning_rate": 0.0003487297297297297,
+      "loss": 15.6481,
+      "step": 15600
+    },
+    {
+      "epoch": 8.436657681940702,
+      "grad_norm": 1.6240930557250977,
+      "learning_rate": 0.0003479189189189189,
+      "loss": 15.6799,
+      "step": 15650
+    },
+    {
+      "epoch": 8.463611859838275,
+      "grad_norm": 1.5161348581314087,
+      "learning_rate": 0.0003471081081081081,
+      "loss": 15.7144,
+      "step": 15700
+    },
+    {
+      "epoch": 8.49056603773585,
+      "grad_norm": 1.4712945222854614,
+      "learning_rate": 0.0003462972972972973,
+      "loss": 15.7262,
+      "step": 15750
+    },
+    {
+      "epoch": 8.517520215633423,
+      "grad_norm": 1.4982839822769165,
+      "learning_rate": 0.00034548648648648645,
+      "loss": 15.6994,
+      "step": 15800
+    },
+    {
+      "epoch": 8.544474393530997,
+      "grad_norm": 1.5805519819259644,
+      "learning_rate": 0.00034467567567567564,
+      "loss": 15.6952,
+      "step": 15850
+    },
+    {
+      "epoch": 8.571428571428571,
+      "grad_norm": 1.38987398147583,
+      "learning_rate": 0.0003438648648648648,
+      "loss": 15.7478,
+      "step": 15900
+    },
+    {
+      "epoch": 8.598382749326145,
+      "grad_norm": 1.5049515962600708,
+      "learning_rate": 0.000343054054054054,
+      "loss": 15.7328,
+      "step": 15950
+    },
+    {
+      "epoch": 8.625336927223719,
+      "grad_norm": 1.4291828870773315,
+      "learning_rate": 0.0003422432432432432,
+      "loss": 15.7165,
+      "step": 16000
+    },
+    {
+      "epoch": 8.625336927223719,
+      "eval_accuracy": 0.39062988231807194,
+      "eval_loss": 3.300835609436035,
+      "eval_runtime": 182.7741,
+      "eval_samples_per_second": 98.548,
+      "eval_steps_per_second": 6.161,
+      "step": 16000
+    },
+    {
+      "epoch": 8.652291105121293,
+      "grad_norm": 1.511680006980896,
+      "learning_rate": 0.00034143243243243244,
+      "loss": 15.7899,
+      "step": 16050
+    },
+    {
+      "epoch": 8.679245283018869,
+      "grad_norm": 1.4265791177749634,
+      "learning_rate": 0.0003406216216216216,
+      "loss": 15.7614,
+      "step": 16100
+    },
+    {
+      "epoch": 8.706199460916443,
+      "grad_norm": 1.4805567264556885,
+      "learning_rate": 0.0003398108108108108,
+      "loss": 15.7489,
+      "step": 16150
+    },
+    {
+      "epoch": 8.733153638814017,
+      "grad_norm": 1.4429032802581787,
+      "learning_rate": 0.00033899999999999995,
+      "loss": 15.7055,
+      "step": 16200
+    },
+    {
+      "epoch": 8.76010781671159,
+      "grad_norm": 1.5268951654434204,
+      "learning_rate": 0.00033818918918918913,
+      "loss": 15.7464,
+      "step": 16250
+    },
+    {
+      "epoch": 8.787061994609164,
+      "grad_norm": 1.5547935962677002,
+      "learning_rate": 0.0003373783783783783,
+      "loss": 15.7365,
+      "step": 16300
+    },
+    {
+      "epoch": 8.814016172506738,
+      "grad_norm": 1.6019865274429321,
+      "learning_rate": 0.0003365675675675675,
+      "loss": 15.749,
+      "step": 16350
+    },
+    {
+      "epoch": 8.840970350404312,
+      "grad_norm": 1.576707124710083,
+      "learning_rate": 0.00033575675675675675,
+      "loss": 15.7725,
+      "step": 16400
+    },
+    {
+      "epoch": 8.867924528301886,
+      "grad_norm": 1.544109582901001,
+      "learning_rate": 0.00033494594594594594,
+      "loss": 15.7704,
+      "step": 16450
+    },
+    {
+      "epoch": 8.89487870619946,
+      "grad_norm": 1.5295876264572144,
+      "learning_rate": 0.0003341351351351351,
+      "loss": 15.7795,
+      "step": 16500
+    },
+    {
+      "epoch": 8.921832884097036,
+      "grad_norm": 1.4180580377578735,
+      "learning_rate": 0.0003333243243243243,
+      "loss": 15.7632,
+      "step": 16550
+    },
+    {
+      "epoch": 8.94878706199461,
+      "grad_norm": 1.4332422018051147,
+      "learning_rate": 0.00033251351351351344,
+      "loss": 15.7384,
+      "step": 16600
+    },
+    {
+      "epoch": 8.975741239892184,
+      "grad_norm": 1.5407085418701172,
+      "learning_rate": 0.00033170270270270263,
+      "loss": 15.7877,
+      "step": 16650
+    },
+    {
+      "epoch": 9.002695417789758,
+      "grad_norm": 1.6193679571151733,
+      "learning_rate": 0.00033089189189189187,
+      "loss": 15.7523,
+      "step": 16700
+    },
+    {
+      "epoch": 9.029649595687331,
+      "grad_norm": 1.5814419984817505,
+      "learning_rate": 0.00033008108108108106,
+      "loss": 15.3115,
+      "step": 16750
+    },
+    {
+      "epoch": 9.056603773584905,
+      "grad_norm": 1.523964524269104,
+      "learning_rate": 0.00032927027027027024,
+      "loss": 15.3445,
+      "step": 16800
+    },
+    {
+      "epoch": 9.08355795148248,
+      "grad_norm": 1.4931163787841797,
+      "learning_rate": 0.00032845945945945943,
+      "loss": 15.3418,
+      "step": 16850
+    },
+    {
+      "epoch": 9.110512129380053,
+      "grad_norm": 1.443772792816162,
+      "learning_rate": 0.0003276486486486486,
+      "loss": 15.415,
+      "step": 16900
+    },
+    {
+      "epoch": 9.137466307277627,
+      "grad_norm": 1.5072815418243408,
+      "learning_rate": 0.00032683783783783786,
+      "loss": 15.464,
+      "step": 16950
+    },
+    {
+      "epoch": 9.164420485175203,
+      "grad_norm": 1.5310089588165283,
+      "learning_rate": 0.00032602702702702694,
+      "loss": 15.4386,
+      "step": 17000
+    },
+    {
+      "epoch": 9.164420485175203,
+      "eval_accuracy": 0.39104013284468325,
+      "eval_loss": 3.3030104637145996,
+      "eval_runtime": 182.8107,
+      "eval_samples_per_second": 98.528,
+      "eval_steps_per_second": 6.159,
+      "step": 17000
+    },
+    {
+      "epoch": 9.191374663072777,
+      "grad_norm": 1.5395615100860596,
+      "learning_rate": 0.0003252162162162162,
+      "loss": 15.4729,
+      "step": 17050
+    },
+    {
+      "epoch": 9.21832884097035,
+      "grad_norm": 1.4140956401824951,
+      "learning_rate": 0.00032440540540540537,
+      "loss": 15.5002,
+      "step": 17100
+    },
+    {
+      "epoch": 9.245283018867925,
+      "grad_norm": 1.447230577468872,
+      "learning_rate": 0.00032359459459459455,
+      "loss": 15.4569,
+      "step": 17150
+    },
+    {
+      "epoch": 9.272237196765499,
+      "grad_norm": 1.4686763286590576,
+      "learning_rate": 0.00032278378378378374,
+      "loss": 15.4912,
+      "step": 17200
+    },
+    {
+      "epoch": 9.299191374663073,
+      "grad_norm": 1.5187294483184814,
+      "learning_rate": 0.00032197297297297293,
+      "loss": 15.4744,
+      "step": 17250
+    },
+    {
+      "epoch": 9.326145552560646,
+      "grad_norm": 1.6057308912277222,
+      "learning_rate": 0.00032116216216216217,
+      "loss": 15.4729,
+      "step": 17300
+    },
+    {
+      "epoch": 9.35309973045822,
+      "grad_norm": 1.606978416442871,
+      "learning_rate": 0.00032035135135135136,
+      "loss": 15.4983,
+      "step": 17350
+    },
+    {
+      "epoch": 9.380053908355794,
+      "grad_norm": 1.5377631187438965,
+      "learning_rate": 0.0003195405405405405,
+      "loss": 15.5095,
+      "step": 17400
+    },
+    {
+      "epoch": 9.40700808625337,
+      "grad_norm": 1.4473826885223389,
+      "learning_rate": 0.0003187297297297297,
+      "loss": 15.5087,
+      "step": 17450
+    },
+    {
+      "epoch": 9.433962264150944,
+      "grad_norm": 1.4930968284606934,
+      "learning_rate": 0.00031791891891891886,
+      "loss": 15.5576,
+      "step": 17500
+    },
+    {
+      "epoch": 9.460916442048518,
+      "grad_norm": 1.4278721809387207,
+      "learning_rate": 0.00031710810810810805,
+      "loss": 15.493,
+      "step": 17550
+    },
+    {
+      "epoch": 9.487870619946092,
+      "grad_norm": 1.4520282745361328,
+      "learning_rate": 0.0003162972972972973,
+      "loss": 15.5464,
+      "step": 17600
+    },
+    {
+      "epoch": 9.514824797843666,
+      "grad_norm": 1.5626846551895142,
+      "learning_rate": 0.0003154864864864865,
+      "loss": 15.5072,
+      "step": 17650
+    },
+    {
+      "epoch": 9.54177897574124,
+      "grad_norm": 1.4320954084396362,
+      "learning_rate": 0.00031467567567567567,
+      "loss": 15.5693,
+      "step": 17700
+    },
+    {
+      "epoch": 9.568733153638814,
+      "grad_norm": 1.511391282081604,
+      "learning_rate": 0.00031386486486486485,
+      "loss": 15.5792,
+      "step": 17750
+    },
+    {
+      "epoch": 9.595687331536388,
+      "grad_norm": 1.561791181564331,
+      "learning_rate": 0.00031305405405405404,
+      "loss": 15.6085,
+      "step": 17800
+    },
+    {
+      "epoch": 9.622641509433961,
+      "grad_norm": 1.5459599494934082,
+      "learning_rate": 0.0003122432432432432,
+      "loss": 15.5633,
+      "step": 17850
+    },
+    {
+      "epoch": 9.649595687331537,
+      "grad_norm": 1.5561130046844482,
+      "learning_rate": 0.0003114324324324324,
+      "loss": 15.5673,
+      "step": 17900
+    },
+    {
+      "epoch": 9.676549865229111,
+      "grad_norm": 1.5501707792282104,
+      "learning_rate": 0.0003106216216216216,
+      "loss": 15.6434,
+      "step": 17950
+    },
+    {
+      "epoch": 9.703504043126685,
+      "grad_norm": 1.5101507902145386,
+      "learning_rate": 0.0003098108108108108,
+      "loss": 15.6082,
+      "step": 18000
+    },
+    {
+      "epoch": 9.703504043126685,
+      "eval_accuracy": 0.39216082515983036,
+      "eval_loss": 3.289290189743042,
+      "eval_runtime": 183.0971,
+      "eval_samples_per_second": 98.374,
+      "eval_steps_per_second": 6.15,
+      "step": 18000
+    },
+    {
+      "epoch": 9.730458221024259,
+      "grad_norm": 1.5629507303237915,
+      "learning_rate": 0.000309,
+      "loss": 15.5575,
+      "step": 18050
+    },
+    {
+      "epoch": 9.757412398921833,
+      "grad_norm": 1.4830306768417358,
+      "learning_rate": 0.00030818918918918916,
+      "loss": 15.6156,
+      "step": 18100
+    },
+    {
+      "epoch": 9.784366576819407,
+      "grad_norm": 1.5295294523239136,
+      "learning_rate": 0.00030737837837837835,
+      "loss": 15.5877,
+      "step": 18150
+    },
+    {
+      "epoch": 9.81132075471698,
+      "grad_norm": 1.597743272781372,
+      "learning_rate": 0.0003065675675675676,
+      "loss": 15.6086,
+      "step": 18200
+    },
+    {
+      "epoch": 9.838274932614555,
+      "grad_norm": 1.510759949684143,
+      "learning_rate": 0.0003057567567567567,
+      "loss": 15.6034,
+      "step": 18250
+    },
+    {
+      "epoch": 9.865229110512129,
+      "grad_norm": 1.4675997495651245,
+      "learning_rate": 0.0003049459459459459,
+      "loss": 15.6475,
+      "step": 18300
+    },
+    {
+      "epoch": 9.892183288409704,
+      "grad_norm": 1.540273666381836,
+      "learning_rate": 0.0003041351351351351,
+      "loss": 15.6564,
+      "step": 18350
+    },
+    {
+      "epoch": 9.919137466307278,
+      "grad_norm": 1.4667071104049683,
+      "learning_rate": 0.0003033243243243243,
+      "loss": 15.5939,
+      "step": 18400
+    },
+    {
+      "epoch": 9.946091644204852,
+      "grad_norm": 1.4966003894805908,
+      "learning_rate": 0.00030251351351351347,
+      "loss": 15.603,
+      "step": 18450
+    },
+    {
+      "epoch": 9.973045822102426,
+      "grad_norm": 1.6007273197174072,
+      "learning_rate": 0.0003017027027027027,
+      "loss": 15.6328,
+      "step": 18500
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 2.202394485473633,
+      "learning_rate": 0.0003008918918918919,
+      "loss": 15.623,
+      "step": 18550
+    },
+    {
+      "epoch": 10.026954177897574,
+      "grad_norm": 1.4779033660888672,
+      "learning_rate": 0.0003000810810810811,
+      "loss": 15.1902,
+      "step": 18600
+    },
+    {
+      "epoch": 10.053908355795148,
+      "grad_norm": 1.5733294486999512,
+      "learning_rate": 0.0002992702702702703,
+      "loss": 15.2046,
+      "step": 18650
+    },
+    {
+      "epoch": 10.080862533692722,
+      "grad_norm": 1.5170866250991821,
+      "learning_rate": 0.00029845945945945946,
+      "loss": 15.2229,
+      "step": 18700
+    },
+    {
+      "epoch": 10.107816711590296,
+      "grad_norm": 1.6261968612670898,
+      "learning_rate": 0.0002976486486486486,
+      "loss": 15.2227,
+      "step": 18750
+    },
+    {
+      "epoch": 10.134770889487871,
+      "grad_norm": 1.468407154083252,
+      "learning_rate": 0.00029683783783783784,
+      "loss": 15.3124,
+      "step": 18800
+    },
+    {
+      "epoch": 10.161725067385445,
+      "grad_norm": 1.5138444900512695,
+      "learning_rate": 0.000296027027027027,
+      "loss": 15.3033,
+      "step": 18850
+    },
+    {
+      "epoch": 10.18867924528302,
+      "grad_norm": 1.4424211978912354,
+      "learning_rate": 0.0002952162162162162,
+      "loss": 15.2999,
+      "step": 18900
+    },
+    {
+      "epoch": 10.215633423180593,
+      "grad_norm": 1.5810476541519165,
+      "learning_rate": 0.0002944054054054054,
+      "loss": 15.317,
+      "step": 18950
+    },
+    {
+      "epoch": 10.242587601078167,
+      "grad_norm": 1.403139591217041,
+      "learning_rate": 0.0002935945945945946,
+      "loss": 15.3025,
+      "step": 19000
+    },
+    {
+      "epoch": 10.242587601078167,
+      "eval_accuracy": 0.39254825984677316,
+      "eval_loss": 3.2937002182006836,
+      "eval_runtime": 182.7882,
+      "eval_samples_per_second": 98.54,
+      "eval_steps_per_second": 6.16,
+      "step": 19000
+    },
+    {
+      "epoch": 10.269541778975741,
+      "grad_norm": 1.54314386844635,
+      "learning_rate": 0.00029278378378378377,
+      "loss": 15.3189,
+      "step": 19050
+    },
+    {
+      "epoch": 10.296495956873315,
+      "grad_norm": 1.4563575983047485,
+      "learning_rate": 0.00029197297297297296,
+      "loss": 15.3523,
+      "step": 19100
+    },
+    {
+      "epoch": 10.323450134770889,
+      "grad_norm": 1.448618769645691,
+      "learning_rate": 0.00029116216216216215,
+      "loss": 15.3741,
+      "step": 19150
+    },
+    {
+      "epoch": 10.350404312668463,
+      "grad_norm": 1.5495871305465698,
+      "learning_rate": 0.00029035135135135133,
+      "loss": 15.3719,
+      "step": 19200
+    },
+    {
+      "epoch": 10.377358490566039,
+      "grad_norm": 1.5040892362594604,
+      "learning_rate": 0.0002895405405405405,
+      "loss": 15.3823,
+      "step": 19250
+    },
+    {
+      "epoch": 10.404312668463612,
+      "grad_norm": 1.4656543731689453,
+      "learning_rate": 0.0002887297297297297,
+      "loss": 15.3649,
+      "step": 19300
+    },
+    {
+      "epoch": 10.431266846361186,
+      "grad_norm": 1.5206462144851685,
+      "learning_rate": 0.0002879189189189189,
+      "loss": 15.4259,
+      "step": 19350
+    },
+    {
+      "epoch": 10.45822102425876,
+      "grad_norm": 1.6246916055679321,
+      "learning_rate": 0.0002871081081081081,
+      "loss": 15.4196,
+      "step": 19400
+    },
+    {
+      "epoch": 10.485175202156334,
+      "grad_norm": 1.4970046281814575,
+      "learning_rate": 0.00028631351351351346,
+      "loss": 15.4277,
+      "step": 19450
+    },
+    {
+      "epoch": 10.512129380053908,
+      "grad_norm": 1.501575231552124,
+      "learning_rate": 0.0002855027027027027,
+      "loss": 15.3925,
+      "step": 19500
+    },
+    {
+      "epoch": 10.539083557951482,
+      "grad_norm": 1.5174660682678223,
+      "learning_rate": 0.0002846918918918919,
+      "loss": 15.4452,
+      "step": 19550
+    },
+    {
+      "epoch": 10.566037735849056,
+      "grad_norm": 1.4966565370559692,
+      "learning_rate": 0.000283881081081081,
+      "loss": 15.4459,
+      "step": 19600
+    },
+    {
+      "epoch": 10.59299191374663,
+      "grad_norm": 1.4583094120025635,
+      "learning_rate": 0.00028307027027027026,
+      "loss": 15.4223,
+      "step": 19650
+    },
+    {
+      "epoch": 10.619946091644206,
+      "grad_norm": 1.563902735710144,
+      "learning_rate": 0.00028225945945945945,
+      "loss": 15.4786,
+      "step": 19700
+    },
+    {
+      "epoch": 10.64690026954178,
+      "grad_norm": 1.456220030784607,
+      "learning_rate": 0.00028144864864864863,
+      "loss": 15.4053,
+      "step": 19750
+    },
+    {
+      "epoch": 10.673854447439354,
+      "grad_norm": 1.4244598150253296,
+      "learning_rate": 0.000280654054054054,
+      "loss": 15.4523,
+      "step": 19800
+    },
+    {
+      "epoch": 10.700808625336927,
+      "grad_norm": 1.6043477058410645,
+      "learning_rate": 0.00027984324324324325,
+      "loss": 15.4974,
+      "step": 19850
+    },
+    {
+      "epoch": 10.727762803234501,
+      "grad_norm": 1.5195151567459106,
+      "learning_rate": 0.0002790324324324324,
+      "loss": 15.4587,
+      "step": 19900
+    },
+    {
+      "epoch": 10.754716981132075,
+      "grad_norm": 1.4490087032318115,
+      "learning_rate": 0.0002782216216216216,
+      "loss": 15.4793,
+      "step": 19950
+    },
+    {
+      "epoch": 10.78167115902965,
+      "grad_norm": 1.4315603971481323,
+      "learning_rate": 0.0002774108108108108,
+      "loss": 15.4821,
+      "step": 20000
+    },
+    {
+      "epoch": 10.78167115902965,
+      "eval_accuracy": 0.3933073754266019,
+      "eval_loss": 3.2815403938293457,
+      "eval_runtime": 182.6356,
+      "eval_samples_per_second": 98.623,
+      "eval_steps_per_second": 6.165,
+      "step": 20000
+    },
+    {
+      "epoch": 10.808625336927223,
+      "grad_norm": 1.5369899272918701,
+      "learning_rate": 0.0002766,
+      "loss": 15.4788,
+      "step": 20050
+    },
+    {
+      "epoch": 10.835579514824797,
+      "grad_norm": 1.517513394355774,
+      "learning_rate": 0.00027578918918918913,
+      "loss": 15.497,
+      "step": 20100
+    },
+    {
+      "epoch": 10.862533692722373,
+      "grad_norm": 1.4283959865570068,
+      "learning_rate": 0.0002749783783783784,
+      "loss": 15.4969,
+      "step": 20150
+    },
+    {
+      "epoch": 10.889487870619947,
+      "grad_norm": 1.5472018718719482,
+      "learning_rate": 0.00027416756756756756,
+      "loss": 15.4344,
+      "step": 20200
+    },
+    {
+      "epoch": 10.91644204851752,
+      "grad_norm": 1.4912693500518799,
+      "learning_rate": 0.00027335675675675675,
+      "loss": 15.4865,
+      "step": 20250
+    },
+    {
+      "epoch": 10.943396226415095,
+      "grad_norm": 1.49799644947052,
+      "learning_rate": 0.0002725459459459459,
+      "loss": 15.4216,
+      "step": 20300
+    },
+    {
+      "epoch": 10.970350404312669,
+      "grad_norm": 1.434645652770996,
+      "learning_rate": 0.0002717351351351351,
+      "loss": 15.5138,
+      "step": 20350
+    },
+    {
+      "epoch": 10.997304582210242,
+      "grad_norm": 1.4343045949935913,
+      "learning_rate": 0.0002709243243243243,
+      "loss": 15.4722,
+      "step": 20400
+    },
+    {
+      "epoch": 11.024258760107816,
+      "grad_norm": 1.537205696105957,
+      "learning_rate": 0.0002701135135135135,
+      "loss": 15.1417,
+      "step": 20450
+    },
+    {
+      "epoch": 11.05121293800539,
+      "grad_norm": 1.5110833644866943,
+      "learning_rate": 0.0002693027027027027,
+      "loss": 15.0783,
+      "step": 20500
+    },
+    {
+      "epoch": 11.078167115902964,
+      "grad_norm": 1.4787135124206543,
+      "learning_rate": 0.00026849189189189187,
+      "loss": 15.0941,
+      "step": 20550
+    },
+    {
+      "epoch": 11.10512129380054,
+      "grad_norm": 1.525460124015808,
+      "learning_rate": 0.00026768108108108106,
+      "loss": 15.1286,
+      "step": 20600
+    },
+    {
+      "epoch": 11.132075471698114,
+      "grad_norm": 1.5442931652069092,
+      "learning_rate": 0.00026687027027027025,
+      "loss": 15.1526,
+      "step": 20650
+    },
+    {
+      "epoch": 11.159029649595688,
+      "grad_norm": 1.5607445240020752,
+      "learning_rate": 0.00026605945945945943,
+      "loss": 15.1498,
+      "step": 20700
+    },
+    {
+      "epoch": 11.185983827493262,
+      "grad_norm": 1.588240146636963,
+      "learning_rate": 0.0002652486486486486,
+      "loss": 15.2072,
+      "step": 20750
+    },
+    {
+      "epoch": 11.212938005390836,
+      "grad_norm": 1.4865835905075073,
+      "learning_rate": 0.0002644378378378378,
+      "loss": 15.1718,
+      "step": 20800
+    },
+    {
+      "epoch": 11.23989218328841,
+      "grad_norm": 1.5161540508270264,
+      "learning_rate": 0.000263627027027027,
+      "loss": 15.2105,
+      "step": 20850
+    },
+    {
+      "epoch": 11.266846361185983,
+      "grad_norm": 1.5474003553390503,
+      "learning_rate": 0.0002628162162162162,
+      "loss": 15.2186,
+      "step": 20900
+    },
+    {
+      "epoch": 11.293800539083557,
+      "grad_norm": 1.5761972665786743,
+      "learning_rate": 0.00026200540540540537,
+      "loss": 15.2277,
+      "step": 20950
+    },
+    {
+      "epoch": 11.320754716981131,
+      "grad_norm": 1.513260841369629,
+      "learning_rate": 0.00026119459459459456,
+      "loss": 15.1951,
+      "step": 21000
+    },
+    {
+      "epoch": 11.320754716981131,
+      "eval_accuracy": 0.39346752089170384,
+      "eval_loss": 3.285801410675049,
+      "eval_runtime": 182.9596,
+      "eval_samples_per_second": 98.448,
+      "eval_steps_per_second": 6.154,
+      "step": 21000
+    },
+    {
+      "epoch": 11.347708894878707,
+      "grad_norm": 1.5032051801681519,
+      "learning_rate": 0.0002603837837837838,
+      "loss": 15.2475,
+      "step": 21050
+    },
+    {
+      "epoch": 11.374663072776281,
+      "grad_norm": 1.6460040807724,
+      "learning_rate": 0.00025957297297297293,
+      "loss": 15.2432,
+      "step": 21100
+    },
+    {
+      "epoch": 11.401617250673855,
+      "grad_norm": 1.6060497760772705,
+      "learning_rate": 0.0002587621621621621,
+      "loss": 15.2565,
+      "step": 21150
+    },
+    {
+      "epoch": 11.428571428571429,
+      "grad_norm": 1.4609140157699585,
+      "learning_rate": 0.00025795135135135136,
+      "loss": 15.2865,
+      "step": 21200
+    },
+    {
+      "epoch": 11.455525606469003,
+      "grad_norm": 1.5956884622573853,
+      "learning_rate": 0.00025714054054054054,
+      "loss": 15.2832,
+      "step": 21250
+    },
+    {
+      "epoch": 11.482479784366577,
+      "grad_norm": 1.6202877759933472,
+      "learning_rate": 0.0002563297297297297,
+      "loss": 15.2709,
+      "step": 21300
+    },
+    {
+      "epoch": 11.50943396226415,
+      "grad_norm": 1.5032237768173218,
+      "learning_rate": 0.00025551891891891886,
+      "loss": 15.2972,
+      "step": 21350
+    },
+    {
+      "epoch": 11.536388140161725,
+      "grad_norm": 1.5729618072509766,
+      "learning_rate": 0.0002547081081081081,
+      "loss": 15.3052,
+      "step": 21400
+    },
+    {
+      "epoch": 11.563342318059298,
+      "grad_norm": 1.5918726921081543,
+      "learning_rate": 0.0002538972972972973,
+      "loss": 15.3656,
+      "step": 21450
+    },
+    {
+      "epoch": 11.590296495956874,
+      "grad_norm": 1.525824785232544,
+      "learning_rate": 0.0002530864864864864,
+      "loss": 15.2748,
+      "step": 21500
+    },
+    {
+      "epoch": 11.617250673854448,
+      "grad_norm": 1.4660574197769165,
+      "learning_rate": 0.00025227567567567567,
+      "loss": 15.3363,
+      "step": 21550
+    },
+    {
+      "epoch": 11.644204851752022,
+      "grad_norm": 1.5372847318649292,
+      "learning_rate": 0.00025146486486486485,
+      "loss": 15.293,
+      "step": 21600
+    },
+    {
+      "epoch": 11.671159029649596,
+      "grad_norm": 1.4234718084335327,
+      "learning_rate": 0.00025065405405405404,
+      "loss": 15.308,
+      "step": 21650
+    },
+    {
+      "epoch": 11.69811320754717,
+      "grad_norm": 1.3779908418655396,
+      "learning_rate": 0.00024984324324324323,
+      "loss": 15.329,
+      "step": 21700
+    },
+    {
+      "epoch": 11.725067385444744,
+      "grad_norm": 1.5348743200302124,
+      "learning_rate": 0.0002490324324324324,
+      "loss": 15.3045,
+      "step": 21750
+    },
+    {
+      "epoch": 11.752021563342318,
+      "grad_norm": 1.5083909034729004,
+      "learning_rate": 0.0002482216216216216,
+      "loss": 15.3429,
+      "step": 21800
+    },
+    {
+      "epoch": 11.778975741239892,
+      "grad_norm": 1.4889590740203857,
+      "learning_rate": 0.0002474108108108108,
+      "loss": 15.3267,
+      "step": 21850
+    },
+    {
+      "epoch": 11.805929919137466,
+      "grad_norm": 1.5269172191619873,
+      "learning_rate": 0.0002466,
+      "loss": 15.3488,
+      "step": 21900
+    },
+    {
+      "epoch": 11.832884097035041,
+      "grad_norm": 1.5171701908111572,
+      "learning_rate": 0.00024578918918918916,
+      "loss": 15.4014,
+      "step": 21950
+    },
+    {
+      "epoch": 11.859838274932615,
+      "grad_norm": 1.5388139486312866,
+      "learning_rate": 0.00024497837837837835,
+      "loss": 15.3396,
+      "step": 22000
+    },
+    {
+      "epoch": 11.859838274932615,
+      "eval_accuracy": 0.3947092458039498,
+      "eval_loss": 3.2750062942504883,
+      "eval_runtime": 182.5517,
+      "eval_samples_per_second": 98.668,
+      "eval_steps_per_second": 6.168,
+      "step": 22000
+    },
+    {
+      "epoch": 11.88679245283019,
+      "grad_norm": 1.6454235315322876,
+      "learning_rate": 0.00024416756756756754,
+      "loss": 15.3485,
+      "step": 22050
+    },
+    {
+      "epoch": 11.913746630727763,
+      "grad_norm": 1.439768671989441,
+      "learning_rate": 0.00024335675675675673,
+      "loss": 15.3763,
+      "step": 22100
+    },
+    {
+      "epoch": 11.940700808625337,
+      "grad_norm": 1.5389134883880615,
+      "learning_rate": 0.0002425459459459459,
+      "loss": 15.3453,
+      "step": 22150
+    },
+    {
+      "epoch": 11.967654986522911,
+      "grad_norm": 1.624977469444275,
+      "learning_rate": 0.00024173513513513513,
+      "loss": 15.362,
+      "step": 22200
+    },
+    {
+      "epoch": 11.994609164420485,
+      "grad_norm": 1.4279649257659912,
+      "learning_rate": 0.0002409243243243243,
+      "loss": 15.3672,
+      "step": 22250
+    },
+    {
+      "epoch": 12.021563342318059,
+      "grad_norm": 1.496293544769287,
+      "learning_rate": 0.00024012972972972972,
+      "loss": 15.0297,
+      "step": 22300
+    },
+    {
+      "epoch": 12.048517520215633,
+      "grad_norm": 1.4558320045471191,
+      "learning_rate": 0.0002393189189189189,
+      "loss": 14.9965,
+      "step": 22350
+    },
+    {
+      "epoch": 12.075471698113208,
+      "grad_norm": 1.4995672702789307,
+      "learning_rate": 0.00023850810810810806,
+      "loss": 15.0038,
+      "step": 22400
+    },
+    {
+      "epoch": 12.102425876010782,
+      "grad_norm": 1.54188072681427,
+      "learning_rate": 0.00023769729729729728,
+      "loss": 15.046,
+      "step": 22450
+    },
+    {
+      "epoch": 12.129380053908356,
+      "grad_norm": 1.5988883972167969,
+      "learning_rate": 0.00023688648648648647,
+      "loss": 14.9948,
+      "step": 22500
+    },
+    {
+      "epoch": 12.15633423180593,
+      "grad_norm": 1.6539379358291626,
+      "learning_rate": 0.00023607567567567568,
+      "loss": 15.0689,
+      "step": 22550
+    },
+    {
+      "epoch": 12.183288409703504,
+      "grad_norm": 1.5286942720413208,
+      "learning_rate": 0.00023526486486486484,
+      "loss": 15.0572,
+      "step": 22600
+    },
+    {
+      "epoch": 12.210242587601078,
+      "grad_norm": 1.512137532234192,
+      "learning_rate": 0.00023445405405405403,
+      "loss": 15.0635,
+      "step": 22650
+    },
+    {
+      "epoch": 12.237196765498652,
+      "grad_norm": 1.6056221723556519,
+      "learning_rate": 0.00023364324324324321,
+      "loss": 15.084,
+      "step": 22700
+    },
+    {
+      "epoch": 12.264150943396226,
+      "grad_norm": 1.5421860218048096,
+      "learning_rate": 0.00023283243243243243,
+      "loss": 15.1055,
+      "step": 22750
+    },
+    {
+      "epoch": 12.2911051212938,
+      "grad_norm": 1.4648276567459106,
+      "learning_rate": 0.0002320216216216216,
+      "loss": 15.1192,
+      "step": 22800
+    },
+    {
+      "epoch": 12.318059299191376,
+      "grad_norm": 1.506853461265564,
+      "learning_rate": 0.00023121081081081078,
+      "loss": 15.1146,
+      "step": 22850
+    },
+    {
+      "epoch": 12.34501347708895,
+      "grad_norm": 1.4823317527770996,
+      "learning_rate": 0.0002304,
+      "loss": 15.13,
+      "step": 22900
+    },
+    {
+      "epoch": 12.371967654986523,
+      "grad_norm": 1.5213923454284668,
+      "learning_rate": 0.00022958918918918918,
+      "loss": 15.121,
+      "step": 22950
+    },
+    {
+      "epoch": 12.398921832884097,
+      "grad_norm": 1.553358793258667,
+      "learning_rate": 0.00022877837837837834,
+      "loss": 15.155,
+      "step": 23000
+    },
+    {
+      "epoch": 12.398921832884097,
+      "eval_accuracy": 0.39454377664292517,
+      "eval_loss": 3.280299186706543,
+      "eval_runtime": 182.8131,
+      "eval_samples_per_second": 98.527,
+      "eval_steps_per_second": 6.159,
+      "step": 23000
+    },
+    {
+      "epoch": 12.425876010781671,
+      "grad_norm": 1.6617335081100464,
+      "learning_rate": 0.00022796756756756755,
+      "loss": 15.129,
+      "step": 23050
+    },
+    {
+      "epoch": 12.452830188679245,
+      "grad_norm": 1.5141173601150513,
+      "learning_rate": 0.00022715675675675674,
+      "loss": 15.1529,
+      "step": 23100
+    },
+    {
+      "epoch": 12.479784366576819,
+      "grad_norm": 1.6448322534561157,
+      "learning_rate": 0.00022634594594594595,
+      "loss": 15.1113,
+      "step": 23150
+    },
+    {
+      "epoch": 12.506738544474393,
+      "grad_norm": 1.4851939678192139,
+      "learning_rate": 0.0002255351351351351,
+      "loss": 15.168,
+      "step": 23200
+    },
+    {
+      "epoch": 12.533692722371967,
+      "grad_norm": 1.6145434379577637,
+      "learning_rate": 0.0002247243243243243,
+      "loss": 15.2239,
+      "step": 23250
+    },
+    {
+      "epoch": 12.560646900269543,
+      "grad_norm": 1.4803235530853271,
+      "learning_rate": 0.00022391351351351349,
+      "loss": 15.1449,
+      "step": 23300
+    },
+    {
+      "epoch": 12.587601078167117,
+      "grad_norm": 1.469650149345398,
+      "learning_rate": 0.0002231027027027027,
+      "loss": 15.1656,
+      "step": 23350
+    },
+    {
+      "epoch": 12.61455525606469,
+      "grad_norm": 1.5211254358291626,
+      "learning_rate": 0.00022229189189189186,
+      "loss": 15.2007,
+      "step": 23400
+    },
+    {
+      "epoch": 12.641509433962264,
+      "grad_norm": 1.5244157314300537,
+      "learning_rate": 0.00022148108108108105,
+      "loss": 15.2137,
+      "step": 23450
+    },
+    {
+      "epoch": 12.668463611859838,
+      "grad_norm": 1.5942649841308594,
+      "learning_rate": 0.00022067027027027026,
+      "loss": 15.1884,
+      "step": 23500
+    },
+    {
+      "epoch": 12.695417789757412,
+      "grad_norm": 1.4888988733291626,
+      "learning_rate": 0.00021985945945945945,
+      "loss": 15.2281,
+      "step": 23550
+    },
+    {
+      "epoch": 12.722371967654986,
+      "grad_norm": 1.5697029829025269,
+      "learning_rate": 0.0002190486486486486,
+      "loss": 15.2005,
+      "step": 23600
+    },
+    {
+      "epoch": 12.74932614555256,
+      "grad_norm": 1.4188930988311768,
+      "learning_rate": 0.00021823783783783782,
+      "loss": 15.2384,
+      "step": 23650
+    },
+    {
+      "epoch": 12.776280323450134,
+      "grad_norm": 1.516971468925476,
+      "learning_rate": 0.000217427027027027,
+      "loss": 15.2365,
+      "step": 23700
+    },
+    {
+      "epoch": 12.80323450134771,
+      "grad_norm": 1.4627147912979126,
+      "learning_rate": 0.0002166162162162162,
+      "loss": 15.2397,
+      "step": 23750
+    },
+    {
+      "epoch": 12.830188679245284,
+      "grad_norm": 1.5219035148620605,
+      "learning_rate": 0.00021580540540540538,
+      "loss": 15.2338,
+      "step": 23800
+    },
+    {
+      "epoch": 12.857142857142858,
+      "grad_norm": 1.5930004119873047,
+      "learning_rate": 0.00021499459459459457,
+      "loss": 15.1914,
+      "step": 23850
+    },
+    {
+      "epoch": 12.884097035040432,
+      "grad_norm": 1.6950451135635376,
+      "learning_rate": 0.00021418378378378376,
+      "loss": 15.2401,
+      "step": 23900
+    },
+    {
+      "epoch": 12.911051212938006,
+      "grad_norm": 1.4628324508666992,
+      "learning_rate": 0.00021337297297297297,
+      "loss": 15.2338,
+      "step": 23950
+    },
+    {
+      "epoch": 12.93800539083558,
+      "grad_norm": 1.5722259283065796,
+      "learning_rate": 0.00021256216216216213,
+      "loss": 15.2463,
+      "step": 24000
+    },
+    {
+      "epoch": 12.93800539083558,
+      "eval_accuracy": 0.3955899372151551,
+      "eval_loss": 3.267047166824341,
+      "eval_runtime": 182.8794,
+      "eval_samples_per_second": 98.491,
+      "eval_steps_per_second": 6.157,
+      "step": 24000
+    },
+    {
+      "epoch": 12.964959568733153,
+      "grad_norm": 1.5204074382781982,
+      "learning_rate": 0.00021175135135135132,
+      "loss": 15.2522,
+      "step": 24050
+    },
+    {
+      "epoch": 12.991913746630727,
+      "grad_norm": 1.557715892791748,
+      "learning_rate": 0.00021094054054054053,
+      "loss": 15.2408,
+      "step": 24100
+    },
+    {
+      "epoch": 13.018867924528301,
+      "grad_norm": 1.4913039207458496,
+      "learning_rate": 0.00021012972972972972,
+      "loss": 14.9635,
+      "step": 24150
+    },
+    {
+      "epoch": 13.045822102425875,
+      "grad_norm": 1.510380744934082,
+      "learning_rate": 0.00020931891891891888,
+      "loss": 14.8479,
+      "step": 24200
+    },
+    {
+      "epoch": 13.07277628032345,
+      "grad_norm": 1.5142250061035156,
+      "learning_rate": 0.0002085081081081081,
+      "loss": 14.9016,
+      "step": 24250
+    },
+    {
+      "epoch": 13.099730458221025,
+      "grad_norm": 1.5466632843017578,
+      "learning_rate": 0.00020769729729729728,
+      "loss": 14.8995,
+      "step": 24300
+    },
+    {
+      "epoch": 13.126684636118599,
+      "grad_norm": 1.5550708770751953,
+      "learning_rate": 0.00020688648648648647,
+      "loss": 14.9194,
+      "step": 24350
+    },
+    {
+      "epoch": 13.153638814016173,
+      "grad_norm": 1.6031028032302856,
+      "learning_rate": 0.00020607567567567566,
+      "loss": 14.9441,
+      "step": 24400
+    },
+    {
+      "epoch": 13.180592991913747,
+      "grad_norm": 1.5483187437057495,
+      "learning_rate": 0.00020526486486486484,
+      "loss": 14.9451,
+      "step": 24450
+    },
+    {
+      "epoch": 13.20754716981132,
+      "grad_norm": 1.516534686088562,
+      "learning_rate": 0.00020445405405405403,
+      "loss": 14.9854,
+      "step": 24500
+    },
+    {
+      "epoch": 13.234501347708894,
+      "grad_norm": 1.5218985080718994,
+      "learning_rate": 0.00020364324324324324,
+      "loss": 15.0095,
+      "step": 24550
+    },
+    {
+      "epoch": 13.261455525606468,
+      "grad_norm": 1.6100311279296875,
+      "learning_rate": 0.0002028324324324324,
+      "loss": 14.9844,
+      "step": 24600
+    },
+    {
+      "epoch": 13.288409703504042,
+      "grad_norm": 1.5485996007919312,
+      "learning_rate": 0.0002020216216216216,
+      "loss": 15.007,
+      "step": 24650
+    },
+    {
+      "epoch": 13.315363881401618,
+      "grad_norm": 1.6040297746658325,
+      "learning_rate": 0.0002012108108108108,
+      "loss": 14.998,
+      "step": 24700
+    },
+    {
+      "epoch": 13.342318059299192,
+      "grad_norm": 1.5006548166275024,
+      "learning_rate": 0.0002004,
+      "loss": 15.019,
+      "step": 24750
+    },
+    {
+      "epoch": 13.369272237196766,
+      "grad_norm": 1.5284956693649292,
+      "learning_rate": 0.00019958918918918915,
+      "loss": 15.0376,
+      "step": 24800
+    },
+    {
+      "epoch": 13.39622641509434,
+      "grad_norm": 1.576499104499817,
+      "learning_rate": 0.00019877837837837837,
+      "loss": 15.0245,
+      "step": 24850
+    },
+    {
+      "epoch": 13.423180592991914,
+      "grad_norm": 1.5415410995483398,
+      "learning_rate": 0.00019796756756756755,
+      "loss": 15.0273,
+      "step": 24900
+    },
+    {
+      "epoch": 13.450134770889488,
+      "grad_norm": 1.5509371757507324,
+      "learning_rate": 0.00019715675675675674,
+      "loss": 15.04,
+      "step": 24950
+    },
+    {
+      "epoch": 13.477088948787062,
+      "grad_norm": 1.5401982069015503,
+      "learning_rate": 0.00019634594594594593,
+      "loss": 15.0772,
+      "step": 25000
+    },
+    {
+      "epoch": 13.477088948787062,
+      "eval_accuracy": 0.39549693550679194,
+      "eval_loss": 3.273712635040283,
+      "eval_runtime": 183.4928,
+      "eval_samples_per_second": 98.162,
+      "eval_steps_per_second": 6.136,
+      "step": 25000
+    },
+    {
+      "epoch": 13.504043126684635,
+      "grad_norm": 1.4463169574737549,
+      "learning_rate": 0.00019553513513513511,
+      "loss": 15.0517,
+      "step": 25050
+    },
+    {
+      "epoch": 13.530997304582211,
+      "grad_norm": 1.5428580045700073,
+      "learning_rate": 0.0001947243243243243,
+      "loss": 15.081,
+      "step": 25100
+    },
+    {
+      "epoch": 13.557951482479785,
+      "grad_norm": 1.4901021718978882,
+      "learning_rate": 0.00019391351351351352,
+      "loss": 15.0555,
+      "step": 25150
+    },
+    {
+      "epoch": 13.584905660377359,
+      "grad_norm": 1.4984480142593384,
+      "learning_rate": 0.00019310270270270268,
+      "loss": 15.0935,
+      "step": 25200
+    },
+    {
+      "epoch": 13.611859838274933,
+      "grad_norm": 1.5072040557861328,
+      "learning_rate": 0.00019229189189189186,
+      "loss": 15.0852,
+      "step": 25250
+    },
+    {
+      "epoch": 13.638814016172507,
+      "grad_norm": 1.5810104608535767,
+      "learning_rate": 0.00019148108108108108,
+      "loss": 15.0621,
+      "step": 25300
+    },
+    {
+      "epoch": 13.66576819407008,
+      "grad_norm": 1.5162920951843262,
+      "learning_rate": 0.00019067027027027026,
+      "loss": 15.0682,
+      "step": 25350
+    },
+    {
+      "epoch": 13.692722371967655,
+      "grad_norm": 1.6046135425567627,
+      "learning_rate": 0.00018985945945945942,
+      "loss": 15.1206,
+      "step": 25400
+    },
+    {
+      "epoch": 13.719676549865229,
+      "grad_norm": 1.5759787559509277,
+      "learning_rate": 0.00018904864864864864,
+      "loss": 15.0669,
+      "step": 25450
+    },
+    {
+      "epoch": 13.746630727762803,
+      "grad_norm": 1.5286028385162354,
+      "learning_rate": 0.00018823783783783783,
+      "loss": 15.0961,
+      "step": 25500
+    },
+    {
+      "epoch": 13.773584905660378,
+      "grad_norm": 1.506309151649475,
+      "learning_rate": 0.000187427027027027,
+      "loss": 15.141,
+      "step": 25550
+    },
+    {
+      "epoch": 13.800539083557952,
+      "grad_norm": 1.6738903522491455,
+      "learning_rate": 0.00018661621621621617,
+      "loss": 15.1184,
+      "step": 25600
+    },
+    {
+      "epoch": 13.827493261455526,
+      "grad_norm": 1.4504151344299316,
+      "learning_rate": 0.00018580540540540539,
+      "loss": 15.1011,
+      "step": 25650
+    },
+    {
+      "epoch": 13.8544474393531,
+      "grad_norm": 1.4721909761428833,
+      "learning_rate": 0.00018499459459459457,
+      "loss": 15.1252,
+      "step": 25700
+    },
+    {
+      "epoch": 13.881401617250674,
+      "grad_norm": 1.5899988412857056,
+      "learning_rate": 0.0001841837837837838,
+      "loss": 15.0654,
+      "step": 25750
+    },
+    {
+      "epoch": 13.908355795148248,
+      "grad_norm": 1.5403255224227905,
+      "learning_rate": 0.00018337297297297295,
+      "loss": 15.1186,
+      "step": 25800
+    },
+    {
+      "epoch": 13.935309973045822,
+      "grad_norm": 1.5205345153808594,
+      "learning_rate": 0.00018256216216216213,
+      "loss": 15.1377,
+      "step": 25850
+    },
+    {
+      "epoch": 13.962264150943396,
+      "grad_norm": 1.550255298614502,
+      "learning_rate": 0.00018175135135135135,
+      "loss": 15.1167,
+      "step": 25900
+    },
+    {
+      "epoch": 13.98921832884097,
+      "grad_norm": 1.5580016374588013,
+      "learning_rate": 0.00018094054054054054,
+      "loss": 15.1598,
+      "step": 25950
+    },
+    {
+      "epoch": 14.016172506738544,
+      "grad_norm": 1.486087441444397,
+      "learning_rate": 0.0001801297297297297,
+      "loss": 14.8798,
+      "step": 26000
+    },
+    {
+      "epoch": 14.016172506738544,
+      "eval_accuracy": 0.395819942608385,
+      "eval_loss": 3.2739694118499756,
+      "eval_runtime": 183.4607,
+      "eval_samples_per_second": 98.179,
+      "eval_steps_per_second": 6.138,
+      "step": 26000
+    },
+    {
+      "epoch": 14.04312668463612,
+      "grad_norm": 1.5213587284088135,
+      "learning_rate": 0.0001793189189189189,
+      "loss": 14.7999,
+      "step": 26050
+    },
+    {
+      "epoch": 14.070080862533693,
+      "grad_norm": 1.5840412378311157,
+      "learning_rate": 0.0001785081081081081,
+      "loss": 14.8034,
+      "step": 26100
+    },
+    {
+      "epoch": 14.097035040431267,
+      "grad_norm": 1.479959487915039,
+      "learning_rate": 0.00017769729729729728,
+      "loss": 14.8473,
+      "step": 26150
+    },
+    {
+      "epoch": 14.123989218328841,
+      "grad_norm": 1.521485447883606,
+      "learning_rate": 0.00017688648648648644,
+      "loss": 14.8314,
+      "step": 26200
+    },
+    {
+      "epoch": 14.150943396226415,
+      "grad_norm": 1.5491563081741333,
+      "learning_rate": 0.00017607567567567566,
+      "loss": 14.8476,
+      "step": 26250
+    },
+    {
+      "epoch": 14.177897574123989,
+      "grad_norm": 1.4828426837921143,
+      "learning_rate": 0.00017528108108108106,
+      "loss": 14.8784,
+      "step": 26300
+    },
+    {
+      "epoch": 14.204851752021563,
+      "grad_norm": 1.517014980316162,
+      "learning_rate": 0.00017447027027027025,
+      "loss": 14.8559,
+      "step": 26350
+    },
+    {
+      "epoch": 14.231805929919137,
+      "grad_norm": 1.5260605812072754,
+      "learning_rate": 0.00017365945945945944,
+      "loss": 14.8986,
+      "step": 26400
+    },
+    {
+      "epoch": 14.25876010781671,
+      "grad_norm": 1.5517903566360474,
+      "learning_rate": 0.00017284864864864865,
+      "loss": 14.9058,
+      "step": 26450
+    },
+    {
+      "epoch": 14.285714285714286,
+      "grad_norm": 1.5963462591171265,
+      "learning_rate": 0.0001720378378378378,
+      "loss": 14.8757,
+      "step": 26500
+    },
+    {
+      "epoch": 14.31266846361186,
+      "grad_norm": 1.4833232164382935,
+      "learning_rate": 0.000171227027027027,
+      "loss": 14.9005,
+      "step": 26550
+    },
+    {
+      "epoch": 14.339622641509434,
+      "grad_norm": 1.5324000120162964,
+      "learning_rate": 0.0001704162162162162,
+      "loss": 14.8942,
+      "step": 26600
+    },
+    {
+      "epoch": 14.366576819407008,
+      "grad_norm": 1.5217413902282715,
+      "learning_rate": 0.0001696054054054054,
+      "loss": 14.9365,
+      "step": 26650
+    },
+    {
+      "epoch": 14.393530997304582,
+      "grad_norm": 1.6108272075653076,
+      "learning_rate": 0.00016879459459459456,
+      "loss": 14.9405,
+      "step": 26700
+    },
+    {
+      "epoch": 14.420485175202156,
+      "grad_norm": 1.646351933479309,
+      "learning_rate": 0.00016798378378378377,
+      "loss": 14.9517,
+      "step": 26750
+    },
+    {
+      "epoch": 14.44743935309973,
+      "grad_norm": 1.5736359357833862,
+      "learning_rate": 0.00016717297297297296,
+      "loss": 14.9074,
+      "step": 26800
+    },
+    {
+      "epoch": 14.474393530997304,
+      "grad_norm": 1.5562492609024048,
+      "learning_rate": 0.00016636216216216215,
+      "loss": 14.8835,
+      "step": 26850
+    },
+    {
+      "epoch": 14.501347708894878,
+      "grad_norm": 1.602283239364624,
+      "learning_rate": 0.00016555135135135133,
+      "loss": 14.8741,
+      "step": 26900
+    },
+    {
+      "epoch": 14.528301886792454,
+      "grad_norm": 1.5761126279830933,
+      "learning_rate": 0.00016474054054054052,
+      "loss": 14.9874,
+      "step": 26950
+    },
+    {
+      "epoch": 14.555256064690028,
+      "grad_norm": 1.5599428415298462,
+      "learning_rate": 0.0001639297297297297,
+      "loss": 14.9491,
+      "step": 27000
+    },
+    {
+      "epoch": 14.555256064690028,
+      "eval_accuracy": 0.3961279564439102,
+      "eval_loss": 3.270308256149292,
+      "eval_runtime": 183.109,
+      "eval_samples_per_second": 98.368,
+      "eval_steps_per_second": 6.149,
+      "step": 27000
+    },
+    {
+      "epoch": 14.582210242587601,
+      "grad_norm": 1.5091289281845093,
+      "learning_rate": 0.00016311891891891892,
+      "loss": 14.9643,
+      "step": 27050
+    },
+    {
+      "epoch": 14.609164420485175,
+      "grad_norm": 1.554219126701355,
+      "learning_rate": 0.00016232432432432433,
+      "loss": 14.947,
+      "step": 27100
+    },
+    {
+      "epoch": 14.63611859838275,
+      "grad_norm": 1.587387204170227,
+      "learning_rate": 0.00016151351351351351,
+      "loss": 14.9556,
+      "step": 27150
+    },
+    {
+      "epoch": 14.663072776280323,
+      "grad_norm": 1.4709123373031616,
+      "learning_rate": 0.00016070270270270267,
+      "loss": 14.9691,
+      "step": 27200
+    },
+    {
+      "epoch": 14.690026954177897,
+      "grad_norm": 1.5789484977722168,
+      "learning_rate": 0.0001598918918918919,
+      "loss": 15.0007,
+      "step": 27250
+    },
+    {
+      "epoch": 14.716981132075471,
+      "grad_norm": 1.5269670486450195,
+      "learning_rate": 0.00015908108108108108,
+      "loss": 14.9889,
+      "step": 27300
+    },
+    {
+      "epoch": 14.743935309973045,
+      "grad_norm": 1.4882417917251587,
+      "learning_rate": 0.00015827027027027026,
+      "loss": 14.9787,
+      "step": 27350
+    },
+    {
+      "epoch": 14.77088948787062,
+      "grad_norm": 1.5664522647857666,
+      "learning_rate": 0.00015745945945945945,
+      "loss": 14.9789,
+      "step": 27400
+    },
+    {
+      "epoch": 14.797843665768195,
+      "grad_norm": 1.5020064115524292,
+      "learning_rate": 0.00015664864864864864,
+      "loss": 14.9879,
+      "step": 27450
+    },
+    {
+      "epoch": 14.824797843665769,
+      "grad_norm": 1.5945509672164917,
+      "learning_rate": 0.00015583783783783782,
+      "loss": 15.0118,
+      "step": 27500
+    },
+    {
+      "epoch": 14.851752021563343,
+      "grad_norm": 1.5526251792907715,
+      "learning_rate": 0.00015502702702702704,
+      "loss": 15.0042,
+      "step": 27550
+    },
+    {
+      "epoch": 14.878706199460916,
+      "grad_norm": 1.5196871757507324,
+      "learning_rate": 0.0001542162162162162,
+      "loss": 15.0268,
+      "step": 27600
+    },
+    {
+      "epoch": 14.90566037735849,
+      "grad_norm": 1.4666787385940552,
+      "learning_rate": 0.00015340540540540538,
+      "loss": 15.0046,
+      "step": 27650
+    },
+    {
+      "epoch": 14.932614555256064,
+      "grad_norm": 1.455553412437439,
+      "learning_rate": 0.0001525945945945946,
+      "loss": 14.9991,
+      "step": 27700
+    },
+    {
+      "epoch": 14.959568733153638,
+      "grad_norm": 1.4995278120040894,
+      "learning_rate": 0.00015178378378378379,
+      "loss": 15.0387,
+      "step": 27750
+    },
+    {
+      "epoch": 14.986522911051212,
+      "grad_norm": 1.6195017099380493,
+      "learning_rate": 0.00015097297297297295,
+      "loss": 15.0153,
+      "step": 27800
+    },
+    {
+      "epoch": 15.013477088948788,
+      "grad_norm": 1.591896653175354,
+      "learning_rate": 0.00015016216216216216,
+      "loss": 14.8525,
+      "step": 27850
+    },
+    {
+      "epoch": 15.040431266846362,
+      "grad_norm": 1.5427701473236084,
+      "learning_rate": 0.00014935135135135135,
+      "loss": 14.6664,
+      "step": 27900
+    },
+    {
+      "epoch": 15.067385444743936,
+      "grad_norm": 1.6039804220199585,
+      "learning_rate": 0.00014854054054054053,
+      "loss": 14.7522,
+      "step": 27950
+    },
+    {
+      "epoch": 15.09433962264151,
+      "grad_norm": 1.5312916040420532,
+      "learning_rate": 0.00014772972972972972,
+      "loss": 14.7585,
+      "step": 28000
+    },
+    {
+      "epoch": 15.09433962264151,
+      "eval_accuracy": 0.3963390572842719,
+      "eval_loss": 3.2742018699645996,
+      "eval_runtime": 183.4262,
+      "eval_samples_per_second": 98.198,
+      "eval_steps_per_second": 6.139,
+      "step": 28000
+    },
+    {
+      "epoch": 15.121293800539084,
+      "grad_norm": 1.5751817226409912,
+      "learning_rate": 0.0001469189189189189,
+      "loss": 14.7446,
+      "step": 28050
+    },
+    {
+      "epoch": 15.148247978436657,
+      "grad_norm": 1.5158833265304565,
+      "learning_rate": 0.0001461081081081081,
+      "loss": 14.7897,
+      "step": 28100
+    },
+    {
+      "epoch": 15.175202156334231,
+      "grad_norm": 1.548401951789856,
+      "learning_rate": 0.00014529729729729728,
+      "loss": 14.7723,
+      "step": 28150
+    },
+    {
+      "epoch": 15.202156334231805,
+      "grad_norm": 1.5945206880569458,
+      "learning_rate": 0.00014448648648648647,
+      "loss": 14.7545,
+      "step": 28200
+    },
+    {
+      "epoch": 15.22911051212938,
+      "grad_norm": 1.6147938966751099,
+      "learning_rate": 0.00014367567567567566,
+      "loss": 14.7821,
+      "step": 28250
+    },
+    {
+      "epoch": 15.256064690026955,
+      "grad_norm": 1.600216269493103,
+      "learning_rate": 0.00014286486486486487,
+      "loss": 14.8026,
+      "step": 28300
+    },
+    {
+      "epoch": 15.283018867924529,
+      "grad_norm": 1.5041214227676392,
+      "learning_rate": 0.00014205405405405403,
+      "loss": 14.7822,
+      "step": 28350
+    },
+    {
+      "epoch": 15.309973045822103,
+      "grad_norm": 1.5731170177459717,
+      "learning_rate": 0.00014124324324324325,
+      "loss": 14.8014,
+      "step": 28400
+    },
+    {
+      "epoch": 15.336927223719677,
+      "grad_norm": 1.62449312210083,
+      "learning_rate": 0.0001404324324324324,
+      "loss": 14.8132,
+      "step": 28450
+    },
+    {
+      "epoch": 15.36388140161725,
+      "grad_norm": 1.5179616212844849,
+      "learning_rate": 0.00013962162162162162,
+      "loss": 14.8027,
+      "step": 28500
+    },
+    {
+      "epoch": 15.390835579514825,
+      "grad_norm": 1.5233399868011475,
+      "learning_rate": 0.0001388108108108108,
+      "loss": 14.8032,
+      "step": 28550
+    },
+    {
+      "epoch": 15.417789757412399,
+      "grad_norm": 1.5631145238876343,
+      "learning_rate": 0.000138,
+      "loss": 14.8132,
+      "step": 28600
+    },
+    {
+      "epoch": 15.444743935309972,
+      "grad_norm": 1.4974826574325562,
+      "learning_rate": 0.00013718918918918918,
+      "loss": 14.8246,
+      "step": 28650
+    },
+    {
+      "epoch": 15.471698113207546,
+      "grad_norm": 1.56992769241333,
+      "learning_rate": 0.00013637837837837837,
+      "loss": 14.8093,
+      "step": 28700
+    },
+    {
+      "epoch": 15.498652291105122,
+      "grad_norm": 1.5067832469940186,
+      "learning_rate": 0.00013556756756756755,
+      "loss": 14.8734,
+      "step": 28750
+    },
+    {
+      "epoch": 15.525606469002696,
+      "grad_norm": 1.5053755044937134,
+      "learning_rate": 0.00013475675675675674,
+      "loss": 14.8705,
+      "step": 28800
+    },
+    {
+      "epoch": 15.55256064690027,
+      "grad_norm": 1.4808902740478516,
+      "learning_rate": 0.00013394594594594593,
+      "loss": 14.8348,
+      "step": 28850
+    },
+    {
+      "epoch": 15.579514824797844,
+      "grad_norm": 1.5790108442306519,
+      "learning_rate": 0.00013313513513513512,
+      "loss": 14.8726,
+      "step": 28900
+    },
+    {
+      "epoch": 15.606469002695418,
+      "grad_norm": 1.5821738243103027,
+      "learning_rate": 0.0001323243243243243,
+      "loss": 14.8672,
+      "step": 28950
+    },
+    {
+      "epoch": 15.633423180592992,
+      "grad_norm": 1.5624059438705444,
+      "learning_rate": 0.00013151351351351352,
+      "loss": 14.8196,
+      "step": 29000
+    },
+    {
+      "epoch": 15.633423180592992,
+      "eval_accuracy": 0.3969098878634074,
+      "eval_loss": 3.2676687240600586,
+      "eval_runtime": 182.4618,
+      "eval_samples_per_second": 98.717,
+      "eval_steps_per_second": 6.171,
+      "step": 29000
+    },
+    {
+      "epoch": 15.660377358490566,
+      "grad_norm": 1.6494839191436768,
+      "learning_rate": 0.00013070270270270268,
+      "loss": 14.8677,
+      "step": 29050
+    },
+    {
+      "epoch": 15.68733153638814,
+      "grad_norm": 1.5024958848953247,
+      "learning_rate": 0.0001298918918918919,
+      "loss": 14.884,
+      "step": 29100
+    },
+    {
+      "epoch": 15.714285714285714,
+      "grad_norm": 1.5223402976989746,
+      "learning_rate": 0.00012908108108108108,
+      "loss": 14.8573,
+      "step": 29150
+    },
+    {
+      "epoch": 15.74123989218329,
+      "grad_norm": 1.4910407066345215,
+      "learning_rate": 0.00012827027027027027,
+      "loss": 14.8876,
+      "step": 29200
+    },
+    {
+      "epoch": 15.768194070080863,
+      "grad_norm": 1.6085453033447266,
+      "learning_rate": 0.00012745945945945945,
+      "loss": 14.8961,
+      "step": 29250
+    },
+    {
+      "epoch": 15.795148247978437,
+      "grad_norm": 1.5478426218032837,
+      "learning_rate": 0.00012664864864864864,
+      "loss": 14.8845,
+      "step": 29300
+    },
+    {
+      "epoch": 15.822102425876011,
+      "grad_norm": 1.5788655281066895,
+      "learning_rate": 0.00012583783783783783,
+      "loss": 14.9066,
+      "step": 29350
+    },
+    {
+      "epoch": 15.849056603773585,
+      "grad_norm": 1.5746803283691406,
+      "learning_rate": 0.000125027027027027,
+      "loss": 14.8779,
+      "step": 29400
+    },
+    {
+      "epoch": 15.876010781671159,
+      "grad_norm": 1.4738293886184692,
+      "learning_rate": 0.0001242162162162162,
+      "loss": 14.9064,
+      "step": 29450
+    },
+    {
+      "epoch": 15.902964959568733,
+      "grad_norm": 1.6745882034301758,
+      "learning_rate": 0.0001234054054054054,
+      "loss": 14.9054,
+      "step": 29500
+    },
+    {
+      "epoch": 15.929919137466307,
+      "grad_norm": 1.5883915424346924,
+      "learning_rate": 0.00012259459459459457,
+      "loss": 14.9191,
+      "step": 29550
+    },
+    {
+      "epoch": 15.95687331536388,
+      "grad_norm": 1.6155942678451538,
+      "learning_rate": 0.00012178378378378378,
+      "loss": 14.8815,
+      "step": 29600
+    },
+    {
+      "epoch": 15.983827493261456,
+      "grad_norm": 1.5333513021469116,
+      "learning_rate": 0.00012098918918918918,
+      "loss": 14.9168,
+      "step": 29650
+    },
+    {
+      "epoch": 16.01078167115903,
+      "grad_norm": 1.5406397581100464,
+      "learning_rate": 0.00012017837837837838,
+      "loss": 14.7804,
+      "step": 29700
+    },
+    {
+      "epoch": 16.037735849056602,
+      "grad_norm": 1.56869637966156,
+      "learning_rate": 0.00011936756756756755,
+      "loss": 14.621,
+      "step": 29750
+    },
+    {
+      "epoch": 16.064690026954178,
+      "grad_norm": 1.6393052339553833,
+      "learning_rate": 0.00011855675675675675,
+      "loss": 14.5929,
+      "step": 29800
+    },
+    {
+      "epoch": 16.09164420485175,
+      "grad_norm": 1.5278005599975586,
+      "learning_rate": 0.00011774594594594594,
+      "loss": 14.6855,
+      "step": 29850
+    },
+    {
+      "epoch": 16.118598382749326,
+      "grad_norm": 1.549673080444336,
+      "learning_rate": 0.00011693513513513513,
+      "loss": 14.6718,
+      "step": 29900
+    },
+    {
+      "epoch": 16.1455525606469,
+      "grad_norm": 1.5019139051437378,
+      "learning_rate": 0.00011612432432432432,
+      "loss": 14.6548,
+      "step": 29950
+    },
+    {
+      "epoch": 16.172506738544474,
+      "grad_norm": 1.6385548114776611,
+      "learning_rate": 0.00011531351351351352,
+      "loss": 14.6783,
+      "step": 30000
+    },
+    {
+      "epoch": 16.172506738544474,
+      "eval_accuracy": 0.39686979717370413,
+      "eval_loss": 3.2710256576538086,
+      "eval_runtime": 182.5158,
+      "eval_samples_per_second": 98.687,
+      "eval_steps_per_second": 6.169,
+      "step": 30000
+    },
+    {
+      "epoch": 16.19946091644205,
+      "grad_norm": 1.5987632274627686,
+      "learning_rate": 0.00011450270270270269,
+      "loss": 14.6573,
+      "step": 30050
+    },
+    {
+      "epoch": 16.22641509433962,
+      "grad_norm": 1.6917481422424316,
+      "learning_rate": 0.00011369189189189189,
+      "loss": 14.664,
+      "step": 30100
+    },
+    {
+      "epoch": 16.253369272237197,
+      "grad_norm": 1.5871925354003906,
+      "learning_rate": 0.00011288108108108108,
+      "loss": 14.6668,
+      "step": 30150
+    },
+    {
+      "epoch": 16.28032345013477,
+      "grad_norm": 1.5470457077026367,
+      "learning_rate": 0.00011207027027027026,
+      "loss": 14.7022,
+      "step": 30200
+    },
+    {
+      "epoch": 16.307277628032345,
+      "grad_norm": 1.6101864576339722,
+      "learning_rate": 0.00011125945945945945,
+      "loss": 14.702,
+      "step": 30250
+    },
+    {
+      "epoch": 16.334231805929917,
+      "grad_norm": 1.5155388116836548,
+      "learning_rate": 0.00011044864864864865,
+      "loss": 14.7447,
+      "step": 30300
+    },
+    {
+      "epoch": 16.361185983827493,
+      "grad_norm": 1.5606392621994019,
+      "learning_rate": 0.00010963783783783783,
+      "loss": 14.7262,
+      "step": 30350
+    },
+    {
+      "epoch": 16.38814016172507,
+      "grad_norm": 1.6367034912109375,
+      "learning_rate": 0.00010882702702702703,
+      "loss": 14.7292,
+      "step": 30400
+    },
+    {
+      "epoch": 16.41509433962264,
+      "grad_norm": 1.5408399105072021,
+      "learning_rate": 0.0001080162162162162,
+      "loss": 14.7541,
+      "step": 30450
+    },
+    {
+      "epoch": 16.442048517520217,
+      "grad_norm": 1.5572620630264282,
+      "learning_rate": 0.0001072054054054054,
+      "loss": 14.7008,
+      "step": 30500
+    },
+    {
+      "epoch": 16.46900269541779,
+      "grad_norm": 1.5497262477874756,
+      "learning_rate": 0.00010639459459459459,
+      "loss": 14.7683,
+      "step": 30550
+    },
+    {
+      "epoch": 16.495956873315365,
+      "grad_norm": 1.5502400398254395,
+      "learning_rate": 0.00010558378378378379,
+      "loss": 14.7466,
+      "step": 30600
+    },
+    {
+      "epoch": 16.522911051212937,
+      "grad_norm": 1.6329463720321655,
+      "learning_rate": 0.00010477297297297296,
+      "loss": 14.7568,
+      "step": 30650
+    },
+    {
+      "epoch": 16.549865229110512,
+      "grad_norm": 1.5791313648223877,
+      "learning_rate": 0.00010396216216216216,
+      "loss": 14.7672,
+      "step": 30700
+    },
+    {
+      "epoch": 16.576819407008085,
+      "grad_norm": 1.6621166467666626,
+      "learning_rate": 0.00010315135135135134,
+      "loss": 14.7865,
+      "step": 30750
+    },
+    {
+      "epoch": 16.60377358490566,
+      "grad_norm": 1.5358580350875854,
+      "learning_rate": 0.00010234054054054054,
+      "loss": 14.7586,
+      "step": 30800
+    },
+    {
+      "epoch": 16.630727762803236,
+      "grad_norm": 1.5281518697738647,
+      "learning_rate": 0.00010152972972972972,
+      "loss": 14.7566,
+      "step": 30850
+    },
+    {
+      "epoch": 16.657681940700808,
+      "grad_norm": 1.5487334728240967,
+      "learning_rate": 0.00010071891891891891,
+      "loss": 14.8048,
+      "step": 30900
+    },
+    {
+      "epoch": 16.684636118598384,
+      "grad_norm": 1.5345453023910522,
+      "learning_rate": 9.99081081081081e-05,
+      "loss": 14.7579,
+      "step": 30950
+    },
+    {
+      "epoch": 16.711590296495956,
+      "grad_norm": 1.5981355905532837,
+      "learning_rate": 9.90972972972973e-05,
+      "loss": 14.7716,
+      "step": 31000
+    },
+    {
+      "epoch": 16.711590296495956,
+      "eval_accuracy": 0.39735642643977725,
+      "eval_loss": 3.2666032314300537,
+      "eval_runtime": 182.2937,
+      "eval_samples_per_second": 98.808,
+      "eval_steps_per_second": 6.177,
+      "step": 31000
+    },
+    {
+      "epoch": 16.73854447439353,
+      "grad_norm": 1.5877584218978882,
+      "learning_rate": 9.828648648648647e-05,
+      "loss": 14.7698,
+      "step": 31050
+    },
+    {
+      "epoch": 16.765498652291104,
+      "grad_norm": 1.5693833827972412,
+      "learning_rate": 9.747567567567567e-05,
+      "loss": 14.8035,
+      "step": 31100
+    },
+    {
+      "epoch": 16.79245283018868,
+      "grad_norm": 1.564299464225769,
+      "learning_rate": 9.666486486486486e-05,
+      "loss": 14.8016,
+      "step": 31150
+    },
+    {
+      "epoch": 16.81940700808625,
+      "grad_norm": 1.6126294136047363,
+      "learning_rate": 9.585405405405405e-05,
+      "loss": 14.7674,
+      "step": 31200
+    },
+    {
+      "epoch": 16.846361185983827,
+      "grad_norm": 1.5660778284072876,
+      "learning_rate": 9.504324324324323e-05,
+      "loss": 14.7822,
+      "step": 31250
+    },
+    {
+      "epoch": 16.873315363881403,
+      "grad_norm": 1.559880018234253,
+      "learning_rate": 9.423243243243243e-05,
+      "loss": 14.7547,
+      "step": 31300
+    },
+    {
+      "epoch": 16.900269541778975,
+      "grad_norm": 1.5699021816253662,
+      "learning_rate": 9.342162162162161e-05,
+      "loss": 14.7827,
+      "step": 31350
+    },
+    {
+      "epoch": 16.92722371967655,
+      "grad_norm": 1.5942411422729492,
+      "learning_rate": 9.261081081081081e-05,
+      "loss": 14.8039,
+      "step": 31400
+    },
+    {
+      "epoch": 16.954177897574123,
+      "grad_norm": 1.544398307800293,
+      "learning_rate": 9.18e-05,
+      "loss": 14.7749,
+      "step": 31450
+    },
+    {
+      "epoch": 16.9811320754717,
+      "grad_norm": 1.6123785972595215,
+      "learning_rate": 9.098918918918918e-05,
+      "loss": 14.7925,
+      "step": 31500
+    },
+    {
+      "epoch": 17.00808625336927,
+      "grad_norm": 1.5455232858657837,
+      "learning_rate": 9.017837837837837e-05,
+      "loss": 14.689,
+      "step": 31550
+    },
+    {
+      "epoch": 17.035040431266847,
+      "grad_norm": 1.584010362625122,
+      "learning_rate": 8.936756756756757e-05,
+      "loss": 14.5752,
+      "step": 31600
+    },
+    {
+      "epoch": 17.06199460916442,
+      "grad_norm": 1.6022430658340454,
+      "learning_rate": 8.855675675675674e-05,
+      "loss": 14.5822,
+      "step": 31650
+    },
+    {
+      "epoch": 17.088948787061994,
+      "grad_norm": 1.4971483945846558,
+      "learning_rate": 8.774594594594594e-05,
+      "loss": 14.5793,
+      "step": 31700
+    },
+    {
+      "epoch": 17.11590296495957,
+      "grad_norm": 1.5255413055419922,
+      "learning_rate": 8.693513513513513e-05,
+      "loss": 14.5842,
+      "step": 31750
+    },
+    {
+      "epoch": 17.142857142857142,
+      "grad_norm": 1.5918787717819214,
+      "learning_rate": 8.612432432432432e-05,
+      "loss": 14.5711,
+      "step": 31800
+    },
+    {
+      "epoch": 17.169811320754718,
+      "grad_norm": 1.5889939069747925,
+      "learning_rate": 8.53135135135135e-05,
+      "loss": 14.6055,
+      "step": 31850
+    },
+    {
+      "epoch": 17.19676549865229,
+      "grad_norm": 1.6539576053619385,
+      "learning_rate": 8.45027027027027e-05,
+      "loss": 14.6048,
+      "step": 31900
+    },
+    {
+      "epoch": 17.223719676549866,
+      "grad_norm": 1.5412706136703491,
+      "learning_rate": 8.369189189189188e-05,
+      "loss": 14.5751,
+      "step": 31950
+    },
+    {
+      "epoch": 17.250673854447438,
+      "grad_norm": 1.5189770460128784,
+      "learning_rate": 8.288108108108108e-05,
+      "loss": 14.6427,
+      "step": 32000
+    },
+    {
+      "epoch": 17.250673854447438,
+      "eval_accuracy": 0.3972496265807574,
+      "eval_loss": 3.2697131633758545,
+      "eval_runtime": 179.3647,
+      "eval_samples_per_second": 100.421,
+      "eval_steps_per_second": 6.278,
+      "step": 32000
+    },
+    {
+      "epoch": 17.277628032345014,
+      "grad_norm": 1.5873239040374756,
+      "learning_rate": 8.207027027027027e-05,
+      "loss": 14.6094,
+      "step": 32050
+    },
+    {
+      "epoch": 17.304582210242586,
+      "grad_norm": 1.645750641822815,
+      "learning_rate": 8.125945945945945e-05,
+      "loss": 14.6167,
+      "step": 32100
+    },
+    {
+      "epoch": 17.33153638814016,
+      "grad_norm": 1.5786947011947632,
+      "learning_rate": 8.044864864864864e-05,
+      "loss": 14.6306,
+      "step": 32150
+    },
+    {
+      "epoch": 17.358490566037737,
+      "grad_norm": 1.5392863750457764,
+      "learning_rate": 7.963783783783784e-05,
+      "loss": 14.622,
+      "step": 32200
+    },
+    {
+      "epoch": 17.38544474393531,
+      "grad_norm": 1.5654475688934326,
+      "learning_rate": 7.882702702702702e-05,
+      "loss": 14.5784,
+      "step": 32250
+    },
+    {
+      "epoch": 17.412398921832885,
+      "grad_norm": 1.5552312135696411,
+      "learning_rate": 7.801621621621622e-05,
+      "loss": 14.6352,
+      "step": 32300
+    },
+    {
+      "epoch": 17.439353099730457,
+      "grad_norm": 1.5792616605758667,
+      "learning_rate": 7.722162162162162e-05,
+      "loss": 14.6571,
+      "step": 32350
+    },
+    {
+      "epoch": 17.466307277628033,
+      "grad_norm": 1.5422114133834839,
+      "learning_rate": 7.64108108108108e-05,
+      "loss": 14.6243,
+      "step": 32400
+    },
+    {
+      "epoch": 17.493261455525605,
+      "grad_norm": 1.5832560062408447,
+      "learning_rate": 7.56e-05,
+      "loss": 14.6572,
+      "step": 32450
+    },
+    {
+      "epoch": 17.52021563342318,
+      "grad_norm": 1.5866824388504028,
+      "learning_rate": 7.478918918918918e-05,
+      "loss": 14.6522,
+      "step": 32500
+    },
+    {
+      "epoch": 17.547169811320753,
+      "grad_norm": 1.533219337463379,
+      "learning_rate": 7.397837837837837e-05,
+      "loss": 14.6272,
+      "step": 32550
+    },
+    {
+      "epoch": 17.57412398921833,
+      "grad_norm": 1.584763526916504,
+      "learning_rate": 7.316756756756756e-05,
+      "loss": 14.6932,
+      "step": 32600
+    },
+    {
+      "epoch": 17.601078167115904,
+      "grad_norm": 1.5591111183166504,
+      "learning_rate": 7.235675675675676e-05,
+      "loss": 14.6659,
+      "step": 32650
+    },
+    {
+      "epoch": 17.628032345013477,
+      "grad_norm": 1.5653057098388672,
+      "learning_rate": 7.154594594594594e-05,
+      "loss": 14.6659,
+      "step": 32700
+    },
+    {
+      "epoch": 17.654986522911052,
+      "grad_norm": 1.4933594465255737,
+      "learning_rate": 7.073513513513513e-05,
+      "loss": 14.6884,
+      "step": 32750
+    },
+    {
+      "epoch": 17.681940700808624,
+      "grad_norm": 1.5485045909881592,
+      "learning_rate": 6.992432432432432e-05,
+      "loss": 14.6379,
+      "step": 32800
+    },
+    {
+      "epoch": 17.7088948787062,
+      "grad_norm": 1.5517247915267944,
+      "learning_rate": 6.91135135135135e-05,
+      "loss": 14.6793,
+      "step": 32850
+    },
+    {
+      "epoch": 17.735849056603772,
+      "grad_norm": 1.654420256614685,
+      "learning_rate": 6.830270270270269e-05,
+      "loss": 14.6845,
+      "step": 32900
+    },
+    {
+      "epoch": 17.762803234501348,
+      "grad_norm": 1.5648823976516724,
+      "learning_rate": 6.749189189189189e-05,
+      "loss": 14.6606,
+      "step": 32950
+    },
+    {
+      "epoch": 17.78975741239892,
+      "grad_norm": 1.5435584783554077,
+      "learning_rate": 6.668108108108108e-05,
+      "loss": 14.6409,
+      "step": 33000
+    },
+    {
+      "epoch": 17.78975741239892,
+      "eval_accuracy": 0.39795865596017094,
+      "eval_loss": 3.263080358505249,
+      "eval_runtime": 180.8635,
+      "eval_samples_per_second": 99.589,
+      "eval_steps_per_second": 6.226,
+      "step": 33000
+    },
+    {
+      "epoch": 17.816711590296496,
+      "grad_norm": 1.5425926446914673,
+      "learning_rate": 6.587027027027027e-05,
+      "loss": 14.7093,
+      "step": 33050
+    },
+    {
+      "epoch": 17.84366576819407,
+      "grad_norm": 1.5393097400665283,
+      "learning_rate": 6.505945945945945e-05,
+      "loss": 14.6421,
+      "step": 33100
+    },
+    {
+      "epoch": 17.870619946091644,
+      "grad_norm": 1.5586802959442139,
+      "learning_rate": 6.424864864864864e-05,
+      "loss": 14.6901,
+      "step": 33150
+    },
+    {
+      "epoch": 17.89757412398922,
+      "grad_norm": 1.6345175504684448,
+      "learning_rate": 6.343783783783783e-05,
+      "loss": 14.6739,
+      "step": 33200
+    },
+    {
+      "epoch": 17.92452830188679,
+      "grad_norm": 1.6852161884307861,
+      "learning_rate": 6.262702702702703e-05,
+      "loss": 14.7087,
+      "step": 33250
+    },
+    {
+      "epoch": 17.951482479784367,
+      "grad_norm": 1.5203602313995361,
+      "learning_rate": 6.181621621621622e-05,
+      "loss": 14.6883,
+      "step": 33300
+    },
+    {
+      "epoch": 17.97843665768194,
+      "grad_norm": 1.5950666666030884,
+      "learning_rate": 6.10054054054054e-05,
+      "loss": 14.6901,
+      "step": 33350
+    },
+    {
+      "epoch": 18.005390835579515,
+      "grad_norm": 1.5396767854690552,
+      "learning_rate": 6.019459459459459e-05,
+      "loss": 14.6374,
+      "step": 33400
+    },
+    {
+      "epoch": 18.032345013477087,
+      "grad_norm": 1.512335181236267,
+      "learning_rate": 5.9383783783783776e-05,
+      "loss": 14.4547,
+      "step": 33450
+    },
+    {
+      "epoch": 18.059299191374663,
+      "grad_norm": 1.6069568395614624,
+      "learning_rate": 5.857297297297297e-05,
+      "loss": 14.5278,
+      "step": 33500
+    },
+    {
+      "epoch": 18.08625336927224,
+      "grad_norm": 1.5221787691116333,
+      "learning_rate": 5.776216216216216e-05,
+      "loss": 14.5299,
+      "step": 33550
+    },
+    {
+      "epoch": 18.11320754716981,
+      "grad_norm": 1.5961333513259888,
+      "learning_rate": 5.6951351351351344e-05,
+      "loss": 14.5246,
+      "step": 33600
+    },
+    {
+      "epoch": 18.140161725067387,
+      "grad_norm": 1.5239356756210327,
+      "learning_rate": 5.614054054054054e-05,
+      "loss": 14.5502,
+      "step": 33650
+    },
+    {
+      "epoch": 18.16711590296496,
+      "grad_norm": 1.6545807123184204,
+      "learning_rate": 5.5329729729729725e-05,
+      "loss": 14.5051,
+      "step": 33700
+    },
+    {
+      "epoch": 18.194070080862534,
+      "grad_norm": 1.5053555965423584,
+      "learning_rate": 5.451891891891891e-05,
+      "loss": 14.5202,
+      "step": 33750
+    },
+    {
+      "epoch": 18.221024258760107,
+      "grad_norm": 1.5080546140670776,
+      "learning_rate": 5.37081081081081e-05,
+      "loss": 14.5393,
+      "step": 33800
+    },
+    {
+      "epoch": 18.247978436657682,
+      "grad_norm": 1.6003968715667725,
+      "learning_rate": 5.289729729729729e-05,
+      "loss": 14.5344,
+      "step": 33850
+    },
+    {
+      "epoch": 18.274932614555254,
+      "grad_norm": 1.5318918228149414,
+      "learning_rate": 5.208648648648648e-05,
+      "loss": 14.5318,
+      "step": 33900
+    },
+    {
+      "epoch": 18.30188679245283,
+      "grad_norm": 1.556462049484253,
+      "learning_rate": 5.127567567567567e-05,
+      "loss": 14.5708,
+      "step": 33950
+    },
+    {
+      "epoch": 18.328840970350406,
+      "grad_norm": 1.5578322410583496,
+      "learning_rate": 5.046486486486486e-05,
+      "loss": 14.5329,
+      "step": 34000
+    },
+    {
+      "epoch": 18.328840970350406,
+      "eval_accuracy": 0.39798331879638404,
+      "eval_loss": 3.2663168907165527,
+      "eval_runtime": 181.0745,
+      "eval_samples_per_second": 99.473,
+      "eval_steps_per_second": 6.218,
+      "step": 34000
+    },
+    {
+      "epoch": 18.355795148247978,
+      "grad_norm": 1.5834003686904907,
+      "learning_rate": 4.965405405405405e-05,
+      "loss": 14.5322,
+      "step": 34050
+    },
+    {
+      "epoch": 18.382749326145554,
+      "grad_norm": 1.5400707721710205,
+      "learning_rate": 4.8843243243243235e-05,
+      "loss": 14.56,
+      "step": 34100
+    },
+    {
+      "epoch": 18.409703504043126,
+      "grad_norm": 1.5586832761764526,
+      "learning_rate": 4.803243243243243e-05,
+      "loss": 14.5639,
+      "step": 34150
+    },
+    {
+      "epoch": 18.4366576819407,
+      "grad_norm": 1.521950364112854,
+      "learning_rate": 4.7221621621621616e-05,
+      "loss": 14.5583,
+      "step": 34200
+    },
+    {
+      "epoch": 18.463611859838274,
+      "grad_norm": 1.5423915386199951,
+      "learning_rate": 4.64108108108108e-05,
+      "loss": 14.5764,
+      "step": 34250
+    },
+    {
+      "epoch": 18.49056603773585,
+      "grad_norm": 1.524101734161377,
+      "learning_rate": 4.56e-05,
+      "loss": 14.511,
+      "step": 34300
+    },
+    {
+      "epoch": 18.51752021563342,
+      "grad_norm": 1.5166869163513184,
+      "learning_rate": 4.4789189189189184e-05,
+      "loss": 14.5173,
+      "step": 34350
+    },
+    {
+      "epoch": 18.544474393530997,
+      "grad_norm": 1.61070716381073,
+      "learning_rate": 4.397837837837837e-05,
+      "loss": 14.5917,
+      "step": 34400
+    },
+    {
+      "epoch": 18.571428571428573,
+      "grad_norm": 1.5960681438446045,
+      "learning_rate": 4.3167567567567565e-05,
+      "loss": 14.5383,
+      "step": 34450
+    },
+    {
+      "epoch": 18.598382749326145,
+      "grad_norm": 1.4851629734039307,
+      "learning_rate": 4.235675675675675e-05,
+      "loss": 14.5736,
+      "step": 34500
+    },
+    {
+      "epoch": 18.62533692722372,
+      "grad_norm": 1.5458905696868896,
+      "learning_rate": 4.154594594594594e-05,
+      "loss": 14.563,
+      "step": 34550
+    },
+    {
+      "epoch": 18.652291105121293,
+      "grad_norm": 1.500968098640442,
+      "learning_rate": 4.073513513513513e-05,
+      "loss": 14.5723,
+      "step": 34600
+    },
+    {
+      "epoch": 18.67924528301887,
+      "grad_norm": 1.5670329332351685,
+      "learning_rate": 3.992432432432432e-05,
+      "loss": 14.5681,
+      "step": 34650
+    },
+    {
+      "epoch": 18.70619946091644,
+      "grad_norm": 1.5421439409255981,
+      "learning_rate": 3.911351351351351e-05,
+      "loss": 14.5949,
+      "step": 34700
+    },
+    {
+      "epoch": 18.733153638814017,
+      "grad_norm": 1.541306734085083,
+      "learning_rate": 3.83027027027027e-05,
+      "loss": 14.5743,
+      "step": 34750
+    },
+    {
+      "epoch": 18.76010781671159,
+      "grad_norm": 1.5344128608703613,
+      "learning_rate": 3.749189189189189e-05,
+      "loss": 14.5806,
+      "step": 34800
+    },
+    {
+      "epoch": 18.787061994609164,
+      "grad_norm": 1.51657235622406,
+      "learning_rate": 3.6681081081081075e-05,
+      "loss": 14.5895,
+      "step": 34850
+    },
+    {
+      "epoch": 18.81401617250674,
+      "grad_norm": 1.5216045379638672,
+      "learning_rate": 3.587027027027026e-05,
+      "loss": 14.5862,
+      "step": 34900
+    },
+    {
+      "epoch": 18.840970350404312,
+      "grad_norm": 1.561399221420288,
+      "learning_rate": 3.5059459459459456e-05,
+      "loss": 14.588,
+      "step": 34950
+    },
+    {
+      "epoch": 18.867924528301888,
+      "grad_norm": 1.6107219457626343,
+      "learning_rate": 3.424864864864864e-05,
+      "loss": 14.5702,
+      "step": 35000
+    },
+    {
+      "epoch": 18.867924528301888,
+      "eval_accuracy": 0.3983890061550617,
+      "eval_loss": 3.261690855026245,
+      "eval_runtime": 180.7732,
+      "eval_samples_per_second": 99.639,
+      "eval_steps_per_second": 6.229,
+      "step": 35000
+    },
+    {
+      "epoch": 18.89487870619946,
+      "grad_norm": 1.5193698406219482,
+      "learning_rate": 3.343783783783783e-05,
+      "loss": 14.5882,
+      "step": 35050
+    },
+    {
+      "epoch": 18.921832884097036,
+      "grad_norm": 1.5151571035385132,
+      "learning_rate": 3.2627027027027024e-05,
+      "loss": 14.5474,
+      "step": 35100
+    },
+    {
+      "epoch": 18.948787061994608,
+      "grad_norm": 1.5188689231872559,
+      "learning_rate": 3.181621621621621e-05,
+      "loss": 14.5617,
+      "step": 35150
+    },
+    {
+      "epoch": 18.975741239892184,
+      "grad_norm": 1.5117980241775513,
+      "learning_rate": 3.10054054054054e-05,
+      "loss": 14.5587,
+      "step": 35200
+    },
+    {
+      "epoch": 19.002695417789756,
+      "grad_norm": 1.5636892318725586,
+      "learning_rate": 3.019459459459459e-05,
+      "loss": 14.5439,
+      "step": 35250
+    },
+    {
+      "epoch": 19.02964959568733,
+      "grad_norm": 1.5492565631866455,
+      "learning_rate": 2.938378378378378e-05,
+      "loss": 14.4387,
+      "step": 35300
+    },
+    {
+      "epoch": 19.056603773584907,
+      "grad_norm": 1.5516005754470825,
+      "learning_rate": 2.857297297297297e-05,
+      "loss": 14.4445,
+      "step": 35350
+    },
+    {
+      "epoch": 19.08355795148248,
+      "grad_norm": 1.533972978591919,
+      "learning_rate": 2.7762162162162163e-05,
+      "loss": 14.4359,
+      "step": 35400
+    },
+    {
+      "epoch": 19.110512129380055,
+      "grad_norm": 1.507752537727356,
+      "learning_rate": 2.695135135135135e-05,
+      "loss": 14.4215,
+      "step": 35450
+    },
+    {
+      "epoch": 19.137466307277627,
+      "grad_norm": 1.4865872859954834,
+      "learning_rate": 2.614054054054054e-05,
+      "loss": 14.4713,
+      "step": 35500
+    },
+    {
+      "epoch": 19.164420485175203,
+      "grad_norm": 1.5423104763031006,
+      "learning_rate": 2.5329729729729728e-05,
+      "loss": 14.4748,
+      "step": 35550
+    },
+    {
+      "epoch": 19.191374663072775,
+      "grad_norm": 1.5166065692901611,
+      "learning_rate": 2.451891891891892e-05,
+      "loss": 14.5064,
+      "step": 35600
+    },
+    {
+      "epoch": 19.21832884097035,
+      "grad_norm": 1.5488274097442627,
+      "learning_rate": 2.370810810810811e-05,
+      "loss": 14.497,
+      "step": 35650
+    },
+    {
+      "epoch": 19.245283018867923,
+      "grad_norm": 1.519723892211914,
+      "learning_rate": 2.2897297297297296e-05,
+      "loss": 14.4739,
+      "step": 35700
+    },
+    {
+      "epoch": 19.2722371967655,
+      "grad_norm": 1.5654048919677734,
+      "learning_rate": 2.2086486486486486e-05,
+      "loss": 14.4639,
+      "step": 35750
+    },
+    {
+      "epoch": 19.299191374663074,
+      "grad_norm": 1.5737123489379883,
+      "learning_rate": 2.1275675675675677e-05,
+      "loss": 14.4619,
+      "step": 35800
+    },
+    {
+      "epoch": 19.326145552560646,
+      "grad_norm": 1.5194604396820068,
+      "learning_rate": 2.0464864864864864e-05,
+      "loss": 14.4379,
+      "step": 35850
+    },
+    {
+      "epoch": 19.353099730458222,
+      "grad_norm": 1.4980615377426147,
+      "learning_rate": 1.9654054054054054e-05,
+      "loss": 14.4757,
+      "step": 35900
+    },
+    {
+      "epoch": 19.380053908355794,
+      "grad_norm": 1.5187228918075562,
+      "learning_rate": 1.8843243243243245e-05,
+      "loss": 14.4836,
+      "step": 35950
+    },
+    {
+      "epoch": 19.40700808625337,
+      "grad_norm": 1.5278230905532837,
+      "learning_rate": 1.8032432432432432e-05,
+      "loss": 14.4615,
+      "step": 36000
+    },
+    {
+      "epoch": 19.40700808625337,
+      "eval_accuracy": 0.39837216589244917,
+      "eval_loss": 3.264392614364624,
+      "eval_runtime": 180.6409,
+      "eval_samples_per_second": 99.712,
+      "eval_steps_per_second": 6.233,
+      "step": 36000
+    },
+    {
+      "epoch": 19.433962264150942,
+      "grad_norm": 1.5677344799041748,
+      "learning_rate": 1.722162162162162e-05,
+      "loss": 14.4584,
+      "step": 36050
+    },
+    {
+      "epoch": 19.460916442048518,
+      "grad_norm": 1.4985554218292236,
+      "learning_rate": 1.641081081081081e-05,
+      "loss": 14.4745,
+      "step": 36100
+    },
+    {
+      "epoch": 19.48787061994609,
+      "grad_norm": 1.5670796632766724,
+      "learning_rate": 1.5599999999999996e-05,
+      "loss": 14.4912,
+      "step": 36150
+    },
+    {
+      "epoch": 19.514824797843666,
+      "grad_norm": 1.5281639099121094,
+      "learning_rate": 1.4789189189189187e-05,
+      "loss": 14.4717,
+      "step": 36200
+    },
+    {
+      "epoch": 19.54177897574124,
+      "grad_norm": 1.5401979684829712,
+      "learning_rate": 1.3978378378378376e-05,
+      "loss": 14.4799,
+      "step": 36250
+    },
+    {
+      "epoch": 19.568733153638814,
+      "grad_norm": 1.5297380685806274,
+      "learning_rate": 1.3167567567567566e-05,
+      "loss": 14.4374,
+      "step": 36300
+    },
+    {
+      "epoch": 19.59568733153639,
+      "grad_norm": 1.5379348993301392,
+      "learning_rate": 1.2372972972972972e-05,
+      "loss": 14.4731,
+      "step": 36350
+    },
+    {
+      "epoch": 19.62264150943396,
+      "grad_norm": 1.5073282718658447,
+      "learning_rate": 1.156216216216216e-05,
+      "loss": 14.444,
+      "step": 36400
+    },
+    {
+      "epoch": 19.649595687331537,
+      "grad_norm": 1.4987374544143677,
+      "learning_rate": 1.0751351351351351e-05,
+      "loss": 14.5121,
+      "step": 36450
+    },
+    {
+      "epoch": 19.67654986522911,
+      "grad_norm": 1.527362585067749,
+      "learning_rate": 9.94054054054054e-06,
+      "loss": 14.5012,
+      "step": 36500
+    },
+    {
+      "epoch": 19.703504043126685,
+      "grad_norm": 1.567726492881775,
+      "learning_rate": 9.129729729729729e-06,
+      "loss": 14.4594,
+      "step": 36550
+    },
+    {
+      "epoch": 19.730458221024257,
+      "grad_norm": 1.5044466257095337,
+      "learning_rate": 8.318918918918918e-06,
+      "loss": 14.4537,
+      "step": 36600
+    },
+    {
+      "epoch": 19.757412398921833,
+      "grad_norm": 1.486252784729004,
+      "learning_rate": 7.508108108108107e-06,
+      "loss": 14.5124,
+      "step": 36650
+    },
+    {
+      "epoch": 19.78436657681941,
+      "grad_norm": 1.5210126638412476,
+      "learning_rate": 6.697297297297297e-06,
+      "loss": 14.4657,
+      "step": 36700
+    },
+    {
+      "epoch": 19.81132075471698,
+      "grad_norm": 1.5458662509918213,
+      "learning_rate": 5.8864864864864855e-06,
+      "loss": 14.4687,
+      "step": 36750
+    },
+    {
+      "epoch": 19.838274932614556,
+      "grad_norm": 1.511208415031433,
+      "learning_rate": 5.075675675675675e-06,
+      "loss": 14.4791,
+      "step": 36800
+    },
+    {
+      "epoch": 19.86522911051213,
+      "grad_norm": 1.5122674703598022,
+      "learning_rate": 4.264864864864865e-06,
+      "loss": 14.4729,
+      "step": 36850
+    },
+    {
+      "epoch": 19.892183288409704,
+      "grad_norm": 1.492714762687683,
+      "learning_rate": 3.454054054054054e-06,
+      "loss": 14.4866,
+      "step": 36900
+    },
+    {
+      "epoch": 19.919137466307276,
+      "grad_norm": 1.4897505044937134,
+      "learning_rate": 2.643243243243243e-06,
+      "loss": 14.4836,
+      "step": 36950
+    },
+    {
+      "epoch": 19.946091644204852,
+      "grad_norm": 1.5580779314041138,
+      "learning_rate": 1.8324324324324325e-06,
+      "loss": 14.4647,
+      "step": 37000
+    },
+    {
+      "epoch": 19.946091644204852,
+      "eval_accuracy": 0.3985308989484288,
+      "eval_loss": 3.2621402740478516,
+      "eval_runtime": 180.8079,
+      "eval_samples_per_second": 99.62,
+      "eval_steps_per_second": 6.228,
+      "step": 37000
+    },
+    {
+      "epoch": 19.973045822102424,
+      "grad_norm": 1.5771269798278809,
+      "learning_rate": 1.0216216216216215e-06,
+      "loss": 14.4453,
+      "step": 37050
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 2.5758163928985596,
+      "learning_rate": 2.108108108108108e-07,
+      "loss": 14.4773,
+      "step": 37100
+    },
+    {
+      "epoch": 20.0,
+      "step": 37100,
+      "total_flos": 1.55087795257344e+18,
+      "train_loss": 16.0859583358559,
+      "train_runtime": 127733.4325,
+      "train_samples_per_second": 46.467,
+      "train_steps_per_second": 0.29
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 37100,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 20,
+  "save_steps": 10000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.55087795257344e+18,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}