diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,7259 @@
+{
+  "best_global_step": 50000,
+  "best_metric": 0.5550713539123535,
+  "best_model_checkpoint": "checkpoints/mla_baseline/checkpoint-50000",
+  "epoch": 7.105300554213443,
+  "eval_steps": 2000,
+  "global_step": 50000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0071053005542134435,
+      "grad_norm": 0.3619440197944641,
+      "learning_rate": 4.9e-06,
+      "loss": 10.3021,
+      "step": 50
+    },
+    {
+      "epoch": 0.014210601108426887,
+      "grad_norm": 0.6546474099159241,
+      "learning_rate": 9.900000000000002e-06,
+      "loss": 10.2243,
+      "step": 100
+    },
+    {
+      "epoch": 0.02131590166264033,
+      "grad_norm": 2.2018182277679443,
+      "learning_rate": 1.49e-05,
+      "loss": 9.8459,
+      "step": 150
+    },
+    {
+      "epoch": 0.028421202216853774,
+      "grad_norm": 3.6677067279815674,
+      "learning_rate": 1.9900000000000003e-05,
+      "loss": 8.5751,
+      "step": 200
+    },
+    {
+      "epoch": 0.03552650277106722,
+      "grad_norm": 1.577499508857727,
+      "learning_rate": 2.49e-05,
+      "loss": 7.543,
+      "step": 250
+    },
+    {
+      "epoch": 0.04263180332528066,
+      "grad_norm": 1.397471308708191,
+      "learning_rate": 2.9900000000000002e-05,
+      "loss": 7.2528,
+      "step": 300
+    },
+    {
+      "epoch": 0.0497371038794941,
+      "grad_norm": 1.4008537530899048,
+      "learning_rate": 3.49e-05,
+      "loss": 7.1711,
+      "step": 350
+    },
+    {
+      "epoch": 0.05684240443370755,
+      "grad_norm": 1.5214428901672363,
+      "learning_rate": 3.99e-05,
+      "loss": 7.1438,
+      "step": 400
+    },
+    {
+      "epoch": 0.06394770498792099,
+      "grad_norm": 1.3703827857971191,
+      "learning_rate": 4.49e-05,
+      "loss": 7.1052,
+      "step": 450
+    },
+    {
+      "epoch": 0.07105300554213444,
+      "grad_norm": 1.3568859100341797,
+      "learning_rate": 4.99e-05,
+      "loss": 7.1017,
+      "step": 500
+    },
+    {
+      "epoch": 0.07815830609634787,
+      "grad_norm": 1.3110448122024536,
+      "learning_rate": 5.49e-05,
+      "loss": 7.0499,
+      "step": 550
+    },
+    {
+      "epoch": 0.08526360665056132,
+      "grad_norm": 1.4580223560333252,
+      "learning_rate": 5.9900000000000006e-05,
+      "loss": 7.0325,
+      "step": 600
+    },
+    {
+      "epoch": 0.09236890720477477,
+      "grad_norm": 1.3007315397262573,
+      "learning_rate": 6.49e-05,
+      "loss": 7.0092,
+      "step": 650
+    },
+    {
+      "epoch": 0.0994742077589882,
+      "grad_norm": 1.3965587615966797,
+      "learning_rate": 6.99e-05,
+      "loss": 6.9465,
+      "step": 700
+    },
+    {
+      "epoch": 0.10657950831320165,
+      "grad_norm": 1.5621792078018188,
+      "learning_rate": 7.489999999999999e-05,
+      "loss": 6.8977,
+      "step": 750
+    },
+    {
+      "epoch": 0.1136848088674151,
+      "grad_norm": 1.5563201904296875,
+      "learning_rate": 7.99e-05,
+      "loss": 6.8554,
+      "step": 800
+    },
+    {
+      "epoch": 0.12079010942162853,
+      "grad_norm": 1.4112974405288696,
+      "learning_rate": 8.49e-05,
+      "loss": 6.8084,
+      "step": 850
+    },
+    {
+      "epoch": 0.12789540997584198,
+      "grad_norm": 1.6433038711547852,
+      "learning_rate": 8.989999999999999e-05,
+      "loss": 6.7257,
+      "step": 900
+    },
+    {
+      "epoch": 0.1350007105300554,
+      "grad_norm": 1.9007697105407715,
+      "learning_rate": 9.49e-05,
+      "loss": 6.6945,
+      "step": 950
+    },
+    {
+      "epoch": 0.14210601108426887,
+      "grad_norm": 2.0347580909729004,
+      "learning_rate": 9.99e-05,
+      "loss": 6.6074,
+      "step": 1000
+    },
+    {
+      "epoch": 0.1492113116384823,
+      "grad_norm": 2.3306286334991455,
+      "learning_rate": 0.0001049,
+      "loss": 6.5402,
+      "step": 1050
+    },
+    {
+      "epoch": 0.15631661219269574,
+      "grad_norm": 3.961554527282715,
+      "learning_rate": 0.0001099,
+      "loss": 6.462,
+      "step": 1100
+    },
+    {
+      "epoch": 0.1634219127469092,
+      "grad_norm": 2.189784288406372,
+      "learning_rate": 0.0001149,
+      "loss": 6.3737,
+      "step": 1150
+    },
+    {
+      "epoch": 0.17052721330112264,
+      "grad_norm": 2.473756790161133,
+      "learning_rate": 0.00011990000000000001,
+      "loss": 6.2955,
+      "step": 1200
+    },
+    {
+      "epoch": 0.17763251385533607,
+      "grad_norm": 3.5680646896362305,
+      "learning_rate": 0.0001249,
+      "loss": 6.2441,
+      "step": 1250
+    },
+    {
+      "epoch": 0.18473781440954953,
+      "grad_norm": 3.6250619888305664,
+      "learning_rate": 0.00012989999999999999,
+      "loss": 6.1729,
+      "step": 1300
+    },
+    {
+      "epoch": 0.19184311496376297,
+      "grad_norm": 4.737584590911865,
+      "learning_rate": 0.0001349,
+      "loss": 6.1345,
+      "step": 1350
+    },
+    {
+      "epoch": 0.1989484155179764,
+      "grad_norm": 3.0449159145355225,
+      "learning_rate": 0.0001399,
+      "loss": 6.0541,
+      "step": 1400
+    },
+    {
+      "epoch": 0.20605371607218986,
+      "grad_norm": 3.516432285308838,
+      "learning_rate": 0.0001449,
+      "loss": 6.0235,
+      "step": 1450
+    },
+    {
+      "epoch": 0.2131590166264033,
+      "grad_norm": 3.8344614505767822,
+      "learning_rate": 0.0001499,
+      "loss": 5.9358,
+      "step": 1500
+    },
+    {
+      "epoch": 0.22026431718061673,
+      "grad_norm": 3.620025873184204,
+      "learning_rate": 0.00015490000000000002,
+      "loss": 5.9013,
+      "step": 1550
+    },
+    {
+      "epoch": 0.2273696177348302,
+      "grad_norm": 3.103908061981201,
+      "learning_rate": 0.00015989999999999998,
+      "loss": 5.8423,
+      "step": 1600
+    },
+    {
+      "epoch": 0.23447491828904363,
+      "grad_norm": 3.2771809101104736,
+      "learning_rate": 0.0001649,
+      "loss": 5.8007,
+      "step": 1650
+    },
+    {
+      "epoch": 0.24158021884325706,
+      "grad_norm": 3.067811965942383,
+      "learning_rate": 0.0001699,
+      "loss": 5.7556,
+      "step": 1700
+    },
+    {
+      "epoch": 0.24868551939747052,
+      "grad_norm": 3.275484085083008,
+      "learning_rate": 0.0001749,
+      "loss": 5.7074,
+      "step": 1750
+    },
+    {
+      "epoch": 0.25579081995168396,
+      "grad_norm": 3.4723703861236572,
+      "learning_rate": 0.0001799,
+      "loss": 5.6587,
+      "step": 1800
+    },
+    {
+      "epoch": 0.2628961205058974,
+      "grad_norm": 4.599445343017578,
+      "learning_rate": 0.00018490000000000002,
+      "loss": 5.6147,
+      "step": 1850
+    },
+    {
+      "epoch": 0.2700014210601108,
+      "grad_norm": 3.8192200660705566,
+      "learning_rate": 0.0001899,
+      "loss": 5.5743,
+      "step": 1900
+    },
+    {
+      "epoch": 0.27710672161432426,
+      "grad_norm": 4.365124702453613,
+      "learning_rate": 0.0001949,
+      "loss": 5.5159,
+      "step": 1950
+    },
+    {
+      "epoch": 0.28421202216853775,
+      "grad_norm": 4.356955051422119,
+      "learning_rate": 0.0001999,
+      "loss": 5.4936,
+      "step": 2000
+    },
+    {
+      "epoch": 0.28421202216853775,
+      "eval_accuracy": 0.2797648310661316,
+      "eval_loss": 5.373836994171143,
+      "eval_runtime": 1.4619,
+      "eval_samples_per_second": 2572.026,
+      "eval_steps_per_second": 40.359,
+      "step": 2000
+    },
+    {
+      "epoch": 0.2913173227227512,
+      "grad_norm": 3.7968027591705322,
+      "learning_rate": 0.0002049,
+      "loss": 5.4659,
+      "step": 2050
+    },
+    {
+      "epoch": 0.2984226232769646,
+      "grad_norm": 3.9787704944610596,
+      "learning_rate": 0.0002099,
+      "loss": 5.3956,
+      "step": 2100
+    },
+    {
+      "epoch": 0.30552792383117805,
+      "grad_norm": 4.154063701629639,
+      "learning_rate": 0.00021490000000000002,
+      "loss": 5.3716,
+      "step": 2150
+    },
+    {
+      "epoch": 0.3126332243853915,
+      "grad_norm": 3.5421087741851807,
+      "learning_rate": 0.0002199,
+      "loss": 5.3506,
+      "step": 2200
+    },
+    {
+      "epoch": 0.3197385249396049,
+      "grad_norm": 4.011205673217773,
+      "learning_rate": 0.0002249,
+      "loss": 5.295,
+      "step": 2250
+    },
+    {
+      "epoch": 0.3268438254938184,
+      "grad_norm": 3.6677510738372803,
+      "learning_rate": 0.0002299,
+      "loss": 5.2688,
+      "step": 2300
+    },
+    {
+      "epoch": 0.33394912604803184,
+      "grad_norm": 3.733154773712158,
+      "learning_rate": 0.0002349,
+      "loss": 5.2771,
+      "step": 2350
+    },
+    {
+      "epoch": 0.3410544266022453,
+      "grad_norm": 3.485593795776367,
+      "learning_rate": 0.0002399,
+      "loss": 5.2569,
+      "step": 2400
+    },
+    {
+      "epoch": 0.3481597271564587,
+      "grad_norm": 3.414398193359375,
+      "learning_rate": 0.0002449,
+      "loss": 5.1798,
+      "step": 2450
+    },
+    {
+      "epoch": 0.35526502771067214,
+      "grad_norm": 4.8961896896362305,
+      "learning_rate": 0.0002499,
+      "loss": 5.1564,
+      "step": 2500
+    },
+    {
+      "epoch": 0.3623703282648856,
+      "grad_norm": 3.7125649452209473,
+      "learning_rate": 0.0002549,
+      "loss": 5.1338,
+      "step": 2550
+    },
+    {
+      "epoch": 0.36947562881909907,
+      "grad_norm": 3.6601812839508057,
+      "learning_rate": 0.00025990000000000003,
+      "loss": 5.1297,
+      "step": 2600
+    },
+    {
+      "epoch": 0.3765809293733125,
+      "grad_norm": 3.997366189956665,
+      "learning_rate": 0.00026490000000000004,
+      "loss": 5.0859,
+      "step": 2650
+    },
+    {
+      "epoch": 0.38368622992752593,
+      "grad_norm": 3.54914927482605,
+      "learning_rate": 0.0002699,
+      "loss": 5.0545,
+      "step": 2700
+    },
+    {
+      "epoch": 0.39079153048173937,
+      "grad_norm": 3.6549172401428223,
+      "learning_rate": 0.00027489999999999996,
+      "loss": 5.0494,
+      "step": 2750
+    },
+    {
+      "epoch": 0.3978968310359528,
+      "grad_norm": 3.403808355331421,
+      "learning_rate": 0.0002799,
+      "loss": 5.0164,
+      "step": 2800
+    },
+    {
+      "epoch": 0.40500213159016624,
+      "grad_norm": 3.342942476272583,
+      "learning_rate": 0.0002849,
+      "loss": 4.9945,
+      "step": 2850
+    },
+    {
+      "epoch": 0.4121074321443797,
+      "grad_norm": 3.8801138401031494,
+      "learning_rate": 0.0002899,
+      "loss": 4.9666,
+      "step": 2900
+    },
+    {
+      "epoch": 0.41921273269859316,
+      "grad_norm": 3.295564651489258,
+      "learning_rate": 0.0002949,
+      "loss": 4.9704,
+      "step": 2950
+    },
+    {
+      "epoch": 0.4263180332528066,
+      "grad_norm": 3.265148639678955,
+      "learning_rate": 0.0002999,
+      "loss": 4.9451,
+      "step": 3000
+    },
+    {
+      "epoch": 0.43342333380702003,
+      "grad_norm": 3.1508865356445312,
+      "learning_rate": 0.0003049,
+      "loss": 4.9264,
+      "step": 3050
+    },
+    {
+      "epoch": 0.44052863436123346,
+      "grad_norm": 3.8297386169433594,
+      "learning_rate": 0.0003099,
+      "loss": 4.9052,
+      "step": 3100
+    },
+    {
+      "epoch": 0.4476339349154469,
+      "grad_norm": 3.190356969833374,
+      "learning_rate": 0.0003149,
+      "loss": 4.8845,
+      "step": 3150
+    },
+    {
+      "epoch": 0.4547392354696604,
+      "grad_norm": 3.5795695781707764,
+      "learning_rate": 0.0003199,
+      "loss": 4.8459,
+      "step": 3200
+    },
+    {
+      "epoch": 0.4618445360238738,
+      "grad_norm": 3.614764451980591,
+      "learning_rate": 0.00032490000000000004,
+      "loss": 4.8369,
+      "step": 3250
+    },
+    {
+      "epoch": 0.46894983657808725,
+      "grad_norm": 3.8342933654785156,
+      "learning_rate": 0.00032990000000000005,
+      "loss": 4.7909,
+      "step": 3300
+    },
+    {
+      "epoch": 0.4760551371323007,
+      "grad_norm": 3.1796512603759766,
+      "learning_rate": 0.0003349,
+      "loss": 4.7868,
+      "step": 3350
+    },
+    {
+      "epoch": 0.4831604376865141,
+      "grad_norm": 3.308342695236206,
+      "learning_rate": 0.00033989999999999997,
+      "loss": 4.7647,
+      "step": 3400
+    },
+    {
+      "epoch": 0.49026573824072756,
+      "grad_norm": 3.8211753368377686,
+      "learning_rate": 0.0003449,
+      "loss": 4.7579,
+      "step": 3450
+    },
+    {
+      "epoch": 0.49737103879494104,
+      "grad_norm": 3.686267375946045,
+      "learning_rate": 0.0003499,
+      "loss": 4.7682,
+      "step": 3500
+    },
+    {
+      "epoch": 0.5044763393491545,
+      "grad_norm": 3.1786656379699707,
+      "learning_rate": 0.0003549,
+      "loss": 4.7018,
+      "step": 3550
+    },
+    {
+      "epoch": 0.5115816399033679,
+      "grad_norm": 3.0715014934539795,
+      "learning_rate": 0.0003599,
+      "loss": 4.7151,
+      "step": 3600
+    },
+    {
+      "epoch": 0.5186869404575813,
+      "grad_norm": 2.9246482849121094,
+      "learning_rate": 0.00036490000000000003,
+      "loss": 4.6897,
+      "step": 3650
+    },
+    {
+      "epoch": 0.5257922410117948,
+      "grad_norm": 3.4063315391540527,
+      "learning_rate": 0.0003699,
+      "loss": 4.6572,
+      "step": 3700
+    },
+    {
+      "epoch": 0.5328975415660082,
+      "grad_norm": 2.8954086303710938,
+      "learning_rate": 0.0003749,
+      "loss": 4.6345,
+      "step": 3750
+    },
+    {
+      "epoch": 0.5400028421202216,
+      "grad_norm": 3.2454769611358643,
+      "learning_rate": 0.0003799,
+      "loss": 4.6397,
+      "step": 3800
+    },
+    {
+      "epoch": 0.5471081426744351,
+      "grad_norm": 3.0364902019500732,
+      "learning_rate": 0.00038490000000000003,
+      "loss": 4.6329,
+      "step": 3850
+    },
+    {
+      "epoch": 0.5542134432286485,
+      "grad_norm": 2.843865156173706,
+      "learning_rate": 0.00038990000000000004,
+      "loss": 4.6107,
+      "step": 3900
+    },
+    {
+      "epoch": 0.5613187437828621,
+      "grad_norm": 3.196857213973999,
+      "learning_rate": 0.0003949,
+      "loss": 4.5821,
+      "step": 3950
+    },
+    {
+      "epoch": 0.5684240443370755,
+      "grad_norm": 3.3867523670196533,
+      "learning_rate": 0.00039989999999999996,
+      "loss": 4.5726,
+      "step": 4000
+    },
+    {
+      "epoch": 0.5684240443370755,
+      "eval_accuracy": 0.3473648726940155,
+      "eval_loss": 4.424046516418457,
+      "eval_runtime": 1.2247,
+      "eval_samples_per_second": 3070.052,
+      "eval_steps_per_second": 48.174,
+      "step": 4000
+    },
+    {
+      "epoch": 0.5755293448912889,
+      "grad_norm": 2.6906516551971436,
+      "learning_rate": 0.0004049,
+      "loss": 4.5637,
+      "step": 4050
+    },
+    {
+      "epoch": 0.5826346454455024,
+      "grad_norm": 2.7520558834075928,
+      "learning_rate": 0.0004099,
+      "loss": 4.5297,
+      "step": 4100
+    },
+    {
+      "epoch": 0.5897399459997158,
+      "grad_norm": 2.7561872005462646,
+      "learning_rate": 0.0004149,
+      "loss": 4.5373,
+      "step": 4150
+    },
+    {
+      "epoch": 0.5968452465539292,
+      "grad_norm": 2.703505754470825,
+      "learning_rate": 0.0004199,
+      "loss": 4.5301,
+      "step": 4200
+    },
+    {
+      "epoch": 0.6039505471081427,
+      "grad_norm": 2.954202175140381,
+      "learning_rate": 0.00042490000000000003,
+      "loss": 4.4973,
+      "step": 4250
+    },
+    {
+      "epoch": 0.6110558476623561,
+      "grad_norm": 2.657067060470581,
+      "learning_rate": 0.0004299,
+      "loss": 4.4914,
+      "step": 4300
+    },
+    {
+      "epoch": 0.6181611482165695,
+      "grad_norm": 2.738647222518921,
+      "learning_rate": 0.0004349,
+      "loss": 4.4614,
+      "step": 4350
+    },
+    {
+      "epoch": 0.625266448770783,
+      "grad_norm": 2.5784761905670166,
+      "learning_rate": 0.0004399,
+      "loss": 4.43,
+      "step": 4400
+    },
+    {
+      "epoch": 0.6323717493249964,
+      "grad_norm": 2.204275608062744,
+      "learning_rate": 0.0004449,
+      "loss": 4.4255,
+      "step": 4450
+    },
+    {
+      "epoch": 0.6394770498792098,
+      "grad_norm": 2.524068832397461,
+      "learning_rate": 0.00044990000000000004,
+      "loss": 4.4287,
+      "step": 4500
+    },
+    {
+      "epoch": 0.6465823504334234,
+      "grad_norm": 2.4962570667266846,
+      "learning_rate": 0.00045490000000000005,
+      "loss": 4.3569,
+      "step": 4550
+    },
+    {
+      "epoch": 0.6536876509876368,
+      "grad_norm": 2.712149143218994,
+      "learning_rate": 0.0004599,
+      "loss": 4.3769,
+      "step": 4600
+    },
+    {
+      "epoch": 0.6607929515418502,
+      "grad_norm": 2.447850465774536,
+      "learning_rate": 0.00046489999999999997,
+      "loss": 4.3919,
+      "step": 4650
+    },
+    {
+      "epoch": 0.6678982520960637,
+      "grad_norm": 1.9662154912948608,
+      "learning_rate": 0.0004699,
+      "loss": 4.3945,
+      "step": 4700
+    },
+    {
+      "epoch": 0.6750035526502771,
+      "grad_norm": 2.773207426071167,
+      "learning_rate": 0.0004749,
+      "loss": 4.3456,
+      "step": 4750
+    },
+    {
+      "epoch": 0.6821088532044906,
+      "grad_norm": 2.235621690750122,
+      "learning_rate": 0.0004799,
+      "loss": 4.3292,
+      "step": 4800
+    },
+    {
+      "epoch": 0.689214153758704,
+      "grad_norm": 2.2144429683685303,
+      "learning_rate": 0.0004849,
+      "loss": 4.3334,
+      "step": 4850
+    },
+    {
+      "epoch": 0.6963194543129174,
+      "grad_norm": 2.3465662002563477,
+      "learning_rate": 0.0004899,
+      "loss": 4.3216,
+      "step": 4900
+    },
+    {
+      "epoch": 0.7034247548671309,
+      "grad_norm": 2.3879144191741943,
+      "learning_rate": 0.0004949,
+      "loss": 4.293,
+      "step": 4950
+    },
+    {
+      "epoch": 0.7105300554213443,
+      "grad_norm": 2.186339855194092,
+      "learning_rate": 0.0004999000000000001,
+      "loss": 4.2757,
+      "step": 5000
+    },
+    {
+      "epoch": 0.7176353559755577,
+      "grad_norm": 2.113415241241455,
+      "learning_rate": 0.0004994555555555555,
+      "loss": 4.2609,
+      "step": 5050
+    },
+    {
+      "epoch": 0.7247406565297712,
+      "grad_norm": 2.384539842605591,
+      "learning_rate": 0.0004989,
+      "loss": 4.2621,
+      "step": 5100
+    },
+    {
+      "epoch": 0.7318459570839847,
+      "grad_norm": 2.57897686958313,
+      "learning_rate": 0.0004983444444444444,
+      "loss": 4.2507,
+      "step": 5150
+    },
+    {
+      "epoch": 0.7389512576381981,
+      "grad_norm": 2.3281402587890625,
+      "learning_rate": 0.0004977888888888889,
+      "loss": 4.2517,
+      "step": 5200
+    },
+    {
+      "epoch": 0.7460565581924116,
+      "grad_norm": 2.055142879486084,
+      "learning_rate": 0.0004972333333333334,
+      "loss": 4.2112,
+      "step": 5250
+    },
+    {
+      "epoch": 0.753161858746625,
+      "grad_norm": 2.339130163192749,
+      "learning_rate": 0.0004966777777777778,
+      "loss": 4.205,
+      "step": 5300
+    },
+    {
+      "epoch": 0.7602671593008384,
+      "grad_norm": 2.2506520748138428,
+      "learning_rate": 0.0004961222222222223,
+      "loss": 4.2257,
+      "step": 5350
+    },
+    {
+      "epoch": 0.7673724598550519,
+      "grad_norm": 2.1256449222564697,
+      "learning_rate": 0.0004955666666666667,
+      "loss": 4.1923,
+      "step": 5400
+    },
+    {
+      "epoch": 0.7744777604092653,
+      "grad_norm": 1.9763379096984863,
+      "learning_rate": 0.0004950111111111112,
+      "loss": 4.154,
+      "step": 5450
+    },
+    {
+      "epoch": 0.7815830609634787,
+      "grad_norm": 2.272706985473633,
+      "learning_rate": 0.0004944555555555555,
+      "loss": 4.1438,
+      "step": 5500
+    },
+    {
+      "epoch": 0.7886883615176922,
+      "grad_norm": 2.2818517684936523,
+      "learning_rate": 0.0004939,
+      "loss": 4.1481,
+      "step": 5550
+    },
+    {
+      "epoch": 0.7957936620719056,
+      "grad_norm": 2.158761739730835,
+      "learning_rate": 0.0004933444444444444,
+      "loss": 4.1347,
+      "step": 5600
+    },
+    {
+      "epoch": 0.802898962626119,
+      "grad_norm": 2.0649828910827637,
+      "learning_rate": 0.0004927888888888889,
+      "loss": 4.1106,
+      "step": 5650
+    },
+    {
+      "epoch": 0.8100042631803325,
+      "grad_norm": 2.074234962463379,
+      "learning_rate": 0.0004922333333333334,
+      "loss": 4.1174,
+      "step": 5700
+    },
+    {
+      "epoch": 0.817109563734546,
+      "grad_norm": 2.0552144050598145,
+      "learning_rate": 0.0004916777777777778,
+      "loss": 4.0816,
+      "step": 5750
+    },
+    {
+      "epoch": 0.8242148642887595,
+      "grad_norm": 2.1459226608276367,
+      "learning_rate": 0.0004911222222222223,
+      "loss": 4.1048,
+      "step": 5800
+    },
+    {
+      "epoch": 0.8313201648429729,
+      "grad_norm": 2.0078556537628174,
+      "learning_rate": 0.0004905666666666666,
+      "loss": 4.0741,
+      "step": 5850
+    },
+    {
+      "epoch": 0.8384254653971863,
+      "grad_norm": 1.962790608406067,
+      "learning_rate": 0.0004900111111111111,
+      "loss": 4.0615,
+      "step": 5900
+    },
+    {
+      "epoch": 0.8455307659513998,
+      "grad_norm": 2.1868464946746826,
+      "learning_rate": 0.0004894555555555555,
+      "loss": 4.0513,
+      "step": 5950
+    },
+    {
+      "epoch": 0.8526360665056132,
+      "grad_norm": 2.160015344619751,
+      "learning_rate": 0.0004889,
+      "loss": 4.0401,
+      "step": 6000
+    },
+    {
+      "epoch": 0.8526360665056132,
+      "eval_accuracy": 0.389207124710083,
+      "eval_loss": 3.9373788833618164,
+      "eval_runtime": 1.4022,
+      "eval_samples_per_second": 2681.497,
+      "eval_steps_per_second": 42.077,
+      "step": 6000
+    },
+    {
+      "epoch": 0.8597413670598266,
+      "grad_norm": 1.8545076847076416,
+      "learning_rate": 0.0004883444444444445,
+      "loss": 4.0452,
+      "step": 6050
+    },
+    {
+      "epoch": 0.8668466676140401,
+      "grad_norm": 2.2014081478118896,
+      "learning_rate": 0.0004877888888888889,
+      "loss": 4.0081,
+      "step": 6100
+    },
+    {
+      "epoch": 0.8739519681682535,
+      "grad_norm": 1.886440396308899,
+      "learning_rate": 0.0004872333333333334,
+      "loss": 4.0104,
+      "step": 6150
+    },
+    {
+      "epoch": 0.8810572687224669,
+      "grad_norm": 2.234833240509033,
+      "learning_rate": 0.00048667777777777776,
+      "loss": 3.9821,
+      "step": 6200
+    },
+    {
+      "epoch": 0.8881625692766804,
+      "grad_norm": 2.0455291271209717,
+      "learning_rate": 0.00048612222222222225,
+      "loss": 3.9822,
+      "step": 6250
+    },
+    {
+      "epoch": 0.8952678698308938,
+      "grad_norm": 2.097593069076538,
+      "learning_rate": 0.00048556666666666663,
+      "loss": 3.9991,
+      "step": 6300
+    },
+    {
+      "epoch": 0.9023731703851073,
+      "grad_norm": 2.0400683879852295,
+      "learning_rate": 0.0004850111111111111,
+      "loss": 3.9679,
+      "step": 6350
+    },
+    {
+      "epoch": 0.9094784709393208,
+      "grad_norm": 2.0201427936553955,
+      "learning_rate": 0.00048445555555555556,
+      "loss": 3.9712,
+      "step": 6400
+    },
+    {
+      "epoch": 0.9165837714935342,
+      "grad_norm": 2.175518035888672,
+      "learning_rate": 0.0004839,
+      "loss": 3.9371,
+      "step": 6450
+    },
+    {
+      "epoch": 0.9236890720477476,
+      "grad_norm": 1.8984272480010986,
+      "learning_rate": 0.0004833444444444445,
+      "loss": 3.9372,
+      "step": 6500
+    },
+    {
+      "epoch": 0.9307943726019611,
+      "grad_norm": 2.0253663063049316,
+      "learning_rate": 0.00048278888888888887,
+      "loss": 3.9369,
+      "step": 6550
+    },
+    {
+      "epoch": 0.9378996731561745,
+      "grad_norm": 1.9045063257217407,
+      "learning_rate": 0.00048223333333333336,
+      "loss": 3.9284,
+      "step": 6600
+    },
+    {
+      "epoch": 0.9450049737103879,
+      "grad_norm": 2.129824161529541,
+      "learning_rate": 0.00048167777777777775,
+      "loss": 3.8851,
+      "step": 6650
+    },
+    {
+      "epoch": 0.9521102742646014,
+      "grad_norm": 2.022346258163452,
+      "learning_rate": 0.00048112222222222224,
+      "loss": 3.9011,
+      "step": 6700
+    },
+    {
+      "epoch": 0.9592155748188148,
+      "grad_norm": 1.9612109661102295,
+      "learning_rate": 0.0004805666666666667,
+      "loss": 3.8745,
+      "step": 6750
+    },
+    {
+      "epoch": 0.9663208753730282,
+      "grad_norm": 1.9245905876159668,
+      "learning_rate": 0.0004800111111111111,
+      "loss": 3.8801,
+      "step": 6800
+    },
+    {
+      "epoch": 0.9734261759272417,
+      "grad_norm": 2.065843343734741,
+      "learning_rate": 0.0004794555555555556,
+      "loss": 3.8506,
+      "step": 6850
+    },
+    {
+      "epoch": 0.9805314764814551,
+      "grad_norm": 2.1017391681671143,
+      "learning_rate": 0.0004789,
+      "loss": 3.8554,
+      "step": 6900
+    },
+    {
+      "epoch": 0.9876367770356687,
+      "grad_norm": 1.9719597101211548,
+      "learning_rate": 0.0004783444444444445,
+      "loss": 3.8459,
+      "step": 6950
+    },
+    {
+      "epoch": 0.9947420775898821,
+      "grad_norm": 2.016110897064209,
+      "learning_rate": 0.00047778888888888886,
+      "loss": 3.8261,
+      "step": 7000
+    },
+    {
+      "epoch": 1.0018473781440955,
+      "grad_norm": 1.8949023485183716,
+      "learning_rate": 0.00047723333333333335,
+      "loss": 3.8615,
+      "step": 7050
+    },
+    {
+      "epoch": 1.008952678698309,
+      "grad_norm": 1.8229880332946777,
+      "learning_rate": 0.0004766777777777778,
+      "loss": 3.8052,
+      "step": 7100
+    },
+    {
+      "epoch": 1.0160579792525224,
+      "grad_norm": 2.2516708374023438,
+      "learning_rate": 0.0004761222222222222,
+      "loss": 3.8019,
+      "step": 7150
+    },
+    {
+      "epoch": 1.0231632798067358,
+      "grad_norm": 2.0374722480773926,
+      "learning_rate": 0.0004755666666666667,
+      "loss": 3.8016,
+      "step": 7200
+    },
+    {
+      "epoch": 1.0302685803609493,
+      "grad_norm": 2.0370278358459473,
+      "learning_rate": 0.0004750111111111111,
+      "loss": 3.7872,
+      "step": 7250
+    },
+    {
+      "epoch": 1.0373738809151627,
+      "grad_norm": 2.0572760105133057,
+      "learning_rate": 0.0004744555555555556,
+      "loss": 3.7898,
+      "step": 7300
+    },
+    {
+      "epoch": 1.0444791814693761,
+      "grad_norm": 2.3242228031158447,
+      "learning_rate": 0.00047389999999999997,
+      "loss": 3.7669,
+      "step": 7350
+    },
+    {
+      "epoch": 1.0515844820235896,
+      "grad_norm": 2.0401973724365234,
+      "learning_rate": 0.00047334444444444446,
+      "loss": 3.7725,
+      "step": 7400
+    },
+    {
+      "epoch": 1.058689782577803,
+      "grad_norm": 1.954636812210083,
+      "learning_rate": 0.0004727888888888889,
+      "loss": 3.7939,
+      "step": 7450
+    },
+    {
+      "epoch": 1.0657950831320164,
+      "grad_norm": 1.949588418006897,
+      "learning_rate": 0.00047223333333333334,
+      "loss": 3.7626,
+      "step": 7500
+    },
+    {
+      "epoch": 1.0729003836862299,
+      "grad_norm": 1.9143036603927612,
+      "learning_rate": 0.0004716777777777778,
+      "loss": 3.7593,
+      "step": 7550
+    },
+    {
+      "epoch": 1.0800056842404433,
+      "grad_norm": 1.8144210577011108,
+      "learning_rate": 0.0004711222222222222,
+      "loss": 3.7563,
+      "step": 7600
+    },
+    {
+      "epoch": 1.0871109847946567,
+      "grad_norm": 1.971367597579956,
+      "learning_rate": 0.0004705666666666667,
+      "loss": 3.7496,
+      "step": 7650
+    },
+    {
+      "epoch": 1.0942162853488702,
+      "grad_norm": 2.0770599842071533,
+      "learning_rate": 0.0004700111111111111,
+      "loss": 3.7137,
+      "step": 7700
+    },
+    {
+      "epoch": 1.1013215859030836,
+      "grad_norm": 1.8662265539169312,
+      "learning_rate": 0.0004694555555555556,
+      "loss": 3.7118,
+      "step": 7750
+    },
+    {
+      "epoch": 1.108426886457297,
+      "grad_norm": 1.9397633075714111,
+      "learning_rate": 0.0004689,
+      "loss": 3.7241,
+      "step": 7800
+    },
+    {
+      "epoch": 1.1155321870115107,
+      "grad_norm": 2.0750577449798584,
+      "learning_rate": 0.00046834444444444445,
+      "loss": 3.6847,
+      "step": 7850
+    },
+    {
+      "epoch": 1.1226374875657241,
+      "grad_norm": 2.2893929481506348,
+      "learning_rate": 0.0004677888888888889,
+      "loss": 3.7101,
+      "step": 7900
+    },
+    {
+      "epoch": 1.1297427881199376,
+      "grad_norm": 1.8849507570266724,
+      "learning_rate": 0.0004672333333333333,
+      "loss": 3.6931,
+      "step": 7950
+    },
+    {
+      "epoch": 1.136848088674151,
+      "grad_norm": 2.108226776123047,
+      "learning_rate": 0.0004666777777777778,
+      "loss": 3.6962,
+      "step": 8000
+    },
+    {
+      "epoch": 1.136848088674151,
+      "eval_accuracy": 0.41970664262771606,
+      "eval_loss": 3.6095504760742188,
+      "eval_runtime": 1.3165,
+      "eval_samples_per_second": 2855.969,
+      "eval_steps_per_second": 44.814,
+      "step": 8000
+    },
+    {
+      "epoch": 1.1439533892283644,
+      "grad_norm": 1.8215097188949585,
+      "learning_rate": 0.0004661222222222222,
+      "loss": 3.6758,
+      "step": 8050
+    },
+    {
+      "epoch": 1.1510586897825779,
+      "grad_norm": 2.1105434894561768,
+      "learning_rate": 0.0004655666666666667,
+      "loss": 3.6832,
+      "step": 8100
+    },
+    {
+      "epoch": 1.1581639903367913,
+      "grad_norm": 1.933396339416504,
+      "learning_rate": 0.0004650111111111111,
+      "loss": 3.6691,
+      "step": 8150
+    },
+    {
+      "epoch": 1.1652692908910047,
+      "grad_norm": 1.8069454431533813,
+      "learning_rate": 0.00046445555555555556,
+      "loss": 3.632,
+      "step": 8200
+    },
+    {
+      "epoch": 1.1723745914452182,
+      "grad_norm": 1.9175587892532349,
+      "learning_rate": 0.0004639,
+      "loss": 3.6569,
+      "step": 8250
+    },
+    {
+      "epoch": 1.1794798919994316,
+      "grad_norm": 1.936651587486267,
+      "learning_rate": 0.00046334444444444444,
+      "loss": 3.6549,
+      "step": 8300
+    },
+    {
+      "epoch": 1.186585192553645,
+      "grad_norm": 1.8092012405395508,
+      "learning_rate": 0.00046278888888888893,
+      "loss": 3.6326,
+      "step": 8350
+    },
+    {
+      "epoch": 1.1936904931078585,
+      "grad_norm": 1.741718053817749,
+      "learning_rate": 0.00046223333333333337,
+      "loss": 3.6232,
+      "step": 8400
+    },
+    {
+      "epoch": 1.200795793662072,
+      "grad_norm": 1.9609713554382324,
+      "learning_rate": 0.0004616777777777778,
+      "loss": 3.6275,
+      "step": 8450
+    },
+    {
+      "epoch": 1.2079010942162853,
+      "grad_norm": 1.6691536903381348,
+      "learning_rate": 0.00046112222222222224,
+      "loss": 3.6359,
+      "step": 8500
+    },
+    {
+      "epoch": 1.2150063947704988,
+      "grad_norm": 1.8362239599227905,
+      "learning_rate": 0.0004605666666666667,
+      "loss": 3.6195,
+      "step": 8550
+    },
+    {
+      "epoch": 1.2221116953247122,
+      "grad_norm": 1.9224746227264404,
+      "learning_rate": 0.0004600111111111111,
+      "loss": 3.6203,
+      "step": 8600
+    },
+    {
+      "epoch": 1.2292169958789256,
+      "grad_norm": 1.791446566581726,
+      "learning_rate": 0.00045945555555555555,
+      "loss": 3.605,
+      "step": 8650
+    },
+    {
+      "epoch": 1.236322296433139,
+      "grad_norm": 2.0857369899749756,
+      "learning_rate": 0.0004589,
+      "loss": 3.599,
+      "step": 8700
+    },
+    {
+      "epoch": 1.2434275969873525,
+      "grad_norm": 2.1251144409179688,
+      "learning_rate": 0.0004583444444444445,
+      "loss": 3.6082,
+      "step": 8750
+    },
+    {
+      "epoch": 1.250532897541566,
+      "grad_norm": 1.901963710784912,
+      "learning_rate": 0.0004577888888888889,
+      "loss": 3.6067,
+      "step": 8800
+    },
+    {
+      "epoch": 1.2576381980957794,
+      "grad_norm": 1.745914340019226,
+      "learning_rate": 0.00045723333333333335,
+      "loss": 3.6024,
+      "step": 8850
+    },
+    {
+      "epoch": 1.264743498649993,
+      "grad_norm": 1.7997791767120361,
+      "learning_rate": 0.0004566777777777778,
+      "loss": 3.5741,
+      "step": 8900
+    },
+    {
+      "epoch": 1.2718487992042062,
+      "grad_norm": 1.8684380054473877,
+      "learning_rate": 0.0004561222222222222,
+      "loss": 3.5767,
+      "step": 8950
+    },
+    {
+      "epoch": 1.27895409975842,
+      "grad_norm": 1.9908355474472046,
+      "learning_rate": 0.00045556666666666666,
+      "loss": 3.5529,
+      "step": 9000
+    },
+    {
+      "epoch": 1.286059400312633,
+      "grad_norm": 1.8683586120605469,
+      "learning_rate": 0.0004550111111111111,
+      "loss": 3.5614,
+      "step": 9050
+    },
+    {
+      "epoch": 1.2931647008668468,
+      "grad_norm": 1.766913890838623,
+      "learning_rate": 0.0004544555555555556,
+      "loss": 3.5393,
+      "step": 9100
+    },
+    {
+      "epoch": 1.30027000142106,
+      "grad_norm": 2.004033088684082,
+      "learning_rate": 0.00045390000000000003,
+      "loss": 3.5424,
+      "step": 9150
+    },
+    {
+      "epoch": 1.3073753019752736,
+      "grad_norm": 1.9604741334915161,
+      "learning_rate": 0.00045334444444444447,
+      "loss": 3.5526,
+      "step": 9200
+    },
+    {
+      "epoch": 1.314480602529487,
+      "grad_norm": 1.9911085367202759,
+      "learning_rate": 0.0004527888888888889,
+      "loss": 3.5261,
+      "step": 9250
+    },
+    {
+      "epoch": 1.3215859030837005,
+      "grad_norm": 1.9417774677276611,
+      "learning_rate": 0.00045223333333333334,
+      "loss": 3.545,
+      "step": 9300
+    },
+    {
+      "epoch": 1.328691203637914,
+      "grad_norm": 1.9904927015304565,
+      "learning_rate": 0.0004516777777777778,
+      "loss": 3.5167,
+      "step": 9350
+    },
+    {
+      "epoch": 1.3357965041921274,
+      "grad_norm": 1.8322969675064087,
+      "learning_rate": 0.0004511222222222222,
+      "loss": 3.498,
+      "step": 9400
+    },
+    {
+      "epoch": 1.3429018047463408,
+      "grad_norm": 1.797074794769287,
+      "learning_rate": 0.0004505666666666667,
+      "loss": 3.5084,
+      "step": 9450
+    },
+    {
+      "epoch": 1.3500071053005542,
+      "grad_norm": 1.8438552618026733,
+      "learning_rate": 0.0004500111111111111,
+      "loss": 3.5053,
+      "step": 9500
+    },
+    {
+      "epoch": 1.3571124058547677,
+      "grad_norm": 1.8281489610671997,
+      "learning_rate": 0.0004494555555555556,
+      "loss": 3.4848,
+      "step": 9550
+    },
+    {
+      "epoch": 1.364217706408981,
+      "grad_norm": 1.9461642503738403,
+      "learning_rate": 0.0004489,
+      "loss": 3.4721,
+      "step": 9600
+    },
+    {
+      "epoch": 1.3713230069631945,
+      "grad_norm": 1.9854313135147095,
+      "learning_rate": 0.00044834444444444445,
+      "loss": 3.4813,
+      "step": 9650
+    },
+    {
+      "epoch": 1.378428307517408,
+      "grad_norm": 1.9185129404067993,
+      "learning_rate": 0.0004477888888888889,
+      "loss": 3.5082,
+      "step": 9700
+    },
+    {
+      "epoch": 1.3855336080716214,
+      "grad_norm": 1.9525820016860962,
+      "learning_rate": 0.0004472333333333333,
+      "loss": 3.4965,
+      "step": 9750
+    },
+    {
+      "epoch": 1.3926389086258348,
+      "grad_norm": 1.9303693771362305,
+      "learning_rate": 0.0004466777777777778,
+      "loss": 3.5032,
+      "step": 9800
+    },
+    {
+      "epoch": 1.3997442091800483,
+      "grad_norm": 1.8870890140533447,
+      "learning_rate": 0.0004461222222222222,
+      "loss": 3.4689,
+      "step": 9850
+    },
+    {
+      "epoch": 1.4068495097342617,
+      "grad_norm": 1.9007459878921509,
+      "learning_rate": 0.0004455666666666667,
+      "loss": 3.4519,
+      "step": 9900
+    },
+    {
+      "epoch": 1.4139548102884751,
+      "grad_norm": 2.0525524616241455,
+      "learning_rate": 0.00044501111111111113,
+      "loss": 3.4673,
+      "step": 9950
+    },
+    {
+      "epoch": 1.4210601108426886,
+      "grad_norm": 1.9106584787368774,
+      "learning_rate": 0.00044445555555555557,
+      "loss": 3.4609,
+      "step": 10000
+    },
+    {
+      "epoch": 1.4210601108426886,
+      "eval_accuracy": 0.44234737753868103,
+      "eval_loss": 3.3924427032470703,
+      "eval_runtime": 1.3159,
+      "eval_samples_per_second": 2857.31,
+      "eval_steps_per_second": 44.835,
+      "step": 10000
+    },
+    {
+      "epoch": 1.428165411396902,
+      "grad_norm": 1.6311852931976318,
+      "learning_rate": 0.0004439,
+      "loss": 3.4179,
+      "step": 10050
+    },
+    {
+      "epoch": 1.4352707119511154,
+      "grad_norm": 1.6985563039779663,
+      "learning_rate": 0.00044334444444444444,
+      "loss": 3.4466,
+      "step": 10100
+    },
+    {
+      "epoch": 1.442376012505329,
+      "grad_norm": 1.7384531497955322,
+      "learning_rate": 0.00044278888888888893,
+      "loss": 3.4444,
+      "step": 10150
+    },
+    {
+      "epoch": 1.4494813130595423,
+      "grad_norm": 1.8844335079193115,
+      "learning_rate": 0.0004422333333333333,
+      "loss": 3.4162,
+      "step": 10200
+    },
+    {
+      "epoch": 1.456586613613756,
+      "grad_norm": 1.8525067567825317,
+      "learning_rate": 0.0004416777777777778,
+      "loss": 3.4217,
+      "step": 10250
+    },
+    {
+      "epoch": 1.4636919141679692,
+      "grad_norm": 1.924688458442688,
+      "learning_rate": 0.00044112222222222224,
+      "loss": 3.441,
+      "step": 10300
+    },
+    {
+      "epoch": 1.4707972147221828,
+      "grad_norm": 1.8880536556243896,
+      "learning_rate": 0.0004405666666666667,
+      "loss": 3.4166,
+      "step": 10350
+    },
+    {
+      "epoch": 1.4779025152763963,
+      "grad_norm": 1.8569647073745728,
+      "learning_rate": 0.0004400111111111111,
+      "loss": 3.4239,
+      "step": 10400
+    },
+    {
+      "epoch": 1.4850078158306097,
+      "grad_norm": 1.7077707052230835,
+      "learning_rate": 0.00043945555555555555,
+      "loss": 3.4355,
+      "step": 10450
+    },
+    {
+      "epoch": 1.4921131163848231,
+      "grad_norm": 2.164099931716919,
+      "learning_rate": 0.00043890000000000004,
+      "loss": 3.4213,
+      "step": 10500
+    },
+    {
+      "epoch": 1.4992184169390366,
+      "grad_norm": 1.7151411771774292,
+      "learning_rate": 0.0004383444444444444,
+      "loss": 3.4066,
+      "step": 10550
+    },
+    {
+      "epoch": 1.50632371749325,
+      "grad_norm": 1.7368758916854858,
+      "learning_rate": 0.0004377888888888889,
+      "loss": 3.3885,
+      "step": 10600
+    },
+    {
+      "epoch": 1.5134290180474634,
+      "grad_norm": 1.9382424354553223,
+      "learning_rate": 0.0004372333333333333,
+      "loss": 3.4183,
+      "step": 10650
+    },
+    {
+      "epoch": 1.5205343186016769,
+      "grad_norm": 1.9375344514846802,
+      "learning_rate": 0.0004366777777777778,
+      "loss": 3.4014,
+      "step": 10700
+    },
+    {
+      "epoch": 1.5276396191558903,
+      "grad_norm": 1.8492896556854248,
+      "learning_rate": 0.00043612222222222223,
+      "loss": 3.3994,
+      "step": 10750
+    },
+    {
+      "epoch": 1.5347449197101037,
+      "grad_norm": 1.8383281230926514,
+      "learning_rate": 0.00043556666666666666,
+      "loss": 3.3695,
+      "step": 10800
+    },
+    {
+      "epoch": 1.5418502202643172,
+      "grad_norm": 1.785221815109253,
+      "learning_rate": 0.00043501111111111116,
+      "loss": 3.4025,
+      "step": 10850
+    },
+    {
+      "epoch": 1.5489555208185306,
+      "grad_norm": 1.8311967849731445,
+      "learning_rate": 0.00043445555555555554,
+      "loss": 3.3803,
+      "step": 10900
+    },
+    {
+      "epoch": 1.556060821372744,
+      "grad_norm": 2.0591444969177246,
+      "learning_rate": 0.00043390000000000003,
+      "loss": 3.3792,
+      "step": 10950
+    },
+    {
+      "epoch": 1.5631661219269575,
+      "grad_norm": 1.7114486694335938,
+      "learning_rate": 0.0004333444444444444,
+      "loss": 3.3562,
+      "step": 11000
+    },
+    {
+      "epoch": 1.570271422481171,
+      "grad_norm": 1.8185391426086426,
+      "learning_rate": 0.0004327888888888889,
+      "loss": 3.3397,
+      "step": 11050
+    },
+    {
+      "epoch": 1.5773767230353843,
+      "grad_norm": 2.1638777256011963,
+      "learning_rate": 0.00043223333333333334,
+      "loss": 3.3304,
+      "step": 11100
+    },
+    {
+      "epoch": 1.5844820235895978,
+      "grad_norm": 1.8578804731369019,
+      "learning_rate": 0.0004316777777777778,
+      "loss": 3.3724,
+      "step": 11150
+    },
+    {
+      "epoch": 1.5915873241438114,
+      "grad_norm": 1.8585854768753052,
+      "learning_rate": 0.00043112222222222227,
+      "loss": 3.3607,
+      "step": 11200
+    },
+    {
+      "epoch": 1.5986926246980246,
+      "grad_norm": 1.7061985731124878,
+      "learning_rate": 0.00043056666666666665,
+      "loss": 3.3526,
+      "step": 11250
+    },
+    {
+      "epoch": 1.6057979252522383,
+      "grad_norm": 1.9890382289886475,
+      "learning_rate": 0.00043001111111111114,
+      "loss": 3.3264,
+      "step": 11300
+    },
+    {
+      "epoch": 1.6129032258064515,
+      "grad_norm": 1.9284510612487793,
+      "learning_rate": 0.0004294555555555555,
+      "loss": 3.322,
+      "step": 11350
+    },
+    {
+      "epoch": 1.6200085263606652,
+      "grad_norm": 1.802822470664978,
+      "learning_rate": 0.0004289,
+      "loss": 3.356,
+      "step": 11400
+    },
+    {
+      "epoch": 1.6271138269148784,
+      "grad_norm": 1.6525734663009644,
+      "learning_rate": 0.0004283444444444445,
+      "loss": 3.343,
+      "step": 11450
+    },
+    {
+      "epoch": 1.634219127469092,
+      "grad_norm": 1.9781888723373413,
+      "learning_rate": 0.0004277888888888889,
+      "loss": 3.3424,
+      "step": 11500
+    },
+    {
+      "epoch": 1.6413244280233052,
+      "grad_norm": 1.78948175907135,
+      "learning_rate": 0.0004272333333333334,
+      "loss": 3.3334,
+      "step": 11550
+    },
+    {
+      "epoch": 1.648429728577519,
+      "grad_norm": 1.9074255228042603,
+      "learning_rate": 0.00042667777777777776,
+      "loss": 3.3221,
+      "step": 11600
+    },
+    {
+      "epoch": 1.6555350291317321,
+      "grad_norm": 1.723588466644287,
+      "learning_rate": 0.00042612222222222226,
+      "loss": 3.3214,
+      "step": 11650
+    },
+    {
+      "epoch": 1.6626403296859458,
+      "grad_norm": 1.7900450229644775,
+      "learning_rate": 0.00042556666666666664,
+      "loss": 3.3557,
+      "step": 11700
+    },
+    {
+      "epoch": 1.6697456302401592,
+      "grad_norm": 1.8456897735595703,
+      "learning_rate": 0.00042501111111111113,
+      "loss": 3.3195,
+      "step": 11750
+    },
+    {
+      "epoch": 1.6768509307943726,
+      "grad_norm": 1.9056233167648315,
+      "learning_rate": 0.0004244555555555555,
+      "loss": 3.3365,
+      "step": 11800
+    },
+    {
+      "epoch": 1.683956231348586,
+      "grad_norm": 1.88231360912323,
+      "learning_rate": 0.0004239,
+      "loss": 3.3063,
+      "step": 11850
+    },
+    {
+      "epoch": 1.6910615319027995,
+      "grad_norm": 1.961814045906067,
+      "learning_rate": 0.0004233444444444445,
+      "loss": 3.3011,
+      "step": 11900
+    },
+    {
+      "epoch": 1.698166832457013,
+      "grad_norm": 1.6388925313949585,
+      "learning_rate": 0.0004227888888888889,
+      "loss": 3.29,
+      "step": 11950
+    },
+    {
+      "epoch": 1.7052721330112264,
+      "grad_norm": 1.8381247520446777,
+      "learning_rate": 0.00042223333333333337,
+      "loss": 3.3135,
+      "step": 12000
+    },
+    {
+      "epoch": 1.7052721330112264,
+      "eval_accuracy": 0.4627563953399658,
+      "eval_loss": 3.2176170349121094,
+      "eval_runtime": 1.4396,
+      "eval_samples_per_second": 2611.836,
+      "eval_steps_per_second": 40.984,
+      "step": 12000
+    },
+    {
+      "epoch": 1.7123774335654398,
+      "grad_norm": 1.9075206518173218,
+      "learning_rate": 0.00042167777777777775,
+      "loss": 3.2829,
+      "step": 12050
+    },
+    {
+      "epoch": 1.7194827341196532,
+      "grad_norm": 1.8718541860580444,
+      "learning_rate": 0.00042112222222222224,
+      "loss": 3.3016,
+      "step": 12100
+    },
+    {
+      "epoch": 1.7265880346738667,
+      "grad_norm": 1.821109414100647,
+      "learning_rate": 0.0004205666666666667,
+      "loss": 3.2981,
+      "step": 12150
+    },
+    {
+      "epoch": 1.7336933352280801,
+      "grad_norm": 1.821947455406189,
+      "learning_rate": 0.0004200111111111111,
+      "loss": 3.2915,
+      "step": 12200
+    },
+    {
+      "epoch": 1.7407986357822935,
+      "grad_norm": 1.940075397491455,
+      "learning_rate": 0.0004194555555555556,
+      "loss": 3.2839,
+      "step": 12250
+    },
+    {
+      "epoch": 1.747903936336507,
+      "grad_norm": 1.779321312904358,
+      "learning_rate": 0.0004189,
+      "loss": 3.2556,
+      "step": 12300
+    },
+    {
+      "epoch": 1.7550092368907206,
+      "grad_norm": 1.8860771656036377,
+      "learning_rate": 0.0004183444444444445,
+      "loss": 3.29,
+      "step": 12350
+    },
+    {
+      "epoch": 1.7621145374449338,
+      "grad_norm": 1.7763786315917969,
+      "learning_rate": 0.00041778888888888886,
+      "loss": 3.284,
+      "step": 12400
+    },
+    {
+      "epoch": 1.7692198379991475,
+      "grad_norm": 1.822303295135498,
+      "learning_rate": 0.00041723333333333336,
+      "loss": 3.2684,
+      "step": 12450
+    },
+    {
+      "epoch": 1.7763251385533607,
+      "grad_norm": 1.8546795845031738,
+      "learning_rate": 0.0004166777777777778,
+      "loss": 3.2722,
+      "step": 12500
+    },
+    {
+      "epoch": 1.7834304391075744,
+      "grad_norm": 1.8313467502593994,
+      "learning_rate": 0.00041612222222222223,
+      "loss": 3.2599,
+      "step": 12550
+    },
+    {
+      "epoch": 1.7905357396617876,
+      "grad_norm": 1.8988237380981445,
+      "learning_rate": 0.00041556666666666667,
+      "loss": 3.2451,
+      "step": 12600
+    },
+    {
+      "epoch": 1.7976410402160012,
+      "grad_norm": 1.7995522022247314,
+      "learning_rate": 0.0004150111111111111,
+      "loss": 3.2431,
+      "step": 12650
+    },
+    {
+      "epoch": 1.8047463407702145,
+      "grad_norm": 1.8105956315994263,
+      "learning_rate": 0.0004144555555555556,
+      "loss": 3.2426,
+      "step": 12700
+    },
+    {
+      "epoch": 1.811851641324428,
+      "grad_norm": 1.672658920288086,
+      "learning_rate": 0.0004139,
+      "loss": 3.2177,
+      "step": 12750
+    },
+    {
+      "epoch": 1.8189569418786413,
+      "grad_norm": 1.6561784744262695,
+      "learning_rate": 0.00041334444444444447,
+      "loss": 3.2395,
+      "step": 12800
+    },
+    {
+      "epoch": 1.826062242432855,
+      "grad_norm": 1.965090274810791,
+      "learning_rate": 0.0004127888888888889,
+      "loss": 3.2797,
+      "step": 12850
+    },
+    {
+      "epoch": 1.8331675429870682,
+      "grad_norm": 1.803202509880066,
+      "learning_rate": 0.00041223333333333334,
+      "loss": 3.2293,
+      "step": 12900
+    },
+    {
+      "epoch": 1.8402728435412818,
+      "grad_norm": 1.8006333112716675,
+      "learning_rate": 0.0004116777777777778,
+      "loss": 3.2415,
+      "step": 12950
+    },
+    {
+      "epoch": 1.8473781440954953,
+      "grad_norm": 1.754680871963501,
+      "learning_rate": 0.0004111222222222222,
+      "loss": 3.2471,
+      "step": 13000
+    },
+    {
+      "epoch": 1.8544834446497087,
+      "grad_norm": 1.9389216899871826,
+      "learning_rate": 0.0004105666666666667,
+      "loss": 3.2215,
+      "step": 13050
+    },
+    {
+      "epoch": 1.8615887452039221,
+      "grad_norm": 1.6737974882125854,
+      "learning_rate": 0.0004100111111111111,
+      "loss": 3.2347,
+      "step": 13100
+    },
+    {
+      "epoch": 1.8686940457581356,
+      "grad_norm": 1.608807921409607,
+      "learning_rate": 0.0004094555555555556,
+      "loss": 3.2219,
+      "step": 13150
+    },
+    {
+      "epoch": 1.875799346312349,
+      "grad_norm": 1.9688620567321777,
+      "learning_rate": 0.0004089,
+      "loss": 3.2489,
+      "step": 13200
+    },
+    {
+      "epoch": 1.8829046468665624,
+      "grad_norm": 1.6064422130584717,
+      "learning_rate": 0.00040834444444444446,
+      "loss": 3.2206,
+      "step": 13250
+    },
+    {
+      "epoch": 1.8900099474207759,
+      "grad_norm": 1.7799787521362305,
+      "learning_rate": 0.0004077888888888889,
+      "loss": 3.2219,
+      "step": 13300
+    },
+    {
+      "epoch": 1.8971152479749893,
+      "grad_norm": 1.880999207496643,
+      "learning_rate": 0.00040723333333333333,
+      "loss": 3.2213,
+      "step": 13350
+    },
+    {
+      "epoch": 1.9042205485292028,
+      "grad_norm": 1.9010646343231201,
+      "learning_rate": 0.0004066777777777778,
+      "loss": 3.2001,
+      "step": 13400
+    },
+    {
+      "epoch": 1.9113258490834162,
+      "grad_norm": 1.7856796979904175,
+      "learning_rate": 0.0004061222222222222,
+      "loss": 3.2293,
+      "step": 13450
+    },
+    {
+      "epoch": 1.9184311496376296,
+      "grad_norm": 1.7832741737365723,
+      "learning_rate": 0.0004055666666666667,
+      "loss": 3.2087,
+      "step": 13500
+    },
+    {
+      "epoch": 1.925536450191843,
+      "grad_norm": 1.817251205444336,
+      "learning_rate": 0.00040501111111111113,
+      "loss": 3.2004,
+      "step": 13550
+    },
+    {
+      "epoch": 1.9326417507460567,
+      "grad_norm": 1.5597301721572876,
+      "learning_rate": 0.00040445555555555557,
+      "loss": 3.1885,
+      "step": 13600
+    },
+    {
+      "epoch": 1.93974705130027,
+      "grad_norm": 1.6983693838119507,
+      "learning_rate": 0.0004039,
+      "loss": 3.1858,
+      "step": 13650
+    },
+    {
+      "epoch": 1.9468523518544836,
+      "grad_norm": 1.6369034051895142,
+      "learning_rate": 0.00040334444444444444,
+      "loss": 3.1833,
+      "step": 13700
+    },
+    {
+      "epoch": 1.9539576524086968,
+      "grad_norm": 1.6948421001434326,
+      "learning_rate": 0.0004027888888888889,
+      "loss": 3.1958,
+      "step": 13750
+    },
+    {
+      "epoch": 1.9610629529629104,
+      "grad_norm": 1.742189645767212,
+      "learning_rate": 0.0004022333333333333,
+      "loss": 3.2171,
+      "step": 13800
+    },
+    {
+      "epoch": 1.9681682535171237,
+      "grad_norm": 1.7551114559173584,
+      "learning_rate": 0.0004016777777777778,
+      "loss": 3.1991,
+      "step": 13850
+    },
+    {
+      "epoch": 1.9752735540713373,
+      "grad_norm": 1.8701705932617188,
+      "learning_rate": 0.00040112222222222224,
+      "loss": 3.1838,
+      "step": 13900
+    },
+    {
+      "epoch": 1.9823788546255505,
+      "grad_norm": 1.7148219347000122,
+      "learning_rate": 0.0004005666666666667,
+      "loss": 3.1522,
+      "step": 13950
+    },
+    {
+      "epoch": 1.9894841551797642,
+      "grad_norm": 1.7814823389053345,
+      "learning_rate": 0.0004000111111111111,
+      "loss": 3.1841,
+      "step": 14000
+    },
+    {
+      "epoch": 1.9894841551797642,
+      "eval_accuracy": 0.471990168094635,
+      "eval_loss": 3.1252591609954834,
+      "eval_runtime": 1.4365,
+      "eval_samples_per_second": 2617.479,
+      "eval_steps_per_second": 41.072,
+      "step": 14000
+    },
+    {
+      "epoch": 1.9965894557339774,
+      "grad_norm": 1.8632270097732544,
+      "learning_rate": 0.00039945555555555556,
+      "loss": 3.1887,
+      "step": 14050
+    },
+    {
+      "epoch": 2.003694756288191,
+      "grad_norm": 1.8517438173294067,
+      "learning_rate": 0.0003989,
+      "loss": 3.1434,
+      "step": 14100
+    },
+    {
+      "epoch": 2.0108000568424043,
+      "grad_norm": 1.9222501516342163,
+      "learning_rate": 0.00039834444444444443,
+      "loss": 3.1432,
+      "step": 14150
+    },
+    {
+      "epoch": 2.017905357396618,
+      "grad_norm": 1.9569604396820068,
+      "learning_rate": 0.0003977888888888889,
+      "loss": 3.1752,
+      "step": 14200
+    },
+    {
+      "epoch": 2.025010657950831,
+      "grad_norm": 1.9165287017822266,
+      "learning_rate": 0.00039723333333333336,
+      "loss": 3.1438,
+      "step": 14250
+    },
+    {
+      "epoch": 2.032115958505045,
+      "grad_norm": 1.8013373613357544,
+      "learning_rate": 0.0003966777777777778,
+      "loss": 3.1602,
+      "step": 14300
+    },
+    {
+      "epoch": 2.039221259059258,
+      "grad_norm": 1.7178173065185547,
+      "learning_rate": 0.00039612222222222223,
+      "loss": 3.1763,
+      "step": 14350
+    },
+    {
+      "epoch": 2.0463265596134717,
+      "grad_norm": 1.762027382850647,
+      "learning_rate": 0.00039556666666666667,
+      "loss": 3.1565,
+      "step": 14400
+    },
+    {
+      "epoch": 2.0534318601676853,
+      "grad_norm": 1.7229429483413696,
+      "learning_rate": 0.0003950111111111111,
+      "loss": 3.1485,
+      "step": 14450
+    },
+    {
+      "epoch": 2.0605371607218985,
+      "grad_norm": 1.9075498580932617,
+      "learning_rate": 0.00039445555555555554,
+      "loss": 3.156,
+      "step": 14500
+    },
+    {
+      "epoch": 2.067642461276112,
+      "grad_norm": 1.7284672260284424,
+      "learning_rate": 0.0003939,
+      "loss": 3.1324,
+      "step": 14550
+    },
+    {
+      "epoch": 2.0747477618303254,
+      "grad_norm": 1.9364721775054932,
+      "learning_rate": 0.00039334444444444447,
+      "loss": 3.1528,
+      "step": 14600
+    },
+    {
+      "epoch": 2.081853062384539,
+      "grad_norm": 1.8764890432357788,
+      "learning_rate": 0.0003927888888888889,
+      "loss": 3.1491,
+      "step": 14650
+    },
+    {
+      "epoch": 2.0889583629387523,
+      "grad_norm": 1.8724685907363892,
+      "learning_rate": 0.00039223333333333334,
+      "loss": 3.1342,
+      "step": 14700
+    },
+    {
+      "epoch": 2.096063663492966,
+      "grad_norm": 1.7195576429367065,
+      "learning_rate": 0.0003916777777777778,
+      "loss": 3.1256,
+      "step": 14750
+    },
+    {
+      "epoch": 2.103168964047179,
+      "grad_norm": 1.8319681882858276,
+      "learning_rate": 0.0003911222222222222,
+      "loss": 3.1422,
+      "step": 14800
+    },
+    {
+      "epoch": 2.110274264601393,
+      "grad_norm": 1.840844750404358,
+      "learning_rate": 0.00039056666666666666,
+      "loss": 3.1501,
+      "step": 14850
+    },
+    {
+      "epoch": 2.117379565155606,
+      "grad_norm": 1.756561517715454,
+      "learning_rate": 0.0003900111111111111,
+      "loss": 3.124,
+      "step": 14900
+    },
+    {
+      "epoch": 2.1244848657098196,
+      "grad_norm": 1.6923362016677856,
+      "learning_rate": 0.0003894555555555556,
+      "loss": 3.1231,
+      "step": 14950
+    },
+    {
+      "epoch": 2.131590166264033,
+      "grad_norm": 1.7511463165283203,
+      "learning_rate": 0.0003889,
+      "loss": 3.105,
+      "step": 15000
+    },
+    {
+      "epoch": 2.1386954668182465,
+      "grad_norm": 1.9665418863296509,
+      "learning_rate": 0.00038834444444444446,
+      "loss": 3.1074,
+      "step": 15050
+    },
+    {
+      "epoch": 2.1458007673724597,
+      "grad_norm": 1.7199262380599976,
+      "learning_rate": 0.0003877888888888889,
+      "loss": 3.1231,
+      "step": 15100
+    },
+    {
+      "epoch": 2.1529060679266734,
+      "grad_norm": 2.058184862136841,
+      "learning_rate": 0.00038723333333333333,
+      "loss": 3.096,
+      "step": 15150
+    },
+    {
+      "epoch": 2.1600113684808866,
+      "grad_norm": 1.7477974891662598,
+      "learning_rate": 0.00038667777777777777,
+      "loss": 3.117,
+      "step": 15200
+    },
+    {
+      "epoch": 2.1671166690351003,
+      "grad_norm": 1.8791770935058594,
+      "learning_rate": 0.0003861222222222222,
+      "loss": 3.1073,
+      "step": 15250
+    },
+    {
+      "epoch": 2.1742219695893135,
+      "grad_norm": 1.924623727798462,
+      "learning_rate": 0.0003855666666666667,
+      "loss": 3.1212,
+      "step": 15300
+    },
+    {
+      "epoch": 2.181327270143527,
+      "grad_norm": 1.8112562894821167,
+      "learning_rate": 0.00038501111111111113,
+      "loss": 3.092,
+      "step": 15350
+    },
+    {
+      "epoch": 2.1884325706977403,
+      "grad_norm": 1.9288897514343262,
+      "learning_rate": 0.00038445555555555557,
+      "loss": 3.0911,
+      "step": 15400
+    },
+    {
+      "epoch": 2.195537871251954,
+      "grad_norm": 1.8161592483520508,
+      "learning_rate": 0.0003839,
+      "loss": 3.0984,
+      "step": 15450
+    },
+    {
+      "epoch": 2.202643171806167,
+      "grad_norm": 1.7706139087677002,
+      "learning_rate": 0.00038334444444444444,
+      "loss": 3.0908,
+      "step": 15500
+    },
+    {
+      "epoch": 2.209748472360381,
+      "grad_norm": 2.003417491912842,
+      "learning_rate": 0.00038278888888888894,
+      "loss": 3.092,
+      "step": 15550
+    },
+    {
+      "epoch": 2.216853772914594,
+      "grad_norm": 1.8415908813476562,
+      "learning_rate": 0.0003822333333333333,
+      "loss": 3.1011,
+      "step": 15600
+    },
+    {
+      "epoch": 2.2239590734688077,
+      "grad_norm": 1.7236992120742798,
+      "learning_rate": 0.0003816777777777778,
+      "loss": 3.1022,
+      "step": 15650
+    },
+    {
+      "epoch": 2.2310643740230214,
+      "grad_norm": 1.8835231065750122,
+      "learning_rate": 0.0003811222222222222,
+      "loss": 3.0866,
+      "step": 15700
+    },
+    {
+      "epoch": 2.2381696745772346,
+      "grad_norm": 1.805284023284912,
+      "learning_rate": 0.0003805666666666667,
+      "loss": 3.1058,
+      "step": 15750
+    },
+    {
+      "epoch": 2.2452749751314482,
+      "grad_norm": 1.9442837238311768,
+      "learning_rate": 0.0003800111111111111,
+      "loss": 3.0923,
+      "step": 15800
+    },
+    {
+      "epoch": 2.2523802756856615,
+      "grad_norm": 1.8718860149383545,
+      "learning_rate": 0.00037945555555555556,
+      "loss": 3.1035,
+      "step": 15850
+    },
+    {
+      "epoch": 2.259485576239875,
+      "grad_norm": 1.8578605651855469,
+      "learning_rate": 0.00037890000000000005,
+      "loss": 3.0904,
+      "step": 15900
+    },
+    {
+      "epoch": 2.2665908767940883,
+      "grad_norm": 1.7682795524597168,
+      "learning_rate": 0.00037834444444444443,
+      "loss": 3.1211,
+      "step": 15950
+    },
+    {
+      "epoch": 2.273696177348302,
+      "grad_norm": 1.7548738718032837,
+      "learning_rate": 0.0003777888888888889,
+      "loss": 3.0719,
+      "step": 16000
+    },
+    {
+      "epoch": 2.273696177348302,
+      "eval_accuracy": 0.47854650020599365,
+      "eval_loss": 3.0652825832366943,
+      "eval_runtime": 1.4549,
+      "eval_samples_per_second": 2584.303,
+      "eval_steps_per_second": 40.552,
+      "step": 16000
+    },
+    {
+      "epoch": 2.280801477902515,
+      "grad_norm": 1.7292568683624268,
+      "learning_rate": 0.0003772333333333333,
+      "loss": 3.0797,
+      "step": 16050
+    },
+    {
+      "epoch": 2.287906778456729,
+      "grad_norm": 1.8499785661697388,
+      "learning_rate": 0.0003766777777777778,
+      "loss": 3.0689,
+      "step": 16100
+    },
+    {
+      "epoch": 2.295012079010942,
+      "grad_norm": 1.7851368188858032,
+      "learning_rate": 0.00037612222222222223,
+      "loss": 3.07,
+      "step": 16150
+    },
+    {
+      "epoch": 2.3021173795651557,
+      "grad_norm": 1.7935354709625244,
+      "learning_rate": 0.00037556666666666667,
+      "loss": 3.0801,
+      "step": 16200
+    },
+    {
+      "epoch": 2.309222680119369,
+      "grad_norm": 1.778581142425537,
+      "learning_rate": 0.00037501111111111116,
+      "loss": 3.0556,
+      "step": 16250
+    },
+    {
+      "epoch": 2.3163279806735826,
+      "grad_norm": 1.8388044834136963,
+      "learning_rate": 0.00037445555555555554,
+      "loss": 3.0656,
+      "step": 16300
+    },
+    {
+      "epoch": 2.323433281227796,
+      "grad_norm": 1.9298747777938843,
+      "learning_rate": 0.00037390000000000004,
+      "loss": 3.054,
+      "step": 16350
+    },
+    {
+      "epoch": 2.3305385817820095,
+      "grad_norm": 1.7658896446228027,
+      "learning_rate": 0.0003733444444444444,
+      "loss": 3.0465,
+      "step": 16400
+    },
+    {
+      "epoch": 2.3376438823362227,
+      "grad_norm": 1.7524223327636719,
+      "learning_rate": 0.0003727888888888889,
+      "loss": 3.0571,
+      "step": 16450
+    },
+    {
+      "epoch": 2.3447491828904363,
+      "grad_norm": 1.7193357944488525,
+      "learning_rate": 0.00037223333333333335,
+      "loss": 3.0535,
+      "step": 16500
+    },
+    {
+      "epoch": 2.3518544834446495,
+      "grad_norm": 2.003408193588257,
+      "learning_rate": 0.0003716777777777778,
+      "loss": 3.0622,
+      "step": 16550
+    },
+    {
+      "epoch": 2.358959783998863,
+      "grad_norm": 2.0907719135284424,
+      "learning_rate": 0.0003711222222222223,
+      "loss": 3.0641,
+      "step": 16600
+    },
+    {
+      "epoch": 2.3660650845530764,
+      "grad_norm": 1.819555401802063,
+      "learning_rate": 0.00037056666666666666,
+      "loss": 3.0539,
+      "step": 16650
+    },
+    {
+      "epoch": 2.37317038510729,
+      "grad_norm": 1.6507291793823242,
+      "learning_rate": 0.00037001111111111115,
+      "loss": 3.0291,
+      "step": 16700
+    },
+    {
+      "epoch": 2.3802756856615037,
+      "grad_norm": 1.763790249824524,
+      "learning_rate": 0.00036945555555555553,
+      "loss": 3.0576,
+      "step": 16750
+    },
+    {
+      "epoch": 2.387380986215717,
+      "grad_norm": 1.8909801244735718,
+      "learning_rate": 0.0003689,
+      "loss": 3.0673,
+      "step": 16800
+    },
+    {
+      "epoch": 2.39448628676993,
+      "grad_norm": 1.701228380203247,
+      "learning_rate": 0.0003683444444444444,
+      "loss": 3.0441,
+      "step": 16850
+    },
+    {
+      "epoch": 2.401591587324144,
+      "grad_norm": 1.7287544012069702,
+      "learning_rate": 0.0003677888888888889,
+      "loss": 3.0622,
+      "step": 16900
+    },
+    {
+      "epoch": 2.4086968878783575,
+      "grad_norm": 1.7561144828796387,
+      "learning_rate": 0.0003672333333333334,
+      "loss": 3.0488,
+      "step": 16950
+    },
+    {
+      "epoch": 2.4158021884325707,
+      "grad_norm": 1.8787589073181152,
+      "learning_rate": 0.00036667777777777777,
+      "loss": 3.057,
+      "step": 17000
+    },
+    {
+      "epoch": 2.4229074889867843,
+      "grad_norm": 1.7158374786376953,
+      "learning_rate": 0.00036612222222222226,
+      "loss": 3.0378,
+      "step": 17050
+    },
+    {
+      "epoch": 2.4300127895409975,
+      "grad_norm": 1.8142306804656982,
+      "learning_rate": 0.00036556666666666664,
+      "loss": 3.0515,
+      "step": 17100
+    },
+    {
+      "epoch": 2.437118090095211,
+      "grad_norm": 1.5873689651489258,
+      "learning_rate": 0.00036501111111111114,
+      "loss": 3.0495,
+      "step": 17150
+    },
+    {
+      "epoch": 2.4442233906494244,
+      "grad_norm": 1.8045600652694702,
+      "learning_rate": 0.0003644555555555555,
+      "loss": 3.025,
+      "step": 17200
+    },
+    {
+      "epoch": 2.451328691203638,
+      "grad_norm": 1.6360293626785278,
+      "learning_rate": 0.0003639,
+      "loss": 3.0391,
+      "step": 17250
+    },
+    {
+      "epoch": 2.4584339917578513,
+      "grad_norm": 1.805924892425537,
+      "learning_rate": 0.0003633444444444445,
+      "loss": 3.028,
+      "step": 17300
+    },
+    {
+      "epoch": 2.465539292312065,
+      "grad_norm": 1.8847789764404297,
+      "learning_rate": 0.0003627888888888889,
+      "loss": 3.0413,
+      "step": 17350
+    },
+    {
+      "epoch": 2.472644592866278,
+      "grad_norm": 1.900485873222351,
+      "learning_rate": 0.0003622333333333334,
+      "loss": 3.023,
+      "step": 17400
+    },
+    {
+      "epoch": 2.479749893420492,
+      "grad_norm": 1.7782105207443237,
+      "learning_rate": 0.00036167777777777776,
+      "loss": 3.0128,
+      "step": 17450
+    },
+    {
+      "epoch": 2.486855193974705,
+      "grad_norm": 1.69002103805542,
+      "learning_rate": 0.00036112222222222225,
+      "loss": 3.0246,
+      "step": 17500
+    },
+    {
+      "epoch": 2.4939604945289187,
+      "grad_norm": 1.8271868228912354,
+      "learning_rate": 0.00036056666666666663,
+      "loss": 3.0155,
+      "step": 17550
+    },
+    {
+      "epoch": 2.501065795083132,
+      "grad_norm": 1.7090950012207031,
+      "learning_rate": 0.0003600111111111111,
+      "loss": 3.0339,
+      "step": 17600
+    },
+    {
+      "epoch": 2.5081710956373455,
+      "grad_norm": 1.7666364908218384,
+      "learning_rate": 0.00035945555555555556,
+      "loss": 3.0362,
+      "step": 17650
+    },
+    {
+      "epoch": 2.5152763961915587,
+      "grad_norm": 1.8835707902908325,
+      "learning_rate": 0.0003589,
+      "loss": 3.0236,
+      "step": 17700
+    },
+    {
+      "epoch": 2.5223816967457724,
+      "grad_norm": 1.8882673978805542,
+      "learning_rate": 0.0003583444444444445,
+      "loss": 3.0096,
+      "step": 17750
+    },
+    {
+      "epoch": 2.529486997299986,
+      "grad_norm": 1.7961220741271973,
+      "learning_rate": 0.00035778888888888887,
+      "loss": 3.0135,
+      "step": 17800
+    },
+    {
+      "epoch": 2.5365922978541993,
+      "grad_norm": 1.7673027515411377,
+      "learning_rate": 0.00035723333333333336,
+      "loss": 3.0151,
+      "step": 17850
+    },
+    {
+      "epoch": 2.5436975984084125,
+      "grad_norm": 1.7343908548355103,
+      "learning_rate": 0.00035667777777777774,
+      "loss": 3.0174,
+      "step": 17900
+    },
+    {
+      "epoch": 2.550802898962626,
+      "grad_norm": 1.7693558931350708,
+      "learning_rate": 0.00035612222222222223,
+      "loss": 3.0008,
+      "step": 17950
+    },
+    {
+      "epoch": 2.55790819951684,
+      "grad_norm": 1.762675404548645,
+      "learning_rate": 0.00035556666666666667,
+      "loss": 2.986,
+      "step": 18000
+    },
+    {
+      "epoch": 2.55790819951684,
+      "eval_accuracy": 0.4936477839946747,
+      "eval_loss": 2.9375457763671875,
+      "eval_runtime": 1.4066,
+      "eval_samples_per_second": 2673.043,
+      "eval_steps_per_second": 41.944,
+      "step": 18000
+    },
+    {
+      "epoch": 2.565013500071053,
+      "grad_norm": 1.7804518938064575,
+      "learning_rate": 0.0003550111111111111,
+      "loss": 3.0187,
+      "step": 18050
+    },
+    {
+      "epoch": 2.572118800625266,
+      "grad_norm": 1.7693209648132324,
+      "learning_rate": 0.0003544555555555556,
+      "loss": 3.011,
+      "step": 18100
+    },
+    {
+      "epoch": 2.57922410117948,
+      "grad_norm": 1.8292006254196167,
+      "learning_rate": 0.0003539,
+      "loss": 3.0151,
+      "step": 18150
+    },
+    {
+      "epoch": 2.5863294017336935,
+      "grad_norm": 1.659195065498352,
+      "learning_rate": 0.0003533444444444445,
+      "loss": 2.9828,
+      "step": 18200
+    },
+    {
+      "epoch": 2.5934347022879067,
+      "grad_norm": 1.8265076875686646,
+      "learning_rate": 0.00035278888888888886,
+      "loss": 3.0014,
+      "step": 18250
+    },
+    {
+      "epoch": 2.60054000284212,
+      "grad_norm": 1.7613290548324585,
+      "learning_rate": 0.00035223333333333335,
+      "loss": 2.9924,
+      "step": 18300
+    },
+    {
+      "epoch": 2.6076453033963336,
+      "grad_norm": 1.8945201635360718,
+      "learning_rate": 0.0003516777777777778,
+      "loss": 2.9883,
+      "step": 18350
+    },
+    {
+      "epoch": 2.6147506039505473,
+      "grad_norm": 1.6011463403701782,
+      "learning_rate": 0.0003511222222222222,
+      "loss": 2.9815,
+      "step": 18400
+    },
+    {
+      "epoch": 2.6218559045047605,
+      "grad_norm": 1.730685830116272,
+      "learning_rate": 0.0003505666666666667,
+      "loss": 3.0054,
+      "step": 18450
+    },
+    {
+      "epoch": 2.628961205058974,
+      "grad_norm": 1.7386492490768433,
+      "learning_rate": 0.0003500111111111111,
+      "loss": 2.9788,
+      "step": 18500
+    },
+    {
+      "epoch": 2.6360665056131873,
+      "grad_norm": 1.8171436786651611,
+      "learning_rate": 0.0003494555555555556,
+      "loss": 2.9997,
+      "step": 18550
+    },
+    {
+      "epoch": 2.643171806167401,
+      "grad_norm": 2.123448610305786,
+      "learning_rate": 0.00034889999999999997,
+      "loss": 3.014,
+      "step": 18600
+    },
+    {
+      "epoch": 2.650277106721614,
+      "grad_norm": 1.7145969867706299,
+      "learning_rate": 0.00034834444444444446,
+      "loss": 2.9728,
+      "step": 18650
+    },
+    {
+      "epoch": 2.657382407275828,
+      "grad_norm": 1.7777656316757202,
+      "learning_rate": 0.0003477888888888889,
+      "loss": 2.9886,
+      "step": 18700
+    },
+    {
+      "epoch": 2.664487707830041,
+      "grad_norm": 1.7456960678100586,
+      "learning_rate": 0.00034723333333333333,
+      "loss": 2.9896,
+      "step": 18750
+    },
+    {
+      "epoch": 2.6715930083842547,
+      "grad_norm": 1.6129354238510132,
+      "learning_rate": 0.00034667777777777777,
+      "loss": 2.9732,
+      "step": 18800
+    },
+    {
+      "epoch": 2.678698308938468,
+      "grad_norm": 1.6911518573760986,
+      "learning_rate": 0.0003461222222222222,
+      "loss": 2.9867,
+      "step": 18850
+    },
+    {
+      "epoch": 2.6858036094926816,
+      "grad_norm": 1.7017191648483276,
+      "learning_rate": 0.0003455666666666667,
+      "loss": 2.97,
+      "step": 18900
+    },
+    {
+      "epoch": 2.692908910046895,
+      "grad_norm": 1.6771681308746338,
+      "learning_rate": 0.0003450111111111111,
+      "loss": 2.9767,
+      "step": 18950
+    },
+    {
+      "epoch": 2.7000142106011085,
+      "grad_norm": 1.8211736679077148,
+      "learning_rate": 0.0003444555555555556,
+      "loss": 3.0055,
+      "step": 19000
+    },
+    {
+      "epoch": 2.707119511155322,
+      "grad_norm": 1.8175971508026123,
+      "learning_rate": 0.0003439,
+      "loss": 2.9567,
+      "step": 19050
+    },
+    {
+      "epoch": 2.7142248117095353,
+      "grad_norm": 1.8108701705932617,
+      "learning_rate": 0.00034334444444444445,
+      "loss": 2.9767,
+      "step": 19100
+    },
+    {
+      "epoch": 2.7213301122637485,
+      "grad_norm": 1.737069845199585,
+      "learning_rate": 0.0003427888888888889,
+      "loss": 2.9864,
+      "step": 19150
+    },
+    {
+      "epoch": 2.728435412817962,
+      "grad_norm": 1.6564186811447144,
+      "learning_rate": 0.0003422333333333333,
+      "loss": 2.9684,
+      "step": 19200
+    },
+    {
+      "epoch": 2.735540713372176,
+      "grad_norm": 1.7465981245040894,
+      "learning_rate": 0.0003416777777777778,
+      "loss": 2.9774,
+      "step": 19250
+    },
+    {
+      "epoch": 2.742646013926389,
+      "grad_norm": 1.7666462659835815,
+      "learning_rate": 0.0003411222222222222,
+      "loss": 2.9737,
+      "step": 19300
+    },
+    {
+      "epoch": 2.7497513144806023,
+      "grad_norm": 1.7104542255401611,
+      "learning_rate": 0.0003405666666666667,
+      "loss": 2.9636,
+      "step": 19350
+    },
+    {
+      "epoch": 2.756856615034816,
+      "grad_norm": 1.6974040269851685,
+      "learning_rate": 0.0003400111111111111,
+      "loss": 2.96,
+      "step": 19400
+    },
+    {
+      "epoch": 2.7639619155890296,
+      "grad_norm": 1.7548801898956299,
+      "learning_rate": 0.00033945555555555556,
+      "loss": 2.9621,
+      "step": 19450
+    },
+    {
+      "epoch": 2.771067216143243,
+      "grad_norm": 1.6098867654800415,
+      "learning_rate": 0.0003389,
+      "loss": 2.9714,
+      "step": 19500
+    },
+    {
+      "epoch": 2.7781725166974565,
+      "grad_norm": 1.7277166843414307,
+      "learning_rate": 0.00033834444444444443,
+      "loss": 2.9735,
+      "step": 19550
+    },
+    {
+      "epoch": 2.7852778172516697,
+      "grad_norm": 1.7835525274276733,
+      "learning_rate": 0.0003377888888888889,
+      "loss": 2.9882,
+      "step": 19600
+    },
+    {
+      "epoch": 2.7923831178058833,
+      "grad_norm": 1.6993199586868286,
+      "learning_rate": 0.00033723333333333336,
+      "loss": 2.9597,
+      "step": 19650
+    },
+    {
+      "epoch": 2.7994884183600965,
+      "grad_norm": 1.764560580253601,
+      "learning_rate": 0.0003366777777777778,
+      "loss": 2.9507,
+      "step": 19700
+    },
+    {
+      "epoch": 2.80659371891431,
+      "grad_norm": 1.816872000694275,
+      "learning_rate": 0.00033612222222222224,
+      "loss": 2.9655,
+      "step": 19750
+    },
+    {
+      "epoch": 2.8136990194685234,
+      "grad_norm": 1.8321980237960815,
+      "learning_rate": 0.0003355666666666667,
+      "loss": 2.9932,
+      "step": 19800
+    },
+    {
+      "epoch": 2.820804320022737,
+      "grad_norm": 1.8136756420135498,
+      "learning_rate": 0.0003350111111111111,
+      "loss": 2.9514,
+      "step": 19850
+    },
+    {
+      "epoch": 2.8279096205769503,
+      "grad_norm": 1.7299060821533203,
+      "learning_rate": 0.00033445555555555555,
+      "loss": 2.9607,
+      "step": 19900
+    },
+    {
+      "epoch": 2.835014921131164,
+      "grad_norm": 1.6662367582321167,
+      "learning_rate": 0.0003339,
+      "loss": 2.941,
+      "step": 19950
+    },
+    {
+      "epoch": 2.842120221685377,
+      "grad_norm": 1.646530032157898,
+      "learning_rate": 0.0003333444444444445,
+      "loss": 2.9477,
+      "step": 20000
+    },
+    {
+      "epoch": 2.842120221685377,
+      "eval_accuracy": 0.49664175510406494,
+      "eval_loss": 2.882197141647339,
+      "eval_runtime": 1.406,
+      "eval_samples_per_second": 2674.307,
+      "eval_steps_per_second": 41.964,
+      "step": 20000
+    },
+    {
+      "epoch": 2.849225522239591,
+      "grad_norm": 1.721617579460144,
+      "learning_rate": 0.0003327888888888889,
+      "loss": 2.9421,
+      "step": 20050
+    },
+    {
+      "epoch": 2.856330822793804,
+      "grad_norm": 1.8738772869110107,
+      "learning_rate": 0.00033223333333333335,
+      "loss": 2.9624,
+      "step": 20100
+    },
+    {
+      "epoch": 2.8634361233480177,
+      "grad_norm": 1.7513505220413208,
+      "learning_rate": 0.0003316777777777778,
+      "loss": 2.9429,
+      "step": 20150
+    },
+    {
+      "epoch": 2.870541423902231,
+      "grad_norm": 1.9003950357437134,
+      "learning_rate": 0.0003311222222222222,
+      "loss": 2.9538,
+      "step": 20200
+    },
+    {
+      "epoch": 2.8776467244564445,
+      "grad_norm": 1.6538949012756348,
+      "learning_rate": 0.00033056666666666666,
+      "loss": 2.9437,
+      "step": 20250
+    },
+    {
+      "epoch": 2.884752025010658,
+      "grad_norm": 1.7895572185516357,
+      "learning_rate": 0.0003300111111111111,
+      "loss": 2.9445,
+      "step": 20300
+    },
+    {
+      "epoch": 2.8918573255648714,
+      "grad_norm": 1.6757142543792725,
+      "learning_rate": 0.0003294555555555556,
+      "loss": 2.9393,
+      "step": 20350
+    },
+    {
+      "epoch": 2.8989626261190846,
+      "grad_norm": 1.717271327972412,
+      "learning_rate": 0.0003289,
+      "loss": 2.9467,
+      "step": 20400
+    },
+    {
+      "epoch": 2.9060679266732983,
+      "grad_norm": 1.7182637453079224,
+      "learning_rate": 0.00032834444444444446,
+      "loss": 2.9295,
+      "step": 20450
+    },
+    {
+      "epoch": 2.913173227227512,
+      "grad_norm": 1.770296573638916,
+      "learning_rate": 0.0003277888888888889,
+      "loss": 2.9355,
+      "step": 20500
+    },
+    {
+      "epoch": 2.920278527781725,
+      "grad_norm": 1.6502301692962646,
+      "learning_rate": 0.00032723333333333334,
+      "loss": 2.9241,
+      "step": 20550
+    },
+    {
+      "epoch": 2.9273838283359384,
+      "grad_norm": 1.689746618270874,
+      "learning_rate": 0.0003266777777777778,
+      "loss": 2.9195,
+      "step": 20600
+    },
+    {
+      "epoch": 2.934489128890152,
+      "grad_norm": 1.8485779762268066,
+      "learning_rate": 0.0003261222222222222,
+      "loss": 2.934,
+      "step": 20650
+    },
+    {
+      "epoch": 2.9415944294443657,
+      "grad_norm": 1.8438777923583984,
+      "learning_rate": 0.0003255666666666667,
+      "loss": 2.9208,
+      "step": 20700
+    },
+    {
+      "epoch": 2.948699729998579,
+      "grad_norm": 1.8864026069641113,
+      "learning_rate": 0.0003250111111111111,
+      "loss": 2.9299,
+      "step": 20750
+    },
+    {
+      "epoch": 2.9558050305527925,
+      "grad_norm": 1.799882411956787,
+      "learning_rate": 0.0003244555555555556,
+      "loss": 2.9306,
+      "step": 20800
+    },
+    {
+      "epoch": 2.9629103311070057,
+      "grad_norm": 1.7453547716140747,
+      "learning_rate": 0.0003239,
+      "loss": 2.9329,
+      "step": 20850
+    },
+    {
+      "epoch": 2.9700156316612194,
+      "grad_norm": 1.6310656070709229,
+      "learning_rate": 0.00032334444444444445,
+      "loss": 2.9027,
+      "step": 20900
+    },
+    {
+      "epoch": 2.9771209322154326,
+      "grad_norm": 1.8118422031402588,
+      "learning_rate": 0.0003227888888888889,
+      "loss": 2.925,
+      "step": 20950
+    },
+    {
+      "epoch": 2.9842262327696463,
+      "grad_norm": 1.7698137760162354,
+      "learning_rate": 0.0003222333333333333,
+      "loss": 2.9347,
+      "step": 21000
+    },
+    {
+      "epoch": 2.9913315333238595,
+      "grad_norm": 1.7710407972335815,
+      "learning_rate": 0.0003216777777777778,
+      "loss": 2.9159,
+      "step": 21050
+    },
+    {
+      "epoch": 2.998436833878073,
+      "grad_norm": 1.613924264907837,
+      "learning_rate": 0.0003211222222222222,
+      "loss": 2.9188,
+      "step": 21100
+    },
+    {
+      "epoch": 3.0055421344322863,
+      "grad_norm": 1.8230890035629272,
+      "learning_rate": 0.0003205666666666667,
+      "loss": 2.8801,
+      "step": 21150
+    },
+    {
+      "epoch": 3.0126474349865,
+      "grad_norm": 1.745085597038269,
+      "learning_rate": 0.0003200111111111111,
+      "loss": 2.8948,
+      "step": 21200
+    },
+    {
+      "epoch": 3.019752735540713,
+      "grad_norm": 1.6332448720932007,
+      "learning_rate": 0.00031945555555555556,
+      "loss": 2.8926,
+      "step": 21250
+    },
+    {
+      "epoch": 3.026858036094927,
+      "grad_norm": 1.6312452554702759,
+      "learning_rate": 0.0003189,
+      "loss": 2.9057,
+      "step": 21300
+    },
+    {
+      "epoch": 3.03396333664914,
+      "grad_norm": 1.7664523124694824,
+      "learning_rate": 0.00031834444444444444,
+      "loss": 2.9123,
+      "step": 21350
+    },
+    {
+      "epoch": 3.0410686372033537,
+      "grad_norm": 1.8374122381210327,
+      "learning_rate": 0.00031778888888888893,
+      "loss": 2.9046,
+      "step": 21400
+    },
+    {
+      "epoch": 3.048173937757567,
+      "grad_norm": 1.686972737312317,
+      "learning_rate": 0.0003172333333333333,
+      "loss": 2.9136,
+      "step": 21450
+    },
+    {
+      "epoch": 3.0552792383117806,
+      "grad_norm": 1.7806686162948608,
+      "learning_rate": 0.0003166777777777778,
+      "loss": 2.867,
+      "step": 21500
+    },
+    {
+      "epoch": 3.062384538865994,
+      "grad_norm": 1.7213020324707031,
+      "learning_rate": 0.00031612222222222224,
+      "loss": 2.8885,
+      "step": 21550
+    },
+    {
+      "epoch": 3.0694898394202075,
+      "grad_norm": 1.659408688545227,
+      "learning_rate": 0.0003155666666666667,
+      "loss": 2.8824,
+      "step": 21600
+    },
+    {
+      "epoch": 3.0765951399744207,
+      "grad_norm": 1.71113121509552,
+      "learning_rate": 0.0003150111111111111,
+      "loss": 2.9071,
+      "step": 21650
+    },
+    {
+      "epoch": 3.0837004405286343,
+      "grad_norm": 1.6978799104690552,
+      "learning_rate": 0.00031445555555555555,
+      "loss": 2.9053,
+      "step": 21700
+    },
+    {
+      "epoch": 3.090805741082848,
+      "grad_norm": 1.7844059467315674,
+      "learning_rate": 0.00031390000000000004,
+      "loss": 2.8661,
+      "step": 21750
+    },
+    {
+      "epoch": 3.097911041637061,
+      "grad_norm": 1.7128864526748657,
+      "learning_rate": 0.0003133444444444444,
+      "loss": 2.8892,
+      "step": 21800
+    },
+    {
+      "epoch": 3.105016342191275,
+      "grad_norm": 1.7532225847244263,
+      "learning_rate": 0.0003127888888888889,
+      "loss": 2.9181,
+      "step": 21850
+    },
+    {
+      "epoch": 3.112121642745488,
+      "grad_norm": 1.7945367097854614,
+      "learning_rate": 0.0003122333333333333,
+      "loss": 2.8686,
+      "step": 21900
+    },
+    {
+      "epoch": 3.1192269432997017,
+      "grad_norm": 1.6655150651931763,
+      "learning_rate": 0.0003116777777777778,
+      "loss": 2.8604,
+      "step": 21950
+    },
+    {
+      "epoch": 3.126332243853915,
+      "grad_norm": 1.6884666681289673,
+      "learning_rate": 0.0003111222222222222,
+      "loss": 2.9021,
+      "step": 22000
+    },
+    {
+      "epoch": 3.126332243853915,
+      "eval_accuracy": 0.5131886601448059,
+      "eval_loss": 2.801339864730835,
+      "eval_runtime": 1.3759,
+      "eval_samples_per_second": 2732.73,
+      "eval_steps_per_second": 42.881,
+      "step": 22000
+    },
+    {
+      "epoch": 3.1334375444081286,
+      "grad_norm": 1.801597237586975,
+      "learning_rate": 0.00031056666666666666,
+      "loss": 2.889,
+      "step": 22050
+    },
+    {
+      "epoch": 3.140542844962342,
+      "grad_norm": 1.6757820844650269,
+      "learning_rate": 0.00031001111111111115,
+      "loss": 2.8707,
+      "step": 22100
+    },
+    {
+      "epoch": 3.1476481455165555,
+      "grad_norm": 1.6857764720916748,
+      "learning_rate": 0.00030945555555555554,
+      "loss": 2.877,
+      "step": 22150
+    },
+    {
+      "epoch": 3.1547534460707687,
+      "grad_norm": 1.7177927494049072,
+      "learning_rate": 0.00030890000000000003,
+      "loss": 2.8937,
+      "step": 22200
+    },
+    {
+      "epoch": 3.1618587466249823,
+      "grad_norm": 1.7618646621704102,
+      "learning_rate": 0.0003083444444444444,
+      "loss": 2.8927,
+      "step": 22250
+    },
+    {
+      "epoch": 3.1689640471791956,
+      "grad_norm": 1.862821102142334,
+      "learning_rate": 0.0003077888888888889,
+      "loss": 2.8737,
+      "step": 22300
+    },
+    {
+      "epoch": 3.176069347733409,
+      "grad_norm": 1.6600791215896606,
+      "learning_rate": 0.00030723333333333334,
+      "loss": 2.8844,
+      "step": 22350
+    },
+    {
+      "epoch": 3.1831746482876224,
+      "grad_norm": 1.715598702430725,
+      "learning_rate": 0.0003066777777777778,
+      "loss": 2.8733,
+      "step": 22400
+    },
+    {
+      "epoch": 3.190279948841836,
+      "grad_norm": 1.615591049194336,
+      "learning_rate": 0.00030612222222222227,
+      "loss": 2.8793,
+      "step": 22450
+    },
+    {
+      "epoch": 3.1973852493960493,
+      "grad_norm": 1.6799874305725098,
+      "learning_rate": 0.00030556666666666665,
+      "loss": 2.8833,
+      "step": 22500
+    },
+    {
+      "epoch": 3.204490549950263,
+      "grad_norm": 1.8471604585647583,
+      "learning_rate": 0.00030501111111111114,
+      "loss": 2.8819,
+      "step": 22550
+    },
+    {
+      "epoch": 3.211595850504476,
+      "grad_norm": 1.721903681755066,
+      "learning_rate": 0.0003044555555555555,
+      "loss": 2.8663,
+      "step": 22600
+    },
+    {
+      "epoch": 3.21870115105869,
+      "grad_norm": 1.7450604438781738,
+      "learning_rate": 0.0003039,
+      "loss": 2.8662,
+      "step": 22650
+    },
+    {
+      "epoch": 3.225806451612903,
+      "grad_norm": 1.7081820964813232,
+      "learning_rate": 0.0003033444444444445,
+      "loss": 2.8641,
+      "step": 22700
+    },
+    {
+      "epoch": 3.2329117521671167,
+      "grad_norm": 1.834999918937683,
+      "learning_rate": 0.0003027888888888889,
+      "loss": 2.862,
+      "step": 22750
+    },
+    {
+      "epoch": 3.2400170527213303,
+      "grad_norm": 1.6457868814468384,
+      "learning_rate": 0.0003022333333333334,
+      "loss": 2.847,
+      "step": 22800
+    },
+    {
+      "epoch": 3.2471223532755435,
+      "grad_norm": 1.613499641418457,
+      "learning_rate": 0.00030167777777777776,
+      "loss": 2.8755,
+      "step": 22850
+    },
+    {
+      "epoch": 3.2542276538297568,
+      "grad_norm": 1.6897544860839844,
+      "learning_rate": 0.00030112222222222225,
+      "loss": 2.8551,
+      "step": 22900
+    },
+    {
+      "epoch": 3.2613329543839704,
+      "grad_norm": 1.7230019569396973,
+      "learning_rate": 0.00030056666666666664,
+      "loss": 2.8636,
+      "step": 22950
+    },
+    {
+      "epoch": 3.268438254938184,
+      "grad_norm": 1.8159016370773315,
+      "learning_rate": 0.00030001111111111113,
+      "loss": 2.8669,
+      "step": 23000
+    },
+    {
+      "epoch": 3.2755435554923973,
+      "grad_norm": 1.6577616930007935,
+      "learning_rate": 0.0002994555555555555,
+      "loss": 2.8526,
+      "step": 23050
+    },
+    {
+      "epoch": 3.282648856046611,
+      "grad_norm": 1.6508777141571045,
+      "learning_rate": 0.0002989,
+      "loss": 2.8717,
+      "step": 23100
+    },
+    {
+      "epoch": 3.289754156600824,
+      "grad_norm": 1.630266785621643,
+      "learning_rate": 0.0002983444444444445,
+      "loss": 2.8593,
+      "step": 23150
+    },
+    {
+      "epoch": 3.296859457155038,
+      "grad_norm": 1.7836819887161255,
+      "learning_rate": 0.0002977888888888889,
+      "loss": 2.8409,
+      "step": 23200
+    },
+    {
+      "epoch": 3.303964757709251,
+      "grad_norm": 1.7919524908065796,
+      "learning_rate": 0.00029723333333333337,
+      "loss": 2.8644,
+      "step": 23250
+    },
+    {
+      "epoch": 3.3110700582634647,
+      "grad_norm": 1.8404020071029663,
+      "learning_rate": 0.00029667777777777775,
+      "loss": 2.856,
+      "step": 23300
+    },
+    {
+      "epoch": 3.318175358817678,
+      "grad_norm": 1.913402795791626,
+      "learning_rate": 0.00029612222222222224,
+      "loss": 2.8724,
+      "step": 23350
+    },
+    {
+      "epoch": 3.3252806593718915,
+      "grad_norm": 1.5555598735809326,
+      "learning_rate": 0.0002955666666666667,
+      "loss": 2.834,
+      "step": 23400
+    },
+    {
+      "epoch": 3.3323859599261048,
+      "grad_norm": 1.6552711725234985,
+      "learning_rate": 0.0002950111111111111,
+      "loss": 2.8363,
+      "step": 23450
+    },
+    {
+      "epoch": 3.3394912604803184,
+      "grad_norm": 1.7023571729660034,
+      "learning_rate": 0.0002944555555555556,
+      "loss": 2.8586,
+      "step": 23500
+    },
+    {
+      "epoch": 3.3465965610345316,
+      "grad_norm": 1.8556574583053589,
+      "learning_rate": 0.0002939,
+      "loss": 2.8384,
+      "step": 23550
+    },
+    {
+      "epoch": 3.3537018615887453,
+      "grad_norm": 1.8642303943634033,
+      "learning_rate": 0.0002933444444444445,
+      "loss": 2.8554,
+      "step": 23600
+    },
+    {
+      "epoch": 3.3608071621429585,
+      "grad_norm": 1.8594276905059814,
+      "learning_rate": 0.00029278888888888886,
+      "loss": 2.8562,
+      "step": 23650
+    },
+    {
+      "epoch": 3.367912462697172,
+      "grad_norm": 1.7641949653625488,
+      "learning_rate": 0.00029223333333333335,
+      "loss": 2.8509,
+      "step": 23700
+    },
+    {
+      "epoch": 3.3750177632513854,
+      "grad_norm": 1.780263900756836,
+      "learning_rate": 0.0002916777777777778,
+      "loss": 2.8511,
+      "step": 23750
+    },
+    {
+      "epoch": 3.382123063805599,
+      "grad_norm": 1.7801567316055298,
+      "learning_rate": 0.00029112222222222223,
+      "loss": 2.8701,
+      "step": 23800
+    },
+    {
+      "epoch": 3.3892283643598122,
+      "grad_norm": 1.7215375900268555,
+      "learning_rate": 0.00029056666666666666,
+      "loss": 2.8586,
+      "step": 23850
+    },
+    {
+      "epoch": 3.396333664914026,
+      "grad_norm": 1.547910451889038,
+      "learning_rate": 0.0002900111111111111,
+      "loss": 2.843,
+      "step": 23900
+    },
+    {
+      "epoch": 3.403438965468239,
+      "grad_norm": 1.6910938024520874,
+      "learning_rate": 0.0002894555555555556,
+      "loss": 2.8203,
+      "step": 23950
+    },
+    {
+      "epoch": 3.4105442660224528,
+      "grad_norm": 1.791414499282837,
+      "learning_rate": 0.0002889,
+      "loss": 2.823,
+      "step": 24000
+    },
+    {
+      "epoch": 3.4105442660224528,
+      "eval_accuracy": 0.5083815455436707,
+      "eval_loss": 2.793198823928833,
+      "eval_runtime": 1.4287,
+      "eval_samples_per_second": 2631.689,
+      "eval_steps_per_second": 41.295,
+      "step": 24000
+    },
+    {
+      "epoch": 3.4176495665766664,
+      "grad_norm": 1.8300690650939941,
+      "learning_rate": 0.00028834444444444447,
+      "loss": 2.8477,
+      "step": 24050
+    },
+    {
+      "epoch": 3.4247548671308796,
+      "grad_norm": 1.8223072290420532,
+      "learning_rate": 0.0002877888888888889,
+      "loss": 2.7996,
+      "step": 24100
+    },
+    {
+      "epoch": 3.431860167685093,
+      "grad_norm": 1.781424880027771,
+      "learning_rate": 0.00028723333333333334,
+      "loss": 2.8303,
+      "step": 24150
+    },
+    {
+      "epoch": 3.4389654682393065,
+      "grad_norm": 1.6363815069198608,
+      "learning_rate": 0.0002866777777777778,
+      "loss": 2.852,
+      "step": 24200
+    },
+    {
+      "epoch": 3.44607076879352,
+      "grad_norm": 1.838799238204956,
+      "learning_rate": 0.0002861222222222222,
+      "loss": 2.8356,
+      "step": 24250
+    },
+    {
+      "epoch": 3.4531760693477334,
+      "grad_norm": 1.6546505689620972,
+      "learning_rate": 0.0002855666666666667,
+      "loss": 2.8423,
+      "step": 24300
+    },
+    {
+      "epoch": 3.460281369901947,
+      "grad_norm": 1.6957730054855347,
+      "learning_rate": 0.0002850111111111111,
+      "loss": 2.8331,
+      "step": 24350
+    },
+    {
+      "epoch": 3.4673866704561602,
+      "grad_norm": 1.7324293851852417,
+      "learning_rate": 0.0002844555555555556,
+      "loss": 2.8258,
+      "step": 24400
+    },
+    {
+      "epoch": 3.474491971010374,
+      "grad_norm": 1.7163538932800293,
+      "learning_rate": 0.0002839,
+      "loss": 2.8265,
+      "step": 24450
+    },
+    {
+      "epoch": 3.481597271564587,
+      "grad_norm": 1.7319365739822388,
+      "learning_rate": 0.00028334444444444445,
+      "loss": 2.8256,
+      "step": 24500
+    },
+    {
+      "epoch": 3.4887025721188007,
+      "grad_norm": 2.031334161758423,
+      "learning_rate": 0.0002827888888888889,
+      "loss": 2.8359,
+      "step": 24550
+    },
+    {
+      "epoch": 3.495807872673014,
+      "grad_norm": 1.750105857849121,
+      "learning_rate": 0.0002822333333333333,
+      "loss": 2.8361,
+      "step": 24600
+    },
+    {
+      "epoch": 3.5029131732272276,
+      "grad_norm": 1.8582334518432617,
+      "learning_rate": 0.0002816777777777778,
+      "loss": 2.8611,
+      "step": 24650
+    },
+    {
+      "epoch": 3.510018473781441,
+      "grad_norm": 1.8061821460723877,
+      "learning_rate": 0.0002811222222222222,
+      "loss": 2.8137,
+      "step": 24700
+    },
+    {
+      "epoch": 3.5171237743356545,
+      "grad_norm": 1.7302175760269165,
+      "learning_rate": 0.0002805666666666667,
+      "loss": 2.8224,
+      "step": 24750
+    },
+    {
+      "epoch": 3.5242290748898677,
+      "grad_norm": 1.7751331329345703,
+      "learning_rate": 0.00028001111111111113,
+      "loss": 2.8508,
+      "step": 24800
+    },
+    {
+      "epoch": 3.5313343754440814,
+      "grad_norm": 1.7023547887802124,
+      "learning_rate": 0.00027945555555555557,
+      "loss": 2.8283,
+      "step": 24850
+    },
+    {
+      "epoch": 3.5384396759982946,
+      "grad_norm": 1.6634979248046875,
+      "learning_rate": 0.0002789,
+      "loss": 2.8343,
+      "step": 24900
+    },
+    {
+      "epoch": 3.545544976552508,
+      "grad_norm": 1.8010696172714233,
+      "learning_rate": 0.00027834444444444444,
+      "loss": 2.8175,
+      "step": 24950
+    },
+    {
+      "epoch": 3.5526502771067214,
+      "grad_norm": 1.7478526830673218,
+      "learning_rate": 0.0002777888888888889,
+      "loss": 2.832,
+      "step": 25000
+    },
+    {
+      "epoch": 3.559755577660935,
+      "grad_norm": 1.8158820867538452,
+      "learning_rate": 0.0002772333333333333,
+      "loss": 2.8119,
+      "step": 25050
+    },
+    {
+      "epoch": 3.5668608782151487,
+      "grad_norm": 1.8094425201416016,
+      "learning_rate": 0.0002766777777777778,
+      "loss": 2.8173,
+      "step": 25100
+    },
+    {
+      "epoch": 3.573966178769362,
+      "grad_norm": 1.6954437494277954,
+      "learning_rate": 0.00027612222222222224,
+      "loss": 2.8067,
+      "step": 25150
+    },
+    {
+      "epoch": 3.581071479323575,
+      "grad_norm": 1.9532020092010498,
+      "learning_rate": 0.0002755666666666667,
+      "loss": 2.8207,
+      "step": 25200
+    },
+    {
+      "epoch": 3.588176779877789,
+      "grad_norm": 1.652669072151184,
+      "learning_rate": 0.0002750111111111111,
+      "loss": 2.7992,
+      "step": 25250
+    },
+    {
+      "epoch": 3.5952820804320025,
+      "grad_norm": 1.7369633913040161,
+      "learning_rate": 0.00027445555555555555,
+      "loss": 2.8382,
+      "step": 25300
+    },
+    {
+      "epoch": 3.6023873809862157,
+      "grad_norm": 1.5968304872512817,
+      "learning_rate": 0.0002739,
+      "loss": 2.8245,
+      "step": 25350
+    },
+    {
+      "epoch": 3.609492681540429,
+      "grad_norm": 1.791428565979004,
+      "learning_rate": 0.0002733444444444444,
+      "loss": 2.8243,
+      "step": 25400
+    },
+    {
+      "epoch": 3.6165979820946426,
+      "grad_norm": 1.7652702331542969,
+      "learning_rate": 0.0002727888888888889,
+      "loss": 2.8047,
+      "step": 25450
+    },
+    {
+      "epoch": 3.623703282648856,
+      "grad_norm": 1.7601053714752197,
+      "learning_rate": 0.00027223333333333335,
+      "loss": 2.8247,
+      "step": 25500
+    },
+    {
+      "epoch": 3.6308085832030694,
+      "grad_norm": 1.7859609127044678,
+      "learning_rate": 0.0002716777777777778,
+      "loss": 2.8192,
+      "step": 25550
+    },
+    {
+      "epoch": 3.637913883757283,
+      "grad_norm": 1.5694724321365356,
+      "learning_rate": 0.00027112222222222223,
+      "loss": 2.8291,
+      "step": 25600
+    },
+    {
+      "epoch": 3.6450191843114963,
+      "grad_norm": 1.839003324508667,
+      "learning_rate": 0.00027056666666666667,
+      "loss": 2.8119,
+      "step": 25650
+    },
+    {
+      "epoch": 3.65212448486571,
+      "grad_norm": 1.8402965068817139,
+      "learning_rate": 0.0002700111111111111,
+      "loss": 2.8155,
+      "step": 25700
+    },
+    {
+      "epoch": 3.659229785419923,
+      "grad_norm": 1.7180988788604736,
+      "learning_rate": 0.00026945555555555554,
+      "loss": 2.7935,
+      "step": 25750
+    },
+    {
+      "epoch": 3.666335085974137,
+      "grad_norm": 1.6569797992706299,
+      "learning_rate": 0.0002689,
+      "loss": 2.8104,
+      "step": 25800
+    },
+    {
+      "epoch": 3.67344038652835,
+      "grad_norm": 1.7790822982788086,
+      "learning_rate": 0.00026834444444444447,
+      "loss": 2.7944,
+      "step": 25850
+    },
+    {
+      "epoch": 3.6805456870825637,
+      "grad_norm": 1.7186955213546753,
+      "learning_rate": 0.0002677888888888889,
+      "loss": 2.7895,
+      "step": 25900
+    },
+    {
+      "epoch": 3.687650987636777,
+      "grad_norm": 1.7662361860275269,
+      "learning_rate": 0.00026723333333333334,
+      "loss": 2.8101,
+      "step": 25950
+    },
+    {
+      "epoch": 3.6947562881909906,
+      "grad_norm": 1.7198424339294434,
+      "learning_rate": 0.0002666777777777778,
+      "loss": 2.7967,
+      "step": 26000
+    },
+    {
+      "epoch": 3.6947562881909906,
+      "eval_accuracy": 0.5110318660736084,
+      "eval_loss": 2.814082384109497,
+      "eval_runtime": 1.4763,
+      "eval_samples_per_second": 2546.889,
+      "eval_steps_per_second": 39.964,
+      "step": 26000
+    },
+    {
+      "epoch": 3.7018615887452038,
+      "grad_norm": 1.8493636846542358,
+      "learning_rate": 0.0002661222222222222,
+      "loss": 2.7912,
+      "step": 26050
+    },
+    {
+      "epoch": 3.7089668892994174,
+      "grad_norm": 1.9021114110946655,
+      "learning_rate": 0.00026556666666666665,
+      "loss": 2.7961,
+      "step": 26100
+    },
+    {
+      "epoch": 3.716072189853631,
+      "grad_norm": 1.756255030632019,
+      "learning_rate": 0.0002650111111111111,
+      "loss": 2.7835,
+      "step": 26150
+    },
+    {
+      "epoch": 3.7231774904078443,
+      "grad_norm": 1.7606886625289917,
+      "learning_rate": 0.0002644555555555556,
+      "loss": 2.8018,
+      "step": 26200
+    },
+    {
+      "epoch": 3.7302827909620575,
+      "grad_norm": 1.6332037448883057,
+      "learning_rate": 0.0002639,
+      "loss": 2.8003,
+      "step": 26250
+    },
+    {
+      "epoch": 3.737388091516271,
+      "grad_norm": 1.7682067155838013,
+      "learning_rate": 0.00026334444444444445,
+      "loss": 2.7961,
+      "step": 26300
+    },
+    {
+      "epoch": 3.744493392070485,
+      "grad_norm": 1.7425826787948608,
+      "learning_rate": 0.0002627888888888889,
+      "loss": 2.8068,
+      "step": 26350
+    },
+    {
+      "epoch": 3.751598692624698,
+      "grad_norm": 1.7267491817474365,
+      "learning_rate": 0.00026223333333333333,
+      "loss": 2.7612,
+      "step": 26400
+    },
+    {
+      "epoch": 3.7587039931789112,
+      "grad_norm": 1.5135743618011475,
+      "learning_rate": 0.00026167777777777777,
+      "loss": 2.7745,
+      "step": 26450
+    },
+    {
+      "epoch": 3.765809293733125,
+      "grad_norm": 1.8557426929473877,
+      "learning_rate": 0.0002611222222222222,
+      "loss": 2.7918,
+      "step": 26500
+    },
+    {
+      "epoch": 3.7729145942873386,
+      "grad_norm": 1.60994553565979,
+      "learning_rate": 0.0002605666666666667,
+      "loss": 2.7807,
+      "step": 26550
+    },
+    {
+      "epoch": 3.7800198948415518,
+      "grad_norm": 1.8855258226394653,
+      "learning_rate": 0.00026001111111111113,
+      "loss": 2.8044,
+      "step": 26600
+    },
+    {
+      "epoch": 3.787125195395765,
+      "grad_norm": 1.6651372909545898,
+      "learning_rate": 0.00025945555555555557,
+      "loss": 2.7805,
+      "step": 26650
+    },
+    {
+      "epoch": 3.7942304959499786,
+      "grad_norm": 1.8007607460021973,
+      "learning_rate": 0.0002589,
+      "loss": 2.7983,
+      "step": 26700
+    },
+    {
+      "epoch": 3.8013357965041923,
+      "grad_norm": 1.6439241170883179,
+      "learning_rate": 0.00025834444444444444,
+      "loss": 2.7896,
+      "step": 26750
+    },
+    {
+      "epoch": 3.8084410970584055,
+      "grad_norm": 1.8518551588058472,
+      "learning_rate": 0.00025778888888888893,
+      "loss": 2.756,
+      "step": 26800
+    },
+    {
+      "epoch": 3.815546397612619,
+      "grad_norm": 1.7910232543945312,
+      "learning_rate": 0.0002572333333333333,
+      "loss": 2.7825,
+      "step": 26850
+    },
+    {
+      "epoch": 3.8226516981668324,
+      "grad_norm": 1.678464651107788,
+      "learning_rate": 0.0002566777777777778,
+      "loss": 2.7798,
+      "step": 26900
+    },
+    {
+      "epoch": 3.829756998721046,
+      "grad_norm": 1.7954827547073364,
+      "learning_rate": 0.0002561222222222222,
+      "loss": 2.771,
+      "step": 26950
+    },
+    {
+      "epoch": 3.8368622992752592,
+      "grad_norm": 1.6391098499298096,
+      "learning_rate": 0.0002555666666666667,
+      "loss": 2.771,
+      "step": 27000
+    },
+    {
+      "epoch": 3.843967599829473,
+      "grad_norm": 1.8526887893676758,
+      "learning_rate": 0.0002550111111111111,
+      "loss": 2.7749,
+      "step": 27050
+    },
+    {
+      "epoch": 3.851072900383686,
+      "grad_norm": 1.9011675119400024,
+      "learning_rate": 0.00025445555555555555,
+      "loss": 2.7898,
+      "step": 27100
+    },
+    {
+      "epoch": 3.8581782009378998,
+      "grad_norm": 1.711075782775879,
+      "learning_rate": 0.00025390000000000005,
+      "loss": 2.7933,
+      "step": 27150
+    },
+    {
+      "epoch": 3.865283501492113,
+      "grad_norm": 1.9099894762039185,
+      "learning_rate": 0.00025334444444444443,
+      "loss": 2.7883,
+      "step": 27200
+    },
+    {
+      "epoch": 3.8723888020463266,
+      "grad_norm": 1.6617141962051392,
+      "learning_rate": 0.0002527888888888889,
+      "loss": 2.7768,
+      "step": 27250
+    },
+    {
+      "epoch": 3.87949410260054,
+      "grad_norm": 1.7626140117645264,
+      "learning_rate": 0.0002522333333333333,
+      "loss": 2.7703,
+      "step": 27300
+    },
+    {
+      "epoch": 3.8865994031547535,
+      "grad_norm": 1.8242555856704712,
+      "learning_rate": 0.0002516777777777778,
+      "loss": 2.7734,
+      "step": 27350
+    },
+    {
+      "epoch": 3.893704703708967,
+      "grad_norm": 1.8651511669158936,
+      "learning_rate": 0.00025112222222222223,
+      "loss": 2.7654,
+      "step": 27400
+    },
+    {
+      "epoch": 3.9008100042631804,
+      "grad_norm": 1.8385076522827148,
+      "learning_rate": 0.00025056666666666667,
+      "loss": 2.7626,
+      "step": 27450
+    },
+    {
+      "epoch": 3.9079153048173936,
+      "grad_norm": 1.7049630880355835,
+      "learning_rate": 0.00025001111111111116,
+      "loss": 2.7816,
+      "step": 27500
+    },
+    {
+      "epoch": 3.9150206053716072,
+      "grad_norm": 1.6070621013641357,
+      "learning_rate": 0.0002494555555555556,
+      "loss": 2.7825,
+      "step": 27550
+    },
+    {
+      "epoch": 3.922125905925821,
+      "grad_norm": 1.6320933103561401,
+      "learning_rate": 0.00024890000000000003,
+      "loss": 2.756,
+      "step": 27600
+    },
+    {
+      "epoch": 3.929231206480034,
+      "grad_norm": 1.8926113843917847,
+      "learning_rate": 0.00024834444444444447,
+      "loss": 2.779,
+      "step": 27650
+    },
+    {
+      "epoch": 3.9363365070342473,
+      "grad_norm": 1.6653364896774292,
+      "learning_rate": 0.0002477888888888889,
+      "loss": 2.8109,
+      "step": 27700
+    },
+    {
+      "epoch": 3.943441807588461,
+      "grad_norm": 1.8551247119903564,
+      "learning_rate": 0.00024723333333333334,
+      "loss": 2.75,
+      "step": 27750
+    },
+    {
+      "epoch": 3.9505471081426746,
+      "grad_norm": 1.8100675344467163,
+      "learning_rate": 0.0002466777777777778,
+      "loss": 2.7643,
+      "step": 27800
+    },
+    {
+      "epoch": 3.957652408696888,
+      "grad_norm": 1.6926288604736328,
+      "learning_rate": 0.0002461222222222222,
+      "loss": 2.7711,
+      "step": 27850
+    },
+    {
+      "epoch": 3.964757709251101,
+      "grad_norm": 1.6786293983459473,
+      "learning_rate": 0.00024556666666666665,
+      "loss": 2.7595,
+      "step": 27900
+    },
+    {
+      "epoch": 3.9718630098053147,
+      "grad_norm": 1.569153070449829,
+      "learning_rate": 0.00024501111111111115,
+      "loss": 2.7817,
+      "step": 27950
+    },
+    {
+      "epoch": 3.9789683103595284,
+      "grad_norm": 1.7269905805587769,
+      "learning_rate": 0.0002444555555555556,
+      "loss": 2.7554,
+      "step": 28000
+    },
+    {
+      "epoch": 3.9789683103595284,
+      "eval_accuracy": 0.5206505656242371,
+      "eval_loss": 2.7269434928894043,
+      "eval_runtime": 1.313,
+      "eval_samples_per_second": 2863.767,
+      "eval_steps_per_second": 44.937,
+      "step": 28000
+    },
+    {
+      "epoch": 3.9860736109137416,
+      "grad_norm": 1.8113017082214355,
+      "learning_rate": 0.00024390000000000002,
+      "loss": 2.7633,
+      "step": 28050
+    },
+    {
+      "epoch": 3.9931789114679552,
+      "grad_norm": 1.878679871559143,
+      "learning_rate": 0.00024334444444444446,
+      "loss": 2.7855,
+      "step": 28100
+    },
+    {
+      "epoch": 4.000284212022168,
+      "grad_norm": 1.683408260345459,
+      "learning_rate": 0.0002427888888888889,
+      "loss": 2.751,
+      "step": 28150
+    },
+    {
+      "epoch": 4.007389512576382,
+      "grad_norm": 1.6192328929901123,
+      "learning_rate": 0.00024223333333333333,
+      "loss": 2.7363,
+      "step": 28200
+    },
+    {
+      "epoch": 4.014494813130596,
+      "grad_norm": 1.7787748575210571,
+      "learning_rate": 0.00024167777777777777,
+      "loss": 2.7267,
+      "step": 28250
+    },
+    {
+      "epoch": 4.0216001136848085,
+      "grad_norm": 1.8885560035705566,
+      "learning_rate": 0.0002411222222222222,
+      "loss": 2.7409,
+      "step": 28300
+    },
+    {
+      "epoch": 4.028705414239022,
+      "grad_norm": 1.8589296340942383,
+      "learning_rate": 0.0002405666666666667,
+      "loss": 2.7602,
+      "step": 28350
+    },
+    {
+      "epoch": 4.035810714793236,
+      "grad_norm": 1.9907305240631104,
+      "learning_rate": 0.00024001111111111113,
+      "loss": 2.7435,
+      "step": 28400
+    },
+    {
+      "epoch": 4.0429160153474495,
+      "grad_norm": 1.7080715894699097,
+      "learning_rate": 0.00023945555555555557,
+      "loss": 2.7307,
+      "step": 28450
+    },
+    {
+      "epoch": 4.050021315901662,
+      "grad_norm": 1.65744948387146,
+      "learning_rate": 0.0002389,
+      "loss": 2.7272,
+      "step": 28500
+    },
+    {
+      "epoch": 4.057126616455876,
+      "grad_norm": 1.6736458539962769,
+      "learning_rate": 0.00023834444444444444,
+      "loss": 2.737,
+      "step": 28550
+    },
+    {
+      "epoch": 4.06423191701009,
+      "grad_norm": 1.7677834033966064,
+      "learning_rate": 0.00023778888888888888,
+      "loss": 2.7256,
+      "step": 28600
+    },
+    {
+      "epoch": 4.071337217564303,
+      "grad_norm": 1.7060825824737549,
+      "learning_rate": 0.00023723333333333332,
+      "loss": 2.7578,
+      "step": 28650
+    },
+    {
+      "epoch": 4.078442518118516,
+      "grad_norm": 1.5596199035644531,
+      "learning_rate": 0.00023667777777777778,
+      "loss": 2.7446,
+      "step": 28700
+    },
+    {
+      "epoch": 4.08554781867273,
+      "grad_norm": 1.7688475847244263,
+      "learning_rate": 0.00023612222222222225,
+      "loss": 2.7451,
+      "step": 28750
+    },
+    {
+      "epoch": 4.092653119226943,
+      "grad_norm": 1.6317684650421143,
+      "learning_rate": 0.00023556666666666668,
+      "loss": 2.7385,
+      "step": 28800
+    },
+    {
+      "epoch": 4.099758419781157,
+      "grad_norm": 1.7691974639892578,
+      "learning_rate": 0.00023501111111111112,
+      "loss": 2.7098,
+      "step": 28850
+    },
+    {
+      "epoch": 4.106863720335371,
+      "grad_norm": 1.7797774076461792,
+      "learning_rate": 0.00023445555555555556,
+      "loss": 2.709,
+      "step": 28900
+    },
+    {
+      "epoch": 4.113969020889583,
+      "grad_norm": 1.9547299146652222,
+      "learning_rate": 0.0002339,
+      "loss": 2.7476,
+      "step": 28950
+    },
+    {
+      "epoch": 4.121074321443797,
+      "grad_norm": 1.8123749494552612,
+      "learning_rate": 0.00023334444444444443,
+      "loss": 2.726,
+      "step": 29000
+    },
+    {
+      "epoch": 4.128179621998011,
+      "grad_norm": 1.8036295175552368,
+      "learning_rate": 0.0002327888888888889,
+      "loss": 2.7269,
+      "step": 29050
+    },
+    {
+      "epoch": 4.135284922552224,
+      "grad_norm": 1.7694047689437866,
+      "learning_rate": 0.00023223333333333336,
+      "loss": 2.728,
+      "step": 29100
+    },
+    {
+      "epoch": 4.142390223106437,
+      "grad_norm": 1.6515389680862427,
+      "learning_rate": 0.0002316777777777778,
+      "loss": 2.7279,
+      "step": 29150
+    },
+    {
+      "epoch": 4.149495523660651,
+      "grad_norm": 1.6699292659759521,
+      "learning_rate": 0.00023112222222222223,
+      "loss": 2.7281,
+      "step": 29200
+    },
+    {
+      "epoch": 4.156600824214864,
+      "grad_norm": 1.878513216972351,
+      "learning_rate": 0.00023056666666666667,
+      "loss": 2.7259,
+      "step": 29250
+    },
+    {
+      "epoch": 4.163706124769078,
+      "grad_norm": 1.5836261510849,
+      "learning_rate": 0.0002300111111111111,
+      "loss": 2.7269,
+      "step": 29300
+    },
+    {
+      "epoch": 4.170811425323291,
+      "grad_norm": 1.7325093746185303,
+      "learning_rate": 0.00022945555555555554,
+      "loss": 2.7192,
+      "step": 29350
+    },
+    {
+      "epoch": 4.1779167258775045,
+      "grad_norm": 1.793212652206421,
+      "learning_rate": 0.0002289,
+      "loss": 2.7374,
+      "step": 29400
+    },
+    {
+      "epoch": 4.185022026431718,
+      "grad_norm": 1.658182978630066,
+      "learning_rate": 0.00022834444444444444,
+      "loss": 2.7541,
+      "step": 29450
+    },
+    {
+      "epoch": 4.192127326985932,
+      "grad_norm": 1.9298884868621826,
+      "learning_rate": 0.0002277888888888889,
+      "loss": 2.7343,
+      "step": 29500
+    },
+    {
+      "epoch": 4.199232627540145,
+      "grad_norm": 1.6460927724838257,
+      "learning_rate": 0.00022723333333333335,
+      "loss": 2.7191,
+      "step": 29550
+    },
+    {
+      "epoch": 4.206337928094358,
+      "grad_norm": 1.6624342203140259,
+      "learning_rate": 0.00022667777777777778,
+      "loss": 2.7415,
+      "step": 29600
+    },
+    {
+      "epoch": 4.213443228648572,
+      "grad_norm": 1.710582971572876,
+      "learning_rate": 0.00022612222222222222,
+      "loss": 2.7037,
+      "step": 29650
+    },
+    {
+      "epoch": 4.220548529202786,
+      "grad_norm": 1.7348077297210693,
+      "learning_rate": 0.00022556666666666668,
+      "loss": 2.7315,
+      "step": 29700
+    },
+    {
+      "epoch": 4.227653829756998,
+      "grad_norm": 1.7268335819244385,
+      "learning_rate": 0.00022501111111111112,
+      "loss": 2.7443,
+      "step": 29750
+    },
+    {
+      "epoch": 4.234759130311212,
+      "grad_norm": 1.8017711639404297,
+      "learning_rate": 0.00022445555555555556,
+      "loss": 2.735,
+      "step": 29800
+    },
+    {
+      "epoch": 4.241864430865426,
+      "grad_norm": 1.776839017868042,
+      "learning_rate": 0.0002239,
+      "loss": 2.7009,
+      "step": 29850
+    },
+    {
+      "epoch": 4.248969731419639,
+      "grad_norm": 1.7648807764053345,
+      "learning_rate": 0.00022334444444444446,
+      "loss": 2.7173,
+      "step": 29900
+    },
+    {
+      "epoch": 4.256075031973852,
+      "grad_norm": 1.6997913122177124,
+      "learning_rate": 0.0002227888888888889,
+      "loss": 2.7113,
+      "step": 29950
+    },
+    {
+      "epoch": 4.263180332528066,
+      "grad_norm": 1.723713994026184,
+      "learning_rate": 0.00022223333333333333,
+      "loss": 2.7182,
+      "step": 30000
+    },
+    {
+      "epoch": 4.263180332528066,
+      "eval_accuracy": 0.529339075088501,
+      "eval_loss": 2.672091245651245,
+      "eval_runtime": 1.5216,
+      "eval_samples_per_second": 2471.011,
+      "eval_steps_per_second": 38.774,
+      "step": 30000
+    },
+    {
+      "epoch": 4.270285633082279,
+      "grad_norm": 1.6741389036178589,
+      "learning_rate": 0.0002216777777777778,
+      "loss": 2.6956,
+      "step": 30050
+    },
+    {
+      "epoch": 4.277390933636493,
+      "grad_norm": 1.7774298191070557,
+      "learning_rate": 0.00022112222222222223,
+      "loss": 2.6983,
+      "step": 30100
+    },
+    {
+      "epoch": 4.284496234190707,
+      "grad_norm": 1.742165207862854,
+      "learning_rate": 0.00022056666666666667,
+      "loss": 2.7294,
+      "step": 30150
+    },
+    {
+      "epoch": 4.2916015347449195,
+      "grad_norm": 1.7675608396530151,
+      "learning_rate": 0.0002200111111111111,
+      "loss": 2.6987,
+      "step": 30200
+    },
+    {
+      "epoch": 4.298706835299133,
+      "grad_norm": 1.7561299800872803,
+      "learning_rate": 0.00021945555555555554,
+      "loss": 2.7197,
+      "step": 30250
+    },
+    {
+      "epoch": 4.305812135853347,
+      "grad_norm": 1.7825928926467896,
+      "learning_rate": 0.0002189,
+      "loss": 2.7139,
+      "step": 30300
+    },
+    {
+      "epoch": 4.31291743640756,
+      "grad_norm": 2.1004021167755127,
+      "learning_rate": 0.00021834444444444445,
+      "loss": 2.7166,
+      "step": 30350
+    },
+    {
+      "epoch": 4.320022736961773,
+      "grad_norm": 1.8277966976165771,
+      "learning_rate": 0.0002177888888888889,
+      "loss": 2.6808,
+      "step": 30400
+    },
+    {
+      "epoch": 4.327128037515987,
+      "grad_norm": 1.7409828901290894,
+      "learning_rate": 0.00021723333333333335,
+      "loss": 2.7069,
+      "step": 30450
+    },
+    {
+      "epoch": 4.3342333380702005,
+      "grad_norm": 1.7453832626342773,
+      "learning_rate": 0.00021667777777777778,
+      "loss": 2.7218,
+      "step": 30500
+    },
+    {
+      "epoch": 4.341338638624414,
+      "grad_norm": 1.8581887483596802,
+      "learning_rate": 0.00021612222222222222,
+      "loss": 2.7126,
+      "step": 30550
+    },
+    {
+      "epoch": 4.348443939178627,
+      "grad_norm": 1.8236676454544067,
+      "learning_rate": 0.00021556666666666666,
+      "loss": 2.7147,
+      "step": 30600
+    },
+    {
+      "epoch": 4.355549239732841,
+      "grad_norm": 1.944787621498108,
+      "learning_rate": 0.0002150111111111111,
+      "loss": 2.724,
+      "step": 30650
+    },
+    {
+      "epoch": 4.362654540287054,
+      "grad_norm": 1.663387417793274,
+      "learning_rate": 0.00021445555555555556,
+      "loss": 2.7077,
+      "step": 30700
+    },
+    {
+      "epoch": 4.369759840841268,
+      "grad_norm": 1.8352930545806885,
+      "learning_rate": 0.00021390000000000002,
+      "loss": 2.6954,
+      "step": 30750
+    },
+    {
+      "epoch": 4.376865141395481,
+      "grad_norm": 1.635725736618042,
+      "learning_rate": 0.00021334444444444446,
+      "loss": 2.7108,
+      "step": 30800
+    },
+    {
+      "epoch": 4.383970441949694,
+      "grad_norm": 1.6727386713027954,
+      "learning_rate": 0.0002127888888888889,
+      "loss": 2.7002,
+      "step": 30850
+    },
+    {
+      "epoch": 4.391075742503908,
+      "grad_norm": 1.6804115772247314,
+      "learning_rate": 0.00021223333333333333,
+      "loss": 2.6961,
+      "step": 30900
+    },
+    {
+      "epoch": 4.398181043058122,
+      "grad_norm": 1.7196424007415771,
+      "learning_rate": 0.00021167777777777777,
+      "loss": 2.7234,
+      "step": 30950
+    },
+    {
+      "epoch": 4.405286343612334,
+      "grad_norm": 1.8474704027175903,
+      "learning_rate": 0.0002111222222222222,
+      "loss": 2.7161,
+      "step": 31000
+    },
+    {
+      "epoch": 4.412391644166548,
+      "grad_norm": 1.6778532266616821,
+      "learning_rate": 0.00021056666666666667,
+      "loss": 2.6654,
+      "step": 31050
+    },
+    {
+      "epoch": 4.419496944720762,
+      "grad_norm": 1.7179423570632935,
+      "learning_rate": 0.00021001111111111114,
+      "loss": 2.7152,
+      "step": 31100
+    },
+    {
+      "epoch": 4.426602245274975,
+      "grad_norm": 1.6991947889328003,
+      "learning_rate": 0.00020945555555555557,
+      "loss": 2.723,
+      "step": 31150
+    },
+    {
+      "epoch": 4.433707545829188,
+      "grad_norm": 1.705942153930664,
+      "learning_rate": 0.0002089,
+      "loss": 2.7024,
+      "step": 31200
+    },
+    {
+      "epoch": 4.440812846383402,
+      "grad_norm": 1.7053892612457275,
+      "learning_rate": 0.00020834444444444445,
+      "loss": 2.7086,
+      "step": 31250
+    },
+    {
+      "epoch": 4.4479181469376154,
+      "grad_norm": 1.735185146331787,
+      "learning_rate": 0.00020778888888888888,
+      "loss": 2.6899,
+      "step": 31300
+    },
+    {
+      "epoch": 4.455023447491829,
+      "grad_norm": 1.7392066717147827,
+      "learning_rate": 0.00020723333333333332,
+      "loss": 2.7156,
+      "step": 31350
+    },
+    {
+      "epoch": 4.462128748046043,
+      "grad_norm": 1.7509199380874634,
+      "learning_rate": 0.00020667777777777776,
+      "loss": 2.6833,
+      "step": 31400
+    },
+    {
+      "epoch": 4.4692340486002555,
+      "grad_norm": 1.877554178237915,
+      "learning_rate": 0.00020612222222222225,
+      "loss": 2.702,
+      "step": 31450
+    },
+    {
+      "epoch": 4.476339349154469,
+      "grad_norm": 1.859157681465149,
+      "learning_rate": 0.00020556666666666669,
+      "loss": 2.7088,
+      "step": 31500
+    },
+    {
+      "epoch": 4.483444649708683,
+      "grad_norm": 1.9033279418945312,
+      "learning_rate": 0.00020501111111111112,
+      "loss": 2.6754,
+      "step": 31550
+    },
+    {
+      "epoch": 4.4905499502628965,
+      "grad_norm": 1.8347678184509277,
+      "learning_rate": 0.00020445555555555556,
+      "loss": 2.6952,
+      "step": 31600
+    },
+    {
+      "epoch": 4.497655250817109,
+      "grad_norm": 1.8469839096069336,
+      "learning_rate": 0.0002039,
+      "loss": 2.6879,
+      "step": 31650
+    },
+    {
+      "epoch": 4.504760551371323,
+      "grad_norm": 1.740691065788269,
+      "learning_rate": 0.00020334444444444443,
+      "loss": 2.7068,
+      "step": 31700
+    },
+    {
+      "epoch": 4.511865851925537,
+      "grad_norm": 1.6777381896972656,
+      "learning_rate": 0.0002027888888888889,
+      "loss": 2.664,
+      "step": 31750
+    },
+    {
+      "epoch": 4.51897115247975,
+      "grad_norm": 1.709246039390564,
+      "learning_rate": 0.00020223333333333333,
+      "loss": 2.7154,
+      "step": 31800
+    },
+    {
+      "epoch": 4.526076453033963,
+      "grad_norm": 1.7748068571090698,
+      "learning_rate": 0.0002016777777777778,
+      "loss": 2.6943,
+      "step": 31850
+    },
+    {
+      "epoch": 4.533181753588177,
+      "grad_norm": 1.613324522972107,
+      "learning_rate": 0.00020112222222222223,
+      "loss": 2.6844,
+      "step": 31900
+    },
+    {
+      "epoch": 4.54028705414239,
+      "grad_norm": 1.6655322313308716,
+      "learning_rate": 0.00020056666666666667,
+      "loss": 2.6986,
+      "step": 31950
+    },
+    {
+      "epoch": 4.547392354696604,
+      "grad_norm": 1.8054392337799072,
+      "learning_rate": 0.0002000111111111111,
+      "loss": 2.6879,
+      "step": 32000
+    },
+    {
+      "epoch": 4.547392354696604,
+      "eval_accuracy": 0.5284602046012878,
+      "eval_loss": 2.64477801322937,
+      "eval_runtime": 1.3859,
+      "eval_samples_per_second": 2713.044,
+      "eval_steps_per_second": 42.572,
+      "step": 32000
+    },
+    {
+      "epoch": 4.554497655250817,
+      "grad_norm": 1.7321134805679321,
+      "learning_rate": 0.00019945555555555555,
+      "loss": 2.6896,
+      "step": 32050
+    },
+    {
+      "epoch": 4.56160295580503,
+      "grad_norm": 1.9145094156265259,
+      "learning_rate": 0.0001989,
+      "loss": 2.6834,
+      "step": 32100
+    },
+    {
+      "epoch": 4.568708256359244,
+      "grad_norm": 1.8805677890777588,
+      "learning_rate": 0.00019834444444444445,
+      "loss": 2.6949,
+      "step": 32150
+    },
+    {
+      "epoch": 4.575813556913458,
+      "grad_norm": 1.7424559593200684,
+      "learning_rate": 0.00019778888888888888,
+      "loss": 2.6867,
+      "step": 32200
+    },
+    {
+      "epoch": 4.582918857467671,
+      "grad_norm": 1.9168509244918823,
+      "learning_rate": 0.00019723333333333335,
+      "loss": 2.6895,
+      "step": 32250
+    },
+    {
+      "epoch": 4.590024158021884,
+      "grad_norm": 1.6305192708969116,
+      "learning_rate": 0.00019667777777777778,
+      "loss": 2.6829,
+      "step": 32300
+    },
+    {
+      "epoch": 4.597129458576098,
+      "grad_norm": 1.5996636152267456,
+      "learning_rate": 0.00019612222222222222,
+      "loss": 2.6894,
+      "step": 32350
+    },
+    {
+      "epoch": 4.604234759130311,
+      "grad_norm": 1.7526198625564575,
+      "learning_rate": 0.00019556666666666666,
+      "loss": 2.6857,
+      "step": 32400
+    },
+    {
+      "epoch": 4.611340059684524,
+      "grad_norm": 1.8293813467025757,
+      "learning_rate": 0.00019501111111111112,
+      "loss": 2.6659,
+      "step": 32450
+    },
+    {
+      "epoch": 4.618445360238738,
+      "grad_norm": 1.8178389072418213,
+      "learning_rate": 0.00019445555555555556,
+      "loss": 2.6877,
+      "step": 32500
+    },
+    {
+      "epoch": 4.6255506607929515,
+      "grad_norm": 1.628410816192627,
+      "learning_rate": 0.0001939,
+      "loss": 2.699,
+      "step": 32550
+    },
+    {
+      "epoch": 4.632655961347165,
+      "grad_norm": 1.7101975679397583,
+      "learning_rate": 0.00019334444444444446,
+      "loss": 2.6754,
+      "step": 32600
+    },
+    {
+      "epoch": 4.639761261901379,
+      "grad_norm": 1.6504610776901245,
+      "learning_rate": 0.0001927888888888889,
+      "loss": 2.6874,
+      "step": 32650
+    },
+    {
+      "epoch": 4.646866562455592,
+      "grad_norm": 1.7311458587646484,
+      "learning_rate": 0.00019223333333333333,
+      "loss": 2.6898,
+      "step": 32700
+    },
+    {
+      "epoch": 4.653971863009805,
+      "grad_norm": 1.9511997699737549,
+      "learning_rate": 0.00019167777777777777,
+      "loss": 2.6902,
+      "step": 32750
+    },
+    {
+      "epoch": 4.661077163564019,
+      "grad_norm": 1.7774189710617065,
+      "learning_rate": 0.00019112222222222224,
+      "loss": 2.6501,
+      "step": 32800
+    },
+    {
+      "epoch": 4.668182464118233,
+      "grad_norm": 1.8208081722259521,
+      "learning_rate": 0.00019056666666666667,
+      "loss": 2.6812,
+      "step": 32850
+    },
+    {
+      "epoch": 4.675287764672445,
+      "grad_norm": 1.7172284126281738,
+      "learning_rate": 0.0001900111111111111,
+      "loss": 2.6661,
+      "step": 32900
+    },
+    {
+      "epoch": 4.682393065226659,
+      "grad_norm": 1.98916494846344,
+      "learning_rate": 0.00018945555555555555,
+      "loss": 2.6739,
+      "step": 32950
+    },
+    {
+      "epoch": 4.689498365780873,
+      "grad_norm": 1.7716188430786133,
+      "learning_rate": 0.0001889,
+      "loss": 2.6826,
+      "step": 33000
+    },
+    {
+      "epoch": 4.696603666335086,
+      "grad_norm": 1.7136895656585693,
+      "learning_rate": 0.00018834444444444445,
+      "loss": 2.6791,
+      "step": 33050
+    },
+    {
+      "epoch": 4.703708966889299,
+      "grad_norm": 1.8110146522521973,
+      "learning_rate": 0.00018778888888888888,
+      "loss": 2.6942,
+      "step": 33100
+    },
+    {
+      "epoch": 4.710814267443513,
+      "grad_norm": 1.9212117195129395,
+      "learning_rate": 0.00018723333333333335,
+      "loss": 2.6818,
+      "step": 33150
+    },
+    {
+      "epoch": 4.717919567997726,
+      "grad_norm": 1.7386603355407715,
+      "learning_rate": 0.00018667777777777779,
+      "loss": 2.6818,
+      "step": 33200
+    },
+    {
+      "epoch": 4.72502486855194,
+      "grad_norm": 1.7467000484466553,
+      "learning_rate": 0.00018612222222222222,
+      "loss": 2.684,
+      "step": 33250
+    },
+    {
+      "epoch": 4.732130169106153,
+      "grad_norm": 1.8801017999649048,
+      "learning_rate": 0.00018556666666666666,
+      "loss": 2.668,
+      "step": 33300
+    },
+    {
+      "epoch": 4.7392354696603665,
+      "grad_norm": 1.751801609992981,
+      "learning_rate": 0.0001850111111111111,
+      "loss": 2.66,
+      "step": 33350
+    },
+    {
+      "epoch": 4.74634077021458,
+      "grad_norm": 1.9448400735855103,
+      "learning_rate": 0.00018445555555555556,
+      "loss": 2.6685,
+      "step": 33400
+    },
+    {
+      "epoch": 4.753446070768794,
+      "grad_norm": 1.7627147436141968,
+      "learning_rate": 0.00018390000000000002,
+      "loss": 2.6803,
+      "step": 33450
+    },
+    {
+      "epoch": 4.760551371323007,
+      "grad_norm": 1.7280786037445068,
+      "learning_rate": 0.00018334444444444446,
+      "loss": 2.6735,
+      "step": 33500
+    },
+    {
+      "epoch": 4.76765667187722,
+      "grad_norm": 1.8190041780471802,
+      "learning_rate": 0.0001827888888888889,
+      "loss": 2.6678,
+      "step": 33550
+    },
+    {
+      "epoch": 4.774761972431434,
+      "grad_norm": 1.735298991203308,
+      "learning_rate": 0.00018223333333333334,
+      "loss": 2.6547,
+      "step": 33600
+    },
+    {
+      "epoch": 4.7818672729856475,
+      "grad_norm": 1.7868086099624634,
+      "learning_rate": 0.00018167777777777777,
+      "loss": 2.6876,
+      "step": 33650
+    },
+    {
+      "epoch": 4.78897257353986,
+      "grad_norm": 1.749411940574646,
+      "learning_rate": 0.0001811222222222222,
+      "loss": 2.6575,
+      "step": 33700
+    },
+    {
+      "epoch": 4.796077874094074,
+      "grad_norm": 1.6397782564163208,
+      "learning_rate": 0.00018056666666666665,
+      "loss": 2.6762,
+      "step": 33750
+    },
+    {
+      "epoch": 4.803183174648288,
+      "grad_norm": 1.6440680027008057,
+      "learning_rate": 0.00018001111111111114,
+      "loss": 2.6445,
+      "step": 33800
+    },
+    {
+      "epoch": 4.810288475202501,
+      "grad_norm": 1.664414405822754,
+      "learning_rate": 0.00017945555555555557,
+      "loss": 2.6687,
+      "step": 33850
+    },
+    {
+      "epoch": 4.817393775756715,
+      "grad_norm": 1.6941622495651245,
+      "learning_rate": 0.0001789,
+      "loss": 2.6661,
+      "step": 33900
+    },
+    {
+      "epoch": 4.824499076310928,
+      "grad_norm": 1.8345458507537842,
+      "learning_rate": 0.00017834444444444445,
+      "loss": 2.6758,
+      "step": 33950
+    },
+    {
+      "epoch": 4.831604376865141,
+      "grad_norm": 1.7914514541625977,
+      "learning_rate": 0.00017778888888888889,
+      "loss": 2.6659,
+      "step": 34000
+    },
+    {
+      "epoch": 4.831604376865141,
+      "eval_accuracy": 0.5343063473701477,
+      "eval_loss": 2.6134769916534424,
+      "eval_runtime": 1.4838,
+      "eval_samples_per_second": 2534.091,
+      "eval_steps_per_second": 39.764,
+      "step": 34000
+    },
+    {
+      "epoch": 4.838709677419355,
+      "grad_norm": 1.7636709213256836,
+      "learning_rate": 0.00017723333333333332,
+      "loss": 2.6714,
+      "step": 34050
+    },
+    {
+      "epoch": 4.845814977973569,
+      "grad_norm": 1.712996244430542,
+      "learning_rate": 0.00017667777777777776,
+      "loss": 2.6592,
+      "step": 34100
+    },
+    {
+      "epoch": 4.852920278527781,
+      "grad_norm": 1.8048306703567505,
+      "learning_rate": 0.00017612222222222225,
+      "loss": 2.6626,
+      "step": 34150
+    },
+    {
+      "epoch": 4.860025579081995,
+      "grad_norm": 1.6417750120162964,
+      "learning_rate": 0.0001755666666666667,
+      "loss": 2.6569,
+      "step": 34200
+    },
+    {
+      "epoch": 4.867130879636209,
+      "grad_norm": 1.678472638130188,
+      "learning_rate": 0.00017501111111111112,
+      "loss": 2.6508,
+      "step": 34250
+    },
+    {
+      "epoch": 4.874236180190422,
+      "grad_norm": 1.9373923540115356,
+      "learning_rate": 0.00017445555555555556,
+      "loss": 2.6428,
+      "step": 34300
+    },
+    {
+      "epoch": 4.881341480744635,
+      "grad_norm": 1.6744999885559082,
+      "learning_rate": 0.0001739,
+      "loss": 2.6595,
+      "step": 34350
+    },
+    {
+      "epoch": 4.888446781298849,
+      "grad_norm": 1.796247959136963,
+      "learning_rate": 0.00017334444444444444,
+      "loss": 2.6739,
+      "step": 34400
+    },
+    {
+      "epoch": 4.8955520818530625,
+      "grad_norm": 1.8660869598388672,
+      "learning_rate": 0.00017278888888888887,
+      "loss": 2.6538,
+      "step": 34450
+    },
+    {
+      "epoch": 4.902657382407276,
+      "grad_norm": 1.6597950458526611,
+      "learning_rate": 0.00017223333333333334,
+      "loss": 2.6417,
+      "step": 34500
+    },
+    {
+      "epoch": 4.909762682961489,
+      "grad_norm": 1.6842671632766724,
+      "learning_rate": 0.0001716777777777778,
+      "loss": 2.6521,
+      "step": 34550
+    },
+    {
+      "epoch": 4.9168679835157025,
+      "grad_norm": 1.8126052618026733,
+      "learning_rate": 0.00017112222222222224,
+      "loss": 2.6465,
+      "step": 34600
+    },
+    {
+      "epoch": 4.923973284069916,
+      "grad_norm": 1.8387752771377563,
+      "learning_rate": 0.00017056666666666667,
+      "loss": 2.6434,
+      "step": 34650
+    },
+    {
+      "epoch": 4.93107858462413,
+      "grad_norm": 1.7043359279632568,
+      "learning_rate": 0.0001700111111111111,
+      "loss": 2.6521,
+      "step": 34700
+    },
+    {
+      "epoch": 4.9381838851783435,
+      "grad_norm": 1.7130815982818604,
+      "learning_rate": 0.00016945555555555555,
+      "loss": 2.6609,
+      "step": 34750
+    },
+    {
+      "epoch": 4.945289185732556,
+      "grad_norm": 1.6739017963409424,
+      "learning_rate": 0.00016889999999999999,
+      "loss": 2.6231,
+      "step": 34800
+    },
+    {
+      "epoch": 4.95239448628677,
+      "grad_norm": 1.6555219888687134,
+      "learning_rate": 0.00016834444444444445,
+      "loss": 2.6672,
+      "step": 34850
+    },
+    {
+      "epoch": 4.959499786840984,
+      "grad_norm": 1.7497371435165405,
+      "learning_rate": 0.0001677888888888889,
+      "loss": 2.6369,
+      "step": 34900
+    },
+    {
+      "epoch": 4.966605087395196,
+      "grad_norm": 1.7139822244644165,
+      "learning_rate": 0.00016723333333333335,
+      "loss": 2.6625,
+      "step": 34950
+    },
+    {
+      "epoch": 4.97371038794941,
+      "grad_norm": 1.7247073650360107,
+      "learning_rate": 0.0001666777777777778,
+      "loss": 2.6793,
+      "step": 35000
+    },
+    {
+      "epoch": 4.980815688503624,
+      "grad_norm": 1.792324185371399,
+      "learning_rate": 0.00016612222222222222,
+      "loss": 2.6511,
+      "step": 35050
+    },
+    {
+      "epoch": 4.987920989057837,
+      "grad_norm": 1.7975879907608032,
+      "learning_rate": 0.00016556666666666666,
+      "loss": 2.6459,
+      "step": 35100
+    },
+    {
+      "epoch": 4.995026289612051,
+      "grad_norm": 1.6427263021469116,
+      "learning_rate": 0.0001650111111111111,
+      "loss": 2.6298,
+      "step": 35150
+    },
+    {
+      "epoch": 5.002131590166264,
+      "grad_norm": 1.7684862613677979,
+      "learning_rate": 0.00016445555555555556,
+      "loss": 2.6456,
+      "step": 35200
+    },
+    {
+      "epoch": 5.009236890720477,
+      "grad_norm": 1.6923253536224365,
+      "learning_rate": 0.0001639,
+      "loss": 2.6107,
+      "step": 35250
+    },
+    {
+      "epoch": 5.016342191274691,
+      "grad_norm": 1.7871248722076416,
+      "learning_rate": 0.00016334444444444444,
+      "loss": 2.6249,
+      "step": 35300
+    },
+    {
+      "epoch": 5.023447491828905,
+      "grad_norm": 1.738742709159851,
+      "learning_rate": 0.0001627888888888889,
+      "loss": 2.6468,
+      "step": 35350
+    },
+    {
+      "epoch": 5.0305527923831175,
+      "grad_norm": 1.688292384147644,
+      "learning_rate": 0.00016223333333333334,
+      "loss": 2.632,
+      "step": 35400
+    },
+    {
+      "epoch": 5.037658092937331,
+      "grad_norm": 1.7152246236801147,
+      "learning_rate": 0.00016167777777777777,
+      "loss": 2.6373,
+      "step": 35450
+    },
+    {
+      "epoch": 5.044763393491545,
+      "grad_norm": 1.6816190481185913,
+      "learning_rate": 0.00016112222222222224,
+      "loss": 2.631,
+      "step": 35500
+    },
+    {
+      "epoch": 5.0518686940457584,
+      "grad_norm": 1.9319465160369873,
+      "learning_rate": 0.00016056666666666668,
+      "loss": 2.5949,
+      "step": 35550
+    },
+    {
+      "epoch": 5.058973994599971,
+      "grad_norm": 1.7346752882003784,
+      "learning_rate": 0.0001600111111111111,
+      "loss": 2.6062,
+      "step": 35600
+    },
+    {
+      "epoch": 5.066079295154185,
+      "grad_norm": 1.95259428024292,
+      "learning_rate": 0.00015945555555555555,
+      "loss": 2.6281,
+      "step": 35650
+    },
+    {
+      "epoch": 5.0731845957083985,
+      "grad_norm": 1.9597879648208618,
+      "learning_rate": 0.0001589,
+      "loss": 2.6211,
+      "step": 35700
+    },
+    {
+      "epoch": 5.080289896262612,
+      "grad_norm": 1.7877590656280518,
+      "learning_rate": 0.00015834444444444445,
+      "loss": 2.6229,
+      "step": 35750
+    },
+    {
+      "epoch": 5.087395196816825,
+      "grad_norm": 1.8384203910827637,
+      "learning_rate": 0.0001577888888888889,
+      "loss": 2.6415,
+      "step": 35800
+    },
+    {
+      "epoch": 5.094500497371039,
+      "grad_norm": 1.7532262802124023,
+      "learning_rate": 0.00015723333333333335,
+      "loss": 2.6277,
+      "step": 35850
+    },
+    {
+      "epoch": 5.101605797925252,
+      "grad_norm": 1.8662482500076294,
+      "learning_rate": 0.0001566777777777778,
+      "loss": 2.6251,
+      "step": 35900
+    },
+    {
+      "epoch": 5.108711098479466,
+      "grad_norm": 1.817358374595642,
+      "learning_rate": 0.00015612222222222223,
+      "loss": 2.6221,
+      "step": 35950
+    },
+    {
+      "epoch": 5.11581639903368,
+      "grad_norm": 1.6843544244766235,
+      "learning_rate": 0.00015556666666666666,
+      "loss": 2.6143,
+      "step": 36000
+    },
+    {
+      "epoch": 5.11581639903368,
+      "eval_accuracy": 0.5344390869140625,
+      "eval_loss": 2.5895192623138428,
+      "eval_runtime": 1.3175,
+      "eval_samples_per_second": 2853.919,
+      "eval_steps_per_second": 44.782,
+      "step": 36000
+    },
+    {
+      "epoch": 5.122921699587892,
+      "grad_norm": 1.6144895553588867,
+      "learning_rate": 0.0001550111111111111,
+      "loss": 2.628,
+      "step": 36050
+    },
+    {
+      "epoch": 5.130027000142106,
+      "grad_norm": 1.7719066143035889,
+      "learning_rate": 0.00015445555555555556,
+      "loss": 2.622,
+      "step": 36100
+    },
+    {
+      "epoch": 5.13713230069632,
+      "grad_norm": 1.775813102722168,
+      "learning_rate": 0.0001539,
+      "loss": 2.629,
+      "step": 36150
+    },
+    {
+      "epoch": 5.144237601250533,
+      "grad_norm": 1.7993078231811523,
+      "learning_rate": 0.00015334444444444446,
+      "loss": 2.6059,
+      "step": 36200
+    },
+    {
+      "epoch": 5.151342901804746,
+      "grad_norm": 1.6277464628219604,
+      "learning_rate": 0.0001527888888888889,
+      "loss": 2.6393,
+      "step": 36250
+    },
+    {
+      "epoch": 5.15844820235896,
+      "grad_norm": 1.6855401992797852,
+      "learning_rate": 0.00015223333333333334,
+      "loss": 2.616,
+      "step": 36300
+    },
+    {
+      "epoch": 5.165553502913173,
+      "grad_norm": 1.7239676713943481,
+      "learning_rate": 0.00015167777777777778,
+      "loss": 2.6438,
+      "step": 36350
+    },
+    {
+      "epoch": 5.172658803467387,
+      "grad_norm": 1.7113441228866577,
+      "learning_rate": 0.0001511222222222222,
+      "loss": 2.5931,
+      "step": 36400
+    },
+    {
+      "epoch": 5.1797641040216,
+      "grad_norm": 1.7422834634780884,
+      "learning_rate": 0.00015056666666666665,
+      "loss": 2.6281,
+      "step": 36450
+    },
+    {
+      "epoch": 5.1868694045758135,
+      "grad_norm": 1.7386971712112427,
+      "learning_rate": 0.0001500111111111111,
+      "loss": 2.6301,
+      "step": 36500
+    },
+    {
+      "epoch": 5.193974705130027,
+      "grad_norm": 1.8851107358932495,
+      "learning_rate": 0.00014945555555555558,
+      "loss": 2.623,
+      "step": 36550
+    },
+    {
+      "epoch": 5.201080005684241,
+      "grad_norm": 1.901906967163086,
+      "learning_rate": 0.00014890000000000001,
+      "loss": 2.5805,
+      "step": 36600
+    },
+    {
+      "epoch": 5.2081853062384535,
+      "grad_norm": 1.6902124881744385,
+      "learning_rate": 0.00014834444444444445,
+      "loss": 2.6301,
+      "step": 36650
+    },
+    {
+      "epoch": 5.215290606792667,
+      "grad_norm": 1.8009217977523804,
+      "learning_rate": 0.0001477888888888889,
+      "loss": 2.5984,
+      "step": 36700
+    },
+    {
+      "epoch": 5.222395907346881,
+      "grad_norm": 1.7949461936950684,
+      "learning_rate": 0.00014723333333333333,
+      "loss": 2.612,
+      "step": 36750
+    },
+    {
+      "epoch": 5.2295012079010945,
+      "grad_norm": 1.8531763553619385,
+      "learning_rate": 0.00014667777777777776,
+      "loss": 2.6009,
+      "step": 36800
+    },
+    {
+      "epoch": 5.236606508455307,
+      "grad_norm": 1.9773792028427124,
+      "learning_rate": 0.0001461222222222222,
+      "loss": 2.6264,
+      "step": 36850
+    },
+    {
+      "epoch": 5.243711809009521,
+      "grad_norm": 1.7207331657409668,
+      "learning_rate": 0.0001455666666666667,
+      "loss": 2.6109,
+      "step": 36900
+    },
+    {
+      "epoch": 5.250817109563735,
+      "grad_norm": 1.6260844469070435,
+      "learning_rate": 0.00014501111111111113,
+      "loss": 2.6107,
+      "step": 36950
+    },
+    {
+      "epoch": 5.257922410117948,
+      "grad_norm": 1.8471879959106445,
+      "learning_rate": 0.00014445555555555556,
+      "loss": 2.6236,
+      "step": 37000
+    },
+    {
+      "epoch": 5.265027710672161,
+      "grad_norm": 1.7721649408340454,
+      "learning_rate": 0.0001439,
+      "loss": 2.6253,
+      "step": 37050
+    },
+    {
+      "epoch": 5.272133011226375,
+      "grad_norm": 1.9177438020706177,
+      "learning_rate": 0.00014334444444444444,
+      "loss": 2.6143,
+      "step": 37100
+    },
+    {
+      "epoch": 5.279238311780588,
+      "grad_norm": 1.777573823928833,
+      "learning_rate": 0.00014278888888888888,
+      "loss": 2.6128,
+      "step": 37150
+    },
+    {
+      "epoch": 5.286343612334802,
+      "grad_norm": 1.6359736919403076,
+      "learning_rate": 0.00014223333333333334,
+      "loss": 2.602,
+      "step": 37200
+    },
+    {
+      "epoch": 5.293448912889016,
+      "grad_norm": 1.793562412261963,
+      "learning_rate": 0.0001416777777777778,
+      "loss": 2.6356,
+      "step": 37250
+    },
+    {
+      "epoch": 5.300554213443228,
+      "grad_norm": 1.7116947174072266,
+      "learning_rate": 0.00014112222222222224,
+      "loss": 2.6169,
+      "step": 37300
+    },
+    {
+      "epoch": 5.307659513997442,
+      "grad_norm": 1.9178073406219482,
+      "learning_rate": 0.00014056666666666668,
+      "loss": 2.6071,
+      "step": 37350
+    },
+    {
+      "epoch": 5.314764814551656,
+      "grad_norm": 1.7767285108566284,
+      "learning_rate": 0.00014001111111111111,
+      "loss": 2.6182,
+      "step": 37400
+    },
+    {
+      "epoch": 5.321870115105869,
+      "grad_norm": 1.8449881076812744,
+      "learning_rate": 0.00013945555555555555,
+      "loss": 2.6065,
+      "step": 37450
+    },
+    {
+      "epoch": 5.328975415660082,
+      "grad_norm": 1.814612865447998,
+      "learning_rate": 0.0001389,
+      "loss": 2.6139,
+      "step": 37500
+    },
+    {
+      "epoch": 5.336080716214296,
+      "grad_norm": 1.7973238229751587,
+      "learning_rate": 0.00013834444444444445,
+      "loss": 2.6266,
+      "step": 37550
+    },
+    {
+      "epoch": 5.3431860167685095,
+      "grad_norm": 1.525099277496338,
+      "learning_rate": 0.0001377888888888889,
+      "loss": 2.5997,
+      "step": 37600
+    },
+    {
+      "epoch": 5.350291317322723,
+      "grad_norm": 1.7055003643035889,
+      "learning_rate": 0.00013723333333333335,
+      "loss": 2.6203,
+      "step": 37650
+    },
+    {
+      "epoch": 5.357396617876936,
+      "grad_norm": 1.6953563690185547,
+      "learning_rate": 0.0001366777777777778,
+      "loss": 2.6076,
+      "step": 37700
+    },
+    {
+      "epoch": 5.3645019184311495,
+      "grad_norm": 1.8244370222091675,
+      "learning_rate": 0.00013612222222222223,
+      "loss": 2.5991,
+      "step": 37750
+    },
+    {
+      "epoch": 5.371607218985363,
+      "grad_norm": 1.956551194190979,
+      "learning_rate": 0.00013556666666666666,
+      "loss": 2.6084,
+      "step": 37800
+    },
+    {
+      "epoch": 5.378712519539577,
+      "grad_norm": 1.725568413734436,
+      "learning_rate": 0.0001350111111111111,
+      "loss": 2.5965,
+      "step": 37850
+    },
+    {
+      "epoch": 5.38581782009379,
+      "grad_norm": 1.644600749015808,
+      "learning_rate": 0.00013445555555555557,
+      "loss": 2.5927,
+      "step": 37900
+    },
+    {
+      "epoch": 5.392923120648003,
+      "grad_norm": 1.8263912200927734,
+      "learning_rate": 0.0001339,
+      "loss": 2.5936,
+      "step": 37950
+    },
+    {
+      "epoch": 5.400028421202217,
+      "grad_norm": 1.664300799369812,
+      "learning_rate": 0.00013334444444444444,
+      "loss": 2.6121,
+      "step": 38000
+    },
+    {
+      "epoch": 5.400028421202217,
+      "eval_accuracy": 0.5335931181907654,
+      "eval_loss": 2.589524030685425,
+      "eval_runtime": 1.4723,
+      "eval_samples_per_second": 2553.85,
+      "eval_steps_per_second": 40.074,
+      "step": 38000
+    },
+    {
+      "epoch": 5.407133721756431,
+      "grad_norm": 1.7988592386245728,
+      "learning_rate": 0.0001327888888888889,
+      "loss": 2.5998,
+      "step": 38050
+    },
+    {
+      "epoch": 5.414239022310643,
+      "grad_norm": 1.78681480884552,
+      "learning_rate": 0.00013223333333333334,
+      "loss": 2.5973,
+      "step": 38100
+    },
+    {
+      "epoch": 5.421344322864857,
+      "grad_norm": 1.8385677337646484,
+      "learning_rate": 0.00013167777777777778,
+      "loss": 2.5859,
+      "step": 38150
+    },
+    {
+      "epoch": 5.428449623419071,
+      "grad_norm": 1.695863962173462,
+      "learning_rate": 0.00013112222222222221,
+      "loss": 2.5974,
+      "step": 38200
+    },
+    {
+      "epoch": 5.435554923973284,
+      "grad_norm": 1.8921480178833008,
+      "learning_rate": 0.00013056666666666668,
+      "loss": 2.5927,
+      "step": 38250
+    },
+    {
+      "epoch": 5.442660224527497,
+      "grad_norm": 1.7668559551239014,
+      "learning_rate": 0.00013001111111111112,
+      "loss": 2.5875,
+      "step": 38300
+    },
+    {
+      "epoch": 5.449765525081711,
+      "grad_norm": 1.7700510025024414,
+      "learning_rate": 0.00012945555555555555,
+      "loss": 2.6111,
+      "step": 38350
+    },
+    {
+      "epoch": 5.456870825635924,
+      "grad_norm": 1.6881382465362549,
+      "learning_rate": 0.0001289,
+      "loss": 2.6026,
+      "step": 38400
+    },
+    {
+      "epoch": 5.463976126190138,
+      "grad_norm": 1.8298293352127075,
+      "learning_rate": 0.00012834444444444445,
+      "loss": 2.6006,
+      "step": 38450
+    },
+    {
+      "epoch": 5.471081426744352,
+      "grad_norm": 1.6942826509475708,
+      "learning_rate": 0.0001277888888888889,
+      "loss": 2.6035,
+      "step": 38500
+    },
+    {
+      "epoch": 5.4781867272985645,
+      "grad_norm": 1.8194513320922852,
+      "learning_rate": 0.00012723333333333333,
+      "loss": 2.5884,
+      "step": 38550
+    },
+    {
+      "epoch": 5.485292027852778,
+      "grad_norm": 1.8685965538024902,
+      "learning_rate": 0.0001266777777777778,
+      "loss": 2.5953,
+      "step": 38600
+    },
+    {
+      "epoch": 5.492397328406992,
+      "grad_norm": 1.7005841732025146,
+      "learning_rate": 0.00012612222222222223,
+      "loss": 2.5994,
+      "step": 38650
+    },
+    {
+      "epoch": 5.4995026289612055,
+      "grad_norm": 1.833150029182434,
+      "learning_rate": 0.00012556666666666666,
+      "loss": 2.5881,
+      "step": 38700
+    },
+    {
+      "epoch": 5.506607929515418,
+      "grad_norm": 1.691675066947937,
+      "learning_rate": 0.0001250111111111111,
+      "loss": 2.6069,
+      "step": 38750
+    },
+    {
+      "epoch": 5.513713230069632,
+      "grad_norm": 1.769320011138916,
+      "learning_rate": 0.00012445555555555557,
+      "loss": 2.5756,
+      "step": 38800
+    },
+    {
+      "epoch": 5.5208185306238455,
+      "grad_norm": 1.6686408519744873,
+      "learning_rate": 0.0001239,
+      "loss": 2.5906,
+      "step": 38850
+    },
+    {
+      "epoch": 5.527923831178059,
+      "grad_norm": 1.6487681865692139,
+      "learning_rate": 0.00012334444444444447,
+      "loss": 2.5795,
+      "step": 38900
+    },
+    {
+      "epoch": 5.535029131732272,
+      "grad_norm": 1.5991772413253784,
+      "learning_rate": 0.0001227888888888889,
+      "loss": 2.5886,
+      "step": 38950
+    },
+    {
+      "epoch": 5.542134432286486,
+      "grad_norm": 1.8373521566390991,
+      "learning_rate": 0.00012223333333333334,
+      "loss": 2.5726,
+      "step": 39000
+    },
+    {
+      "epoch": 5.549239732840699,
+      "grad_norm": 1.832866907119751,
+      "learning_rate": 0.00012167777777777778,
+      "loss": 2.5869,
+      "step": 39050
+    },
+    {
+      "epoch": 5.556345033394913,
+      "grad_norm": 1.6868762969970703,
+      "learning_rate": 0.00012112222222222223,
+      "loss": 2.5834,
+      "step": 39100
+    },
+    {
+      "epoch": 5.563450333949126,
+      "grad_norm": 1.7114180326461792,
+      "learning_rate": 0.00012056666666666667,
+      "loss": 2.5955,
+      "step": 39150
+    },
+    {
+      "epoch": 5.570555634503339,
+      "grad_norm": 1.8619048595428467,
+      "learning_rate": 0.00012001111111111112,
+      "loss": 2.618,
+      "step": 39200
+    },
+    {
+      "epoch": 5.577660935057553,
+      "grad_norm": 1.9599003791809082,
+      "learning_rate": 0.00011945555555555555,
+      "loss": 2.5879,
+      "step": 39250
+    },
+    {
+      "epoch": 5.584766235611767,
+      "grad_norm": 1.8127872943878174,
+      "learning_rate": 0.0001189,
+      "loss": 2.5773,
+      "step": 39300
+    },
+    {
+      "epoch": 5.59187153616598,
+      "grad_norm": 1.6214098930358887,
+      "learning_rate": 0.00011834444444444445,
+      "loss": 2.5677,
+      "step": 39350
+    },
+    {
+      "epoch": 5.598976836720193,
+      "grad_norm": 1.8787380456924438,
+      "learning_rate": 0.00011778888888888889,
+      "loss": 2.5936,
+      "step": 39400
+    },
+    {
+      "epoch": 5.606082137274407,
+      "grad_norm": 1.7826387882232666,
+      "learning_rate": 0.00011723333333333333,
+      "loss": 2.5734,
+      "step": 39450
+    },
+    {
+      "epoch": 5.61318743782862,
+      "grad_norm": 1.6517889499664307,
+      "learning_rate": 0.00011667777777777779,
+      "loss": 2.5787,
+      "step": 39500
+    },
+    {
+      "epoch": 5.620292738382833,
+      "grad_norm": 1.9160776138305664,
+      "learning_rate": 0.00011612222222222223,
+      "loss": 2.5911,
+      "step": 39550
+    },
+    {
+      "epoch": 5.627398038937047,
+      "grad_norm": 1.7249836921691895,
+      "learning_rate": 0.00011556666666666667,
+      "loss": 2.5813,
+      "step": 39600
+    },
+    {
+      "epoch": 5.6345033394912605,
+      "grad_norm": 1.815263032913208,
+      "learning_rate": 0.0001150111111111111,
+      "loss": 2.5825,
+      "step": 39650
+    },
+    {
+      "epoch": 5.641608640045474,
+      "grad_norm": 1.912611722946167,
+      "learning_rate": 0.00011445555555555557,
+      "loss": 2.5846,
+      "step": 39700
+    },
+    {
+      "epoch": 5.648713940599688,
+      "grad_norm": 1.7393444776535034,
+      "learning_rate": 0.0001139,
+      "loss": 2.5919,
+      "step": 39750
+    },
+    {
+      "epoch": 5.655819241153901,
+      "grad_norm": 1.740699291229248,
+      "learning_rate": 0.00011334444444444444,
+      "loss": 2.5997,
+      "step": 39800
+    },
+    {
+      "epoch": 5.662924541708114,
+      "grad_norm": 1.7837730646133423,
+      "learning_rate": 0.00011278888888888889,
+      "loss": 2.5815,
+      "step": 39850
+    },
+    {
+      "epoch": 5.670029842262328,
+      "grad_norm": 1.9134184122085571,
+      "learning_rate": 0.00011223333333333334,
+      "loss": 2.5646,
+      "step": 39900
+    },
+    {
+      "epoch": 5.6771351428165415,
+      "grad_norm": 1.7678228616714478,
+      "learning_rate": 0.00011167777777777778,
+      "loss": 2.5827,
+      "step": 39950
+    },
+    {
+      "epoch": 5.684240443370754,
+      "grad_norm": 1.7933886051177979,
+      "learning_rate": 0.00011112222222222222,
+      "loss": 2.5639,
+      "step": 40000
+    },
+    {
+      "epoch": 5.684240443370754,
+      "eval_accuracy": 0.5388505458831787,
+      "eval_loss": 2.571995496749878,
+      "eval_runtime": 1.3688,
+      "eval_samples_per_second": 2747.025,
+      "eval_steps_per_second": 43.105,
+      "step": 40000
+    },
+    {
+      "epoch": 5.691345743924968,
+      "grad_norm": 1.7596800327301025,
+      "learning_rate": 0.00011056666666666667,
+      "loss": 2.5873,
+      "step": 40050
+    },
+    {
+      "epoch": 5.698451044479182,
+      "grad_norm": 1.852180004119873,
+      "learning_rate": 0.00011001111111111112,
+      "loss": 2.5909,
+      "step": 40100
+    },
+    {
+      "epoch": 5.705556345033395,
+      "grad_norm": 1.6055049896240234,
+      "learning_rate": 0.00010945555555555555,
+      "loss": 2.568,
+      "step": 40150
+    },
+    {
+      "epoch": 5.712661645587608,
+      "grad_norm": 1.9267030954360962,
+      "learning_rate": 0.0001089,
+      "loss": 2.5681,
+      "step": 40200
+    },
+    {
+      "epoch": 5.719766946141822,
+      "grad_norm": 1.663442850112915,
+      "learning_rate": 0.00010834444444444444,
+      "loss": 2.5927,
+      "step": 40250
+    },
+    {
+      "epoch": 5.726872246696035,
+      "grad_norm": 1.6687923669815063,
+      "learning_rate": 0.00010778888888888889,
+      "loss": 2.5585,
+      "step": 40300
+    },
+    {
+      "epoch": 5.733977547250249,
+      "grad_norm": 1.64219331741333,
+      "learning_rate": 0.00010723333333333334,
+      "loss": 2.6002,
+      "step": 40350
+    },
+    {
+      "epoch": 5.741082847804462,
+      "grad_norm": 1.7011926174163818,
+      "learning_rate": 0.00010667777777777778,
+      "loss": 2.5828,
+      "step": 40400
+    },
+    {
+      "epoch": 5.748188148358675,
+      "grad_norm": 1.7362624406814575,
+      "learning_rate": 0.00010612222222222223,
+      "loss": 2.5832,
+      "step": 40450
+    },
+    {
+      "epoch": 5.755293448912889,
+      "grad_norm": 1.7169482707977295,
+      "learning_rate": 0.00010556666666666667,
+      "loss": 2.5808,
+      "step": 40500
+    },
+    {
+      "epoch": 5.762398749467103,
+      "grad_norm": 1.7829616069793701,
+      "learning_rate": 0.00010501111111111112,
+      "loss": 2.566,
+      "step": 40550
+    },
+    {
+      "epoch": 5.769504050021316,
+      "grad_norm": 1.8037691116333008,
+      "learning_rate": 0.00010445555555555555,
+      "loss": 2.5777,
+      "step": 40600
+    },
+    {
+      "epoch": 5.776609350575529,
+      "grad_norm": 1.8303767442703247,
+      "learning_rate": 0.0001039,
+      "loss": 2.5691,
+      "step": 40650
+    },
+    {
+      "epoch": 5.783714651129743,
+      "grad_norm": 1.6861969232559204,
+      "learning_rate": 0.00010334444444444446,
+      "loss": 2.5929,
+      "step": 40700
+    },
+    {
+      "epoch": 5.7908199516839565,
+      "grad_norm": 2.0748815536499023,
+      "learning_rate": 0.00010278888888888889,
+      "loss": 2.5749,
+      "step": 40750
+    },
+    {
+      "epoch": 5.797925252238169,
+      "grad_norm": 1.96970534324646,
+      "learning_rate": 0.00010223333333333333,
+      "loss": 2.5705,
+      "step": 40800
+    },
+    {
+      "epoch": 5.805030552792383,
+      "grad_norm": 1.6883175373077393,
+      "learning_rate": 0.00010167777777777778,
+      "loss": 2.5644,
+      "step": 40850
+    },
+    {
+      "epoch": 5.8121358533465965,
+      "grad_norm": 1.8087552785873413,
+      "learning_rate": 0.00010112222222222223,
+      "loss": 2.5855,
+      "step": 40900
+    },
+    {
+      "epoch": 5.81924115390081,
+      "grad_norm": 1.771265983581543,
+      "learning_rate": 0.00010056666666666667,
+      "loss": 2.5527,
+      "step": 40950
+    },
+    {
+      "epoch": 5.826346454455024,
+      "grad_norm": 1.7859134674072266,
+      "learning_rate": 0.0001000111111111111,
+      "loss": 2.5787,
+      "step": 41000
+    },
+    {
+      "epoch": 5.833451755009237,
+      "grad_norm": 1.8071932792663574,
+      "learning_rate": 9.945555555555557e-05,
+      "loss": 2.5371,
+      "step": 41050
+    },
+    {
+      "epoch": 5.84055705556345,
+      "grad_norm": 1.6997110843658447,
+      "learning_rate": 9.89e-05,
+      "loss": 2.5671,
+      "step": 41100
+    },
+    {
+      "epoch": 5.847662356117664,
+      "grad_norm": 1.623996615409851,
+      "learning_rate": 9.834444444444444e-05,
+      "loss": 2.5568,
+      "step": 41150
+    },
+    {
+      "epoch": 5.854767656671878,
+      "grad_norm": 1.720940351486206,
+      "learning_rate": 9.778888888888888e-05,
+      "loss": 2.5701,
+      "step": 41200
+    },
+    {
+      "epoch": 5.86187295722609,
+      "grad_norm": 1.5865821838378906,
+      "learning_rate": 9.723333333333334e-05,
+      "loss": 2.5884,
+      "step": 41250
+    },
+    {
+      "epoch": 5.868978257780304,
+      "grad_norm": 1.7232975959777832,
+      "learning_rate": 9.667777777777778e-05,
+      "loss": 2.5765,
+      "step": 41300
+    },
+    {
+      "epoch": 5.876083558334518,
+      "grad_norm": 1.879902958869934,
+      "learning_rate": 9.612222222222222e-05,
+      "loss": 2.5735,
+      "step": 41350
+    },
+    {
+      "epoch": 5.883188858888731,
+      "grad_norm": 1.7776896953582764,
+      "learning_rate": 9.556666666666667e-05,
+      "loss": 2.5701,
+      "step": 41400
+    },
+    {
+      "epoch": 5.890294159442944,
+      "grad_norm": 1.6992663145065308,
+      "learning_rate": 9.501111111111112e-05,
+      "loss": 2.5841,
+      "step": 41450
+    },
+    {
+      "epoch": 5.897399459997158,
+      "grad_norm": 1.8483794927597046,
+      "learning_rate": 9.445555555555556e-05,
+      "loss": 2.5563,
+      "step": 41500
+    },
+    {
+      "epoch": 5.904504760551371,
+      "grad_norm": 1.7901180982589722,
+      "learning_rate": 9.39e-05,
+      "loss": 2.5739,
+      "step": 41550
+    },
+    {
+      "epoch": 5.911610061105585,
+      "grad_norm": 1.8169100284576416,
+      "learning_rate": 9.334444444444444e-05,
+      "loss": 2.5597,
+      "step": 41600
+    },
+    {
+      "epoch": 5.918715361659798,
+      "grad_norm": 1.8494465351104736,
+      "learning_rate": 9.278888888888889e-05,
+      "loss": 2.5641,
+      "step": 41650
+    },
+    {
+      "epoch": 5.9258206622140115,
+      "grad_norm": 1.711912989616394,
+      "learning_rate": 9.223333333333333e-05,
+      "loss": 2.558,
+      "step": 41700
+    },
+    {
+      "epoch": 5.932925962768225,
+      "grad_norm": 1.789607048034668,
+      "learning_rate": 9.167777777777778e-05,
+      "loss": 2.5553,
+      "step": 41750
+    },
+    {
+      "epoch": 5.940031263322439,
+      "grad_norm": 1.6762250661849976,
+      "learning_rate": 9.112222222222222e-05,
+      "loss": 2.5429,
+      "step": 41800
+    },
+    {
+      "epoch": 5.9471365638766525,
+      "grad_norm": 1.8096888065338135,
+      "learning_rate": 9.056666666666667e-05,
+      "loss": 2.5565,
+      "step": 41850
+    },
+    {
+      "epoch": 5.954241864430865,
+      "grad_norm": 1.6990909576416016,
+      "learning_rate": 9.001111111111112e-05,
+      "loss": 2.5566,
+      "step": 41900
+    },
+    {
+      "epoch": 5.961347164985079,
+      "grad_norm": 1.7819483280181885,
+      "learning_rate": 8.945555555555556e-05,
+      "loss": 2.5609,
+      "step": 41950
+    },
+    {
+      "epoch": 5.9684524655392925,
+      "grad_norm": 1.7949304580688477,
+      "learning_rate": 8.89e-05,
+      "loss": 2.5517,
+      "step": 42000
+    },
+    {
+      "epoch": 5.9684524655392925,
+      "eval_accuracy": 0.5439066290855408,
+      "eval_loss": 2.5534567832946777,
+      "eval_runtime": 1.4415,
+      "eval_samples_per_second": 2608.377,
+      "eval_steps_per_second": 40.929,
+      "step": 42000
+    },
+    {
+      "epoch": 5.975557766093505,
+      "grad_norm": 1.8308558464050293,
+      "learning_rate": 8.834444444444444e-05,
+      "loss": 2.5442,
+      "step": 42050
+    },
+    {
+      "epoch": 5.982663066647719,
+      "grad_norm": 1.9043349027633667,
+      "learning_rate": 8.77888888888889e-05,
+      "loss": 2.5432,
+      "step": 42100
+    },
+    {
+      "epoch": 5.989768367201933,
+      "grad_norm": 1.8522542715072632,
+      "learning_rate": 8.723333333333333e-05,
+      "loss": 2.5459,
+      "step": 42150
+    },
+    {
+      "epoch": 5.996873667756146,
+      "grad_norm": 1.8050007820129395,
+      "learning_rate": 8.667777777777778e-05,
+      "loss": 2.5387,
+      "step": 42200
+    },
+    {
+      "epoch": 6.00397896831036,
+      "grad_norm": 1.7550387382507324,
+      "learning_rate": 8.612222222222223e-05,
+      "loss": 2.5406,
+      "step": 42250
+    },
+    {
+      "epoch": 6.011084268864573,
+      "grad_norm": 1.7701274156570435,
+      "learning_rate": 8.556666666666667e-05,
+      "loss": 2.5472,
+      "step": 42300
+    },
+    {
+      "epoch": 6.018189569418786,
+      "grad_norm": 1.6725575923919678,
+      "learning_rate": 8.50111111111111e-05,
+      "loss": 2.537,
+      "step": 42350
+    },
+    {
+      "epoch": 6.025294869973,
+      "grad_norm": 1.7032172679901123,
+      "learning_rate": 8.445555555555557e-05,
+      "loss": 2.5599,
+      "step": 42400
+    },
+    {
+      "epoch": 6.032400170527214,
+      "grad_norm": 1.8186277151107788,
+      "learning_rate": 8.39e-05,
+      "loss": 2.5486,
+      "step": 42450
+    },
+    {
+      "epoch": 6.039505471081426,
+      "grad_norm": 1.6973793506622314,
+      "learning_rate": 8.334444444444444e-05,
+      "loss": 2.5327,
+      "step": 42500
+    },
+    {
+      "epoch": 6.04661077163564,
+      "grad_norm": 1.7290997505187988,
+      "learning_rate": 8.278888888888888e-05,
+      "loss": 2.5486,
+      "step": 42550
+    },
+    {
+      "epoch": 6.053716072189854,
+      "grad_norm": 1.72194504737854,
+      "learning_rate": 8.223333333333334e-05,
+      "loss": 2.5563,
+      "step": 42600
+    },
+    {
+      "epoch": 6.060821372744067,
+      "grad_norm": 1.6357437372207642,
+      "learning_rate": 8.167777777777778e-05,
+      "loss": 2.5448,
+      "step": 42650
+    },
+    {
+      "epoch": 6.06792667329828,
+      "grad_norm": 1.7466579675674438,
+      "learning_rate": 8.112222222222222e-05,
+      "loss": 2.5516,
+      "step": 42700
+    },
+    {
+      "epoch": 6.075031973852494,
+      "grad_norm": 1.7971217632293701,
+      "learning_rate": 8.056666666666667e-05,
+      "loss": 2.5372,
+      "step": 42750
+    },
+    {
+      "epoch": 6.0821372744067075,
+      "grad_norm": 1.7370117902755737,
+      "learning_rate": 8.001111111111112e-05,
+      "loss": 2.5451,
+      "step": 42800
+    },
+    {
+      "epoch": 6.089242574960921,
+      "grad_norm": 1.9375853538513184,
+      "learning_rate": 7.945555555555556e-05,
+      "loss": 2.5472,
+      "step": 42850
+    },
+    {
+      "epoch": 6.096347875515134,
+      "grad_norm": 1.8141741752624512,
+      "learning_rate": 7.89e-05,
+      "loss": 2.5328,
+      "step": 42900
+    },
+    {
+      "epoch": 6.103453176069348,
+      "grad_norm": 1.9149723052978516,
+      "learning_rate": 7.834444444444444e-05,
+      "loss": 2.5162,
+      "step": 42950
+    },
+    {
+      "epoch": 6.110558476623561,
+      "grad_norm": 1.7717311382293701,
+      "learning_rate": 7.77888888888889e-05,
+      "loss": 2.5438,
+      "step": 43000
+    },
+    {
+      "epoch": 6.117663777177775,
+      "grad_norm": 1.7755837440490723,
+      "learning_rate": 7.723333333333333e-05,
+      "loss": 2.5316,
+      "step": 43050
+    },
+    {
+      "epoch": 6.124769077731988,
+      "grad_norm": 1.7218981981277466,
+      "learning_rate": 7.667777777777778e-05,
+      "loss": 2.5293,
+      "step": 43100
+    },
+    {
+      "epoch": 6.131874378286201,
+      "grad_norm": 1.824884295463562,
+      "learning_rate": 7.612222222222222e-05,
+      "loss": 2.5248,
+      "step": 43150
+    },
+    {
+      "epoch": 6.138979678840415,
+      "grad_norm": 1.7106846570968628,
+      "learning_rate": 7.556666666666667e-05,
+      "loss": 2.5321,
+      "step": 43200
+    },
+    {
+      "epoch": 6.146084979394629,
+      "grad_norm": 1.805311679840088,
+      "learning_rate": 7.501111111111112e-05,
+      "loss": 2.5245,
+      "step": 43250
+    },
+    {
+      "epoch": 6.153190279948841,
+      "grad_norm": 1.7311850786209106,
+      "learning_rate": 7.445555555555556e-05,
+      "loss": 2.5471,
+      "step": 43300
+    },
+    {
+      "epoch": 6.160295580503055,
+      "grad_norm": 1.7861510515213013,
+      "learning_rate": 7.39e-05,
+      "loss": 2.5501,
+      "step": 43350
+    },
+    {
+      "epoch": 6.167400881057269,
+      "grad_norm": 1.9243968725204468,
+      "learning_rate": 7.334444444444444e-05,
+      "loss": 2.5437,
+      "step": 43400
+    },
+    {
+      "epoch": 6.174506181611482,
+      "grad_norm": 1.706551432609558,
+      "learning_rate": 7.27888888888889e-05,
+      "loss": 2.535,
+      "step": 43450
+    },
+    {
+      "epoch": 6.181611482165696,
+      "grad_norm": 1.8230974674224854,
+      "learning_rate": 7.223333333333333e-05,
+      "loss": 2.5183,
+      "step": 43500
+    },
+    {
+      "epoch": 6.188716782719909,
+      "grad_norm": 1.8202252388000488,
+      "learning_rate": 7.167777777777778e-05,
+      "loss": 2.5224,
+      "step": 43550
+    },
+    {
+      "epoch": 6.195822083274122,
+      "grad_norm": 1.7891016006469727,
+      "learning_rate": 7.112222222222223e-05,
+      "loss": 2.5142,
+      "step": 43600
+    },
+    {
+      "epoch": 6.202927383828336,
+      "grad_norm": 1.6762447357177734,
+      "learning_rate": 7.056666666666667e-05,
+      "loss": 2.5286,
+      "step": 43650
+    },
+    {
+      "epoch": 6.21003268438255,
+      "grad_norm": 1.7952409982681274,
+      "learning_rate": 7.00111111111111e-05,
+      "loss": 2.5212,
+      "step": 43700
+    },
+    {
+      "epoch": 6.2171379849367625,
+      "grad_norm": 1.9008454084396362,
+      "learning_rate": 6.945555555555556e-05,
+      "loss": 2.5084,
+      "step": 43750
+    },
+    {
+      "epoch": 6.224243285490976,
+      "grad_norm": 1.9572248458862305,
+      "learning_rate": 6.890000000000001e-05,
+      "loss": 2.542,
+      "step": 43800
+    },
+    {
+      "epoch": 6.23134858604519,
+      "grad_norm": 1.8827017545700073,
+      "learning_rate": 6.834444444444444e-05,
+      "loss": 2.5489,
+      "step": 43850
+    },
+    {
+      "epoch": 6.2384538865994035,
+      "grad_norm": 1.8700237274169922,
+      "learning_rate": 6.778888888888888e-05,
+      "loss": 2.5443,
+      "step": 43900
+    },
+    {
+      "epoch": 6.245559187153616,
+      "grad_norm": 2.1006288528442383,
+      "learning_rate": 6.723333333333335e-05,
+      "loss": 2.5255,
+      "step": 43950
+    },
+    {
+      "epoch": 6.25266448770783,
+      "grad_norm": 1.7663155794143677,
+      "learning_rate": 6.667777777777778e-05,
+      "loss": 2.5377,
+      "step": 44000
+    },
+    {
+      "epoch": 6.25266448770783,
+      "eval_accuracy": 0.5429770946502686,
+      "eval_loss": 2.5491862297058105,
+      "eval_runtime": 1.3913,
+      "eval_samples_per_second": 2702.566,
+      "eval_steps_per_second": 42.407,
+      "step": 44000
+    },
+    {
+      "epoch": 6.2597697882620436,
+      "grad_norm": 1.747223973274231,
+      "learning_rate": 6.612222222222222e-05,
+      "loss": 2.5047,
+      "step": 44050
+    },
+    {
+      "epoch": 6.266875088816257,
+      "grad_norm": 1.837746500968933,
+      "learning_rate": 6.556666666666666e-05,
+      "loss": 2.5486,
+      "step": 44100
+    },
+    {
+      "epoch": 6.27398038937047,
+      "grad_norm": 1.8067456483840942,
+      "learning_rate": 6.501111111111112e-05,
+      "loss": 2.5246,
+      "step": 44150
+    },
+    {
+      "epoch": 6.281085689924684,
+      "grad_norm": 1.7894113063812256,
+      "learning_rate": 6.445555555555556e-05,
+      "loss": 2.5244,
+      "step": 44200
+    },
+    {
+      "epoch": 6.288190990478897,
+      "grad_norm": 1.8320538997650146,
+      "learning_rate": 6.39e-05,
+      "loss": 2.5486,
+      "step": 44250
+    },
+    {
+      "epoch": 6.295296291033111,
+      "grad_norm": 1.9445518255233765,
+      "learning_rate": 6.334444444444445e-05,
+      "loss": 2.5449,
+      "step": 44300
+    },
+    {
+      "epoch": 6.302401591587325,
+      "grad_norm": 1.8100392818450928,
+      "learning_rate": 6.27888888888889e-05,
+      "loss": 2.516,
+      "step": 44350
+    },
+    {
+      "epoch": 6.309506892141537,
+      "grad_norm": 1.7949355840682983,
+      "learning_rate": 6.223333333333333e-05,
+      "loss": 2.5361,
+      "step": 44400
+    },
+    {
+      "epoch": 6.316612192695751,
+      "grad_norm": 1.7824511528015137,
+      "learning_rate": 6.167777777777778e-05,
+      "loss": 2.5252,
+      "step": 44450
+    },
+    {
+      "epoch": 6.323717493249965,
+      "grad_norm": 1.8611977100372314,
+      "learning_rate": 6.112222222222222e-05,
+      "loss": 2.5231,
+      "step": 44500
+    },
+    {
+      "epoch": 6.3308227938041775,
+      "grad_norm": 1.7862845659255981,
+      "learning_rate": 6.0566666666666664e-05,
+      "loss": 2.5104,
+      "step": 44550
+    },
+    {
+      "epoch": 6.337928094358391,
+      "grad_norm": 1.7541511058807373,
+      "learning_rate": 6.0011111111111114e-05,
+      "loss": 2.511,
+      "step": 44600
+    },
+    {
+      "epoch": 6.345033394912605,
+      "grad_norm": 1.7871416807174683,
+      "learning_rate": 5.945555555555555e-05,
+      "loss": 2.5121,
+      "step": 44650
+    },
+    {
+      "epoch": 6.352138695466818,
+      "grad_norm": 1.8592162132263184,
+      "learning_rate": 5.89e-05,
+      "loss": 2.5114,
+      "step": 44700
+    },
+    {
+      "epoch": 6.359243996021032,
+      "grad_norm": 1.7054407596588135,
+      "learning_rate": 5.8344444444444446e-05,
+      "loss": 2.53,
+      "step": 44750
+    },
+    {
+      "epoch": 6.366349296575245,
+      "grad_norm": 1.788488745689392,
+      "learning_rate": 5.778888888888889e-05,
+      "loss": 2.5508,
+      "step": 44800
+    },
+    {
+      "epoch": 6.3734545971294585,
+      "grad_norm": 1.8061481714248657,
+      "learning_rate": 5.723333333333333e-05,
+      "loss": 2.5408,
+      "step": 44850
+    },
+    {
+      "epoch": 6.380559897683672,
+      "grad_norm": 1.7002582550048828,
+      "learning_rate": 5.667777777777778e-05,
+      "loss": 2.5298,
+      "step": 44900
+    },
+    {
+      "epoch": 6.387665198237886,
+      "grad_norm": 1.707021951675415,
+      "learning_rate": 5.612222222222222e-05,
+      "loss": 2.522,
+      "step": 44950
+    },
+    {
+      "epoch": 6.394770498792099,
+      "grad_norm": 1.855074405670166,
+      "learning_rate": 5.556666666666667e-05,
+      "loss": 2.5345,
+      "step": 45000
+    },
+    {
+      "epoch": 6.401875799346312,
+      "grad_norm": 1.866926670074463,
+      "learning_rate": 5.501111111111111e-05,
+      "loss": 2.5368,
+      "step": 45050
+    },
+    {
+      "epoch": 6.408981099900526,
+      "grad_norm": 1.7042498588562012,
+      "learning_rate": 5.445555555555556e-05,
+      "loss": 2.5188,
+      "step": 45100
+    },
+    {
+      "epoch": 6.4160864004547395,
+      "grad_norm": 1.8467471599578857,
+      "learning_rate": 5.39e-05,
+      "loss": 2.5368,
+      "step": 45150
+    },
+    {
+      "epoch": 6.423191701008952,
+      "grad_norm": 1.7979286909103394,
+      "learning_rate": 5.3344444444444446e-05,
+      "loss": 2.518,
+      "step": 45200
+    },
+    {
+      "epoch": 6.430297001563166,
+      "grad_norm": 1.772163987159729,
+      "learning_rate": 5.2788888888888897e-05,
+      "loss": 2.5179,
+      "step": 45250
+    },
+    {
+      "epoch": 6.43740230211738,
+      "grad_norm": 1.9672173261642456,
+      "learning_rate": 5.2233333333333334e-05,
+      "loss": 2.5017,
+      "step": 45300
+    },
+    {
+      "epoch": 6.444507602671593,
+      "grad_norm": 1.8143894672393799,
+      "learning_rate": 5.1677777777777784e-05,
+      "loss": 2.5096,
+      "step": 45350
+    },
+    {
+      "epoch": 6.451612903225806,
+      "grad_norm": 1.794852614402771,
+      "learning_rate": 5.112222222222222e-05,
+      "loss": 2.5268,
+      "step": 45400
+    },
+    {
+      "epoch": 6.45871820378002,
+      "grad_norm": 2.0142173767089844,
+      "learning_rate": 5.056666666666667e-05,
+      "loss": 2.523,
+      "step": 45450
+    },
+    {
+      "epoch": 6.465823504334233,
+      "grad_norm": 1.9065548181533813,
+      "learning_rate": 5.001111111111111e-05,
+      "loss": 2.5207,
+      "step": 45500
+    },
+    {
+      "epoch": 6.472928804888447,
+      "grad_norm": 1.9128227233886719,
+      "learning_rate": 4.945555555555556e-05,
+      "loss": 2.4899,
+      "step": 45550
+    },
+    {
+      "epoch": 6.480034105442661,
+      "grad_norm": 1.729750633239746,
+      "learning_rate": 4.89e-05,
+      "loss": 2.538,
+      "step": 45600
+    },
+    {
+      "epoch": 6.487139405996873,
+      "grad_norm": 1.7502912282943726,
+      "learning_rate": 4.8344444444444447e-05,
+      "loss": 2.5154,
+      "step": 45650
+    },
+    {
+      "epoch": 6.494244706551087,
+      "grad_norm": 1.7217427492141724,
+      "learning_rate": 4.778888888888889e-05,
+      "loss": 2.5228,
+      "step": 45700
+    },
+    {
+      "epoch": 6.501350007105301,
+      "grad_norm": 1.852655053138733,
+      "learning_rate": 4.7233333333333334e-05,
+      "loss": 2.5175,
+      "step": 45750
+    },
+    {
+      "epoch": 6.5084553076595135,
+      "grad_norm": 1.8434449434280396,
+      "learning_rate": 4.667777777777778e-05,
+      "loss": 2.5029,
+      "step": 45800
+    },
+    {
+      "epoch": 6.515560608213727,
+      "grad_norm": 1.6561399698257446,
+      "learning_rate": 4.612222222222223e-05,
+      "loss": 2.5213,
+      "step": 45850
+    },
+    {
+      "epoch": 6.522665908767941,
+      "grad_norm": 1.8549339771270752,
+      "learning_rate": 4.5566666666666665e-05,
+      "loss": 2.5187,
+      "step": 45900
+    },
+    {
+      "epoch": 6.5297712093221545,
+      "grad_norm": 1.7742987871170044,
+      "learning_rate": 4.5011111111111116e-05,
+      "loss": 2.5192,
+      "step": 45950
+    },
+    {
+      "epoch": 6.536876509876368,
+      "grad_norm": 1.772049903869629,
+      "learning_rate": 4.445555555555555e-05,
+      "loss": 2.5169,
+      "step": 46000
+    },
+    {
+      "epoch": 6.536876509876368,
+      "eval_accuracy": 0.5479844808578491,
+      "eval_loss": 2.523486614227295,
+      "eval_runtime": 1.4553,
+      "eval_samples_per_second": 2583.605,
+      "eval_steps_per_second": 40.541,
+      "step": 46000
+    },
+    {
+      "epoch": 6.543981810430581,
+      "grad_norm": 1.9687917232513428,
+      "learning_rate": 4.39e-05,
+      "loss": 2.5184,
+      "step": 46050
+    },
+    {
+      "epoch": 6.551087110984795,
+      "grad_norm": 1.6084198951721191,
+      "learning_rate": 4.334444444444444e-05,
+      "loss": 2.5259,
+      "step": 46100
+    },
+    {
+      "epoch": 6.558192411539008,
+      "grad_norm": 1.8368006944656372,
+      "learning_rate": 4.278888888888889e-05,
+      "loss": 2.5118,
+      "step": 46150
+    },
+    {
+      "epoch": 6.565297712093222,
+      "grad_norm": 1.8750320672988892,
+      "learning_rate": 4.2233333333333334e-05,
+      "loss": 2.5157,
+      "step": 46200
+    },
+    {
+      "epoch": 6.572403012647435,
+      "grad_norm": 1.7846745252609253,
+      "learning_rate": 4.167777777777778e-05,
+      "loss": 2.5076,
+      "step": 46250
+    },
+    {
+      "epoch": 6.579508313201648,
+      "grad_norm": 1.8737770318984985,
+      "learning_rate": 4.112222222222222e-05,
+      "loss": 2.5257,
+      "step": 46300
+    },
+    {
+      "epoch": 6.586613613755862,
+      "grad_norm": 1.870085597038269,
+      "learning_rate": 4.0566666666666666e-05,
+      "loss": 2.5151,
+      "step": 46350
+    },
+    {
+      "epoch": 6.593718914310076,
+      "grad_norm": 1.7165669202804565,
+      "learning_rate": 4.001111111111111e-05,
+      "loss": 2.5106,
+      "step": 46400
+    },
+    {
+      "epoch": 6.600824214864288,
+      "grad_norm": 2.036309242248535,
+      "learning_rate": 3.945555555555556e-05,
+      "loss": 2.5137,
+      "step": 46450
+    },
+    {
+      "epoch": 6.607929515418502,
+      "grad_norm": 2.068466901779175,
+      "learning_rate": 3.89e-05,
+      "loss": 2.5248,
+      "step": 46500
+    },
+    {
+      "epoch": 6.615034815972716,
+      "grad_norm": 1.8142298460006714,
+      "learning_rate": 3.834444444444445e-05,
+      "loss": 2.5043,
+      "step": 46550
+    },
+    {
+      "epoch": 6.622140116526929,
+      "grad_norm": 1.8824352025985718,
+      "learning_rate": 3.7788888888888884e-05,
+      "loss": 2.5186,
+      "step": 46600
+    },
+    {
+      "epoch": 6.629245417081142,
+      "grad_norm": 1.865993618965149,
+      "learning_rate": 3.7233333333333335e-05,
+      "loss": 2.5131,
+      "step": 46650
+    },
+    {
+      "epoch": 6.636350717635356,
+      "grad_norm": 1.7249752283096313,
+      "learning_rate": 3.667777777777777e-05,
+      "loss": 2.508,
+      "step": 46700
+    },
+    {
+      "epoch": 6.643456018189569,
+      "grad_norm": 1.8466631174087524,
+      "learning_rate": 3.612222222222222e-05,
+      "loss": 2.5059,
+      "step": 46750
+    },
+    {
+      "epoch": 6.650561318743783,
+      "grad_norm": 1.7760319709777832,
+      "learning_rate": 3.556666666666667e-05,
+      "loss": 2.5165,
+      "step": 46800
+    },
+    {
+      "epoch": 6.657666619297997,
+      "grad_norm": 1.7560021877288818,
+      "learning_rate": 3.501111111111111e-05,
+      "loss": 2.4964,
+      "step": 46850
+    },
+    {
+      "epoch": 6.6647719198522095,
+      "grad_norm": 1.7918329238891602,
+      "learning_rate": 3.445555555555556e-05,
+      "loss": 2.4828,
+      "step": 46900
+    },
+    {
+      "epoch": 6.671877220406423,
+      "grad_norm": 1.89692223072052,
+      "learning_rate": 3.39e-05,
+      "loss": 2.5003,
+      "step": 46950
+    },
+    {
+      "epoch": 6.678982520960637,
+      "grad_norm": 1.7761543989181519,
+      "learning_rate": 3.334444444444445e-05,
+      "loss": 2.4997,
+      "step": 47000
+    },
+    {
+      "epoch": 6.68608782151485,
+      "grad_norm": 1.837147831916809,
+      "learning_rate": 3.278888888888889e-05,
+      "loss": 2.4969,
+      "step": 47050
+    },
+    {
+      "epoch": 6.693193122069063,
+      "grad_norm": 1.7900179624557495,
+      "learning_rate": 3.2233333333333335e-05,
+      "loss": 2.5142,
+      "step": 47100
+    },
+    {
+      "epoch": 6.700298422623277,
+      "grad_norm": 1.8336573839187622,
+      "learning_rate": 3.167777777777778e-05,
+      "loss": 2.4998,
+      "step": 47150
+    },
+    {
+      "epoch": 6.707403723177491,
+      "grad_norm": 1.9420949220657349,
+      "learning_rate": 3.112222222222222e-05,
+      "loss": 2.4862,
+      "step": 47200
+    },
+    {
+      "epoch": 6.714509023731704,
+      "grad_norm": 1.8640563488006592,
+      "learning_rate": 3.0566666666666667e-05,
+      "loss": 2.4828,
+      "step": 47250
+    },
+    {
+      "epoch": 6.721614324285917,
+      "grad_norm": 1.7071493864059448,
+      "learning_rate": 3.001111111111111e-05,
+      "loss": 2.4917,
+      "step": 47300
+    },
+    {
+      "epoch": 6.728719624840131,
+      "grad_norm": 1.986015796661377,
+      "learning_rate": 2.9455555555555554e-05,
+      "loss": 2.5003,
+      "step": 47350
+    },
+    {
+      "epoch": 6.735824925394344,
+      "grad_norm": 1.985974907875061,
+      "learning_rate": 2.8899999999999998e-05,
+      "loss": 2.5244,
+      "step": 47400
+    },
+    {
+      "epoch": 6.742930225948558,
+      "grad_norm": 1.7473855018615723,
+      "learning_rate": 2.8344444444444445e-05,
+      "loss": 2.4982,
+      "step": 47450
+    },
+    {
+      "epoch": 6.750035526502771,
+      "grad_norm": 1.6116970777511597,
+      "learning_rate": 2.778888888888889e-05,
+      "loss": 2.4841,
+      "step": 47500
+    },
+    {
+      "epoch": 6.757140827056984,
+      "grad_norm": 1.7973964214324951,
+      "learning_rate": 2.7233333333333332e-05,
+      "loss": 2.5168,
+      "step": 47550
+    },
+    {
+      "epoch": 6.764246127611198,
+      "grad_norm": 1.7002062797546387,
+      "learning_rate": 2.667777777777778e-05,
+      "loss": 2.4778,
+      "step": 47600
+    },
+    {
+      "epoch": 6.771351428165412,
+      "grad_norm": 1.6538755893707275,
+      "learning_rate": 2.6122222222222223e-05,
+      "loss": 2.5103,
+      "step": 47650
+    },
+    {
+      "epoch": 6.7784567287196245,
+      "grad_norm": 1.9194822311401367,
+      "learning_rate": 2.5566666666666667e-05,
+      "loss": 2.5176,
+      "step": 47700
+    },
+    {
+      "epoch": 6.785562029273838,
+      "grad_norm": 1.9157203435897827,
+      "learning_rate": 2.5011111111111114e-05,
+      "loss": 2.4771,
+      "step": 47750
+    },
+    {
+      "epoch": 6.792667329828052,
+      "grad_norm": 1.752124547958374,
+      "learning_rate": 2.4455555555555558e-05,
+      "loss": 2.4962,
+      "step": 47800
+    },
+    {
+      "epoch": 6.799772630382265,
+      "grad_norm": 1.8622421026229858,
+      "learning_rate": 2.39e-05,
+      "loss": 2.5011,
+      "step": 47850
+    },
+    {
+      "epoch": 6.806877930936478,
+      "grad_norm": 1.9002290964126587,
+      "learning_rate": 2.3344444444444445e-05,
+      "loss": 2.501,
+      "step": 47900
+    },
+    {
+      "epoch": 6.813983231490692,
+      "grad_norm": 1.779293179512024,
+      "learning_rate": 2.278888888888889e-05,
+      "loss": 2.5198,
+      "step": 47950
+    },
+    {
+      "epoch": 6.8210885320449055,
+      "grad_norm": 1.7831950187683105,
+      "learning_rate": 2.2233333333333336e-05,
+      "loss": 2.5213,
+      "step": 48000
+    },
+    {
+      "epoch": 6.8210885320449055,
+      "eval_accuracy": 0.5504963397979736,
+      "eval_loss": 2.474808692932129,
+      "eval_runtime": 1.4017,
+      "eval_samples_per_second": 2682.379,
+      "eval_steps_per_second": 42.091,
+      "step": 48000
+    },
+    {
+      "epoch": 6.828193832599119,
+      "grad_norm": 1.9210829734802246,
+      "learning_rate": 2.167777777777778e-05,
+      "loss": 2.5188,
+      "step": 48050
+    },
+    {
+      "epoch": 6.835299133153333,
+      "grad_norm": 1.8451883792877197,
+      "learning_rate": 2.1122222222222224e-05,
+      "loss": 2.4747,
+      "step": 48100
+    },
+    {
+      "epoch": 6.842404433707546,
+      "grad_norm": 1.761411428451538,
+      "learning_rate": 2.0566666666666667e-05,
+      "loss": 2.5166,
+      "step": 48150
+    },
+    {
+      "epoch": 6.849509734261759,
+      "grad_norm": 1.6441208124160767,
+      "learning_rate": 2.001111111111111e-05,
+      "loss": 2.5105,
+      "step": 48200
+    },
+    {
+      "epoch": 6.856615034815973,
+      "grad_norm": 1.8028146028518677,
+      "learning_rate": 1.9455555555555555e-05,
+      "loss": 2.507,
+      "step": 48250
+    },
+    {
+      "epoch": 6.863720335370186,
+      "grad_norm": 1.8731715679168701,
+      "learning_rate": 1.8900000000000002e-05,
+      "loss": 2.5134,
+      "step": 48300
+    },
+    {
+      "epoch": 6.870825635924399,
+      "grad_norm": 1.891373634338379,
+      "learning_rate": 1.8344444444444446e-05,
+      "loss": 2.498,
+      "step": 48350
+    },
+    {
+      "epoch": 6.877930936478613,
+      "grad_norm": 1.762410044670105,
+      "learning_rate": 1.778888888888889e-05,
+      "loss": 2.5021,
+      "step": 48400
+    },
+    {
+      "epoch": 6.885036237032827,
+      "grad_norm": 1.608912706375122,
+      "learning_rate": 1.7233333333333333e-05,
+      "loss": 2.4846,
+      "step": 48450
+    },
+    {
+      "epoch": 6.89214153758704,
+      "grad_norm": 1.6867483854293823,
+      "learning_rate": 1.6677777777777777e-05,
+      "loss": 2.5007,
+      "step": 48500
+    },
+    {
+      "epoch": 6.899246838141253,
+      "grad_norm": 1.753810167312622,
+      "learning_rate": 1.612222222222222e-05,
+      "loss": 2.4897,
+      "step": 48550
+    },
+    {
+      "epoch": 6.906352138695467,
+      "grad_norm": 1.7778904438018799,
+      "learning_rate": 1.5566666666666668e-05,
+      "loss": 2.516,
+      "step": 48600
+    },
+    {
+      "epoch": 6.91345743924968,
+      "grad_norm": 1.8375523090362549,
+      "learning_rate": 1.5011111111111112e-05,
+      "loss": 2.4946,
+      "step": 48650
+    },
+    {
+      "epoch": 6.920562739803894,
+      "grad_norm": 1.7306774854660034,
+      "learning_rate": 1.4455555555555555e-05,
+      "loss": 2.4534,
+      "step": 48700
+    },
+    {
+      "epoch": 6.927668040358107,
+      "grad_norm": 1.8564552068710327,
+      "learning_rate": 1.3899999999999999e-05,
+      "loss": 2.494,
+      "step": 48750
+    },
+    {
+      "epoch": 6.9347733409123204,
+      "grad_norm": 1.687759518623352,
+      "learning_rate": 1.3344444444444446e-05,
+      "loss": 2.5194,
+      "step": 48800
+    },
+    {
+      "epoch": 6.941878641466534,
+      "grad_norm": 1.7956315279006958,
+      "learning_rate": 1.278888888888889e-05,
+      "loss": 2.4794,
+      "step": 48850
+    },
+    {
+      "epoch": 6.948983942020748,
+      "grad_norm": 1.8576797246932983,
+      "learning_rate": 1.2233333333333334e-05,
+      "loss": 2.4653,
+      "step": 48900
+    },
+    {
+      "epoch": 6.956089242574961,
+      "grad_norm": 1.7479002475738525,
+      "learning_rate": 1.1677777777777779e-05,
+      "loss": 2.5062,
+      "step": 48950
+    },
+    {
+      "epoch": 6.963194543129174,
+      "grad_norm": 1.786080002784729,
+      "learning_rate": 1.1122222222222223e-05,
+      "loss": 2.4969,
+      "step": 49000
+    },
+    {
+      "epoch": 6.970299843683388,
+      "grad_norm": 1.8831062316894531,
+      "learning_rate": 1.0566666666666667e-05,
+      "loss": 2.4833,
+      "step": 49050
+    },
+    {
+      "epoch": 6.9774051442376015,
+      "grad_norm": 1.7100664377212524,
+      "learning_rate": 1.0011111111111112e-05,
+      "loss": 2.5084,
+      "step": 49100
+    },
+    {
+      "epoch": 6.984510444791814,
+      "grad_norm": 1.8145115375518799,
+      "learning_rate": 9.455555555555556e-06,
+      "loss": 2.5161,
+      "step": 49150
+    },
+    {
+      "epoch": 6.991615745346028,
+      "grad_norm": 1.7452220916748047,
+      "learning_rate": 8.9e-06,
+      "loss": 2.5122,
+      "step": 49200
+    },
+    {
+      "epoch": 6.998721045900242,
+      "grad_norm": 1.7257003784179688,
+      "learning_rate": 8.344444444444445e-06,
+      "loss": 2.507,
+      "step": 49250
+    },
+    {
+      "epoch": 7.005826346454455,
+      "grad_norm": 1.9227982759475708,
+      "learning_rate": 7.788888888888889e-06,
+      "loss": 2.5138,
+      "step": 49300
+    },
+    {
+      "epoch": 7.012931647008669,
+      "grad_norm": 1.8527491092681885,
+      "learning_rate": 7.233333333333333e-06,
+      "loss": 2.4882,
+      "step": 49350
+    },
+    {
+      "epoch": 7.020036947562882,
+      "grad_norm": 1.9063125848770142,
+      "learning_rate": 6.677777777777779e-06,
+      "loss": 2.4867,
+      "step": 49400
+    },
+    {
+      "epoch": 7.027142248117095,
+      "grad_norm": 1.886391520500183,
+      "learning_rate": 6.1222222222222224e-06,
+      "loss": 2.4559,
+      "step": 49450
+    },
+    {
+      "epoch": 7.034247548671309,
+      "grad_norm": 1.8591769933700562,
+      "learning_rate": 5.566666666666667e-06,
+      "loss": 2.487,
+      "step": 49500
+    },
+    {
+      "epoch": 7.041352849225523,
+      "grad_norm": 1.9463618993759155,
+      "learning_rate": 5.011111111111112e-06,
+      "loss": 2.4784,
+      "step": 49550
+    },
+    {
+      "epoch": 7.048458149779735,
+      "grad_norm": 1.8545664548873901,
+      "learning_rate": 4.455555555555555e-06,
+      "loss": 2.4978,
+      "step": 49600
+    },
+    {
+      "epoch": 7.055563450333949,
+      "grad_norm": 1.77763831615448,
+      "learning_rate": 3.9e-06,
+      "loss": 2.4797,
+      "step": 49650
+    },
+    {
+      "epoch": 7.062668750888163,
+      "grad_norm": 2.1448755264282227,
+      "learning_rate": 3.3444444444444445e-06,
+      "loss": 2.5023,
+      "step": 49700
+    },
+    {
+      "epoch": 7.069774051442376,
+      "grad_norm": 1.895645022392273,
+      "learning_rate": 2.788888888888889e-06,
+      "loss": 2.4801,
+      "step": 49750
+    },
+    {
+      "epoch": 7.076879351996589,
+      "grad_norm": 1.6657183170318604,
+      "learning_rate": 2.2333333333333333e-06,
+      "loss": 2.4922,
+      "step": 49800
+    },
+    {
+      "epoch": 7.083984652550803,
+      "grad_norm": 1.82796311378479,
+      "learning_rate": 1.6777777777777779e-06,
+      "loss": 2.475,
+      "step": 49850
+    },
+    {
+      "epoch": 7.091089953105016,
+      "grad_norm": 1.904001235961914,
+      "learning_rate": 1.1222222222222222e-06,
+      "loss": 2.4825,
+      "step": 49900
+    },
+    {
+      "epoch": 7.09819525365923,
+      "grad_norm": 1.7330507040023804,
+      "learning_rate": 5.666666666666667e-07,
+      "loss": 2.4798,
+      "step": 49950
+    },
+    {
+      "epoch": 7.105300554213443,
+      "grad_norm": 1.712110161781311,
+      "learning_rate": 1.1111111111111112e-08,
+      "loss": 2.4916,
+      "step": 50000
+    },
+    {
+      "epoch": 7.105300554213443,
+      "eval_accuracy": 0.5550713539123535,
+      "eval_loss": 2.4701426029205322,
+      "eval_runtime": 1.4603,
+      "eval_samples_per_second": 2574.769,
+      "eval_steps_per_second": 40.402,
+      "step": 50000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 50000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.380157104452403e+16,
+  "train_batch_size": 256,
+  "trial_name": null,
+  "trial_params": null
+}