diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,7561 @@
+{
+  "best_global_step": 2100,
+  "best_metric": 1.0858707427978516,
+  "best_model_checkpoint": "./outputs/checkpoint-2100",
+  "epoch": 0.16188870151770657,
+  "eval_steps": 100,
+  "global_step": 2100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00015417971573114913,
+      "grad_norm": 1.2087944746017456,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 1.8689,
+      "step": 2
+    },
+    {
+      "epoch": 0.00030835943146229826,
+      "grad_norm": 1.2666666507720947,
+      "learning_rate": 6e-06,
+      "loss": 1.7785,
+      "step": 4
+    },
+    {
+      "epoch": 0.00046253914719344736,
+      "grad_norm": 0.7307026982307434,
+      "learning_rate": 1e-05,
+      "loss": 1.6809,
+      "step": 6
+    },
+    {
+      "epoch": 0.0006167188629245965,
+      "grad_norm": 1.2569252252578735,
+      "learning_rate": 1.4000000000000001e-05,
+      "loss": 1.9048,
+      "step": 8
+    },
+    {
+      "epoch": 0.0007708985786557456,
+      "grad_norm": 0.9572980403900146,
+      "learning_rate": 1.8e-05,
+      "loss": 1.7574,
+      "step": 10
+    },
+    {
+      "epoch": 0.0009250782943868947,
+      "grad_norm": 0.9918506145477295,
+      "learning_rate": 2.2000000000000003e-05,
+      "loss": 1.858,
+      "step": 12
+    },
+    {
+      "epoch": 0.0010792580101180438,
+      "grad_norm": 0.9316955208778381,
+      "learning_rate": 2.6000000000000002e-05,
+      "loss": 1.8238,
+      "step": 14
+    },
+    {
+      "epoch": 0.001233437725849193,
+      "grad_norm": 0.8265096545219421,
+      "learning_rate": 3e-05,
+      "loss": 1.6852,
+      "step": 16
+    },
+    {
+      "epoch": 0.001387617441580342,
+      "grad_norm": 0.900516152381897,
+      "learning_rate": 3.4000000000000007e-05,
+      "loss": 1.8227,
+      "step": 18
+    },
+    {
+      "epoch": 0.0015417971573114912,
+      "grad_norm": 0.9343056678771973,
+      "learning_rate": 3.8e-05,
+      "loss": 1.7732,
+      "step": 20
+    },
+    {
+      "epoch": 0.0016959768730426404,
+      "grad_norm": 0.8314495086669922,
+      "learning_rate": 4.2e-05,
+      "loss": 1.732,
+      "step": 22
+    },
+    {
+      "epoch": 0.0018501565887737894,
+      "grad_norm": 0.8370314240455627,
+      "learning_rate": 4.600000000000001e-05,
+      "loss": 1.6725,
+      "step": 24
+    },
+    {
+      "epoch": 0.0020043363045049384,
+      "grad_norm": 0.6678845286369324,
+      "learning_rate": 5e-05,
+      "loss": 1.5638,
+      "step": 26
+    },
+    {
+      "epoch": 0.0021585160202360876,
+      "grad_norm": 0.6469596028327942,
+      "learning_rate": 5.4000000000000005e-05,
+      "loss": 1.6414,
+      "step": 28
+    },
+    {
+      "epoch": 0.002312695735967237,
+      "grad_norm": 1.1161589622497559,
+      "learning_rate": 5.8e-05,
+      "loss": 1.6015,
+      "step": 30
+    },
+    {
+      "epoch": 0.002466875451698386,
+      "grad_norm": 0.6085391044616699,
+      "learning_rate": 6.2e-05,
+      "loss": 1.4577,
+      "step": 32
+    },
+    {
+      "epoch": 0.0026210551674295353,
+      "grad_norm": 0.7159522175788879,
+      "learning_rate": 6.6e-05,
+      "loss": 1.4667,
+      "step": 34
+    },
+    {
+      "epoch": 0.002775234883160684,
+      "grad_norm": 0.67247074842453,
+      "learning_rate": 7e-05,
+      "loss": 1.5619,
+      "step": 36
+    },
+    {
+      "epoch": 0.0029294145988918332,
+      "grad_norm": 0.6272625923156738,
+      "learning_rate": 7.4e-05,
+      "loss": 1.322,
+      "step": 38
+    },
+    {
+      "epoch": 0.0030835943146229824,
+      "grad_norm": 0.7291163206100464,
+      "learning_rate": 7.800000000000001e-05,
+      "loss": 1.3936,
+      "step": 40
+    },
+    {
+      "epoch": 0.0032377740303541317,
+      "grad_norm": 0.4980190396308899,
+      "learning_rate": 8.2e-05,
+      "loss": 1.3322,
+      "step": 42
+    },
+    {
+      "epoch": 0.003391953746085281,
+      "grad_norm": 1.032578945159912,
+      "learning_rate": 8.6e-05,
+      "loss": 1.3657,
+      "step": 44
+    },
+    {
+      "epoch": 0.0035461334618164296,
+      "grad_norm": 0.5118615031242371,
+      "learning_rate": 9e-05,
+      "loss": 1.2866,
+      "step": 46
+    },
+    {
+      "epoch": 0.003700313177547579,
+      "grad_norm": 0.5234407782554626,
+      "learning_rate": 9.4e-05,
+      "loss": 1.2806,
+      "step": 48
+    },
+    {
+      "epoch": 0.003854492893278728,
+      "grad_norm": 0.49764135479927063,
+      "learning_rate": 9.8e-05,
+      "loss": 1.2004,
+      "step": 50
+    },
+    {
+      "epoch": 0.004008672609009877,
+      "grad_norm": 0.34377485513687134,
+      "learning_rate": 0.00010200000000000001,
+      "loss": 1.1947,
+      "step": 52
+    },
+    {
+      "epoch": 0.0041628523247410265,
+      "grad_norm": 0.41426530480384827,
+      "learning_rate": 0.00010600000000000002,
+      "loss": 1.2689,
+      "step": 54
+    },
+    {
+      "epoch": 0.004317032040472175,
+      "grad_norm": 0.5027992129325867,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.2249,
+      "step": 56
+    },
+    {
+      "epoch": 0.004471211756203325,
+      "grad_norm": 0.44335752725601196,
+      "learning_rate": 0.00011399999999999999,
+      "loss": 1.2771,
+      "step": 58
+    },
+    {
+      "epoch": 0.004625391471934474,
+      "grad_norm": 0.3176646828651428,
+      "learning_rate": 0.000118,
+      "loss": 1.1873,
+      "step": 60
+    },
+    {
+      "epoch": 0.0047795711876656224,
+      "grad_norm": 0.24802716076374054,
+      "learning_rate": 0.000122,
+      "loss": 1.1989,
+      "step": 62
+    },
+    {
+      "epoch": 0.004933750903396772,
+      "grad_norm": 0.23831751942634583,
+      "learning_rate": 0.000126,
+      "loss": 1.1093,
+      "step": 64
+    },
+    {
+      "epoch": 0.005087930619127921,
+      "grad_norm": 0.24024009704589844,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.2196,
+      "step": 66
+    },
+    {
+      "epoch": 0.0052421103348590705,
+      "grad_norm": 0.2745237350463867,
+      "learning_rate": 0.000134,
+      "loss": 1.1802,
+      "step": 68
+    },
+    {
+      "epoch": 0.005396290050590219,
+      "grad_norm": 0.27817806601524353,
+      "learning_rate": 0.000138,
+      "loss": 1.1939,
+      "step": 70
+    },
+    {
+      "epoch": 0.005550469766321368,
+      "grad_norm": 0.19907328486442566,
+      "learning_rate": 0.000142,
+      "loss": 1.2061,
+      "step": 72
+    },
+    {
+      "epoch": 0.005704649482052518,
+      "grad_norm": 0.18879663944244385,
+      "learning_rate": 0.000146,
+      "loss": 1.2149,
+      "step": 74
+    },
+    {
+      "epoch": 0.0058588291977836665,
+      "grad_norm": 0.21456782519817352,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.1726,
+      "step": 76
+    },
+    {
+      "epoch": 0.006013008913514816,
+      "grad_norm": 0.23913143575191498,
+      "learning_rate": 0.000154,
+      "loss": 1.148,
+      "step": 78
+    },
+    {
+      "epoch": 0.006167188629245965,
+      "grad_norm": 0.2148526906967163,
+      "learning_rate": 0.00015800000000000002,
+      "loss": 1.1925,
+      "step": 80
+    },
+    {
+      "epoch": 0.006321368344977114,
+      "grad_norm": 0.2392999231815338,
+      "learning_rate": 0.000162,
+      "loss": 1.1488,
+      "step": 82
+    },
+    {
+      "epoch": 0.006475548060708263,
+      "grad_norm": 0.16503232717514038,
+      "learning_rate": 0.000166,
+      "loss": 1.1555,
+      "step": 84
+    },
+    {
+      "epoch": 0.006629727776439412,
+      "grad_norm": 0.1844739466905594,
+      "learning_rate": 0.00017,
+      "loss": 1.1934,
+      "step": 86
+    },
+    {
+      "epoch": 0.006783907492170562,
+      "grad_norm": 0.23832857608795166,
+      "learning_rate": 0.000174,
+      "loss": 1.1129,
+      "step": 88
+    },
+    {
+      "epoch": 0.0069380872079017105,
+      "grad_norm": 0.8846365809440613,
+      "learning_rate": 0.00017800000000000002,
+      "loss": 1.1028,
+      "step": 90
+    },
+    {
+      "epoch": 0.007092266923632859,
+      "grad_norm": 0.187076598405838,
+      "learning_rate": 0.000182,
+      "loss": 1.1,
+      "step": 92
+    },
+    {
+      "epoch": 0.007246446639364009,
+      "grad_norm": 0.1795521378517151,
+      "learning_rate": 0.00018600000000000002,
+      "loss": 1.1478,
+      "step": 94
+    },
+    {
+      "epoch": 0.007400626355095158,
+      "grad_norm": 0.199871227145195,
+      "learning_rate": 0.00019,
+      "loss": 1.1223,
+      "step": 96
+    },
+    {
+      "epoch": 0.007554806070826307,
+      "grad_norm": 0.17832662165164948,
+      "learning_rate": 0.000194,
+      "loss": 1.0909,
+      "step": 98
+    },
+    {
+      "epoch": 0.007708985786557456,
+      "grad_norm": 0.17023932933807373,
+      "learning_rate": 0.00019800000000000002,
+      "loss": 1.1526,
+      "step": 100
+    },
+    {
+      "epoch": 0.007708985786557456,
+      "eval_loss": 1.1401352882385254,
+      "eval_runtime": 185.6269,
+      "eval_samples_per_second": 91.274,
+      "eval_steps_per_second": 1.428,
+      "step": 100
+    },
+    {
+      "epoch": 0.007863165502288605,
+      "grad_norm": 0.17429223656654358,
+      "learning_rate": 0.00019999484748557298,
+      "loss": 1.1597,
+      "step": 102
+    },
+    {
+      "epoch": 0.008017345218019754,
+      "grad_norm": 0.16158349812030792,
+      "learning_rate": 0.0001999845424567189,
+      "loss": 1.1297,
+      "step": 104
+    },
+    {
+      "epoch": 0.008171524933750904,
+      "grad_norm": 0.15818771719932556,
+      "learning_rate": 0.0001999742374278648,
+      "loss": 1.083,
+      "step": 106
+    },
+    {
+      "epoch": 0.008325704649482053,
+      "grad_norm": 0.1591726392507553,
+      "learning_rate": 0.00019996393239901073,
+      "loss": 1.086,
+      "step": 108
+    },
+    {
+      "epoch": 0.008479884365213202,
+      "grad_norm": 0.174184650182724,
+      "learning_rate": 0.00019995362737015664,
+      "loss": 1.0769,
+      "step": 110
+    },
+    {
+      "epoch": 0.00863406408094435,
+      "grad_norm": 0.15928815305233002,
+      "learning_rate": 0.00019994332234130258,
+      "loss": 1.1315,
+      "step": 112
+    },
+    {
+      "epoch": 0.0087882437966755,
+      "grad_norm": 0.19639264047145844,
+      "learning_rate": 0.0001999330173124485,
+      "loss": 1.1339,
+      "step": 114
+    },
+    {
+      "epoch": 0.00894242351240665,
+      "grad_norm": 0.1639835238456726,
+      "learning_rate": 0.0001999227122835944,
+      "loss": 1.0836,
+      "step": 116
+    },
+    {
+      "epoch": 0.009096603228137799,
+      "grad_norm": 0.18691964447498322,
+      "learning_rate": 0.00019991240725474033,
+      "loss": 1.2109,
+      "step": 118
+    },
+    {
+      "epoch": 0.009250782943868947,
+      "grad_norm": 0.188096821308136,
+      "learning_rate": 0.00019990210222588624,
+      "loss": 1.1778,
+      "step": 120
+    },
+    {
+      "epoch": 0.009404962659600096,
+      "grad_norm": 0.1527150571346283,
+      "learning_rate": 0.00019989179719703218,
+      "loss": 1.0977,
+      "step": 122
+    },
+    {
+      "epoch": 0.009559142375331245,
+      "grad_norm": 0.1705218255519867,
+      "learning_rate": 0.0001998814921681781,
+      "loss": 1.1333,
+      "step": 124
+    },
+    {
+      "epoch": 0.009713322091062395,
+      "grad_norm": 0.1888928860425949,
+      "learning_rate": 0.00019987118713932401,
+      "loss": 1.1843,
+      "step": 126
+    },
+    {
+      "epoch": 0.009867501806793544,
+      "grad_norm": 0.1778104603290558,
+      "learning_rate": 0.00019986088211046993,
+      "loss": 1.0766,
+      "step": 128
+    },
+    {
+      "epoch": 0.010021681522524693,
+      "grad_norm": 0.15807992219924927,
+      "learning_rate": 0.00019985057708161584,
+      "loss": 1.0449,
+      "step": 130
+    },
+    {
+      "epoch": 0.010175861238255842,
+      "grad_norm": 0.16706159710884094,
+      "learning_rate": 0.00019984027205276176,
+      "loss": 1.0644,
+      "step": 132
+    },
+    {
+      "epoch": 0.01033004095398699,
+      "grad_norm": 0.16455501317977905,
+      "learning_rate": 0.00019982996702390767,
+      "loss": 1.1479,
+      "step": 134
+    },
+    {
+      "epoch": 0.010484220669718141,
+      "grad_norm": 0.17258939146995544,
+      "learning_rate": 0.0001998196619950536,
+      "loss": 1.0614,
+      "step": 136
+    },
+    {
+      "epoch": 0.01063840038544929,
+      "grad_norm": 0.15501369535923004,
+      "learning_rate": 0.0001998093569661995,
+      "loss": 1.1045,
+      "step": 138
+    },
+    {
+      "epoch": 0.010792580101180439,
+      "grad_norm": 0.1534334272146225,
+      "learning_rate": 0.00019979905193734542,
+      "loss": 1.1035,
+      "step": 140
+    },
+    {
+      "epoch": 0.010946759816911587,
+      "grad_norm": 0.14120443165302277,
+      "learning_rate": 0.00019978874690849136,
+      "loss": 1.0618,
+      "step": 142
+    },
+    {
+      "epoch": 0.011100939532642736,
+      "grad_norm": 0.17808520793914795,
+      "learning_rate": 0.00019977844187963728,
+      "loss": 1.1687,
+      "step": 144
+    },
+    {
+      "epoch": 0.011255119248373887,
+      "grad_norm": 0.16697613894939423,
+      "learning_rate": 0.0001997681368507832,
+      "loss": 1.0979,
+      "step": 146
+    },
+    {
+      "epoch": 0.011409298964105035,
+      "grad_norm": 0.16491086781024933,
+      "learning_rate": 0.0001997578318219291,
+      "loss": 1.1219,
+      "step": 148
+    },
+    {
+      "epoch": 0.011563478679836184,
+      "grad_norm": 0.15342313051223755,
+      "learning_rate": 0.00019974752679307502,
+      "loss": 1.1169,
+      "step": 150
+    },
+    {
+      "epoch": 0.011717658395567333,
+      "grad_norm": 0.1539286971092224,
+      "learning_rate": 0.00019973722176422093,
+      "loss": 1.1288,
+      "step": 152
+    },
+    {
+      "epoch": 0.011871838111298482,
+      "grad_norm": 0.15605852007865906,
+      "learning_rate": 0.00019972691673536688,
+      "loss": 1.0445,
+      "step": 154
+    },
+    {
+      "epoch": 0.012026017827029632,
+      "grad_norm": 0.14324098825454712,
+      "learning_rate": 0.0001997166117065128,
+      "loss": 1.1309,
+      "step": 156
+    },
+    {
+      "epoch": 0.012180197542760781,
+      "grad_norm": 0.21045701205730438,
+      "learning_rate": 0.0001997063066776587,
+      "loss": 1.0946,
+      "step": 158
+    },
+    {
+      "epoch": 0.01233437725849193,
+      "grad_norm": 0.16019922494888306,
+      "learning_rate": 0.00019969600164880462,
+      "loss": 1.11,
+      "step": 160
+    },
+    {
+      "epoch": 0.012488556974223079,
+      "grad_norm": 0.15740078687667847,
+      "learning_rate": 0.00019968569661995054,
+      "loss": 1.112,
+      "step": 162
+    },
+    {
+      "epoch": 0.012642736689954227,
+      "grad_norm": 0.16974380612373352,
+      "learning_rate": 0.00019967539159109648,
+      "loss": 1.1279,
+      "step": 164
+    },
+    {
+      "epoch": 0.012796916405685378,
+      "grad_norm": 0.16405288875102997,
+      "learning_rate": 0.0001996650865622424,
+      "loss": 1.0952,
+      "step": 166
+    },
+    {
+      "epoch": 0.012951096121416527,
+      "grad_norm": 0.16120509803295135,
+      "learning_rate": 0.0001996547815333883,
+      "loss": 1.1203,
+      "step": 168
+    },
+    {
+      "epoch": 0.013105275837147675,
+      "grad_norm": 0.17402276396751404,
+      "learning_rate": 0.00019964447650453422,
+      "loss": 1.0991,
+      "step": 170
+    },
+    {
+      "epoch": 0.013259455552878824,
+      "grad_norm": 0.18349111080169678,
+      "learning_rate": 0.00019963417147568014,
+      "loss": 1.1394,
+      "step": 172
+    },
+    {
+      "epoch": 0.013413635268609973,
+      "grad_norm": 0.14613087475299835,
+      "learning_rate": 0.00019962386644682608,
+      "loss": 1.1357,
+      "step": 174
+    },
+    {
+      "epoch": 0.013567814984341123,
+      "grad_norm": 0.142988383769989,
+      "learning_rate": 0.000199613561417972,
+      "loss": 1.0169,
+      "step": 176
+    },
+    {
+      "epoch": 0.013721994700072272,
+      "grad_norm": 0.14817160367965698,
+      "learning_rate": 0.0001996032563891179,
+      "loss": 1.1238,
+      "step": 178
+    },
+    {
+      "epoch": 0.013876174415803421,
+      "grad_norm": 0.15391133725643158,
+      "learning_rate": 0.00019959295136026382,
+      "loss": 1.0712,
+      "step": 180
+    },
+    {
+      "epoch": 0.01403035413153457,
+      "grad_norm": 0.1766846477985382,
+      "learning_rate": 0.00019958264633140974,
+      "loss": 1.1422,
+      "step": 182
+    },
+    {
+      "epoch": 0.014184533847265719,
+      "grad_norm": 0.16789212822914124,
+      "learning_rate": 0.00019957234130255565,
+      "loss": 1.1266,
+      "step": 184
+    },
+    {
+      "epoch": 0.014338713562996869,
+      "grad_norm": 0.1527165323495865,
+      "learning_rate": 0.00019956203627370157,
+      "loss": 1.0667,
+      "step": 186
+    },
+    {
+      "epoch": 0.014492893278728018,
+      "grad_norm": 0.1772206574678421,
+      "learning_rate": 0.00019955173124484748,
+      "loss": 1.1182,
+      "step": 188
+    },
+    {
+      "epoch": 0.014647072994459167,
+      "grad_norm": 0.15008313953876495,
+      "learning_rate": 0.0001995414262159934,
+      "loss": 1.0382,
+      "step": 190
+    },
+    {
+      "epoch": 0.014801252710190315,
+      "grad_norm": 0.16365988552570343,
+      "learning_rate": 0.00019953112118713931,
+      "loss": 1.1262,
+      "step": 192
+    },
+    {
+      "epoch": 0.014955432425921464,
+      "grad_norm": 0.14952193200588226,
+      "learning_rate": 0.00019952081615828526,
+      "loss": 1.1245,
+      "step": 194
+    },
+    {
+      "epoch": 0.015109612141652615,
+      "grad_norm": 0.15425263345241547,
+      "learning_rate": 0.00019951051112943117,
+      "loss": 1.1452,
+      "step": 196
+    },
+    {
+      "epoch": 0.015263791857383763,
+      "grad_norm": 0.1567617654800415,
+      "learning_rate": 0.00019950020610057709,
+      "loss": 1.0392,
+      "step": 198
+    },
+    {
+      "epoch": 0.015417971573114912,
+      "grad_norm": 0.14292609691619873,
+      "learning_rate": 0.000199489901071723,
+      "loss": 1.0728,
+      "step": 200
+    },
+    {
+      "epoch": 0.015417971573114912,
+      "eval_loss": 1.1127630472183228,
+      "eval_runtime": 185.2528,
+      "eval_samples_per_second": 91.459,
+      "eval_steps_per_second": 1.43,
+      "step": 200
+    },
+    {
+      "epoch": 0.015572151288846061,
+      "grad_norm": 0.15465517342090607,
+      "learning_rate": 0.00019947959604286892,
+      "loss": 1.0596,
+      "step": 202
+    },
+    {
+      "epoch": 0.01572633100457721,
+      "grad_norm": 0.16749607026576996,
+      "learning_rate": 0.00019946929101401486,
+      "loss": 1.1005,
+      "step": 204
+    },
+    {
+      "epoch": 0.01588051072030836,
+      "grad_norm": 0.15854287147521973,
+      "learning_rate": 0.00019945898598516077,
+      "loss": 1.0963,
+      "step": 206
+    },
+    {
+      "epoch": 0.016034690436039507,
+      "grad_norm": 0.1457831859588623,
+      "learning_rate": 0.0001994486809563067,
+      "loss": 1.1149,
+      "step": 208
+    },
+    {
+      "epoch": 0.016188870151770656,
+      "grad_norm": 0.15744629502296448,
+      "learning_rate": 0.0001994383759274526,
+      "loss": 1.0789,
+      "step": 210
+    },
+    {
+      "epoch": 0.01634304986750181,
+      "grad_norm": 0.13411423563957214,
+      "learning_rate": 0.00019942807089859852,
+      "loss": 1.0641,
+      "step": 212
+    },
+    {
+      "epoch": 0.016497229583232957,
+      "grad_norm": 0.1575399488210678,
+      "learning_rate": 0.00019941776586974446,
+      "loss": 1.0888,
+      "step": 214
+    },
+    {
+      "epoch": 0.016651409298964106,
+      "grad_norm": 0.14619529247283936,
+      "learning_rate": 0.00019940746084089037,
+      "loss": 1.081,
+      "step": 216
+    },
+    {
+      "epoch": 0.016805589014695255,
+      "grad_norm": 0.15578237175941467,
+      "learning_rate": 0.0001993971558120363,
+      "loss": 1.1434,
+      "step": 218
+    },
+    {
+      "epoch": 0.016959768730426403,
+      "grad_norm": 0.1516629308462143,
+      "learning_rate": 0.0001993868507831822,
+      "loss": 1.0909,
+      "step": 220
+    },
+    {
+      "epoch": 0.017113948446157552,
+      "grad_norm": 0.15613436698913574,
+      "learning_rate": 0.00019937654575432812,
+      "loss": 1.0999,
+      "step": 222
+    },
+    {
+      "epoch": 0.0172681281618887,
+      "grad_norm": 0.14825573563575745,
+      "learning_rate": 0.00019936624072547406,
+      "loss": 1.0827,
+      "step": 224
+    },
+    {
+      "epoch": 0.01742230787761985,
+      "grad_norm": 0.1624906212091446,
+      "learning_rate": 0.00019935593569661998,
+      "loss": 1.0856,
+      "step": 226
+    },
+    {
+      "epoch": 0.017576487593351,
+      "grad_norm": 0.1380940079689026,
+      "learning_rate": 0.0001993456306677659,
+      "loss": 1.0514,
+      "step": 228
+    },
+    {
+      "epoch": 0.017730667309082147,
+      "grad_norm": 0.13712120056152344,
+      "learning_rate": 0.0001993353256389118,
+      "loss": 1.0977,
+      "step": 230
+    },
+    {
+      "epoch": 0.0178848470248133,
+      "grad_norm": 0.1448957622051239,
+      "learning_rate": 0.00019932502061005772,
+      "loss": 1.0729,
+      "step": 232
+    },
+    {
+      "epoch": 0.01803902674054445,
+      "grad_norm": 0.13421876728534698,
+      "learning_rate": 0.00019931471558120364,
+      "loss": 1.0879,
+      "step": 234
+    },
+    {
+      "epoch": 0.018193206456275597,
+      "grad_norm": 0.16884732246398926,
+      "learning_rate": 0.00019930441055234955,
+      "loss": 1.1159,
+      "step": 236
+    },
+    {
+      "epoch": 0.018347386172006746,
+      "grad_norm": 0.14634890854358673,
+      "learning_rate": 0.00019929410552349547,
+      "loss": 1.0568,
+      "step": 238
+    },
+    {
+      "epoch": 0.018501565887737895,
+      "grad_norm": 0.16796648502349854,
+      "learning_rate": 0.00019928380049464138,
+      "loss": 1.0944,
+      "step": 240
+    },
+    {
+      "epoch": 0.018655745603469043,
+      "grad_norm": 0.13724717497825623,
+      "learning_rate": 0.0001992734954657873,
+      "loss": 1.0609,
+      "step": 242
+    },
+    {
+      "epoch": 0.018809925319200192,
+      "grad_norm": 0.14133594930171967,
+      "learning_rate": 0.0001992631904369332,
+      "loss": 1.0879,
+      "step": 244
+    },
+    {
+      "epoch": 0.01896410503493134,
+      "grad_norm": 0.1611246019601822,
+      "learning_rate": 0.00019925288540807915,
+      "loss": 1.0681,
+      "step": 246
+    },
+    {
+      "epoch": 0.01911828475066249,
+      "grad_norm": 0.17420877516269684,
+      "learning_rate": 0.00019924258037922507,
+      "loss": 1.1336,
+      "step": 248
+    },
+    {
+      "epoch": 0.01927246446639364,
+      "grad_norm": 0.13766029477119446,
+      "learning_rate": 0.00019923227535037098,
+      "loss": 1.075,
+      "step": 250
+    },
+    {
+      "epoch": 0.01942664418212479,
+      "grad_norm": 0.1691662222146988,
+      "learning_rate": 0.0001992219703215169,
+      "loss": 1.1369,
+      "step": 252
+    },
+    {
+      "epoch": 0.01958082389785594,
+      "grad_norm": 0.14959432184696198,
+      "learning_rate": 0.0001992116652926628,
+      "loss": 1.1129,
+      "step": 254
+    },
+    {
+      "epoch": 0.01973500361358709,
+      "grad_norm": 0.14996406435966492,
+      "learning_rate": 0.00019920136026380875,
+      "loss": 1.0304,
+      "step": 256
+    },
+    {
+      "epoch": 0.019889183329318237,
+      "grad_norm": 0.13211801648139954,
+      "learning_rate": 0.00019919105523495467,
+      "loss": 1.0652,
+      "step": 258
+    },
+    {
+      "epoch": 0.020043363045049386,
+      "grad_norm": 0.16041967272758484,
+      "learning_rate": 0.00019918075020610058,
+      "loss": 1.077,
+      "step": 260
+    },
+    {
+      "epoch": 0.020197542760780535,
+      "grad_norm": 0.1524546593427658,
+      "learning_rate": 0.0001991704451772465,
+      "loss": 1.1176,
+      "step": 262
+    },
+    {
+      "epoch": 0.020351722476511683,
+      "grad_norm": 0.16032540798187256,
+      "learning_rate": 0.00019916014014839241,
+      "loss": 1.0736,
+      "step": 264
+    },
+    {
+      "epoch": 0.020505902192242832,
+      "grad_norm": 0.17891019582748413,
+      "learning_rate": 0.00019914983511953836,
+      "loss": 1.1435,
+      "step": 266
+    },
+    {
+      "epoch": 0.02066008190797398,
+      "grad_norm": 0.14484059810638428,
+      "learning_rate": 0.00019913953009068427,
+      "loss": 1.0356,
+      "step": 268
+    },
+    {
+      "epoch": 0.02081426162370513,
+      "grad_norm": 0.14321155846118927,
+      "learning_rate": 0.00019912922506183019,
+      "loss": 1.0536,
+      "step": 270
+    },
+    {
+      "epoch": 0.020968441339436282,
+      "grad_norm": 0.17357808351516724,
+      "learning_rate": 0.0001991189200329761,
+      "loss": 1.171,
+      "step": 272
+    },
+    {
+      "epoch": 0.02112262105516743,
+      "grad_norm": 0.13990800082683563,
+      "learning_rate": 0.00019910861500412202,
+      "loss": 1.0946,
+      "step": 274
+    },
+    {
+      "epoch": 0.02127680077089858,
+      "grad_norm": 0.16634231805801392,
+      "learning_rate": 0.00019909830997526796,
+      "loss": 1.1029,
+      "step": 276
+    },
+    {
+      "epoch": 0.02143098048662973,
+      "grad_norm": 0.16322381794452667,
+      "learning_rate": 0.00019908800494641387,
+      "loss": 1.0688,
+      "step": 278
+    },
+    {
+      "epoch": 0.021585160202360877,
+      "grad_norm": 0.1652844250202179,
+      "learning_rate": 0.0001990776999175598,
+      "loss": 1.1237,
+      "step": 280
+    },
+    {
+      "epoch": 0.021739339918092026,
+      "grad_norm": 0.14457885921001434,
+      "learning_rate": 0.0001990673948887057,
+      "loss": 1.1995,
+      "step": 282
+    },
+    {
+      "epoch": 0.021893519633823175,
+      "grad_norm": 0.15549878776073456,
+      "learning_rate": 0.00019905708985985162,
+      "loss": 1.0475,
+      "step": 284
+    },
+    {
+      "epoch": 0.022047699349554323,
+      "grad_norm": 0.15715502202510834,
+      "learning_rate": 0.00019904678483099756,
+      "loss": 1.1211,
+      "step": 286
+    },
+    {
+      "epoch": 0.022201879065285472,
+      "grad_norm": 0.14022529125213623,
+      "learning_rate": 0.00019903647980214347,
+      "loss": 1.1056,
+      "step": 288
+    },
+    {
+      "epoch": 0.02235605878101662,
+      "grad_norm": 0.13293786346912384,
+      "learning_rate": 0.0001990261747732894,
+      "loss": 1.0877,
+      "step": 290
+    },
+    {
+      "epoch": 0.022510238496747773,
+      "grad_norm": 0.14625073969364166,
+      "learning_rate": 0.0001990158697444353,
+      "loss": 1.0375,
+      "step": 292
+    },
+    {
+      "epoch": 0.022664418212478922,
+      "grad_norm": 0.1417943835258484,
+      "learning_rate": 0.0001990055647155812,
+      "loss": 1.091,
+      "step": 294
+    },
+    {
+      "epoch": 0.02281859792821007,
+      "grad_norm": 0.1519964039325714,
+      "learning_rate": 0.00019899525968672713,
+      "loss": 1.0396,
+      "step": 296
+    },
+    {
+      "epoch": 0.02297277764394122,
+      "grad_norm": 0.1676655411720276,
+      "learning_rate": 0.00019898495465787305,
+      "loss": 1.1249,
+      "step": 298
+    },
+    {
+      "epoch": 0.02312695735967237,
+      "grad_norm": 0.1487220674753189,
+      "learning_rate": 0.00019897464962901896,
+      "loss": 1.1768,
+      "step": 300
+    },
+    {
+      "epoch": 0.02312695735967237,
+      "eval_loss": 1.1061022281646729,
+      "eval_runtime": 185.239,
+      "eval_samples_per_second": 91.466,
+      "eval_steps_per_second": 1.431,
+      "step": 300
+    },
+    {
+      "epoch": 0.023281137075403517,
+      "grad_norm": 0.1399739533662796,
+      "learning_rate": 0.00019896434460016488,
+      "loss": 1.0962,
+      "step": 302
+    },
+    {
+      "epoch": 0.023435316791134666,
+      "grad_norm": 0.15282337367534637,
+      "learning_rate": 0.0001989540395713108,
+      "loss": 1.1688,
+      "step": 304
+    },
+    {
+      "epoch": 0.023589496506865815,
+      "grad_norm": 0.15459619462490082,
+      "learning_rate": 0.00019894373454245674,
+      "loss": 1.0216,
+      "step": 306
+    },
+    {
+      "epoch": 0.023743676222596963,
+      "grad_norm": 0.15799634158611298,
+      "learning_rate": 0.00019893342951360265,
+      "loss": 1.1429,
+      "step": 308
+    },
+    {
+      "epoch": 0.023897855938328112,
+      "grad_norm": 0.1343819946050644,
+      "learning_rate": 0.00019892312448474857,
+      "loss": 1.0959,
+      "step": 310
+    },
+    {
+      "epoch": 0.024052035654059264,
+      "grad_norm": 0.14791317284107208,
+      "learning_rate": 0.00019891281945589448,
+      "loss": 1.0636,
+      "step": 312
+    },
+    {
+      "epoch": 0.024206215369790413,
+      "grad_norm": 0.1442137360572815,
+      "learning_rate": 0.0001989025144270404,
+      "loss": 1.055,
+      "step": 314
+    },
+    {
+      "epoch": 0.024360395085521562,
+      "grad_norm": 0.14649145305156708,
+      "learning_rate": 0.00019889220939818634,
+      "loss": 1.0906,
+      "step": 316
+    },
+    {
+      "epoch": 0.02451457480125271,
+      "grad_norm": 0.14234665036201477,
+      "learning_rate": 0.00019888190436933225,
+      "loss": 1.0853,
+      "step": 318
+    },
+    {
+      "epoch": 0.02466875451698386,
+      "grad_norm": 0.1419668048620224,
+      "learning_rate": 0.00019887159934047817,
+      "loss": 1.0296,
+      "step": 320
+    },
+    {
+      "epoch": 0.02482293423271501,
+      "grad_norm": 0.14730845391750336,
+      "learning_rate": 0.00019886129431162408,
+      "loss": 1.0421,
+      "step": 322
+    },
+    {
+      "epoch": 0.024977113948446157,
+      "grad_norm": 0.1400081068277359,
+      "learning_rate": 0.00019885098928277,
+      "loss": 1.0291,
+      "step": 324
+    },
+    {
+      "epoch": 0.025131293664177306,
+      "grad_norm": 0.15542668104171753,
+      "learning_rate": 0.0001988406842539159,
+      "loss": 1.0597,
+      "step": 326
+    },
+    {
+      "epoch": 0.025285473379908455,
+      "grad_norm": 0.14521440863609314,
+      "learning_rate": 0.00019883037922506185,
+      "loss": 1.0491,
+      "step": 328
+    },
+    {
+      "epoch": 0.025439653095639603,
+      "grad_norm": 0.16224826872348785,
+      "learning_rate": 0.00019882007419620777,
+      "loss": 1.1031,
+      "step": 330
+    },
+    {
+      "epoch": 0.025593832811370756,
+      "grad_norm": 0.15028877556324005,
+      "learning_rate": 0.00019880976916735368,
+      "loss": 1.1154,
+      "step": 332
+    },
+    {
+      "epoch": 0.025748012527101904,
+      "grad_norm": 0.12962941825389862,
+      "learning_rate": 0.0001987994641384996,
+      "loss": 1.0363,
+      "step": 334
+    },
+    {
+      "epoch": 0.025902192242833053,
+      "grad_norm": 0.14908359944820404,
+      "learning_rate": 0.0001987891591096455,
+      "loss": 1.1513,
+      "step": 336
+    },
+    {
+      "epoch": 0.026056371958564202,
+      "grad_norm": 0.15441828966140747,
+      "learning_rate": 0.00019877885408079146,
+      "loss": 1.1303,
+      "step": 338
+    },
+    {
+      "epoch": 0.02621055167429535,
+      "grad_norm": 0.12669101357460022,
+      "learning_rate": 0.00019876854905193737,
+      "loss": 1.0875,
+      "step": 340
+    },
+    {
+      "epoch": 0.0263647313900265,
+      "grad_norm": 0.13190661370754242,
+      "learning_rate": 0.00019875824402308329,
+      "loss": 1.0778,
+      "step": 342
+    },
+    {
+      "epoch": 0.02651891110575765,
+      "grad_norm": 0.14043989777565002,
+      "learning_rate": 0.0001987479389942292,
+      "loss": 1.1011,
+      "step": 344
+    },
+    {
+      "epoch": 0.026673090821488797,
+      "grad_norm": 0.13694870471954346,
+      "learning_rate": 0.00019873763396537512,
+      "loss": 1.0532,
+      "step": 346
+    },
+    {
+      "epoch": 0.026827270537219946,
+      "grad_norm": 0.15089921653270721,
+      "learning_rate": 0.00019872732893652103,
+      "loss": 1.1292,
+      "step": 348
+    },
+    {
+      "epoch": 0.026981450252951095,
+      "grad_norm": 0.14839838445186615,
+      "learning_rate": 0.00019871702390766694,
+      "loss": 1.0275,
+      "step": 350
+    },
+    {
+      "epoch": 0.027135629968682247,
+      "grad_norm": 0.16198500990867615,
+      "learning_rate": 0.00019870671887881286,
+      "loss": 1.1453,
+      "step": 352
+    },
+    {
+      "epoch": 0.027289809684413396,
+      "grad_norm": 0.14694632589817047,
+      "learning_rate": 0.00019869641384995877,
+      "loss": 1.129,
+      "step": 354
+    },
+    {
+      "epoch": 0.027443989400144544,
+      "grad_norm": 0.16091379523277283,
+      "learning_rate": 0.0001986861088211047,
+      "loss": 1.1186,
+      "step": 356
+    },
+    {
+      "epoch": 0.027598169115875693,
+      "grad_norm": 0.144720658659935,
+      "learning_rate": 0.00019867580379225063,
+      "loss": 1.0224,
+      "step": 358
+    },
+    {
+      "epoch": 0.027752348831606842,
+      "grad_norm": 0.13851307332515717,
+      "learning_rate": 0.00019866549876339655,
+      "loss": 1.1421,
+      "step": 360
+    },
+    {
+      "epoch": 0.02790652854733799,
+      "grad_norm": 0.13124969601631165,
+      "learning_rate": 0.00019865519373454246,
+      "loss": 1.0938,
+      "step": 362
+    },
+    {
+      "epoch": 0.02806070826306914,
+      "grad_norm": 0.14723828434944153,
+      "learning_rate": 0.00019864488870568838,
+      "loss": 1.1335,
+      "step": 364
+    },
+    {
+      "epoch": 0.02821488797880029,
+      "grad_norm": 0.17669795453548431,
+      "learning_rate": 0.0001986345836768343,
+      "loss": 1.0765,
+      "step": 366
+    },
+    {
+      "epoch": 0.028369067694531437,
+      "grad_norm": 0.1457260102033615,
+      "learning_rate": 0.00019862427864798023,
+      "loss": 1.1073,
+      "step": 368
+    },
+    {
+      "epoch": 0.028523247410262586,
+      "grad_norm": 0.13594554364681244,
+      "learning_rate": 0.00019861397361912615,
+      "loss": 1.0587,
+      "step": 370
+    },
+    {
+      "epoch": 0.028677427125993738,
+      "grad_norm": 0.13798941671848297,
+      "learning_rate": 0.00019860366859027206,
+      "loss": 1.0833,
+      "step": 372
+    },
+    {
+      "epoch": 0.028831606841724887,
+      "grad_norm": 0.15587519109249115,
+      "learning_rate": 0.00019859336356141798,
+      "loss": 1.0287,
+      "step": 374
+    },
+    {
+      "epoch": 0.028985786557456036,
+      "grad_norm": 0.16585086286067963,
+      "learning_rate": 0.0001985830585325639,
+      "loss": 1.1786,
+      "step": 376
+    },
+    {
+      "epoch": 0.029139966273187184,
+      "grad_norm": 0.1444484293460846,
+      "learning_rate": 0.00019857275350370983,
+      "loss": 1.1793,
+      "step": 378
+    },
+    {
+      "epoch": 0.029294145988918333,
+      "grad_norm": 0.14413981139659882,
+      "learning_rate": 0.00019856244847485575,
+      "loss": 1.1141,
+      "step": 380
+    },
+    {
+      "epoch": 0.029448325704649482,
+      "grad_norm": 0.142032191157341,
+      "learning_rate": 0.00019855214344600166,
+      "loss": 1.1033,
+      "step": 382
+    },
+    {
+      "epoch": 0.02960250542038063,
+      "grad_norm": 0.1490195393562317,
+      "learning_rate": 0.00019854183841714758,
+      "loss": 1.1592,
+      "step": 384
+    },
+    {
+      "epoch": 0.02975668513611178,
+      "grad_norm": 0.1408643275499344,
+      "learning_rate": 0.0001985315333882935,
+      "loss": 1.1505,
+      "step": 386
+    },
+    {
+      "epoch": 0.02991086485184293,
+      "grad_norm": 0.12526237964630127,
+      "learning_rate": 0.00019852122835943944,
+      "loss": 1.1027,
+      "step": 388
+    },
+    {
+      "epoch": 0.030065044567574077,
+      "grad_norm": 0.1339711844921112,
+      "learning_rate": 0.00019851092333058535,
+      "loss": 1.1238,
+      "step": 390
+    },
+    {
+      "epoch": 0.03021922428330523,
+      "grad_norm": 0.13032345473766327,
+      "learning_rate": 0.00019850061830173127,
+      "loss": 1.1121,
+      "step": 392
+    },
+    {
+      "epoch": 0.030373403999036378,
+      "grad_norm": 0.15815846621990204,
+      "learning_rate": 0.00019849031327287718,
+      "loss": 1.168,
+      "step": 394
+    },
+    {
+      "epoch": 0.030527583714767527,
+      "grad_norm": 0.14245116710662842,
+      "learning_rate": 0.0001984800082440231,
+      "loss": 1.0436,
+      "step": 396
+    },
+    {
+      "epoch": 0.030681763430498676,
+      "grad_norm": 0.15660050511360168,
+      "learning_rate": 0.000198469703215169,
+      "loss": 1.158,
+      "step": 398
+    },
+    {
+      "epoch": 0.030835943146229824,
+      "grad_norm": 0.1654158979654312,
+      "learning_rate": 0.00019845939818631493,
+      "loss": 1.0802,
+      "step": 400
+    },
+    {
+      "epoch": 0.030835943146229824,
+      "eval_loss": 1.1026971340179443,
+      "eval_runtime": 185.7295,
+      "eval_samples_per_second": 91.224,
+      "eval_steps_per_second": 1.427,
+      "step": 400
+    },
+    {
+      "epoch": 0.030990122861960973,
+      "grad_norm": 0.13845407962799072,
+      "learning_rate": 0.00019844909315746084,
+      "loss": 1.1055,
+      "step": 402
+    },
+    {
+      "epoch": 0.031144302577692122,
+      "grad_norm": 0.14852891862392426,
+      "learning_rate": 0.00019843878812860676,
+      "loss": 1.0983,
+      "step": 404
+    },
+    {
+      "epoch": 0.031298482293423274,
+      "grad_norm": 0.13408593833446503,
+      "learning_rate": 0.00019842848309975267,
+      "loss": 1.1063,
+      "step": 406
+    },
+    {
+      "epoch": 0.03145266200915442,
+      "grad_norm": 0.14041072130203247,
+      "learning_rate": 0.00019841817807089859,
+      "loss": 1.0327,
+      "step": 408
+    },
+    {
+      "epoch": 0.03160684172488557,
+      "grad_norm": 0.16119754314422607,
+      "learning_rate": 0.00019840787304204453,
+      "loss": 1.1,
+      "step": 410
+    },
+    {
+      "epoch": 0.03176102144061672,
+      "grad_norm": 0.14471223950386047,
+      "learning_rate": 0.00019839756801319044,
+      "loss": 1.0783,
+      "step": 412
+    },
+    {
+      "epoch": 0.03191520115634787,
+      "grad_norm": 0.15591050684452057,
+      "learning_rate": 0.00019838726298433636,
+      "loss": 1.1782,
+      "step": 414
+    },
+    {
+      "epoch": 0.032069380872079015,
+      "grad_norm": 0.1766556203365326,
+      "learning_rate": 0.00019837695795548227,
+      "loss": 1.1063,
+      "step": 416
+    },
+    {
+      "epoch": 0.03222356058781017,
+      "grad_norm": 0.16078630089759827,
+      "learning_rate": 0.0001983666529266282,
+      "loss": 1.0891,
+      "step": 418
+    },
+    {
+      "epoch": 0.03237774030354131,
+      "grad_norm": 0.13378402590751648,
+      "learning_rate": 0.00019835634789777413,
+      "loss": 1.074,
+      "step": 420
+    },
+    {
+      "epoch": 0.032531920019272464,
+      "grad_norm": 0.14526261389255524,
+      "learning_rate": 0.00019834604286892004,
+      "loss": 1.108,
+      "step": 422
+    },
+    {
+      "epoch": 0.03268609973500362,
+      "grad_norm": 0.1321713775396347,
+      "learning_rate": 0.00019833573784006596,
+      "loss": 1.019,
+      "step": 424
+    },
+    {
+      "epoch": 0.03284027945073476,
+      "grad_norm": 0.12685374915599823,
+      "learning_rate": 0.00019832543281121187,
+      "loss": 1.09,
+      "step": 426
+    },
+    {
+      "epoch": 0.032994459166465914,
+      "grad_norm": 0.13825605809688568,
+      "learning_rate": 0.0001983151277823578,
+      "loss": 1.1356,
+      "step": 428
+    },
+    {
+      "epoch": 0.03314863888219706,
+      "grad_norm": 0.13683827221393585,
+      "learning_rate": 0.00019830482275350373,
+      "loss": 1.1405,
+      "step": 430
+    },
+    {
+      "epoch": 0.03330281859792821,
+      "grad_norm": 0.16707143187522888,
+      "learning_rate": 0.00019829451772464965,
+      "loss": 1.1305,
+      "step": 432
+    },
+    {
+      "epoch": 0.03345699831365936,
+      "grad_norm": 0.11735045164823532,
+      "learning_rate": 0.00019828421269579556,
+      "loss": 1.0421,
+      "step": 434
+    },
+    {
+      "epoch": 0.03361117802939051,
+      "grad_norm": 0.1337989866733551,
+      "learning_rate": 0.00019827390766694148,
+      "loss": 1.0572,
+      "step": 436
+    },
+    {
+      "epoch": 0.033765357745121655,
+      "grad_norm": 0.17111611366271973,
+      "learning_rate": 0.0001982636026380874,
+      "loss": 1.1698,
+      "step": 438
+    },
+    {
+      "epoch": 0.03391953746085281,
+      "grad_norm": 0.13785259425640106,
+      "learning_rate": 0.00019825329760923333,
+      "loss": 1.056,
+      "step": 440
+    },
+    {
+      "epoch": 0.03407371717658395,
+      "grad_norm": 0.15061460435390472,
+      "learning_rate": 0.00019824299258037925,
+      "loss": 1.0963,
+      "step": 442
+    },
+    {
+      "epoch": 0.034227896892315104,
+      "grad_norm": 0.1231001690030098,
+      "learning_rate": 0.00019823268755152516,
+      "loss": 1.1264,
+      "step": 444
+    },
+    {
+      "epoch": 0.03438207660804626,
+      "grad_norm": 0.13752298057079315,
+      "learning_rate": 0.00019822238252267108,
+      "loss": 1.0672,
+      "step": 446
+    },
+    {
+      "epoch": 0.0345362563237774,
+      "grad_norm": 0.13519813120365143,
+      "learning_rate": 0.000198212077493817,
+      "loss": 1.0882,
+      "step": 448
+    },
+    {
+      "epoch": 0.034690436039508554,
+      "grad_norm": 0.140150785446167,
+      "learning_rate": 0.0001982017724649629,
+      "loss": 1.0572,
+      "step": 450
+    },
+    {
+      "epoch": 0.0348446157552397,
+      "grad_norm": 0.13910406827926636,
+      "learning_rate": 0.00019819146743610882,
+      "loss": 1.0762,
+      "step": 452
+    },
+    {
+      "epoch": 0.03499879547097085,
+      "grad_norm": 0.14587442576885223,
+      "learning_rate": 0.00019818116240725474,
+      "loss": 1.1232,
+      "step": 454
+    },
+    {
+      "epoch": 0.035152975186702,
+      "grad_norm": 0.14476893842220306,
+      "learning_rate": 0.00019817085737840065,
+      "loss": 1.1004,
+      "step": 456
+    },
+    {
+      "epoch": 0.03530715490243315,
+      "grad_norm": 0.13861101865768433,
+      "learning_rate": 0.00019816055234954657,
+      "loss": 1.0302,
+      "step": 458
+    },
+    {
+      "epoch": 0.035461334618164295,
+      "grad_norm": 0.14342686533927917,
+      "learning_rate": 0.0001981502473206925,
+      "loss": 1.1092,
+      "step": 460
+    },
+    {
+      "epoch": 0.03561551433389545,
+      "grad_norm": 0.11709775030612946,
+      "learning_rate": 0.00019813994229183842,
+      "loss": 1.0463,
+      "step": 462
+    },
+    {
+      "epoch": 0.0357696940496266,
+      "grad_norm": 0.15154917538166046,
+      "learning_rate": 0.00019812963726298434,
+      "loss": 1.0897,
+      "step": 464
+    },
+    {
+      "epoch": 0.035923873765357744,
+      "grad_norm": 0.16716259717941284,
+      "learning_rate": 0.00019811933223413025,
+      "loss": 1.1214,
+      "step": 466
+    },
+    {
+      "epoch": 0.0360780534810889,
+      "grad_norm": 0.13513320684432983,
+      "learning_rate": 0.00019810902720527617,
+      "loss": 1.0623,
+      "step": 468
+    },
+    {
+      "epoch": 0.03623223319682004,
+      "grad_norm": 0.15930432081222534,
+      "learning_rate": 0.0001980987221764221,
+      "loss": 1.1092,
+      "step": 470
+    },
+    {
+      "epoch": 0.036386412912551194,
+      "grad_norm": 0.13990509510040283,
+      "learning_rate": 0.00019808841714756803,
+      "loss": 1.1048,
+      "step": 472
+    },
+    {
+      "epoch": 0.03654059262828234,
+      "grad_norm": 0.18784300982952118,
+      "learning_rate": 0.00019807811211871394,
+      "loss": 1.1676,
+      "step": 474
+    },
+    {
+      "epoch": 0.03669477234401349,
+      "grad_norm": 0.152045339345932,
+      "learning_rate": 0.00019806780708985986,
+      "loss": 1.1303,
+      "step": 476
+    },
+    {
+      "epoch": 0.03684895205974464,
+      "grad_norm": 0.1409967988729477,
+      "learning_rate": 0.00019805750206100577,
+      "loss": 1.0972,
+      "step": 478
+    },
+    {
+      "epoch": 0.03700313177547579,
+      "grad_norm": 0.13838854432106018,
+      "learning_rate": 0.0001980471970321517,
+      "loss": 1.101,
+      "step": 480
+    },
+    {
+      "epoch": 0.037157311491206935,
+      "grad_norm": 0.1579430103302002,
+      "learning_rate": 0.00019803689200329763,
+      "loss": 1.1077,
+      "step": 482
+    },
+    {
+      "epoch": 0.03731149120693809,
+      "grad_norm": 0.15061910450458527,
+      "learning_rate": 0.00019802658697444354,
+      "loss": 1.1239,
+      "step": 484
+    },
+    {
+      "epoch": 0.03746567092266924,
+      "grad_norm": 0.16408291459083557,
+      "learning_rate": 0.00019801628194558946,
+      "loss": 1.0961,
+      "step": 486
+    },
+    {
+      "epoch": 0.037619850638400384,
+      "grad_norm": 0.15612424910068512,
+      "learning_rate": 0.00019800597691673537,
+      "loss": 1.1299,
+      "step": 488
+    },
+    {
+      "epoch": 0.03777403035413154,
+      "grad_norm": 0.14135530591011047,
+      "learning_rate": 0.00019799567188788131,
+      "loss": 1.0489,
+      "step": 490
+    },
+    {
+      "epoch": 0.03792821006986268,
+      "grad_norm": 0.13743548095226288,
+      "learning_rate": 0.00019798536685902723,
+      "loss": 1.0837,
+      "step": 492
+    },
+    {
+      "epoch": 0.038082389785593834,
+      "grad_norm": 0.157401442527771,
+      "learning_rate": 0.00019797506183017314,
+      "loss": 1.0573,
+      "step": 494
+    },
+    {
+      "epoch": 0.03823656950132498,
+      "grad_norm": 0.14982052147388458,
+      "learning_rate": 0.00019796475680131906,
+      "loss": 1.0839,
+      "step": 496
+    },
+    {
+      "epoch": 0.03839074921705613,
+      "grad_norm": 0.1347000151872635,
+      "learning_rate": 0.00019795445177246497,
+      "loss": 1.113,
+      "step": 498
+    },
+    {
+      "epoch": 0.03854492893278728,
+      "grad_norm": 0.14478904008865356,
+      "learning_rate": 0.0001979441467436109,
+      "loss": 1.0514,
+      "step": 500
+    },
+    {
+      "epoch": 0.03854492893278728,
+      "eval_loss": 1.1000746488571167,
+      "eval_runtime": 185.5217,
+      "eval_samples_per_second": 91.326,
+      "eval_steps_per_second": 1.428,
+      "step": 500
+    },
+    {
+      "epoch": 0.03869910864851843,
+      "grad_norm": 0.14274291694164276,
+      "learning_rate": 0.00019793384171475683,
+      "loss": 1.0847,
+      "step": 502
+    },
+    {
+      "epoch": 0.03885328836424958,
+      "grad_norm": 0.14326965808868408,
+      "learning_rate": 0.00019792353668590275,
+      "loss": 1.0865,
+      "step": 504
+    },
+    {
+      "epoch": 0.03900746807998073,
+      "grad_norm": 0.1575518548488617,
+      "learning_rate": 0.00019791323165704866,
+      "loss": 1.1258,
+      "step": 506
+    },
+    {
+      "epoch": 0.03916164779571188,
+      "grad_norm": 0.14699862897396088,
+      "learning_rate": 0.00019790292662819458,
+      "loss": 1.1687,
+      "step": 508
+    },
+    {
+      "epoch": 0.039315827511443024,
+      "grad_norm": 0.1394687294960022,
+      "learning_rate": 0.0001978926215993405,
+      "loss": 1.1214,
+      "step": 510
+    },
+    {
+      "epoch": 0.03947000722717418,
+      "grad_norm": 0.14366985857486725,
+      "learning_rate": 0.0001978823165704864,
+      "loss": 1.0651,
+      "step": 512
+    },
+    {
+      "epoch": 0.03962418694290532,
+      "grad_norm": 0.14171218872070312,
+      "learning_rate": 0.00019787201154163232,
+      "loss": 1.1398,
+      "step": 514
+    },
+    {
+      "epoch": 0.039778366658636474,
+      "grad_norm": 0.13258612155914307,
+      "learning_rate": 0.00019786170651277824,
+      "loss": 1.1234,
+      "step": 516
+    },
+    {
+      "epoch": 0.03993254637436762,
+      "grad_norm": 0.17693160474300385,
+      "learning_rate": 0.00019785140148392415,
+      "loss": 1.1121,
+      "step": 518
+    },
+    {
+      "epoch": 0.04008672609009877,
+      "grad_norm": 0.143838569521904,
+      "learning_rate": 0.00019784109645507006,
+      "loss": 1.102,
+      "step": 520
+    },
+    {
+      "epoch": 0.04024090580582992,
+      "grad_norm": 0.14078038930892944,
+      "learning_rate": 0.000197830791426216,
+      "loss": 1.1044,
+      "step": 522
+    },
+    {
+      "epoch": 0.04039508552156107,
+      "grad_norm": 0.12367985397577286,
+      "learning_rate": 0.00019782048639736192,
+      "loss": 1.102,
+      "step": 524
+    },
+    {
+      "epoch": 0.04054926523729222,
+      "grad_norm": 0.136929452419281,
+      "learning_rate": 0.00019781018136850784,
+      "loss": 1.0802,
+      "step": 526
+    },
+    {
+      "epoch": 0.04070344495302337,
+      "grad_norm": 0.15831957757472992,
+      "learning_rate": 0.00019779987633965375,
+      "loss": 1.09,
+      "step": 528
+    },
+    {
+      "epoch": 0.04085762466875452,
+      "grad_norm": 0.15482452511787415,
+      "learning_rate": 0.00019778957131079967,
+      "loss": 1.0828,
+      "step": 530
+    },
+    {
+      "epoch": 0.041011804384485664,
+      "grad_norm": 0.13797122240066528,
+      "learning_rate": 0.0001977792662819456,
+      "loss": 1.1263,
+      "step": 532
+    },
+    {
+      "epoch": 0.04116598410021682,
+      "grad_norm": 0.18304814398288727,
+      "learning_rate": 0.00019776896125309152,
+      "loss": 1.0991,
+      "step": 534
+    },
+    {
+      "epoch": 0.04132016381594796,
+      "grad_norm": 0.1509987860918045,
+      "learning_rate": 0.00019775865622423744,
+      "loss": 1.0804,
+      "step": 536
+    },
+    {
+      "epoch": 0.041474343531679114,
+      "grad_norm": 0.13406258821487427,
+      "learning_rate": 0.00019774835119538335,
+      "loss": 1.0348,
+      "step": 538
+    },
+    {
+      "epoch": 0.04162852324741026,
+      "grad_norm": 0.1413736194372177,
+      "learning_rate": 0.00019773804616652927,
+      "loss": 1.066,
+      "step": 540
+    },
+    {
+      "epoch": 0.04178270296314141,
+      "grad_norm": 0.1451394259929657,
+      "learning_rate": 0.0001977277411376752,
+      "loss": 1.0485,
+      "step": 542
+    },
+    {
+      "epoch": 0.041936882678872564,
+      "grad_norm": 0.13275358080863953,
+      "learning_rate": 0.00019771743610882113,
+      "loss": 1.1164,
+      "step": 544
+    },
+    {
+      "epoch": 0.04209106239460371,
+      "grad_norm": 0.15869611501693726,
+      "learning_rate": 0.00019770713107996704,
+      "loss": 1.1361,
+      "step": 546
+    },
+    {
+      "epoch": 0.04224524211033486,
+      "grad_norm": 0.14091487228870392,
+      "learning_rate": 0.00019769682605111295,
+      "loss": 1.061,
+      "step": 548
+    },
+    {
+      "epoch": 0.04239942182606601,
+      "grad_norm": 0.13538867235183716,
+      "learning_rate": 0.00019768652102225887,
+      "loss": 1.0607,
+      "step": 550
+    },
+    {
+      "epoch": 0.04255360154179716,
+      "grad_norm": 0.15626317262649536,
+      "learning_rate": 0.0001976762159934048,
+      "loss": 1.0758,
+      "step": 552
+    },
+    {
+      "epoch": 0.042707781257528304,
+      "grad_norm": 0.1293731927871704,
+      "learning_rate": 0.00019766591096455073,
+      "loss": 1.0434,
+      "step": 554
+    },
+    {
+      "epoch": 0.04286196097325946,
+      "grad_norm": 0.13498535752296448,
+      "learning_rate": 0.00019765560593569664,
+      "loss": 1.0953,
+      "step": 556
+    },
+    {
+      "epoch": 0.0430161406889906,
+      "grad_norm": 0.14134527742862701,
+      "learning_rate": 0.00019764530090684256,
+      "loss": 1.1559,
+      "step": 558
+    },
+    {
+      "epoch": 0.043170320404721754,
+      "grad_norm": 0.13958705961704254,
+      "learning_rate": 0.00019763499587798847,
+      "loss": 1.2585,
+      "step": 560
+    },
+    {
+      "epoch": 0.0433245001204529,
+      "grad_norm": 0.2181047797203064,
+      "learning_rate": 0.0001976246908491344,
+      "loss": 1.0164,
+      "step": 562
+    },
+    {
+      "epoch": 0.04347867983618405,
+      "grad_norm": 0.1365436315536499,
+      "learning_rate": 0.0001976143858202803,
+      "loss": 1.124,
+      "step": 564
+    },
+    {
+      "epoch": 0.043632859551915204,
+      "grad_norm": 0.12809793651103973,
+      "learning_rate": 0.00019760408079142622,
+      "loss": 1.0378,
+      "step": 566
+    },
+    {
+      "epoch": 0.04378703926764635,
+      "grad_norm": 0.12341924756765366,
+      "learning_rate": 0.00019759377576257213,
+      "loss": 1.1091,
+      "step": 568
+    },
+    {
+      "epoch": 0.0439412189833775,
+      "grad_norm": 0.14291982352733612,
+      "learning_rate": 0.00019758347073371805,
+      "loss": 1.1366,
+      "step": 570
+    },
+    {
+      "epoch": 0.04409539869910865,
+      "grad_norm": 0.14486652612686157,
+      "learning_rate": 0.000197573165704864,
+      "loss": 1.0168,
+      "step": 572
+    },
+    {
+      "epoch": 0.0442495784148398,
+      "grad_norm": 0.1724916249513626,
+      "learning_rate": 0.0001975628606760099,
+      "loss": 1.1037,
+      "step": 574
+    },
+    {
+      "epoch": 0.044403758130570944,
+      "grad_norm": 0.13338427245616913,
+      "learning_rate": 0.00019755255564715582,
+      "loss": 1.0259,
+      "step": 576
+    },
+    {
+      "epoch": 0.0445579378463021,
+      "grad_norm": 0.1372508853673935,
+      "learning_rate": 0.00019754225061830173,
+      "loss": 1.0784,
+      "step": 578
+    },
+    {
+      "epoch": 0.04471211756203324,
+      "grad_norm": 0.11633725464344025,
+      "learning_rate": 0.00019753194558944765,
+      "loss": 1.0648,
+      "step": 580
+    },
+    {
+      "epoch": 0.044866297277764394,
+      "grad_norm": 0.14386776089668274,
+      "learning_rate": 0.00019752164056059356,
+      "loss": 1.0777,
+      "step": 582
+    },
+    {
+      "epoch": 0.045020476993495546,
+      "grad_norm": 0.14929193258285522,
+      "learning_rate": 0.0001975113355317395,
+      "loss": 1.1319,
+      "step": 584
+    },
+    {
+      "epoch": 0.04517465670922669,
+      "grad_norm": 0.1324220448732376,
+      "learning_rate": 0.00019750103050288542,
+      "loss": 1.0614,
+      "step": 586
+    },
+    {
+      "epoch": 0.045328836424957844,
+      "grad_norm": 0.1392926126718521,
+      "learning_rate": 0.00019749072547403133,
+      "loss": 1.142,
+      "step": 588
+    },
+    {
+      "epoch": 0.04548301614068899,
+      "grad_norm": 0.2632090151309967,
+      "learning_rate": 0.00019748042044517725,
+      "loss": 1.0159,
+      "step": 590
+    },
+    {
+      "epoch": 0.04563719585642014,
+      "grad_norm": 0.13699129223823547,
+      "learning_rate": 0.00019747011541632316,
+      "loss": 1.0778,
+      "step": 592
+    },
+    {
+      "epoch": 0.04579137557215129,
+      "grad_norm": 0.13768675923347473,
+      "learning_rate": 0.0001974598103874691,
+      "loss": 1.0719,
+      "step": 594
+    },
+    {
+      "epoch": 0.04594555528788244,
+      "grad_norm": 0.13458684086799622,
+      "learning_rate": 0.00019744950535861502,
+      "loss": 1.0145,
+      "step": 596
+    },
+    {
+      "epoch": 0.046099735003613584,
+      "grad_norm": 0.1772696077823639,
+      "learning_rate": 0.00019743920032976094,
+      "loss": 1.0629,
+      "step": 598
+    },
+    {
+      "epoch": 0.04625391471934474,
+      "grad_norm": 0.13998697698116302,
+      "learning_rate": 0.00019742889530090685,
+      "loss": 1.102,
+      "step": 600
+    },
+    {
+      "epoch": 0.04625391471934474,
+      "eval_loss": 1.098169207572937,
+      "eval_runtime": 185.5141,
+      "eval_samples_per_second": 91.33,
+      "eval_steps_per_second": 1.428,
+      "step": 600
+    },
+    {
+      "epoch": 0.04640809443507588,
+      "grad_norm": 0.13928066194057465,
+      "learning_rate": 0.00019741859027205277,
+      "loss": 1.1527,
+      "step": 602
+    },
+    {
+      "epoch": 0.046562274150807034,
+      "grad_norm": 0.13011601567268372,
+      "learning_rate": 0.0001974082852431987,
+      "loss": 1.1259,
+      "step": 604
+    },
+    {
+      "epoch": 0.046716453866538186,
+      "grad_norm": 0.1306074559688568,
+      "learning_rate": 0.00019739798021434462,
+      "loss": 1.0951,
+      "step": 606
+    },
+    {
+      "epoch": 0.04687063358226933,
+      "grad_norm": 0.14797037839889526,
+      "learning_rate": 0.00019738767518549054,
+      "loss": 1.0321,
+      "step": 608
+    },
+    {
+      "epoch": 0.047024813298000484,
+      "grad_norm": 0.14849938452243805,
+      "learning_rate": 0.00019737737015663645,
+      "loss": 1.1096,
+      "step": 610
+    },
+    {
+      "epoch": 0.04717899301373163,
+      "grad_norm": 0.12060682475566864,
+      "learning_rate": 0.00019736706512778237,
+      "loss": 1.0652,
+      "step": 612
+    },
+    {
+      "epoch": 0.04733317272946278,
+      "grad_norm": 0.12754854559898376,
+      "learning_rate": 0.00019735676009892828,
+      "loss": 1.1097,
+      "step": 614
+    },
+    {
+      "epoch": 0.04748735244519393,
+      "grad_norm": 0.12162326276302338,
+      "learning_rate": 0.0001973464550700742,
+      "loss": 1.1087,
+      "step": 616
+    },
+    {
+      "epoch": 0.04764153216092508,
+      "grad_norm": 0.175630122423172,
+      "learning_rate": 0.0001973361500412201,
+      "loss": 1.0723,
+      "step": 618
+    },
+    {
+      "epoch": 0.047795711876656224,
+      "grad_norm": 0.15365472435951233,
+      "learning_rate": 0.00019732584501236603,
+      "loss": 1.1009,
+      "step": 620
+    },
+    {
+      "epoch": 0.04794989159238738,
+      "grad_norm": 0.13359837234020233,
+      "learning_rate": 0.00019731553998351194,
+      "loss": 1.0974,
+      "step": 622
+    },
+    {
+      "epoch": 0.04810407130811853,
+      "grad_norm": 0.1482960432767868,
+      "learning_rate": 0.00019730523495465788,
+      "loss": 1.1214,
+      "step": 624
+    },
+    {
+      "epoch": 0.048258251023849674,
+      "grad_norm": 0.1309668868780136,
+      "learning_rate": 0.0001972949299258038,
+      "loss": 1.0849,
+      "step": 626
+    },
+    {
+      "epoch": 0.048412430739580826,
+      "grad_norm": 0.1544414609670639,
+      "learning_rate": 0.00019728462489694971,
+      "loss": 1.092,
+      "step": 628
+    },
+    {
+      "epoch": 0.04856661045531197,
+      "grad_norm": 0.14907146990299225,
+      "learning_rate": 0.00019727431986809563,
+      "loss": 1.0671,
+      "step": 630
+    },
+    {
+      "epoch": 0.048720790171043124,
+      "grad_norm": 0.16943813860416412,
+      "learning_rate": 0.00019726401483924154,
+      "loss": 1.1433,
+      "step": 632
+    },
+    {
+      "epoch": 0.04887496988677427,
+      "grad_norm": 0.14070230722427368,
+      "learning_rate": 0.00019725370981038749,
+      "loss": 1.1613,
+      "step": 634
+    },
+    {
+      "epoch": 0.04902914960250542,
+      "grad_norm": 0.15507204830646515,
+      "learning_rate": 0.0001972434047815334,
+      "loss": 1.1286,
+      "step": 636
+    },
+    {
+      "epoch": 0.04918332931823657,
+      "grad_norm": 0.13587893545627594,
+      "learning_rate": 0.00019723309975267932,
+      "loss": 1.1094,
+      "step": 638
+    },
+    {
+      "epoch": 0.04933750903396772,
+      "grad_norm": 0.12399852275848389,
+      "learning_rate": 0.00019722279472382523,
+      "loss": 1.058,
+      "step": 640
+    },
+    {
+      "epoch": 0.049491688749698864,
+      "grad_norm": 0.12497518211603165,
+      "learning_rate": 0.00019721248969497115,
+      "loss": 1.0716,
+      "step": 642
+    },
+    {
+      "epoch": 0.04964586846543002,
+      "grad_norm": 0.15282607078552246,
+      "learning_rate": 0.0001972021846661171,
+      "loss": 1.0912,
+      "step": 644
+    },
+    {
+      "epoch": 0.04980004818116117,
+      "grad_norm": 0.14203013479709625,
+      "learning_rate": 0.000197191879637263,
+      "loss": 1.0846,
+      "step": 646
+    },
+    {
+      "epoch": 0.049954227896892314,
+      "grad_norm": 0.12308704853057861,
+      "learning_rate": 0.00019718157460840892,
+      "loss": 1.1202,
+      "step": 648
+    },
+    {
+      "epoch": 0.050108407612623466,
+      "grad_norm": 0.15226681530475616,
+      "learning_rate": 0.00019717126957955483,
+      "loss": 1.0626,
+      "step": 650
+    },
+    {
+      "epoch": 0.05026258732835461,
+      "grad_norm": 0.12636694312095642,
+      "learning_rate": 0.00019716096455070075,
+      "loss": 1.1086,
+      "step": 652
+    },
+    {
+      "epoch": 0.050416767044085764,
+      "grad_norm": 0.14969666302204132,
+      "learning_rate": 0.0001971506595218467,
+      "loss": 1.1602,
+      "step": 654
+    },
+    {
+      "epoch": 0.05057094675981691,
+      "grad_norm": 0.130833700299263,
+      "learning_rate": 0.0001971403544929926,
+      "loss": 1.0657,
+      "step": 656
+    },
+    {
+      "epoch": 0.05072512647554806,
+      "grad_norm": 0.1283751279115677,
+      "learning_rate": 0.00019713004946413852,
+      "loss": 1.0371,
+      "step": 658
+    },
+    {
+      "epoch": 0.05087930619127921,
+      "grad_norm": 0.11827697604894638,
+      "learning_rate": 0.00019711974443528443,
+      "loss": 1.0308,
+      "step": 660
+    },
+    {
+      "epoch": 0.05103348590701036,
+      "grad_norm": 0.12265590578317642,
+      "learning_rate": 0.00019710943940643035,
+      "loss": 1.1127,
+      "step": 662
+    },
+    {
+      "epoch": 0.05118766562274151,
+      "grad_norm": 0.13979150354862213,
+      "learning_rate": 0.0001970991343775763,
+      "loss": 1.1011,
+      "step": 664
+    },
+    {
+      "epoch": 0.05134184533847266,
+      "grad_norm": 0.1368461698293686,
+      "learning_rate": 0.0001970888293487222,
+      "loss": 1.0857,
+      "step": 666
+    },
+    {
+      "epoch": 0.05149602505420381,
+      "grad_norm": 0.13669301569461823,
+      "learning_rate": 0.00019707852431986812,
+      "loss": 1.0971,
+      "step": 668
+    },
+    {
+      "epoch": 0.051650204769934954,
+      "grad_norm": 0.12659449875354767,
+      "learning_rate": 0.00019706821929101404,
+      "loss": 1.0556,
+      "step": 670
+    },
+    {
+      "epoch": 0.051804384485666106,
+      "grad_norm": 0.14103113114833832,
+      "learning_rate": 0.00019705791426215995,
+      "loss": 1.0913,
+      "step": 672
+    },
+    {
+      "epoch": 0.05195856420139725,
+      "grad_norm": 0.16134017705917358,
+      "learning_rate": 0.00019704760923330587,
+      "loss": 1.0994,
+      "step": 674
+    },
+    {
+      "epoch": 0.052112743917128404,
+      "grad_norm": 0.12725086510181427,
+      "learning_rate": 0.00019703730420445178,
+      "loss": 1.1008,
+      "step": 676
+    },
+    {
+      "epoch": 0.05226692363285955,
+      "grad_norm": 0.12865908443927765,
+      "learning_rate": 0.0001970269991755977,
+      "loss": 1.0186,
+      "step": 678
+    },
+    {
+      "epoch": 0.0524211033485907,
+      "grad_norm": 0.1661859154701233,
+      "learning_rate": 0.0001970166941467436,
+      "loss": 1.068,
+      "step": 680
+    },
+    {
+      "epoch": 0.05257528306432185,
+      "grad_norm": 0.14370663464069366,
+      "learning_rate": 0.00019700638911788953,
+      "loss": 1.102,
+      "step": 682
+    },
+    {
+      "epoch": 0.052729462780053,
+      "grad_norm": 0.13285204768180847,
+      "learning_rate": 0.00019699608408903544,
+      "loss": 1.1055,
+      "step": 684
+    },
+    {
+      "epoch": 0.05288364249578415,
+      "grad_norm": 0.17762747406959534,
+      "learning_rate": 0.00019698577906018138,
+      "loss": 1.1601,
+      "step": 686
+    },
+    {
+      "epoch": 0.0530378222115153,
+      "grad_norm": 0.12693317234516144,
+      "learning_rate": 0.0001969754740313273,
+      "loss": 1.0494,
+      "step": 688
+    },
+    {
+      "epoch": 0.05319200192724645,
+      "grad_norm": 0.1302707940340042,
+      "learning_rate": 0.0001969651690024732,
+      "loss": 1.066,
+      "step": 690
+    },
+    {
+      "epoch": 0.053346181642977594,
+      "grad_norm": 0.11844471096992493,
+      "learning_rate": 0.00019695486397361913,
+      "loss": 1.0085,
+      "step": 692
+    },
+    {
+      "epoch": 0.053500361358708746,
+      "grad_norm": 0.12299422174692154,
+      "learning_rate": 0.00019694455894476504,
+      "loss": 1.0985,
+      "step": 694
+    },
+    {
+      "epoch": 0.05365454107443989,
+      "grad_norm": 0.1222420409321785,
+      "learning_rate": 0.00019693425391591098,
+      "loss": 1.0648,
+      "step": 696
+    },
+    {
+      "epoch": 0.053808720790171044,
+      "grad_norm": 0.13273879885673523,
+      "learning_rate": 0.0001969239488870569,
+      "loss": 1.1108,
+      "step": 698
+    },
+    {
+      "epoch": 0.05396290050590219,
+      "grad_norm": 0.13202215731143951,
+      "learning_rate": 0.00019691364385820281,
+      "loss": 1.1013,
+      "step": 700
+    },
+    {
+      "epoch": 0.05396290050590219,
+      "eval_loss": 1.0964874029159546,
+      "eval_runtime": 185.3303,
+      "eval_samples_per_second": 91.421,
+      "eval_steps_per_second": 1.43,
+      "step": 700
+    },
+    {
+      "epoch": 0.05411708022163334,
+      "grad_norm": 0.13038010895252228,
+      "learning_rate": 0.00019690333882934873,
+      "loss": 1.0642,
+      "step": 702
+    },
+    {
+      "epoch": 0.054271259937364494,
+      "grad_norm": 0.18084144592285156,
+      "learning_rate": 0.00019689303380049464,
+      "loss": 1.0673,
+      "step": 704
+    },
+    {
+      "epoch": 0.05442543965309564,
+      "grad_norm": 0.18958036601543427,
+      "learning_rate": 0.00019688272877164059,
+      "loss": 1.0925,
+      "step": 706
+    },
+    {
+      "epoch": 0.05457961936882679,
+      "grad_norm": 0.13386841118335724,
+      "learning_rate": 0.0001968724237427865,
+      "loss": 1.0978,
+      "step": 708
+    },
+    {
+      "epoch": 0.05473379908455794,
+      "grad_norm": 0.1408504843711853,
+      "learning_rate": 0.00019686211871393242,
+      "loss": 1.1158,
+      "step": 710
+    },
+    {
+      "epoch": 0.05488797880028909,
+      "grad_norm": 0.12006545811891556,
+      "learning_rate": 0.00019685181368507833,
+      "loss": 1.0395,
+      "step": 712
+    },
+    {
+      "epoch": 0.055042158516020234,
+      "grad_norm": 0.13973191380500793,
+      "learning_rate": 0.00019684150865622425,
+      "loss": 1.0685,
+      "step": 714
+    },
+    {
+      "epoch": 0.055196338231751386,
+      "grad_norm": 0.14461107552051544,
+      "learning_rate": 0.0001968312036273702,
+      "loss": 1.0924,
+      "step": 716
+    },
+    {
+      "epoch": 0.05535051794748253,
+      "grad_norm": 0.13358595967292786,
+      "learning_rate": 0.0001968208985985161,
+      "loss": 1.0479,
+      "step": 718
+    },
+    {
+      "epoch": 0.055504697663213684,
+      "grad_norm": 0.13416843116283417,
+      "learning_rate": 0.00019681059356966202,
+      "loss": 1.0166,
+      "step": 720
+    },
+    {
+      "epoch": 0.05565887737894483,
+      "grad_norm": 0.15217959880828857,
+      "learning_rate": 0.00019680028854080793,
+      "loss": 1.0918,
+      "step": 722
+    },
+    {
+      "epoch": 0.05581305709467598,
+      "grad_norm": 0.13012762367725372,
+      "learning_rate": 0.00019678998351195385,
+      "loss": 1.0967,
+      "step": 724
+    },
+    {
+      "epoch": 0.055967236810407134,
+      "grad_norm": 0.13023535907268524,
+      "learning_rate": 0.00019677967848309976,
+      "loss": 1.0247,
+      "step": 726
+    },
+    {
+      "epoch": 0.05612141652613828,
+      "grad_norm": 0.13703665137290955,
+      "learning_rate": 0.00019676937345424568,
+      "loss": 1.0969,
+      "step": 728
+    },
+    {
+      "epoch": 0.05627559624186943,
+      "grad_norm": 0.12767066061496735,
+      "learning_rate": 0.0001967590684253916,
+      "loss": 1.08,
+      "step": 730
+    },
+    {
+      "epoch": 0.05642977595760058,
+      "grad_norm": 0.12238382548093796,
+      "learning_rate": 0.0001967487633965375,
+      "loss": 1.1233,
+      "step": 732
+    },
+    {
+      "epoch": 0.05658395567333173,
+      "grad_norm": 0.1356974095106125,
+      "learning_rate": 0.00019673845836768342,
+      "loss": 1.0439,
+      "step": 734
+    },
+    {
+      "epoch": 0.056738135389062874,
+      "grad_norm": 0.14199669659137726,
+      "learning_rate": 0.00019672815333882936,
+      "loss": 1.0753,
+      "step": 736
+    },
+    {
+      "epoch": 0.056892315104794026,
+      "grad_norm": 0.12904112040996552,
+      "learning_rate": 0.00019671784830997528,
+      "loss": 1.0749,
+      "step": 738
+    },
+    {
+      "epoch": 0.05704649482052517,
+      "grad_norm": 0.1235031932592392,
+      "learning_rate": 0.0001967075432811212,
+      "loss": 1.0275,
+      "step": 740
+    },
+    {
+      "epoch": 0.057200674536256324,
+      "grad_norm": 0.170023113489151,
+      "learning_rate": 0.0001966972382522671,
+      "loss": 1.1295,
+      "step": 742
+    },
+    {
+      "epoch": 0.057354854251987476,
+      "grad_norm": 0.15533532202243805,
+      "learning_rate": 0.00019668693322341302,
+      "loss": 1.0629,
+      "step": 744
+    },
+    {
+      "epoch": 0.05750903396771862,
+      "grad_norm": 0.1602126806974411,
+      "learning_rate": 0.00019667662819455897,
+      "loss": 1.1538,
+      "step": 746
+    },
+    {
+      "epoch": 0.057663213683449774,
+      "grad_norm": 0.16433580219745636,
+      "learning_rate": 0.00019666632316570488,
+      "loss": 1.1322,
+      "step": 748
+    },
+    {
+      "epoch": 0.05781739339918092,
+      "grad_norm": 0.13925233483314514,
+      "learning_rate": 0.0001966560181368508,
+      "loss": 1.083,
+      "step": 750
+    },
+    {
+      "epoch": 0.05797157311491207,
+      "grad_norm": 0.12234565615653992,
+      "learning_rate": 0.0001966457131079967,
+      "loss": 1.0113,
+      "step": 752
+    },
+    {
+      "epoch": 0.05812575283064322,
+      "grad_norm": 0.1425125002861023,
+      "learning_rate": 0.00019663540807914262,
+      "loss": 1.0762,
+      "step": 754
+    },
+    {
+      "epoch": 0.05827993254637437,
+      "grad_norm": 0.14309099316596985,
+      "learning_rate": 0.00019662510305028854,
+      "loss": 1.0633,
+      "step": 756
+    },
+    {
+      "epoch": 0.058434112262105514,
+      "grad_norm": 0.1381814330816269,
+      "learning_rate": 0.00019661479802143448,
+      "loss": 1.142,
+      "step": 758
+    },
+    {
+      "epoch": 0.058588291977836666,
+      "grad_norm": 0.15551595389842987,
+      "learning_rate": 0.0001966044929925804,
+      "loss": 1.026,
+      "step": 760
+    },
+    {
+      "epoch": 0.05874247169356781,
+      "grad_norm": 0.14606410264968872,
+      "learning_rate": 0.0001965941879637263,
+      "loss": 1.1265,
+      "step": 762
+    },
+    {
+      "epoch": 0.058896651409298964,
+      "grad_norm": 0.13017289340496063,
+      "learning_rate": 0.00019658388293487223,
+      "loss": 1.1051,
+      "step": 764
+    },
+    {
+      "epoch": 0.059050831125030116,
+      "grad_norm": 0.1500990092754364,
+      "learning_rate": 0.00019657357790601814,
+      "loss": 1.0948,
+      "step": 766
+    },
+    {
+      "epoch": 0.05920501084076126,
+      "grad_norm": 0.14307473599910736,
+      "learning_rate": 0.00019656327287716408,
+      "loss": 1.0667,
+      "step": 768
+    },
+    {
+      "epoch": 0.059359190556492414,
+      "grad_norm": 0.13513712584972382,
+      "learning_rate": 0.00019655296784831,
+      "loss": 1.0488,
+      "step": 770
+    },
+    {
+      "epoch": 0.05951337027222356,
+      "grad_norm": 0.13991938531398773,
+      "learning_rate": 0.0001965426628194559,
+      "loss": 1.0888,
+      "step": 772
+    },
+    {
+      "epoch": 0.05966754998795471,
+      "grad_norm": 0.15015999972820282,
+      "learning_rate": 0.00019653235779060183,
+      "loss": 1.0774,
+      "step": 774
+    },
+    {
+      "epoch": 0.05982172970368586,
+      "grad_norm": 0.16419099271297455,
+      "learning_rate": 0.00019652205276174774,
+      "loss": 1.0661,
+      "step": 776
+    },
+    {
+      "epoch": 0.05997590941941701,
+      "grad_norm": 0.12072901427745819,
+      "learning_rate": 0.00019651174773289366,
+      "loss": 1.0645,
+      "step": 778
+    },
+    {
+      "epoch": 0.060130089135148154,
+      "grad_norm": 0.13410696387290955,
+      "learning_rate": 0.00019650144270403957,
+      "loss": 1.0677,
+      "step": 780
+    },
+    {
+      "epoch": 0.060284268850879306,
+      "grad_norm": 0.13373896479606628,
+      "learning_rate": 0.0001964911376751855,
+      "loss": 1.0055,
+      "step": 782
+    },
+    {
+      "epoch": 0.06043844856661046,
+      "grad_norm": 0.13043928146362305,
+      "learning_rate": 0.0001964808326463314,
+      "loss": 1.0579,
+      "step": 784
+    },
+    {
+      "epoch": 0.060592628282341604,
+      "grad_norm": 0.13334155082702637,
+      "learning_rate": 0.00019647052761747732,
+      "loss": 1.0781,
+      "step": 786
+    },
+    {
+      "epoch": 0.060746807998072756,
+      "grad_norm": 0.14660002291202545,
+      "learning_rate": 0.00019646022258862326,
+      "loss": 1.1244,
+      "step": 788
+    },
+    {
+      "epoch": 0.0609009877138039,
+      "grad_norm": 0.1240791380405426,
+      "learning_rate": 0.00019644991755976917,
+      "loss": 1.0353,
+      "step": 790
+    },
+    {
+      "epoch": 0.061055167429535054,
+      "grad_norm": 0.12248943001031876,
+      "learning_rate": 0.0001964396125309151,
+      "loss": 1.1292,
+      "step": 792
+    },
+    {
+      "epoch": 0.0612093471452662,
+      "grad_norm": 0.1340823471546173,
+      "learning_rate": 0.000196429307502061,
+      "loss": 1.0764,
+      "step": 794
+    },
+    {
+      "epoch": 0.06136352686099735,
+      "grad_norm": 0.1297413557767868,
+      "learning_rate": 0.00019641900247320692,
+      "loss": 1.0998,
+      "step": 796
+    },
+    {
+      "epoch": 0.0615177065767285,
+      "grad_norm": 0.13512568175792694,
+      "learning_rate": 0.00019640869744435286,
+      "loss": 1.0349,
+      "step": 798
+    },
+    {
+      "epoch": 0.06167188629245965,
+      "grad_norm": 0.13964438438415527,
+      "learning_rate": 0.00019639839241549878,
+      "loss": 1.0543,
+      "step": 800
+    },
+    {
+      "epoch": 0.06167188629245965,
+      "eval_loss": 1.0952669382095337,
+      "eval_runtime": 185.8383,
+      "eval_samples_per_second": 91.171,
+      "eval_steps_per_second": 1.426,
+      "step": 800
+    },
+    {
+      "epoch": 0.061826066008190794,
+      "grad_norm": 0.1318446695804596,
+      "learning_rate": 0.0001963880873866447,
+      "loss": 1.1469,
+      "step": 802
+    },
+    {
+      "epoch": 0.061980245723921946,
+      "grad_norm": 0.13778544962406158,
+      "learning_rate": 0.0001963777823577906,
+      "loss": 1.0361,
+      "step": 804
+    },
+    {
+      "epoch": 0.0621344254396531,
+      "grad_norm": 0.14804169535636902,
+      "learning_rate": 0.00019636747732893652,
+      "loss": 1.0537,
+      "step": 806
+    },
+    {
+      "epoch": 0.062288605155384244,
+      "grad_norm": 0.1363479495048523,
+      "learning_rate": 0.00019635717230008246,
+      "loss": 1.0819,
+      "step": 808
+    },
+    {
+      "epoch": 0.062442784871115396,
+      "grad_norm": 0.12277363240718842,
+      "learning_rate": 0.00019634686727122838,
+      "loss": 1.0629,
+      "step": 810
+    },
+    {
+      "epoch": 0.06259696458684655,
+      "grad_norm": 0.13027344644069672,
+      "learning_rate": 0.0001963365622423743,
+      "loss": 1.0544,
+      "step": 812
+    },
+    {
+      "epoch": 0.0627511443025777,
+      "grad_norm": 0.1274079531431198,
+      "learning_rate": 0.0001963262572135202,
+      "loss": 1.0685,
+      "step": 814
+    },
+    {
+      "epoch": 0.06290532401830884,
+      "grad_norm": 0.1349189281463623,
+      "learning_rate": 0.00019631595218466612,
+      "loss": 1.0289,
+      "step": 816
+    },
+    {
+      "epoch": 0.06305950373403998,
+      "grad_norm": 0.1265273541212082,
+      "learning_rate": 0.00019630564715581206,
+      "loss": 1.0765,
+      "step": 818
+    },
+    {
+      "epoch": 0.06321368344977114,
+      "grad_norm": 0.1393941193819046,
+      "learning_rate": 0.00019629534212695798,
+      "loss": 1.0918,
+      "step": 820
+    },
+    {
+      "epoch": 0.06336786316550229,
+      "grad_norm": 0.12475106865167618,
+      "learning_rate": 0.0001962850370981039,
+      "loss": 1.027,
+      "step": 822
+    },
+    {
+      "epoch": 0.06352204288123343,
+      "grad_norm": 0.13844382762908936,
+      "learning_rate": 0.0001962747320692498,
+      "loss": 1.1482,
+      "step": 824
+    },
+    {
+      "epoch": 0.0636762225969646,
+      "grad_norm": 0.1444624364376068,
+      "learning_rate": 0.00019626442704039572,
+      "loss": 1.0659,
+      "step": 826
+    },
+    {
+      "epoch": 0.06383040231269574,
+      "grad_norm": 0.13939915597438812,
+      "learning_rate": 0.00019625412201154164,
+      "loss": 1.0392,
+      "step": 828
+    },
+    {
+      "epoch": 0.06398458202842688,
+      "grad_norm": 0.12919913232326508,
+      "learning_rate": 0.00019624381698268755,
+      "loss": 1.0566,
+      "step": 830
+    },
+    {
+      "epoch": 0.06413876174415803,
+      "grad_norm": 0.1297498196363449,
+      "learning_rate": 0.00019623351195383347,
+      "loss": 1.058,
+      "step": 832
+    },
+    {
+      "epoch": 0.06429294145988919,
+      "grad_norm": 0.16311457753181458,
+      "learning_rate": 0.00019622320692497938,
+      "loss": 1.1175,
+      "step": 834
+    },
+    {
+      "epoch": 0.06444712117562033,
+      "grad_norm": 0.14434239268302917,
+      "learning_rate": 0.0001962129018961253,
+      "loss": 1.0966,
+      "step": 836
+    },
+    {
+      "epoch": 0.06460130089135148,
+      "grad_norm": 0.13500697910785675,
+      "learning_rate": 0.00019620259686727121,
+      "loss": 1.138,
+      "step": 838
+    },
+    {
+      "epoch": 0.06475548060708262,
+      "grad_norm": 0.13175781071186066,
+      "learning_rate": 0.00019619229183841716,
+      "loss": 1.0744,
+      "step": 840
+    },
+    {
+      "epoch": 0.06490966032281378,
+      "grad_norm": 0.142098531126976,
+      "learning_rate": 0.00019618198680956307,
+      "loss": 1.0686,
+      "step": 842
+    },
+    {
+      "epoch": 0.06506384003854493,
+      "grad_norm": 0.16844119131565094,
+      "learning_rate": 0.00019617168178070899,
+      "loss": 1.0992,
+      "step": 844
+    },
+    {
+      "epoch": 0.06521801975427607,
+      "grad_norm": 0.13562923669815063,
+      "learning_rate": 0.0001961613767518549,
+      "loss": 1.0749,
+      "step": 846
+    },
+    {
+      "epoch": 0.06537219947000723,
+      "grad_norm": 0.14538466930389404,
+      "learning_rate": 0.00019615107172300082,
+      "loss": 1.123,
+      "step": 848
+    },
+    {
+      "epoch": 0.06552637918573838,
+      "grad_norm": 0.13058879971504211,
+      "learning_rate": 0.00019614076669414676,
+      "loss": 1.0835,
+      "step": 850
+    },
+    {
+      "epoch": 0.06568055890146952,
+      "grad_norm": 0.1567140519618988,
+      "learning_rate": 0.00019613046166529267,
+      "loss": 1.1157,
+      "step": 852
+    },
+    {
+      "epoch": 0.06583473861720067,
+      "grad_norm": 0.12576104700565338,
+      "learning_rate": 0.0001961201566364386,
+      "loss": 1.0143,
+      "step": 854
+    },
+    {
+      "epoch": 0.06598891833293183,
+      "grad_norm": 0.13823091983795166,
+      "learning_rate": 0.0001961098516075845,
+      "loss": 1.0797,
+      "step": 856
+    },
+    {
+      "epoch": 0.06614309804866297,
+      "grad_norm": 0.12293639779090881,
+      "learning_rate": 0.00019609954657873042,
+      "loss": 1.0808,
+      "step": 858
+    },
+    {
+      "epoch": 0.06629727776439412,
+      "grad_norm": 0.13951502740383148,
+      "learning_rate": 0.00019608924154987636,
+      "loss": 1.076,
+      "step": 860
+    },
+    {
+      "epoch": 0.06645145748012526,
+      "grad_norm": 0.13900773227214813,
+      "learning_rate": 0.00019607893652102227,
+      "loss": 1.0846,
+      "step": 862
+    },
+    {
+      "epoch": 0.06660563719585642,
+      "grad_norm": 0.14335249364376068,
+      "learning_rate": 0.0001960686314921682,
+      "loss": 1.0639,
+      "step": 864
+    },
+    {
+      "epoch": 0.06675981691158757,
+      "grad_norm": 0.1712643951177597,
+      "learning_rate": 0.0001960583264633141,
+      "loss": 1.1411,
+      "step": 866
+    },
+    {
+      "epoch": 0.06691399662731871,
+      "grad_norm": 0.12118082493543625,
+      "learning_rate": 0.00019604802143446002,
+      "loss": 1.0807,
+      "step": 868
+    },
+    {
+      "epoch": 0.06706817634304987,
+      "grad_norm": 0.141808420419693,
+      "learning_rate": 0.00019603771640560596,
+      "loss": 1.0641,
+      "step": 870
+    },
+    {
+      "epoch": 0.06722235605878102,
+      "grad_norm": 0.14798308908939362,
+      "learning_rate": 0.00019602741137675188,
+      "loss": 1.073,
+      "step": 872
+    },
+    {
+      "epoch": 0.06737653577451216,
+      "grad_norm": 0.13768306374549866,
+      "learning_rate": 0.0001960171063478978,
+      "loss": 1.0735,
+      "step": 874
+    },
+    {
+      "epoch": 0.06753071549024331,
+      "grad_norm": 0.12452355027198792,
+      "learning_rate": 0.0001960068013190437,
+      "loss": 1.0509,
+      "step": 876
+    },
+    {
+      "epoch": 0.06768489520597447,
+      "grad_norm": 0.1402217000722885,
+      "learning_rate": 0.00019599649629018962,
+      "loss": 1.1157,
+      "step": 878
+    },
+    {
+      "epoch": 0.06783907492170561,
+      "grad_norm": 0.12509870529174805,
+      "learning_rate": 0.00019598619126133556,
+      "loss": 1.0516,
+      "step": 880
+    },
+    {
+      "epoch": 0.06799325463743676,
+      "grad_norm": 0.1574297547340393,
+      "learning_rate": 0.00019597588623248148,
+      "loss": 1.0823,
+      "step": 882
+    },
+    {
+      "epoch": 0.0681474343531679,
+      "grad_norm": 0.14185413718223572,
+      "learning_rate": 0.0001959655812036274,
+      "loss": 1.0444,
+      "step": 884
+    },
+    {
+      "epoch": 0.06830161406889906,
+      "grad_norm": 0.1380462348461151,
+      "learning_rate": 0.0001959552761747733,
+      "loss": 1.1066,
+      "step": 886
+    },
+    {
+      "epoch": 0.06845579378463021,
+      "grad_norm": 0.12986746430397034,
+      "learning_rate": 0.00019594497114591922,
+      "loss": 1.1006,
+      "step": 888
+    },
+    {
+      "epoch": 0.06860997350036135,
+      "grad_norm": 0.13894346356391907,
+      "learning_rate": 0.00019593466611706514,
+      "loss": 1.0569,
+      "step": 890
+    },
+    {
+      "epoch": 0.06876415321609251,
+      "grad_norm": 0.12822435796260834,
+      "learning_rate": 0.00019592436108821105,
+      "loss": 1.0696,
+      "step": 892
+    },
+    {
+      "epoch": 0.06891833293182366,
+      "grad_norm": 0.1369408816099167,
+      "learning_rate": 0.00019591405605935697,
+      "loss": 1.0691,
+      "step": 894
+    },
+    {
+      "epoch": 0.0690725126475548,
+      "grad_norm": 0.13459660112857819,
+      "learning_rate": 0.00019590375103050288,
+      "loss": 1.0801,
+      "step": 896
+    },
+    {
+      "epoch": 0.06922669236328595,
+      "grad_norm": 0.1299123764038086,
+      "learning_rate": 0.0001958934460016488,
+      "loss": 1.0885,
+      "step": 898
+    },
+    {
+      "epoch": 0.06938087207901711,
+      "grad_norm": 0.12562230229377747,
+      "learning_rate": 0.00019588314097279474,
+      "loss": 1.183,
+      "step": 900
+    },
+    {
+      "epoch": 0.06938087207901711,
+      "eval_loss": 1.0944268703460693,
+      "eval_runtime": 185.3723,
+      "eval_samples_per_second": 91.4,
+      "eval_steps_per_second": 1.43,
+      "step": 900
+    },
+    {
+      "epoch": 0.06953505179474825,
+      "grad_norm": 0.13996927440166473,
+      "learning_rate": 0.00019587283594394065,
+      "loss": 1.0356,
+      "step": 902
+    },
+    {
+      "epoch": 0.0696892315104794,
+      "grad_norm": 0.128004252910614,
+      "learning_rate": 0.00019586253091508657,
+      "loss": 1.0343,
+      "step": 904
+    },
+    {
+      "epoch": 0.06984341122621056,
+      "grad_norm": 0.15650418400764465,
+      "learning_rate": 0.00019585222588623248,
+      "loss": 1.1138,
+      "step": 906
+    },
+    {
+      "epoch": 0.0699975909419417,
+      "grad_norm": 0.5840476751327515,
+      "learning_rate": 0.0001958419208573784,
+      "loss": 1.1785,
+      "step": 908
+    },
+    {
+      "epoch": 0.07015177065767285,
+      "grad_norm": 0.15330374240875244,
+      "learning_rate": 0.00019583161582852434,
+      "loss": 1.0243,
+      "step": 910
+    },
+    {
+      "epoch": 0.070305950373404,
+      "grad_norm": 0.1603543907403946,
+      "learning_rate": 0.00019582131079967026,
+      "loss": 1.1228,
+      "step": 912
+    },
+    {
+      "epoch": 0.07046013008913515,
+      "grad_norm": 0.14209845662117004,
+      "learning_rate": 0.00019581100577081617,
+      "loss": 1.0939,
+      "step": 914
+    },
+    {
+      "epoch": 0.0706143098048663,
+      "grad_norm": 0.16117019951343536,
+      "learning_rate": 0.00019580070074196209,
+      "loss": 1.1447,
+      "step": 916
+    },
+    {
+      "epoch": 0.07076848952059744,
+      "grad_norm": 0.14068694412708282,
+      "learning_rate": 0.000195790395713108,
+      "loss": 1.0642,
+      "step": 918
+    },
+    {
+      "epoch": 0.07092266923632859,
+      "grad_norm": 0.15248316526412964,
+      "learning_rate": 0.00019578009068425394,
+      "loss": 1.0162,
+      "step": 920
+    },
+    {
+      "epoch": 0.07107684895205975,
+      "grad_norm": 0.22734233736991882,
+      "learning_rate": 0.00019576978565539986,
+      "loss": 1.1123,
+      "step": 922
+    },
+    {
+      "epoch": 0.0712310286677909,
+      "grad_norm": 0.1393287032842636,
+      "learning_rate": 0.00019575948062654577,
+      "loss": 1.0862,
+      "step": 924
+    },
+    {
+      "epoch": 0.07138520838352204,
+      "grad_norm": 0.12911191582679749,
+      "learning_rate": 0.0001957491755976917,
+      "loss": 1.0651,
+      "step": 926
+    },
+    {
+      "epoch": 0.0715393880992532,
+      "grad_norm": 0.12298440933227539,
+      "learning_rate": 0.0001957388705688376,
+      "loss": 1.1227,
+      "step": 928
+    },
+    {
+      "epoch": 0.07169356781498434,
+      "grad_norm": 0.14941005408763885,
+      "learning_rate": 0.00019572856553998352,
+      "loss": 1.0989,
+      "step": 930
+    },
+    {
+      "epoch": 0.07184774753071549,
+      "grad_norm": 0.1411515325307846,
+      "learning_rate": 0.00019571826051112946,
+      "loss": 1.0816,
+      "step": 932
+    },
+    {
+      "epoch": 0.07200192724644663,
+      "grad_norm": 0.11999720335006714,
+      "learning_rate": 0.00019570795548227537,
+      "loss": 1.0306,
+      "step": 934
+    },
+    {
+      "epoch": 0.0721561069621778,
+      "grad_norm": 0.1500861495733261,
+      "learning_rate": 0.0001956976504534213,
+      "loss": 1.0678,
+      "step": 936
+    },
+    {
+      "epoch": 0.07231028667790894,
+      "grad_norm": 0.12102475017309189,
+      "learning_rate": 0.0001956873454245672,
+      "loss": 1.0534,
+      "step": 938
+    },
+    {
+      "epoch": 0.07246446639364008,
+      "grad_norm": 0.11554603278636932,
+      "learning_rate": 0.00019567704039571312,
+      "loss": 1.0535,
+      "step": 940
+    },
+    {
+      "epoch": 0.07261864610937123,
+      "grad_norm": 0.12290264666080475,
+      "learning_rate": 0.00019566673536685903,
+      "loss": 1.0738,
+      "step": 942
+    },
+    {
+      "epoch": 0.07277282582510239,
+      "grad_norm": 0.17740991711616516,
+      "learning_rate": 0.00019565643033800495,
+      "loss": 1.0811,
+      "step": 944
+    },
+    {
+      "epoch": 0.07292700554083353,
+      "grad_norm": 0.14767777919769287,
+      "learning_rate": 0.00019564612530915086,
+      "loss": 1.105,
+      "step": 946
+    },
+    {
+      "epoch": 0.07308118525656468,
+      "grad_norm": 0.13773177564144135,
+      "learning_rate": 0.00019563582028029678,
+      "loss": 1.0983,
+      "step": 948
+    },
+    {
+      "epoch": 0.07323536497229584,
+      "grad_norm": 0.13891370594501495,
+      "learning_rate": 0.0001956255152514427,
+      "loss": 1.1349,
+      "step": 950
+    },
+    {
+      "epoch": 0.07338954468802698,
+      "grad_norm": 0.14717017114162445,
+      "learning_rate": 0.00019561521022258863,
+      "loss": 1.134,
+      "step": 952
+    },
+    {
+      "epoch": 0.07354372440375813,
+      "grad_norm": 0.15095743536949158,
+      "learning_rate": 0.00019560490519373455,
+      "loss": 1.063,
+      "step": 954
+    },
+    {
+      "epoch": 0.07369790411948927,
+      "grad_norm": 0.12851206958293915,
+      "learning_rate": 0.00019559460016488046,
+      "loss": 1.1005,
+      "step": 956
+    },
+    {
+      "epoch": 0.07385208383522043,
+      "grad_norm": 0.13364006578922272,
+      "learning_rate": 0.00019558429513602638,
+      "loss": 1.0429,
+      "step": 958
+    },
+    {
+      "epoch": 0.07400626355095158,
+      "grad_norm": 0.1326039433479309,
+      "learning_rate": 0.0001955739901071723,
+      "loss": 1.1586,
+      "step": 960
+    },
+    {
+      "epoch": 0.07416044326668272,
+      "grad_norm": 0.13149486482143402,
+      "learning_rate": 0.00019556368507831824,
+      "loss": 1.109,
+      "step": 962
+    },
+    {
+      "epoch": 0.07431462298241387,
+      "grad_norm": 0.1189669519662857,
+      "learning_rate": 0.00019555338004946415,
+      "loss": 1.0462,
+      "step": 964
+    },
+    {
+      "epoch": 0.07446880269814503,
+      "grad_norm": 0.14341482520103455,
+      "learning_rate": 0.00019554307502061007,
+      "loss": 1.0623,
+      "step": 966
+    },
+    {
+      "epoch": 0.07462298241387617,
+      "grad_norm": 0.14133721590042114,
+      "learning_rate": 0.00019553276999175598,
+      "loss": 1.0945,
+      "step": 968
+    },
+    {
+      "epoch": 0.07477716212960732,
+      "grad_norm": 0.1351941078901291,
+      "learning_rate": 0.0001955224649629019,
+      "loss": 1.0327,
+      "step": 970
+    },
+    {
+      "epoch": 0.07493134184533848,
+      "grad_norm": 0.12836019694805145,
+      "learning_rate": 0.00019551215993404784,
+      "loss": 1.069,
+      "step": 972
+    },
+    {
+      "epoch": 0.07508552156106962,
+      "grad_norm": 0.13199055194854736,
+      "learning_rate": 0.00019550185490519375,
+      "loss": 1.0323,
+      "step": 974
+    },
+    {
+      "epoch": 0.07523970127680077,
+      "grad_norm": 0.14991353452205658,
+      "learning_rate": 0.00019549154987633967,
+      "loss": 1.0625,
+      "step": 976
+    },
+    {
+      "epoch": 0.07539388099253191,
+      "grad_norm": 0.13832435011863708,
+      "learning_rate": 0.00019548124484748558,
+      "loss": 1.1031,
+      "step": 978
+    },
+    {
+      "epoch": 0.07554806070826307,
+      "grad_norm": 0.12351599335670471,
+      "learning_rate": 0.0001954709398186315,
+      "loss": 1.0286,
+      "step": 980
+    },
+    {
+      "epoch": 0.07570224042399422,
+      "grad_norm": 0.12360050529241562,
+      "learning_rate": 0.00019546063478977744,
+      "loss": 1.0652,
+      "step": 982
+    },
+    {
+      "epoch": 0.07585642013972536,
+      "grad_norm": 0.13384872674942017,
+      "learning_rate": 0.00019545032976092335,
+      "loss": 1.1125,
+      "step": 984
+    },
+    {
+      "epoch": 0.07601059985545652,
+      "grad_norm": 0.13200527429580688,
+      "learning_rate": 0.00019544002473206927,
+      "loss": 1.0727,
+      "step": 986
+    },
+    {
+      "epoch": 0.07616477957118767,
+      "grad_norm": 0.143647700548172,
+      "learning_rate": 0.00019542971970321518,
+      "loss": 1.1207,
+      "step": 988
+    },
+    {
+      "epoch": 0.07631895928691881,
+      "grad_norm": 0.13605177402496338,
+      "learning_rate": 0.0001954194146743611,
+      "loss": 1.0225,
+      "step": 990
+    },
+    {
+      "epoch": 0.07647313900264996,
+      "grad_norm": 0.12646125257015228,
+      "learning_rate": 0.00019540910964550701,
+      "loss": 1.11,
+      "step": 992
+    },
+    {
+      "epoch": 0.07662731871838112,
+      "grad_norm": 0.132467120885849,
+      "learning_rate": 0.00019539880461665293,
+      "loss": 1.1092,
+      "step": 994
+    },
+    {
+      "epoch": 0.07678149843411226,
+      "grad_norm": 0.12461701035499573,
+      "learning_rate": 0.00019538849958779884,
+      "loss": 1.0854,
+      "step": 996
+    },
+    {
+      "epoch": 0.07693567814984341,
+      "grad_norm": 0.13430501520633698,
+      "learning_rate": 0.00019537819455894476,
+      "loss": 1.2,
+      "step": 998
+    },
+    {
+      "epoch": 0.07708985786557455,
+      "grad_norm": 0.12623916566371918,
+      "learning_rate": 0.00019536788953009067,
+      "loss": 1.0522,
+      "step": 1000
+    },
+    {
+      "epoch": 0.07708985786557455,
+      "eval_loss": 1.0930616855621338,
+      "eval_runtime": 185.4001,
+      "eval_samples_per_second": 91.386,
+      "eval_steps_per_second": 1.429,
+      "step": 1000
+    },
+    {
+      "epoch": 0.07724403758130571,
+      "grad_norm": 0.11760087311267853,
+      "learning_rate": 0.00019535758450123662,
+      "loss": 1.1566,
+      "step": 1002
+    },
+    {
+      "epoch": 0.07739821729703686,
+      "grad_norm": 0.145633727312088,
+      "learning_rate": 0.00019534727947238253,
+      "loss": 1.094,
+      "step": 1004
+    },
+    {
+      "epoch": 0.077552397012768,
+      "grad_norm": 0.1311633288860321,
+      "learning_rate": 0.00019533697444352845,
+      "loss": 1.0792,
+      "step": 1006
+    },
+    {
+      "epoch": 0.07770657672849916,
+      "grad_norm": 0.12563548982143402,
+      "learning_rate": 0.00019532666941467436,
+      "loss": 1.0601,
+      "step": 1008
+    },
+    {
+      "epoch": 0.07786075644423031,
+      "grad_norm": 0.14429886639118195,
+      "learning_rate": 0.00019531636438582028,
+      "loss": 1.0926,
+      "step": 1010
+    },
+    {
+      "epoch": 0.07801493615996145,
+      "grad_norm": 0.13131891191005707,
+      "learning_rate": 0.0001953060593569662,
+      "loss": 1.1012,
+      "step": 1012
+    },
+    {
+      "epoch": 0.0781691158756926,
+      "grad_norm": 0.14185300469398499,
+      "learning_rate": 0.00019529575432811213,
+      "loss": 1.1113,
+      "step": 1014
+    },
+    {
+      "epoch": 0.07832329559142376,
+      "grad_norm": 0.14298418164253235,
+      "learning_rate": 0.00019528544929925805,
+      "loss": 1.0909,
+      "step": 1016
+    },
+    {
+      "epoch": 0.0784774753071549,
+      "grad_norm": 0.1339821219444275,
+      "learning_rate": 0.00019527514427040396,
+      "loss": 1.0994,
+      "step": 1018
+    },
+    {
+      "epoch": 0.07863165502288605,
+      "grad_norm": 0.1252928525209427,
+      "learning_rate": 0.00019526483924154988,
+      "loss": 1.0316,
+      "step": 1020
+    },
+    {
+      "epoch": 0.0787858347386172,
+      "grad_norm": 0.1277703046798706,
+      "learning_rate": 0.0001952545342126958,
+      "loss": 1.1067,
+      "step": 1022
+    },
+    {
+      "epoch": 0.07894001445434835,
+      "grad_norm": 0.12644124031066895,
+      "learning_rate": 0.00019524422918384173,
+      "loss": 1.0176,
+      "step": 1024
+    },
+    {
+      "epoch": 0.0790941941700795,
+      "grad_norm": 0.13443627953529358,
+      "learning_rate": 0.00019523392415498765,
+      "loss": 1.0754,
+      "step": 1026
+    },
+    {
+      "epoch": 0.07924837388581064,
+      "grad_norm": 0.1895609050989151,
+      "learning_rate": 0.00019522361912613356,
+      "loss": 1.0551,
+      "step": 1028
+    },
+    {
+      "epoch": 0.0794025536015418,
+      "grad_norm": 0.1372397392988205,
+      "learning_rate": 0.00019521331409727948,
+      "loss": 1.0442,
+      "step": 1030
+    },
+    {
+      "epoch": 0.07955673331727295,
+      "grad_norm": 0.14173942804336548,
+      "learning_rate": 0.0001952030090684254,
+      "loss": 1.0692,
+      "step": 1032
+    },
+    {
+      "epoch": 0.0797109130330041,
+      "grad_norm": 0.12321804463863373,
+      "learning_rate": 0.00019519270403957134,
+      "loss": 1.0276,
+      "step": 1034
+    },
+    {
+      "epoch": 0.07986509274873524,
+      "grad_norm": 0.12327130138874054,
+      "learning_rate": 0.00019518239901071725,
+      "loss": 1.0376,
+      "step": 1036
+    },
+    {
+      "epoch": 0.0800192724644664,
+      "grad_norm": 0.12301841378211975,
+      "learning_rate": 0.00019517209398186317,
+      "loss": 1.0887,
+      "step": 1038
+    },
+    {
+      "epoch": 0.08017345218019754,
+      "grad_norm": 0.1429559886455536,
+      "learning_rate": 0.00019516178895300908,
+      "loss": 1.0321,
+      "step": 1040
+    },
+    {
+      "epoch": 0.08032763189592869,
+      "grad_norm": 0.13955366611480713,
+      "learning_rate": 0.000195151483924155,
+      "loss": 1.1081,
+      "step": 1042
+    },
+    {
+      "epoch": 0.08048181161165983,
+      "grad_norm": 0.13553303480148315,
+      "learning_rate": 0.00019514117889530094,
+      "loss": 1.0252,
+      "step": 1044
+    },
+    {
+      "epoch": 0.080635991327391,
+      "grad_norm": 0.14100225269794464,
+      "learning_rate": 0.00019513087386644685,
+      "loss": 1.1071,
+      "step": 1046
+    },
+    {
+      "epoch": 0.08079017104312214,
+      "grad_norm": 0.14522643387317657,
+      "learning_rate": 0.00019512056883759277,
+      "loss": 1.0653,
+      "step": 1048
+    },
+    {
+      "epoch": 0.08094435075885328,
+      "grad_norm": 0.14540371298789978,
+      "learning_rate": 0.00019511026380873868,
+      "loss": 1.01,
+      "step": 1050
+    },
+    {
+      "epoch": 0.08109853047458444,
+      "grad_norm": 0.1459018737077713,
+      "learning_rate": 0.0001950999587798846,
+      "loss": 1.1147,
+      "step": 1052
+    },
+    {
+      "epoch": 0.08125271019031559,
+      "grad_norm": 0.12590867280960083,
+      "learning_rate": 0.0001950896537510305,
+      "loss": 1.0685,
+      "step": 1054
+    },
+    {
+      "epoch": 0.08140688990604673,
+      "grad_norm": 0.11943504959344864,
+      "learning_rate": 0.00019507934872217643,
+      "loss": 1.0854,
+      "step": 1056
+    },
+    {
+      "epoch": 0.08156106962177788,
+      "grad_norm": 0.12039398401975632,
+      "learning_rate": 0.00019506904369332234,
+      "loss": 1.1397,
+      "step": 1058
+    },
+    {
+      "epoch": 0.08171524933750904,
+      "grad_norm": 0.1411554217338562,
+      "learning_rate": 0.00019505873866446826,
+      "loss": 1.1271,
+      "step": 1060
+    },
+    {
+      "epoch": 0.08186942905324018,
+      "grad_norm": 0.1402871012687683,
+      "learning_rate": 0.00019504843363561417,
+      "loss": 1.0425,
+      "step": 1062
+    },
+    {
+      "epoch": 0.08202360876897133,
+      "grad_norm": 0.13545840978622437,
+      "learning_rate": 0.00019503812860676011,
+      "loss": 1.0571,
+      "step": 1064
+    },
+    {
+      "epoch": 0.08217778848470249,
+      "grad_norm": 0.12789209187030792,
+      "learning_rate": 0.00019502782357790603,
+      "loss": 1.0596,
+      "step": 1066
+    },
+    {
+      "epoch": 0.08233196820043363,
+      "grad_norm": 0.13018928468227386,
+      "learning_rate": 0.00019501751854905194,
+      "loss": 1.1188,
+      "step": 1068
+    },
+    {
+      "epoch": 0.08248614791616478,
+      "grad_norm": 0.12482234835624695,
+      "learning_rate": 0.00019500721352019786,
+      "loss": 1.0831,
+      "step": 1070
+    },
+    {
+      "epoch": 0.08264032763189592,
+      "grad_norm": 0.11897309869527817,
+      "learning_rate": 0.00019499690849134377,
+      "loss": 1.0658,
+      "step": 1072
+    },
+    {
+      "epoch": 0.08279450734762708,
+      "grad_norm": 0.12954497337341309,
+      "learning_rate": 0.00019498660346248972,
+      "loss": 1.0204,
+      "step": 1074
+    },
+    {
+      "epoch": 0.08294868706335823,
+      "grad_norm": 0.14220042526721954,
+      "learning_rate": 0.00019497629843363563,
+      "loss": 1.1101,
+      "step": 1076
+    },
+    {
+      "epoch": 0.08310286677908937,
+      "grad_norm": 0.1631559580564499,
+      "learning_rate": 0.00019496599340478155,
+      "loss": 1.1352,
+      "step": 1078
+    },
+    {
+      "epoch": 0.08325704649482052,
+      "grad_norm": 0.13439539074897766,
+      "learning_rate": 0.00019495568837592746,
+      "loss": 1.0108,
+      "step": 1080
+    },
+    {
+      "epoch": 0.08341122621055168,
+      "grad_norm": 0.12389718741178513,
+      "learning_rate": 0.00019494538334707338,
+      "loss": 1.0155,
+      "step": 1082
+    },
+    {
+      "epoch": 0.08356540592628282,
+      "grad_norm": 0.1241556853055954,
+      "learning_rate": 0.00019493507831821932,
+      "loss": 1.1428,
+      "step": 1084
+    },
+    {
+      "epoch": 0.08371958564201397,
+      "grad_norm": 0.13087880611419678,
+      "learning_rate": 0.00019492477328936523,
+      "loss": 1.0876,
+      "step": 1086
+    },
+    {
+      "epoch": 0.08387376535774513,
+      "grad_norm": 0.12431449443101883,
+      "learning_rate": 0.00019491446826051115,
+      "loss": 1.0758,
+      "step": 1088
+    },
+    {
+      "epoch": 0.08402794507347627,
+      "grad_norm": 0.13807635009288788,
+      "learning_rate": 0.00019490416323165706,
+      "loss": 1.0902,
+      "step": 1090
+    },
+    {
+      "epoch": 0.08418212478920742,
+      "grad_norm": 0.12751048803329468,
+      "learning_rate": 0.00019489385820280298,
+      "loss": 1.0732,
+      "step": 1092
+    },
+    {
+      "epoch": 0.08433630450493856,
+      "grad_norm": 0.15594707429409027,
+      "learning_rate": 0.00019488355317394892,
+      "loss": 1.1115,
+      "step": 1094
+    },
+    {
+      "epoch": 0.08449048422066972,
+      "grad_norm": 0.11647301912307739,
+      "learning_rate": 0.00019487324814509483,
+      "loss": 1.1592,
+      "step": 1096
+    },
+    {
+      "epoch": 0.08464466393640087,
+      "grad_norm": 0.13609850406646729,
+      "learning_rate": 0.00019486294311624075,
+      "loss": 1.1139,
+      "step": 1098
+    },
+    {
+      "epoch": 0.08479884365213201,
+      "grad_norm": 0.1234198659658432,
+      "learning_rate": 0.00019485263808738666,
+      "loss": 1.0682,
+      "step": 1100
+    },
+    {
+      "epoch": 0.08479884365213201,
+      "eval_loss": 1.0920624732971191,
+      "eval_runtime": 185.5142,
+      "eval_samples_per_second": 91.33,
+      "eval_steps_per_second": 1.428,
+      "step": 1100
+    },
+    {
+      "epoch": 0.08495302336786316,
+      "grad_norm": 0.1375039666891098,
+      "learning_rate": 0.00019484233305853258,
+      "loss": 1.0585,
+      "step": 1102
+    },
+    {
+      "epoch": 0.08510720308359432,
+      "grad_norm": 0.14471521973609924,
+      "learning_rate": 0.0001948320280296785,
+      "loss": 1.1115,
+      "step": 1104
+    },
+    {
+      "epoch": 0.08526138279932546,
+      "grad_norm": 0.12425632029771805,
+      "learning_rate": 0.0001948217230008244,
+      "loss": 1.0501,
+      "step": 1106
+    },
+    {
+      "epoch": 0.08541556251505661,
+      "grad_norm": 0.1161596029996872,
+      "learning_rate": 0.00019481141797197032,
+      "loss": 1.0182,
+      "step": 1108
+    },
+    {
+      "epoch": 0.08556974223078777,
+      "grad_norm": 0.11700072139501572,
+      "learning_rate": 0.00019480111294311624,
+      "loss": 1.0579,
+      "step": 1110
+    },
+    {
+      "epoch": 0.08572392194651891,
+      "grad_norm": 0.14330415427684784,
+      "learning_rate": 0.00019479080791426215,
+      "loss": 1.1211,
+      "step": 1112
+    },
+    {
+      "epoch": 0.08587810166225006,
+      "grad_norm": 0.14039026200771332,
+      "learning_rate": 0.00019478050288540807,
+      "loss": 1.0826,
+      "step": 1114
+    },
+    {
+      "epoch": 0.0860322813779812,
+      "grad_norm": 0.14031362533569336,
+      "learning_rate": 0.000194770197856554,
+      "loss": 1.0871,
+      "step": 1116
+    },
+    {
+      "epoch": 0.08618646109371236,
+      "grad_norm": 0.12351037561893463,
+      "learning_rate": 0.00019475989282769993,
+      "loss": 1.001,
+      "step": 1118
+    },
+    {
+      "epoch": 0.08634064080944351,
+      "grad_norm": 0.11667052656412125,
+      "learning_rate": 0.00019474958779884584,
+      "loss": 1.0421,
+      "step": 1120
+    },
+    {
+      "epoch": 0.08649482052517465,
+      "grad_norm": 0.1489124447107315,
+      "learning_rate": 0.00019473928276999175,
+      "loss": 1.1644,
+      "step": 1122
+    },
+    {
+      "epoch": 0.0866490002409058,
+      "grad_norm": 0.1338202804327011,
+      "learning_rate": 0.00019472897774113767,
+      "loss": 1.1239,
+      "step": 1124
+    },
+    {
+      "epoch": 0.08680317995663696,
+      "grad_norm": 0.13266493380069733,
+      "learning_rate": 0.0001947186727122836,
+      "loss": 1.0839,
+      "step": 1126
+    },
+    {
+      "epoch": 0.0869573596723681,
+      "grad_norm": 0.13726286590099335,
+      "learning_rate": 0.00019470836768342953,
+      "loss": 1.1325,
+      "step": 1128
+    },
+    {
+      "epoch": 0.08711153938809925,
+      "grad_norm": 0.14077100157737732,
+      "learning_rate": 0.00019469806265457544,
+      "loss": 1.0429,
+      "step": 1130
+    },
+    {
+      "epoch": 0.08726571910383041,
+      "grad_norm": 0.1362866312265396,
+      "learning_rate": 0.00019468775762572136,
+      "loss": 1.0715,
+      "step": 1132
+    },
+    {
+      "epoch": 0.08741989881956155,
+      "grad_norm": 0.12472223490476608,
+      "learning_rate": 0.00019467745259686727,
+      "loss": 1.0503,
+      "step": 1134
+    },
+    {
+      "epoch": 0.0875740785352927,
+      "grad_norm": 0.1350635141134262,
+      "learning_rate": 0.0001946671475680132,
+      "loss": 1.0498,
+      "step": 1136
+    },
+    {
+      "epoch": 0.08772825825102384,
+      "grad_norm": 0.1424301117658615,
+      "learning_rate": 0.00019465684253915913,
+      "loss": 1.1589,
+      "step": 1138
+    },
+    {
+      "epoch": 0.087882437966755,
+      "grad_norm": 0.12365067005157471,
+      "learning_rate": 0.00019464653751030504,
+      "loss": 1.1065,
+      "step": 1140
+    },
+    {
+      "epoch": 0.08803661768248615,
+      "grad_norm": 0.16497495770454407,
+      "learning_rate": 0.00019463623248145096,
+      "loss": 1.0189,
+      "step": 1142
+    },
+    {
+      "epoch": 0.0881907973982173,
+      "grad_norm": 0.1381298303604126,
+      "learning_rate": 0.00019462592745259687,
+      "loss": 1.0426,
+      "step": 1144
+    },
+    {
+      "epoch": 0.08834497711394845,
+      "grad_norm": 0.15007291734218597,
+      "learning_rate": 0.00019461562242374282,
+      "loss": 1.1108,
+      "step": 1146
+    },
+    {
+      "epoch": 0.0884991568296796,
+      "grad_norm": 0.19384606182575226,
+      "learning_rate": 0.00019460531739488873,
+      "loss": 1.0664,
+      "step": 1148
+    },
+    {
+      "epoch": 0.08865333654541074,
+      "grad_norm": 0.12032177299261093,
+      "learning_rate": 0.00019459501236603465,
+      "loss": 1.018,
+      "step": 1150
+    },
+    {
+      "epoch": 0.08880751626114189,
+      "grad_norm": 0.1197669506072998,
+      "learning_rate": 0.00019458470733718056,
+      "loss": 1.071,
+      "step": 1152
+    },
+    {
+      "epoch": 0.08896169597687305,
+      "grad_norm": 0.12108784914016724,
+      "learning_rate": 0.00019457440230832647,
+      "loss": 1.0499,
+      "step": 1154
+    },
+    {
+      "epoch": 0.0891158756926042,
+      "grad_norm": 0.1270270049571991,
+      "learning_rate": 0.0001945640972794724,
+      "loss": 1.1172,
+      "step": 1156
+    },
+    {
+      "epoch": 0.08927005540833534,
+      "grad_norm": 0.13599786162376404,
+      "learning_rate": 0.0001945537922506183,
+      "loss": 1.103,
+      "step": 1158
+    },
+    {
+      "epoch": 0.08942423512406648,
+      "grad_norm": 0.12051045894622803,
+      "learning_rate": 0.00019454348722176422,
+      "loss": 1.0905,
+      "step": 1160
+    },
+    {
+      "epoch": 0.08957841483979764,
+      "grad_norm": 0.12117696553468704,
+      "learning_rate": 0.00019453318219291013,
+      "loss": 1.0611,
+      "step": 1162
+    },
+    {
+      "epoch": 0.08973259455552879,
+      "grad_norm": 0.13710887730121613,
+      "learning_rate": 0.00019452287716405605,
+      "loss": 1.0242,
+      "step": 1164
+    },
+    {
+      "epoch": 0.08988677427125993,
+      "grad_norm": 0.1160813644528389,
+      "learning_rate": 0.000194512572135202,
+      "loss": 1.0863,
+      "step": 1166
+    },
+    {
+      "epoch": 0.09004095398699109,
+      "grad_norm": 0.1754099279642105,
+      "learning_rate": 0.0001945022671063479,
+      "loss": 1.0938,
+      "step": 1168
+    },
+    {
+      "epoch": 0.09019513370272224,
+      "grad_norm": 0.1331128627061844,
+      "learning_rate": 0.00019449196207749382,
+      "loss": 1.0692,
+      "step": 1170
+    },
+    {
+      "epoch": 0.09034931341845338,
+      "grad_norm": 0.13422611355781555,
+      "learning_rate": 0.00019448165704863974,
+      "loss": 1.0699,
+      "step": 1172
+    },
+    {
+      "epoch": 0.09050349313418453,
+      "grad_norm": 0.12999802827835083,
+      "learning_rate": 0.00019447135201978565,
+      "loss": 1.0957,
+      "step": 1174
+    },
+    {
+      "epoch": 0.09065767284991569,
+      "grad_norm": 0.13413815200328827,
+      "learning_rate": 0.0001944610469909316,
+      "loss": 1.0869,
+      "step": 1176
+    },
+    {
+      "epoch": 0.09081185256564683,
+      "grad_norm": 0.12901006639003754,
+      "learning_rate": 0.0001944507419620775,
+      "loss": 1.0442,
+      "step": 1178
+    },
+    {
+      "epoch": 0.09096603228137798,
+      "grad_norm": 0.11824194341897964,
+      "learning_rate": 0.00019444043693322342,
+      "loss": 1.0935,
+      "step": 1180
+    },
+    {
+      "epoch": 0.09112021199710912,
+      "grad_norm": 0.14895616471767426,
+      "learning_rate": 0.00019443013190436934,
+      "loss": 1.0624,
+      "step": 1182
+    },
+    {
+      "epoch": 0.09127439171284028,
+      "grad_norm": 0.13515722751617432,
+      "learning_rate": 0.00019441982687551525,
+      "loss": 1.0797,
+      "step": 1184
+    },
+    {
+      "epoch": 0.09142857142857143,
+      "grad_norm": 0.13411575555801392,
+      "learning_rate": 0.00019440952184666117,
+      "loss": 1.0637,
+      "step": 1186
+    },
+    {
+      "epoch": 0.09158275114430257,
+      "grad_norm": 0.12519463896751404,
+      "learning_rate": 0.0001943992168178071,
+      "loss": 1.0608,
+      "step": 1188
+    },
+    {
+      "epoch": 0.09173693086003373,
+      "grad_norm": 0.1267428696155548,
+      "learning_rate": 0.00019438891178895302,
+      "loss": 1.0182,
+      "step": 1190
+    },
+    {
+      "epoch": 0.09189111057576488,
+      "grad_norm": 0.13116560876369476,
+      "learning_rate": 0.00019437860676009894,
+      "loss": 1.1139,
+      "step": 1192
+    },
+    {
+      "epoch": 0.09204529029149602,
+      "grad_norm": 0.14659713208675385,
+      "learning_rate": 0.00019436830173124485,
+      "loss": 1.1275,
+      "step": 1194
+    },
+    {
+      "epoch": 0.09219947000722717,
+      "grad_norm": 0.12913885712623596,
+      "learning_rate": 0.00019435799670239077,
+      "loss": 1.0858,
+      "step": 1196
+    },
+    {
+      "epoch": 0.09235364972295833,
+      "grad_norm": 0.12855856120586395,
+      "learning_rate": 0.0001943476916735367,
+      "loss": 1.0811,
+      "step": 1198
+    },
+    {
+      "epoch": 0.09250782943868947,
+      "grad_norm": 0.1391747146844864,
+      "learning_rate": 0.00019433738664468263,
+      "loss": 1.0146,
+      "step": 1200
+    },
+    {
+      "epoch": 0.09250782943868947,
+      "eval_loss": 1.0912913084030151,
+      "eval_runtime": 185.3661,
+      "eval_samples_per_second": 91.403,
+      "eval_steps_per_second": 1.43,
+      "step": 1200
+    },
+    {
+      "epoch": 0.09266200915442062,
+      "grad_norm": 0.13186782598495483,
+      "learning_rate": 0.00019432708161582854,
+      "loss": 1.1017,
+      "step": 1202
+    },
+    {
+      "epoch": 0.09281618887015176,
+      "grad_norm": 0.12913943827152252,
+      "learning_rate": 0.00019431677658697446,
+      "loss": 1.1027,
+      "step": 1204
+    },
+    {
+      "epoch": 0.09297036858588292,
+      "grad_norm": 0.1349743753671646,
+      "learning_rate": 0.00019430647155812037,
+      "loss": 1.1023,
+      "step": 1206
+    },
+    {
+      "epoch": 0.09312454830161407,
+      "grad_norm": 0.12534667551517487,
+      "learning_rate": 0.00019429616652926629,
+      "loss": 1.0659,
+      "step": 1208
+    },
+    {
+      "epoch": 0.09327872801734521,
+      "grad_norm": 0.11720700562000275,
+      "learning_rate": 0.0001942858615004122,
+      "loss": 1.0532,
+      "step": 1210
+    },
+    {
+      "epoch": 0.09343290773307637,
+      "grad_norm": 0.1364222913980484,
+      "learning_rate": 0.00019427555647155812,
+      "loss": 1.0575,
+      "step": 1212
+    },
+    {
+      "epoch": 0.09358708744880752,
+      "grad_norm": 0.15532977879047394,
+      "learning_rate": 0.00019426525144270403,
+      "loss": 1.1145,
+      "step": 1214
+    },
+    {
+      "epoch": 0.09374126716453866,
+      "grad_norm": 0.1377478837966919,
+      "learning_rate": 0.00019425494641384995,
+      "loss": 1.0505,
+      "step": 1216
+    },
+    {
+      "epoch": 0.09389544688026981,
+      "grad_norm": 0.1273409128189087,
+      "learning_rate": 0.0001942446413849959,
+      "loss": 1.0873,
+      "step": 1218
+    },
+    {
+      "epoch": 0.09404962659600097,
+      "grad_norm": 0.11990435421466827,
+      "learning_rate": 0.0001942343363561418,
+      "loss": 1.0829,
+      "step": 1220
+    },
+    {
+      "epoch": 0.09420380631173211,
+      "grad_norm": 0.14191892743110657,
+      "learning_rate": 0.00019422403132728772,
+      "loss": 1.0992,
+      "step": 1222
+    },
+    {
+      "epoch": 0.09435798602746326,
+      "grad_norm": 0.14520397782325745,
+      "learning_rate": 0.00019421372629843363,
+      "loss": 1.0712,
+      "step": 1224
+    },
+    {
+      "epoch": 0.09451216574319442,
+      "grad_norm": 0.13780727982521057,
+      "learning_rate": 0.00019420342126957955,
+      "loss": 0.9943,
+      "step": 1226
+    },
+    {
+      "epoch": 0.09466634545892556,
+      "grad_norm": 0.13550738990306854,
+      "learning_rate": 0.0001941931162407255,
+      "loss": 1.1264,
+      "step": 1228
+    },
+    {
+      "epoch": 0.09482052517465671,
+      "grad_norm": 0.12125276774168015,
+      "learning_rate": 0.0001941828112118714,
+      "loss": 1.1207,
+      "step": 1230
+    },
+    {
+      "epoch": 0.09497470489038785,
+      "grad_norm": 0.14529301226139069,
+      "learning_rate": 0.00019417250618301732,
+      "loss": 1.144,
+      "step": 1232
+    },
+    {
+      "epoch": 0.09512888460611901,
+      "grad_norm": 0.15477551519870758,
+      "learning_rate": 0.00019416220115416323,
+      "loss": 1.0568,
+      "step": 1234
+    },
+    {
+      "epoch": 0.09528306432185016,
+      "grad_norm": 0.1299963742494583,
+      "learning_rate": 0.00019415189612530915,
+      "loss": 1.0235,
+      "step": 1236
+    },
+    {
+      "epoch": 0.0954372440375813,
+      "grad_norm": 0.1372281014919281,
+      "learning_rate": 0.0001941415910964551,
+      "loss": 1.0764,
+      "step": 1238
+    },
+    {
+      "epoch": 0.09559142375331245,
+      "grad_norm": 0.1247306764125824,
+      "learning_rate": 0.000194131286067601,
+      "loss": 1.1345,
+      "step": 1240
+    },
+    {
+      "epoch": 0.09574560346904361,
+      "grad_norm": 0.1330571472644806,
+      "learning_rate": 0.00019412098103874692,
+      "loss": 1.1596,
+      "step": 1242
+    },
+    {
+      "epoch": 0.09589978318477475,
+      "grad_norm": 0.15787385404109955,
+      "learning_rate": 0.00019411067600989284,
+      "loss": 1.1067,
+      "step": 1244
+    },
+    {
+      "epoch": 0.0960539629005059,
+      "grad_norm": 0.12646274268627167,
+      "learning_rate": 0.00019410037098103875,
+      "loss": 1.0769,
+      "step": 1246
+    },
+    {
+      "epoch": 0.09620814261623706,
+      "grad_norm": 0.16424262523651123,
+      "learning_rate": 0.0001940900659521847,
+      "loss": 1.0459,
+      "step": 1248
+    },
+    {
+      "epoch": 0.0963623223319682,
+      "grad_norm": 0.1401062309741974,
+      "learning_rate": 0.0001940797609233306,
+      "loss": 1.1308,
+      "step": 1250
+    },
+    {
+      "epoch": 0.09651650204769935,
+      "grad_norm": 0.13971561193466187,
+      "learning_rate": 0.00019406945589447652,
+      "loss": 1.1457,
+      "step": 1252
+    },
+    {
+      "epoch": 0.0966706817634305,
+      "grad_norm": 0.13544687628746033,
+      "learning_rate": 0.00019405915086562244,
+      "loss": 1.0532,
+      "step": 1254
+    },
+    {
+      "epoch": 0.09682486147916165,
+      "grad_norm": 0.13527531921863556,
+      "learning_rate": 0.00019404884583676835,
+      "loss": 1.0376,
+      "step": 1256
+    },
+    {
+      "epoch": 0.0969790411948928,
+      "grad_norm": 0.1731848120689392,
+      "learning_rate": 0.0001940385408079143,
+      "loss": 1.2252,
+      "step": 1258
+    },
+    {
+      "epoch": 0.09713322091062394,
+      "grad_norm": 0.13142083585262299,
+      "learning_rate": 0.0001940282357790602,
+      "loss": 1.0254,
+      "step": 1260
+    },
+    {
+      "epoch": 0.09728740062635509,
+      "grad_norm": 0.13390247523784637,
+      "learning_rate": 0.00019401793075020612,
+      "loss": 1.0448,
+      "step": 1262
+    },
+    {
+      "epoch": 0.09744158034208625,
+      "grad_norm": 0.15188650786876678,
+      "learning_rate": 0.00019400762572135204,
+      "loss": 1.1019,
+      "step": 1264
+    },
+    {
+      "epoch": 0.0975957600578174,
+      "grad_norm": 0.14055617153644562,
+      "learning_rate": 0.00019399732069249795,
+      "loss": 1.0835,
+      "step": 1266
+    },
+    {
+      "epoch": 0.09774993977354854,
+      "grad_norm": 0.12209255248308182,
+      "learning_rate": 0.00019398701566364387,
+      "loss": 1.0675,
+      "step": 1268
+    },
+    {
+      "epoch": 0.0979041194892797,
+      "grad_norm": 0.14639706909656525,
+      "learning_rate": 0.00019397671063478978,
+      "loss": 1.049,
+      "step": 1270
+    },
+    {
+      "epoch": 0.09805829920501084,
+      "grad_norm": 0.13672591745853424,
+      "learning_rate": 0.0001939664056059357,
+      "loss": 1.1057,
+      "step": 1272
+    },
+    {
+      "epoch": 0.09821247892074199,
+      "grad_norm": 0.1522635966539383,
+      "learning_rate": 0.00019395610057708161,
+      "loss": 1.14,
+      "step": 1274
+    },
+    {
+      "epoch": 0.09836665863647313,
+      "grad_norm": 0.13887491822242737,
+      "learning_rate": 0.00019394579554822753,
+      "loss": 1.069,
+      "step": 1276
+    },
+    {
+      "epoch": 0.09852083835220429,
+      "grad_norm": 0.13854965567588806,
+      "learning_rate": 0.00019393549051937344,
+      "loss": 1.0704,
+      "step": 1278
+    },
+    {
+      "epoch": 0.09867501806793544,
+      "grad_norm": 0.12839765846729279,
+      "learning_rate": 0.00019392518549051939,
+      "loss": 1.0512,
+      "step": 1280
+    },
+    {
+      "epoch": 0.09882919778366658,
+      "grad_norm": 0.1270405352115631,
+      "learning_rate": 0.0001939148804616653,
+      "loss": 1.0251,
+      "step": 1282
+    },
+    {
+      "epoch": 0.09898337749939773,
+      "grad_norm": 0.1269143521785736,
+      "learning_rate": 0.00019390457543281122,
+      "loss": 1.0433,
+      "step": 1284
+    },
+    {
+      "epoch": 0.09913755721512889,
+      "grad_norm": 0.14292192459106445,
+      "learning_rate": 0.00019389427040395713,
+      "loss": 1.1507,
+      "step": 1286
+    },
+    {
+      "epoch": 0.09929173693086003,
+      "grad_norm": 0.12512263655662537,
+      "learning_rate": 0.00019388396537510305,
+      "loss": 1.0918,
+      "step": 1288
+    },
+    {
+      "epoch": 0.09944591664659118,
+      "grad_norm": 0.11927679181098938,
+      "learning_rate": 0.000193873660346249,
+      "loss": 1.0924,
+      "step": 1290
+    },
+    {
+      "epoch": 0.09960009636232234,
+      "grad_norm": 0.13639990985393524,
+      "learning_rate": 0.0001938633553173949,
+      "loss": 1.1024,
+      "step": 1292
+    },
+    {
+      "epoch": 0.09975427607805348,
+      "grad_norm": 0.142363503575325,
+      "learning_rate": 0.00019385305028854082,
+      "loss": 1.021,
+      "step": 1294
+    },
+    {
+      "epoch": 0.09990845579378463,
+      "grad_norm": 0.1389359086751938,
+      "learning_rate": 0.00019384274525968673,
+      "loss": 1.0269,
+      "step": 1296
+    },
+    {
+      "epoch": 0.10006263550951577,
+      "grad_norm": 0.15595073997974396,
+      "learning_rate": 0.00019383244023083265,
+      "loss": 1.0913,
+      "step": 1298
+    },
+    {
+      "epoch": 0.10021681522524693,
+      "grad_norm": 0.1324295848608017,
+      "learning_rate": 0.0001938221352019786,
+      "loss": 1.1001,
+      "step": 1300
+    },
+    {
+      "epoch": 0.10021681522524693,
+      "eval_loss": 1.0909266471862793,
+      "eval_runtime": 185.4116,
+      "eval_samples_per_second": 91.38,
+      "eval_steps_per_second": 1.429,
+      "step": 1300
+    },
+    {
+      "epoch": 0.10037099494097808,
+      "grad_norm": 0.139576256275177,
+      "learning_rate": 0.0001938118301731245,
+      "loss": 1.1147,
+      "step": 1302
+    },
+    {
+      "epoch": 0.10052517465670922,
+      "grad_norm": 0.12854811549186707,
+      "learning_rate": 0.00019380152514427042,
+      "loss": 1.0973,
+      "step": 1304
+    },
+    {
+      "epoch": 0.10067935437244037,
+      "grad_norm": 0.1245393380522728,
+      "learning_rate": 0.00019379122011541633,
+      "loss": 1.0485,
+      "step": 1306
+    },
+    {
+      "epoch": 0.10083353408817153,
+      "grad_norm": 0.13261497020721436,
+      "learning_rate": 0.00019378091508656225,
+      "loss": 1.156,
+      "step": 1308
+    },
+    {
+      "epoch": 0.10098771380390267,
+      "grad_norm": 0.1255144327878952,
+      "learning_rate": 0.0001937706100577082,
+      "loss": 1.0852,
+      "step": 1310
+    },
+    {
+      "epoch": 0.10114189351963382,
+      "grad_norm": 0.1412706971168518,
+      "learning_rate": 0.0001937603050288541,
+      "loss": 1.0766,
+      "step": 1312
+    },
+    {
+      "epoch": 0.10129607323536498,
+      "grad_norm": 0.1281047761440277,
+      "learning_rate": 0.00019375000000000002,
+      "loss": 1.0824,
+      "step": 1314
+    },
+    {
+      "epoch": 0.10145025295109612,
+      "grad_norm": 0.13307350873947144,
+      "learning_rate": 0.00019373969497114594,
+      "loss": 1.0887,
+      "step": 1316
+    },
+    {
+      "epoch": 0.10160443266682727,
+      "grad_norm": 0.1287691742181778,
+      "learning_rate": 0.00019372938994229185,
+      "loss": 1.0705,
+      "step": 1318
+    },
+    {
+      "epoch": 0.10175861238255841,
+      "grad_norm": 0.1303441971540451,
+      "learning_rate": 0.00019371908491343777,
+      "loss": 1.1684,
+      "step": 1320
+    },
+    {
+      "epoch": 0.10191279209828957,
+      "grad_norm": 0.13304616510868073,
+      "learning_rate": 0.00019370877988458368,
+      "loss": 1.0944,
+      "step": 1322
+    },
+    {
+      "epoch": 0.10206697181402072,
+      "grad_norm": 0.13905592262744904,
+      "learning_rate": 0.0001936984748557296,
+      "loss": 1.0915,
+      "step": 1324
+    },
+    {
+      "epoch": 0.10222115152975186,
+      "grad_norm": 0.13225632905960083,
+      "learning_rate": 0.0001936881698268755,
+      "loss": 1.0418,
+      "step": 1326
+    },
+    {
+      "epoch": 0.10237533124548302,
+      "grad_norm": 0.1267402619123459,
+      "learning_rate": 0.00019367786479802142,
+      "loss": 1.0446,
+      "step": 1328
+    },
+    {
+      "epoch": 0.10252951096121417,
+      "grad_norm": 0.1439935863018036,
+      "learning_rate": 0.00019366755976916737,
+      "loss": 1.0582,
+      "step": 1330
+    },
+    {
+      "epoch": 0.10268369067694531,
+      "grad_norm": 0.1267223060131073,
+      "learning_rate": 0.00019365725474031328,
+      "loss": 1.0176,
+      "step": 1332
+    },
+    {
+      "epoch": 0.10283787039267646,
+      "grad_norm": 0.1298942118883133,
+      "learning_rate": 0.0001936469497114592,
+      "loss": 1.0552,
+      "step": 1334
+    },
+    {
+      "epoch": 0.10299205010840762,
+      "grad_norm": 0.13010933995246887,
+      "learning_rate": 0.0001936366446826051,
+      "loss": 1.0848,
+      "step": 1336
+    },
+    {
+      "epoch": 0.10314622982413876,
+      "grad_norm": 0.13728559017181396,
+      "learning_rate": 0.00019362633965375103,
+      "loss": 1.0779,
+      "step": 1338
+    },
+    {
+      "epoch": 0.10330040953986991,
+      "grad_norm": 0.13863548636436462,
+      "learning_rate": 0.00019361603462489697,
+      "loss": 1.0326,
+      "step": 1340
+    },
+    {
+      "epoch": 0.10345458925560105,
+      "grad_norm": 0.12995532155036926,
+      "learning_rate": 0.00019360572959604288,
+      "loss": 1.1427,
+      "step": 1342
+    },
+    {
+      "epoch": 0.10360876897133221,
+      "grad_norm": 0.13650789856910706,
+      "learning_rate": 0.0001935954245671888,
+      "loss": 1.0528,
+      "step": 1344
+    },
+    {
+      "epoch": 0.10376294868706336,
+      "grad_norm": 0.1336941123008728,
+      "learning_rate": 0.0001935851195383347,
+      "loss": 1.1155,
+      "step": 1346
+    },
+    {
+      "epoch": 0.1039171284027945,
+      "grad_norm": 0.13927003741264343,
+      "learning_rate": 0.00019357481450948063,
+      "loss": 1.0551,
+      "step": 1348
+    },
+    {
+      "epoch": 0.10407130811852566,
+      "grad_norm": 0.14504994451999664,
+      "learning_rate": 0.00019356450948062657,
+      "loss": 1.1014,
+      "step": 1350
+    },
+    {
+      "epoch": 0.10422548783425681,
+      "grad_norm": 0.15796230733394623,
+      "learning_rate": 0.00019355420445177248,
+      "loss": 1.2115,
+      "step": 1352
+    },
+    {
+      "epoch": 0.10437966754998795,
+      "grad_norm": 0.1317984163761139,
+      "learning_rate": 0.0001935438994229184,
+      "loss": 1.0933,
+      "step": 1354
+    },
+    {
+      "epoch": 0.1045338472657191,
+      "grad_norm": 0.13189563155174255,
+      "learning_rate": 0.00019353359439406431,
+      "loss": 1.0664,
+      "step": 1356
+    },
+    {
+      "epoch": 0.10468802698145026,
+      "grad_norm": 0.1323234885931015,
+      "learning_rate": 0.00019352328936521023,
+      "loss": 1.0824,
+      "step": 1358
+    },
+    {
+      "epoch": 0.1048422066971814,
+      "grad_norm": 0.13659097254276276,
+      "learning_rate": 0.00019351298433635614,
+      "loss": 1.0334,
+      "step": 1360
+    },
+    {
+      "epoch": 0.10499638641291255,
+      "grad_norm": 0.11882172524929047,
+      "learning_rate": 0.0001935026793075021,
+      "loss": 1.0401,
+      "step": 1362
+    },
+    {
+      "epoch": 0.1051505661286437,
+      "grad_norm": 0.13025067746639252,
+      "learning_rate": 0.000193492374278648,
+      "loss": 1.0838,
+      "step": 1364
+    },
+    {
+      "epoch": 0.10530474584437485,
+      "grad_norm": 0.1249939501285553,
+      "learning_rate": 0.00019348206924979392,
+      "loss": 1.0349,
+      "step": 1366
+    },
+    {
+      "epoch": 0.105458925560106,
+      "grad_norm": 0.12588031589984894,
+      "learning_rate": 0.00019347176422093983,
+      "loss": 1.079,
+      "step": 1368
+    },
+    {
+      "epoch": 0.10561310527583714,
+      "grad_norm": 0.12548890709877014,
+      "learning_rate": 0.00019346145919208575,
+      "loss": 1.0062,
+      "step": 1370
+    },
+    {
+      "epoch": 0.1057672849915683,
+      "grad_norm": 0.13328798115253448,
+      "learning_rate": 0.00019345115416323166,
+      "loss": 1.1154,
+      "step": 1372
+    },
+    {
+      "epoch": 0.10592146470729945,
+      "grad_norm": 0.1443903148174286,
+      "learning_rate": 0.00019344084913437758,
+      "loss": 1.097,
+      "step": 1374
+    },
+    {
+      "epoch": 0.1060756444230306,
+      "grad_norm": 0.12835648655891418,
+      "learning_rate": 0.0001934305441055235,
+      "loss": 1.0723,
+      "step": 1376
+    },
+    {
+      "epoch": 0.10622982413876174,
+      "grad_norm": 0.13068312406539917,
+      "learning_rate": 0.0001934202390766694,
+      "loss": 1.1128,
+      "step": 1378
+    },
+    {
+      "epoch": 0.1063840038544929,
+      "grad_norm": 0.13628961145877838,
+      "learning_rate": 0.00019340993404781532,
+      "loss": 1.1146,
+      "step": 1380
+    },
+    {
+      "epoch": 0.10653818357022404,
+      "grad_norm": 0.12263484299182892,
+      "learning_rate": 0.00019339962901896126,
+      "loss": 1.0947,
+      "step": 1382
+    },
+    {
+      "epoch": 0.10669236328595519,
+      "grad_norm": 0.12684424221515656,
+      "learning_rate": 0.00019338932399010718,
+      "loss": 1.059,
+      "step": 1384
+    },
+    {
+      "epoch": 0.10684654300168633,
+      "grad_norm": 0.1421595960855484,
+      "learning_rate": 0.0001933790189612531,
+      "loss": 1.0688,
+      "step": 1386
+    },
+    {
+      "epoch": 0.10700072271741749,
+      "grad_norm": 0.12416025251150131,
+      "learning_rate": 0.000193368713932399,
+      "loss": 1.0905,
+      "step": 1388
+    },
+    {
+      "epoch": 0.10715490243314864,
+      "grad_norm": 0.1284332126379013,
+      "learning_rate": 0.00019335840890354492,
+      "loss": 1.0612,
+      "step": 1390
+    },
+    {
+      "epoch": 0.10730908214887978,
+      "grad_norm": 0.1282491385936737,
+      "learning_rate": 0.00019334810387469086,
+      "loss": 1.0851,
+      "step": 1392
+    },
+    {
+      "epoch": 0.10746326186461094,
+      "grad_norm": 0.13221289217472076,
+      "learning_rate": 0.00019333779884583678,
+      "loss": 1.0446,
+      "step": 1394
+    },
+    {
+      "epoch": 0.10761744158034209,
+      "grad_norm": 0.12401736527681351,
+      "learning_rate": 0.0001933274938169827,
+      "loss": 1.0826,
+      "step": 1396
+    },
+    {
+      "epoch": 0.10777162129607323,
+      "grad_norm": 0.14316771924495697,
+      "learning_rate": 0.0001933171887881286,
+      "loss": 1.1136,
+      "step": 1398
+    },
+    {
+      "epoch": 0.10792580101180438,
+      "grad_norm": 0.17223364114761353,
+      "learning_rate": 0.00019330688375927452,
+      "loss": 1.0752,
+      "step": 1400
+    },
+    {
+      "epoch": 0.10792580101180438,
+      "eval_loss": 1.0899540185928345,
+      "eval_runtime": 185.3818,
+      "eval_samples_per_second": 91.395,
+      "eval_steps_per_second": 1.429,
+      "step": 1400
+    },
+    {
+      "epoch": 0.10807998072753554,
+      "grad_norm": 0.15027141571044922,
+      "learning_rate": 0.00019329657873042047,
+      "loss": 1.0371,
+      "step": 1402
+    },
+    {
+      "epoch": 0.10823416044326668,
+      "grad_norm": 0.19876505434513092,
+      "learning_rate": 0.00019328627370156638,
+      "loss": 1.0312,
+      "step": 1404
+    },
+    {
+      "epoch": 0.10838834015899783,
+      "grad_norm": 0.1422131210565567,
+      "learning_rate": 0.0001932759686727123,
+      "loss": 1.0597,
+      "step": 1406
+    },
+    {
+      "epoch": 0.10854251987472899,
+      "grad_norm": 0.13597753643989563,
+      "learning_rate": 0.0001932656636438582,
+      "loss": 1.0939,
+      "step": 1408
+    },
+    {
+      "epoch": 0.10869669959046013,
+      "grad_norm": 0.16808953881263733,
+      "learning_rate": 0.00019325535861500413,
+      "loss": 1.1221,
+      "step": 1410
+    },
+    {
+      "epoch": 0.10885087930619128,
+      "grad_norm": 0.14884881675243378,
+      "learning_rate": 0.00019324505358615007,
+      "loss": 1.1114,
+      "step": 1412
+    },
+    {
+      "epoch": 0.10900505902192242,
+      "grad_norm": 0.12680503726005554,
+      "learning_rate": 0.00019323474855729598,
+      "loss": 1.1032,
+      "step": 1414
+    },
+    {
+      "epoch": 0.10915923873765358,
+      "grad_norm": 0.13997766375541687,
+      "learning_rate": 0.0001932244435284419,
+      "loss": 1.0799,
+      "step": 1416
+    },
+    {
+      "epoch": 0.10931341845338473,
+      "grad_norm": 0.1343669593334198,
+      "learning_rate": 0.0001932141384995878,
+      "loss": 1.0778,
+      "step": 1418
+    },
+    {
+      "epoch": 0.10946759816911587,
+      "grad_norm": 0.12029851973056793,
+      "learning_rate": 0.00019320383347073373,
+      "loss": 1.1021,
+      "step": 1420
+    },
+    {
+      "epoch": 0.10962177788484702,
+      "grad_norm": 0.1322990357875824,
+      "learning_rate": 0.00019319352844187967,
+      "loss": 1.1061,
+      "step": 1422
+    },
+    {
+      "epoch": 0.10977595760057818,
+      "grad_norm": 0.13710594177246094,
+      "learning_rate": 0.00019318322341302558,
+      "loss": 1.0786,
+      "step": 1424
+    },
+    {
+      "epoch": 0.10993013731630932,
+      "grad_norm": 0.11956049501895905,
+      "learning_rate": 0.0001931729183841715,
+      "loss": 1.0711,
+      "step": 1426
+    },
+    {
+      "epoch": 0.11008431703204047,
+      "grad_norm": 0.139973446726799,
+      "learning_rate": 0.00019316261335531741,
+      "loss": 1.1162,
+      "step": 1428
+    },
+    {
+      "epoch": 0.11023849674777163,
+      "grad_norm": 0.1525941640138626,
+      "learning_rate": 0.00019315230832646333,
+      "loss": 1.0572,
+      "step": 1430
+    },
+    {
+      "epoch": 0.11039267646350277,
+      "grad_norm": 0.1349973976612091,
+      "learning_rate": 0.00019314200329760924,
+      "loss": 1.1048,
+      "step": 1432
+    },
+    {
+      "epoch": 0.11054685617923392,
+      "grad_norm": 0.1305711269378662,
+      "learning_rate": 0.00019313169826875516,
+      "loss": 1.0841,
+      "step": 1434
+    },
+    {
+      "epoch": 0.11070103589496506,
+      "grad_norm": 0.16756822168827057,
+      "learning_rate": 0.00019312139323990107,
+      "loss": 1.0736,
+      "step": 1436
+    },
+    {
+      "epoch": 0.11085521561069622,
+      "grad_norm": 0.13367486000061035,
+      "learning_rate": 0.000193111088211047,
+      "loss": 1.0774,
+      "step": 1438
+    },
+    {
+      "epoch": 0.11100939532642737,
+      "grad_norm": 0.12484605610370636,
+      "learning_rate": 0.0001931007831821929,
+      "loss": 1.1196,
+      "step": 1440
+    },
+    {
+      "epoch": 0.11116357504215851,
+      "grad_norm": 0.14064739644527435,
+      "learning_rate": 0.00019309047815333885,
+      "loss": 1.1101,
+      "step": 1442
+    },
+    {
+      "epoch": 0.11131775475788966,
+      "grad_norm": 0.1366916447877884,
+      "learning_rate": 0.00019308017312448476,
+      "loss": 1.111,
+      "step": 1444
+    },
+    {
+      "epoch": 0.11147193447362082,
+      "grad_norm": 0.11520934104919434,
+      "learning_rate": 0.00019306986809563068,
+      "loss": 1.065,
+      "step": 1446
+    },
+    {
+      "epoch": 0.11162611418935196,
+      "grad_norm": 0.15567731857299805,
+      "learning_rate": 0.0001930595630667766,
+      "loss": 1.1036,
+      "step": 1448
+    },
+    {
+      "epoch": 0.11178029390508311,
+      "grad_norm": 0.13628730177879333,
+      "learning_rate": 0.0001930492580379225,
+      "loss": 1.0717,
+      "step": 1450
+    },
+    {
+      "epoch": 0.11193447362081427,
+      "grad_norm": 0.1359964907169342,
+      "learning_rate": 0.00019303895300906842,
+      "loss": 1.0986,
+      "step": 1452
+    },
+    {
+      "epoch": 0.11208865333654541,
+      "grad_norm": 0.16372162103652954,
+      "learning_rate": 0.00019302864798021436,
+      "loss": 1.0306,
+      "step": 1454
+    },
+    {
+      "epoch": 0.11224283305227656,
+      "grad_norm": 0.1724134087562561,
+      "learning_rate": 0.00019301834295136028,
+      "loss": 1.0753,
+      "step": 1456
+    },
+    {
+      "epoch": 0.1123970127680077,
+      "grad_norm": 0.13646383583545685,
+      "learning_rate": 0.0001930080379225062,
+      "loss": 1.0975,
+      "step": 1458
+    },
+    {
+      "epoch": 0.11255119248373886,
+      "grad_norm": 0.1522134691476822,
+      "learning_rate": 0.0001929977328936521,
+      "loss": 1.1031,
+      "step": 1460
+    },
+    {
+      "epoch": 0.11270537219947001,
+      "grad_norm": 0.13656160235404968,
+      "learning_rate": 0.00019298742786479802,
+      "loss": 1.0602,
+      "step": 1462
+    },
+    {
+      "epoch": 0.11285955191520115,
+      "grad_norm": 0.14140130579471588,
+      "learning_rate": 0.00019297712283594396,
+      "loss": 1.1289,
+      "step": 1464
+    },
+    {
+      "epoch": 0.1130137316309323,
+      "grad_norm": 0.1383032351732254,
+      "learning_rate": 0.00019296681780708988,
+      "loss": 1.0797,
+      "step": 1466
+    },
+    {
+      "epoch": 0.11316791134666346,
+      "grad_norm": 0.15723556280136108,
+      "learning_rate": 0.0001929565127782358,
+      "loss": 1.1156,
+      "step": 1468
+    },
+    {
+      "epoch": 0.1133220910623946,
+      "grad_norm": 0.13462230563163757,
+      "learning_rate": 0.0001929462077493817,
+      "loss": 1.0953,
+      "step": 1470
+    },
+    {
+      "epoch": 0.11347627077812575,
+      "grad_norm": 0.14101319015026093,
+      "learning_rate": 0.00019293590272052762,
+      "loss": 1.1152,
+      "step": 1472
+    },
+    {
+      "epoch": 0.11363045049385691,
+      "grad_norm": 0.13705132901668549,
+      "learning_rate": 0.00019292559769167357,
+      "loss": 1.0886,
+      "step": 1474
+    },
+    {
+      "epoch": 0.11378463020958805,
+      "grad_norm": 0.1206672340631485,
+      "learning_rate": 0.00019291529266281948,
+      "loss": 1.0995,
+      "step": 1476
+    },
+    {
+      "epoch": 0.1139388099253192,
+      "grad_norm": 0.13666383922100067,
+      "learning_rate": 0.0001929049876339654,
+      "loss": 1.058,
+      "step": 1478
+    },
+    {
+      "epoch": 0.11409298964105034,
+      "grad_norm": 0.1265423446893692,
+      "learning_rate": 0.0001928946826051113,
+      "loss": 1.0676,
+      "step": 1480
+    },
+    {
+      "epoch": 0.1142471693567815,
+      "grad_norm": 0.1528097242116928,
+      "learning_rate": 0.00019288437757625723,
+      "loss": 1.0675,
+      "step": 1482
+    },
+    {
+      "epoch": 0.11440134907251265,
+      "grad_norm": 0.16541676223278046,
+      "learning_rate": 0.00019287407254740314,
+      "loss": 1.1539,
+      "step": 1484
+    },
+    {
+      "epoch": 0.1145555287882438,
+      "grad_norm": 0.20383091270923615,
+      "learning_rate": 0.00019286376751854906,
+      "loss": 1.0472,
+      "step": 1486
+    },
+    {
+      "epoch": 0.11470970850397495,
+      "grad_norm": 0.13806484639644623,
+      "learning_rate": 0.00019285346248969497,
+      "loss": 1.0408,
+      "step": 1488
+    },
+    {
+      "epoch": 0.1148638882197061,
+      "grad_norm": 0.1251746118068695,
+      "learning_rate": 0.00019284315746084089,
+      "loss": 1.1207,
+      "step": 1490
+    },
+    {
+      "epoch": 0.11501806793543724,
+      "grad_norm": 0.13218504190444946,
+      "learning_rate": 0.0001928328524319868,
+      "loss": 1.1131,
+      "step": 1492
+    },
+    {
+      "epoch": 0.11517224765116839,
+      "grad_norm": 0.21616914868354797,
+      "learning_rate": 0.00019282254740313274,
+      "loss": 1.1103,
+      "step": 1494
+    },
+    {
+      "epoch": 0.11532642736689955,
+      "grad_norm": 0.1437305361032486,
+      "learning_rate": 0.00019281224237427866,
+      "loss": 1.1243,
+      "step": 1496
+    },
+    {
+      "epoch": 0.11548060708263069,
+      "grad_norm": 0.13094168901443481,
+      "learning_rate": 0.00019280193734542457,
+      "loss": 1.1012,
+      "step": 1498
+    },
+    {
+      "epoch": 0.11563478679836184,
+      "grad_norm": 0.12384334206581116,
+      "learning_rate": 0.0001927916323165705,
+      "loss": 1.05,
+      "step": 1500
+    },
+    {
+      "epoch": 0.11563478679836184,
+      "eval_loss": 1.0905406475067139,
+      "eval_runtime": 185.4473,
+      "eval_samples_per_second": 91.363,
+      "eval_steps_per_second": 1.429,
+      "step": 1500
+    },
+    {
+      "epoch": 0.11578896651409298,
+      "grad_norm": 0.12807106971740723,
+      "learning_rate": 0.0001927813272877164,
+      "loss": 1.0754,
+      "step": 1502
+    },
+    {
+      "epoch": 0.11594314622982414,
+      "grad_norm": 0.12517131865024567,
+      "learning_rate": 0.00019277102225886234,
+      "loss": 1.1017,
+      "step": 1504
+    },
+    {
+      "epoch": 0.11609732594555529,
+      "grad_norm": 0.1704496592283249,
+      "learning_rate": 0.00019276071723000826,
+      "loss": 1.098,
+      "step": 1506
+    },
+    {
+      "epoch": 0.11625150566128643,
+      "grad_norm": 0.12152231484651566,
+      "learning_rate": 0.00019275041220115417,
+      "loss": 1.0738,
+      "step": 1508
+    },
+    {
+      "epoch": 0.11640568537701759,
+      "grad_norm": 0.12952156364917755,
+      "learning_rate": 0.0001927401071723001,
+      "loss": 1.0479,
+      "step": 1510
+    },
+    {
+      "epoch": 0.11655986509274874,
+      "grad_norm": 0.1499640941619873,
+      "learning_rate": 0.000192729802143446,
+      "loss": 1.1046,
+      "step": 1512
+    },
+    {
+      "epoch": 0.11671404480847988,
+      "grad_norm": 0.1331593543291092,
+      "learning_rate": 0.00019271949711459195,
+      "loss": 1.1219,
+      "step": 1514
+    },
+    {
+      "epoch": 0.11686822452421103,
+      "grad_norm": 0.1368558406829834,
+      "learning_rate": 0.00019270919208573786,
+      "loss": 1.1357,
+      "step": 1516
+    },
+    {
+      "epoch": 0.11702240423994219,
+      "grad_norm": 0.12278290838003159,
+      "learning_rate": 0.00019269888705688378,
+      "loss": 1.1079,
+      "step": 1518
+    },
+    {
+      "epoch": 0.11717658395567333,
+      "grad_norm": 0.11737775802612305,
+      "learning_rate": 0.0001926885820280297,
+      "loss": 1.1224,
+      "step": 1520
+    },
+    {
+      "epoch": 0.11733076367140448,
+      "grad_norm": 0.13017341494560242,
+      "learning_rate": 0.0001926782769991756,
+      "loss": 1.0648,
+      "step": 1522
+    },
+    {
+      "epoch": 0.11748494338713562,
+      "grad_norm": 0.11939583718776703,
+      "learning_rate": 0.00019266797197032155,
+      "loss": 1.0899,
+      "step": 1524
+    },
+    {
+      "epoch": 0.11763912310286678,
+      "grad_norm": 0.12446755915880203,
+      "learning_rate": 0.00019265766694146746,
+      "loss": 1.0626,
+      "step": 1526
+    },
+    {
+      "epoch": 0.11779330281859793,
+      "grad_norm": 0.13369430601596832,
+      "learning_rate": 0.00019264736191261338,
+      "loss": 1.0526,
+      "step": 1528
+    },
+    {
+      "epoch": 0.11794748253432907,
+      "grad_norm": 0.13470736145973206,
+      "learning_rate": 0.0001926370568837593,
+      "loss": 1.0946,
+      "step": 1530
+    },
+    {
+      "epoch": 0.11810166225006023,
+      "grad_norm": 0.14193174242973328,
+      "learning_rate": 0.0001926267518549052,
+      "loss": 1.1089,
+      "step": 1532
+    },
+    {
+      "epoch": 0.11825584196579138,
+      "grad_norm": 0.14893026649951935,
+      "learning_rate": 0.00019261644682605112,
+      "loss": 1.0606,
+      "step": 1534
+    },
+    {
+      "epoch": 0.11841002168152252,
+      "grad_norm": 0.20594976842403412,
+      "learning_rate": 0.00019260614179719704,
+      "loss": 1.0375,
+      "step": 1536
+    },
+    {
+      "epoch": 0.11856420139725367,
+      "grad_norm": 0.15287873148918152,
+      "learning_rate": 0.00019259583676834295,
+      "loss": 1.1414,
+      "step": 1538
+    },
+    {
+      "epoch": 0.11871838111298483,
+      "grad_norm": 0.1275177299976349,
+      "learning_rate": 0.00019258553173948887,
+      "loss": 1.1084,
+      "step": 1540
+    },
+    {
+      "epoch": 0.11887256082871597,
+      "grad_norm": 0.20036157965660095,
+      "learning_rate": 0.00019257522671063478,
+      "loss": 1.1261,
+      "step": 1542
+    },
+    {
+      "epoch": 0.11902674054444712,
+      "grad_norm": 0.14492087066173553,
+      "learning_rate": 0.0001925649216817807,
+      "loss": 1.1137,
+      "step": 1544
+    },
+    {
+      "epoch": 0.11918092026017826,
+      "grad_norm": 0.1259312629699707,
+      "learning_rate": 0.00019255461665292664,
+      "loss": 1.0409,
+      "step": 1546
+    },
+    {
+      "epoch": 0.11933509997590942,
+      "grad_norm": 0.1296795755624771,
+      "learning_rate": 0.00019254431162407255,
+      "loss": 1.0332,
+      "step": 1548
+    },
+    {
+      "epoch": 0.11948927969164057,
+      "grad_norm": 0.13372276723384857,
+      "learning_rate": 0.00019253400659521847,
+      "loss": 1.1087,
+      "step": 1550
+    },
+    {
+      "epoch": 0.11964345940737171,
+      "grad_norm": 0.14354725182056427,
+      "learning_rate": 0.00019252370156636438,
+      "loss": 1.0398,
+      "step": 1552
+    },
+    {
+      "epoch": 0.11979763912310287,
+      "grad_norm": 0.1378318965435028,
+      "learning_rate": 0.0001925133965375103,
+      "loss": 1.0542,
+      "step": 1554
+    },
+    {
+      "epoch": 0.11995181883883402,
+      "grad_norm": 0.12171255797147751,
+      "learning_rate": 0.00019250309150865624,
+      "loss": 1.0935,
+      "step": 1556
+    },
+    {
+      "epoch": 0.12010599855456516,
+      "grad_norm": 0.11905664205551147,
+      "learning_rate": 0.00019249278647980215,
+      "loss": 1.0097,
+      "step": 1558
+    },
+    {
+      "epoch": 0.12026017827029631,
+      "grad_norm": 0.12854760885238647,
+      "learning_rate": 0.00019248248145094807,
+      "loss": 1.1517,
+      "step": 1560
+    },
+    {
+      "epoch": 0.12041435798602747,
+      "grad_norm": 0.247908353805542,
+      "learning_rate": 0.00019247217642209398,
+      "loss": 1.0876,
+      "step": 1562
+    },
+    {
+      "epoch": 0.12056853770175861,
+      "grad_norm": 0.1441553235054016,
+      "learning_rate": 0.0001924618713932399,
+      "loss": 1.1414,
+      "step": 1564
+    },
+    {
+      "epoch": 0.12072271741748976,
+      "grad_norm": 0.13307887315750122,
+      "learning_rate": 0.00019245156636438584,
+      "loss": 1.1012,
+      "step": 1566
+    },
+    {
+      "epoch": 0.12087689713322092,
+      "grad_norm": 0.14192406833171844,
+      "learning_rate": 0.00019244126133553176,
+      "loss": 1.1418,
+      "step": 1568
+    },
+    {
+      "epoch": 0.12103107684895206,
+      "grad_norm": 0.11530864983797073,
+      "learning_rate": 0.00019243095630667767,
+      "loss": 1.0776,
+      "step": 1570
+    },
+    {
+      "epoch": 0.12118525656468321,
+      "grad_norm": 0.13385196030139923,
+      "learning_rate": 0.00019242065127782359,
+      "loss": 1.1311,
+      "step": 1572
+    },
+    {
+      "epoch": 0.12133943628041435,
+      "grad_norm": 0.1308089643716812,
+      "learning_rate": 0.0001924103462489695,
+      "loss": 1.0625,
+      "step": 1574
+    },
+    {
+      "epoch": 0.12149361599614551,
+      "grad_norm": 0.11851842701435089,
+      "learning_rate": 0.00019240004122011544,
+      "loss": 1.0182,
+      "step": 1576
+    },
+    {
+      "epoch": 0.12164779571187666,
+      "grad_norm": 0.2496737688779831,
+      "learning_rate": 0.00019238973619126136,
+      "loss": 1.0746,
+      "step": 1578
+    },
+    {
+      "epoch": 0.1218019754276078,
+      "grad_norm": 0.12962055206298828,
+      "learning_rate": 0.00019237943116240727,
+      "loss": 1.0245,
+      "step": 1580
+    },
+    {
+      "epoch": 0.12195615514333895,
+      "grad_norm": 0.13170978426933289,
+      "learning_rate": 0.0001923691261335532,
+      "loss": 0.9897,
+      "step": 1582
+    },
+    {
+      "epoch": 0.12211033485907011,
+      "grad_norm": 0.13226309418678284,
+      "learning_rate": 0.0001923588211046991,
+      "loss": 1.1035,
+      "step": 1584
+    },
+    {
+      "epoch": 0.12226451457480125,
+      "grad_norm": 0.11901077628135681,
+      "learning_rate": 0.00019234851607584502,
+      "loss": 1.0084,
+      "step": 1586
+    },
+    {
+      "epoch": 0.1224186942905324,
+      "grad_norm": 0.15274369716644287,
+      "learning_rate": 0.00019233821104699093,
+      "loss": 1.1436,
+      "step": 1588
+    },
+    {
+      "epoch": 0.12257287400626356,
+      "grad_norm": 0.11832466721534729,
+      "learning_rate": 0.00019232790601813685,
+      "loss": 1.0179,
+      "step": 1590
+    },
+    {
+      "epoch": 0.1227270537219947,
+      "grad_norm": 0.13038666546344757,
+      "learning_rate": 0.00019231760098928276,
+      "loss": 1.0779,
+      "step": 1592
+    },
+    {
+      "epoch": 0.12288123343772585,
+      "grad_norm": 0.12837626039981842,
+      "learning_rate": 0.00019230729596042868,
+      "loss": 1.1404,
+      "step": 1594
+    },
+    {
+      "epoch": 0.123035413153457,
+      "grad_norm": 0.1400509923696518,
+      "learning_rate": 0.00019229699093157462,
+      "loss": 1.1132,
+      "step": 1596
+    },
+    {
+      "epoch": 0.12318959286918815,
+      "grad_norm": 0.13757595419883728,
+      "learning_rate": 0.00019228668590272053,
+      "loss": 1.0816,
+      "step": 1598
+    },
+    {
+      "epoch": 0.1233437725849193,
+      "grad_norm": 0.12403321266174316,
+      "learning_rate": 0.00019227638087386645,
+      "loss": 1.039,
+      "step": 1600
+    },
+    {
+      "epoch": 0.1233437725849193,
+      "eval_loss": 1.0888522863388062,
+      "eval_runtime": 185.2371,
+      "eval_samples_per_second": 91.467,
+      "eval_steps_per_second": 1.431,
+      "step": 1600
+    },
+    {
+      "epoch": 0.12349795230065044,
+      "grad_norm": 0.12380605190992355,
+      "learning_rate": 0.00019226607584501236,
+      "loss": 1.0903,
+      "step": 1602
+    },
+    {
+      "epoch": 0.12365213201638159,
+      "grad_norm": 0.13564443588256836,
+      "learning_rate": 0.00019225577081615828,
+      "loss": 1.0768,
+      "step": 1604
+    },
+    {
+      "epoch": 0.12380631173211275,
+      "grad_norm": 0.1533685177564621,
+      "learning_rate": 0.00019224546578730422,
+      "loss": 1.0852,
+      "step": 1606
+    },
+    {
+      "epoch": 0.12396049144784389,
+      "grad_norm": 0.1163390502333641,
+      "learning_rate": 0.00019223516075845014,
+      "loss": 1.0574,
+      "step": 1608
+    },
+    {
+      "epoch": 0.12411467116357504,
+      "grad_norm": 0.13867324590682983,
+      "learning_rate": 0.00019222485572959605,
+      "loss": 1.0992,
+      "step": 1610
+    },
+    {
+      "epoch": 0.1242688508793062,
+      "grad_norm": 0.12759087979793549,
+      "learning_rate": 0.00019221455070074197,
+      "loss": 1.0738,
+      "step": 1612
+    },
+    {
+      "epoch": 0.12442303059503734,
+      "grad_norm": 0.1237189844250679,
+      "learning_rate": 0.00019220424567188788,
+      "loss": 1.0974,
+      "step": 1614
+    },
+    {
+      "epoch": 0.12457721031076849,
+      "grad_norm": 0.13331052660942078,
+      "learning_rate": 0.00019219394064303382,
+      "loss": 1.0917,
+      "step": 1616
+    },
+    {
+      "epoch": 0.12473139002649963,
+      "grad_norm": 0.1290212869644165,
+      "learning_rate": 0.00019218363561417974,
+      "loss": 1.0696,
+      "step": 1618
+    },
+    {
+      "epoch": 0.12488556974223079,
+      "grad_norm": 0.13309410214424133,
+      "learning_rate": 0.00019217333058532565,
+      "loss": 1.043,
+      "step": 1620
+    },
+    {
+      "epoch": 0.12503974945796192,
+      "grad_norm": 0.13453248143196106,
+      "learning_rate": 0.00019216302555647157,
+      "loss": 1.0435,
+      "step": 1622
+    },
+    {
+      "epoch": 0.1251939291736931,
+      "grad_norm": 0.11639372259378433,
+      "learning_rate": 0.00019215272052761748,
+      "loss": 1.0579,
+      "step": 1624
+    },
+    {
+      "epoch": 0.12534810888942424,
+      "grad_norm": 0.13231517374515533,
+      "learning_rate": 0.0001921424154987634,
+      "loss": 1.1268,
+      "step": 1626
+    },
+    {
+      "epoch": 0.1255022886051554,
+      "grad_norm": 0.1349351406097412,
+      "learning_rate": 0.00019213211046990934,
+      "loss": 1.1599,
+      "step": 1628
+    },
+    {
+      "epoch": 0.12565646832088653,
+      "grad_norm": 0.13710346817970276,
+      "learning_rate": 0.00019212180544105525,
+      "loss": 1.0866,
+      "step": 1630
+    },
+    {
+      "epoch": 0.12581064803661768,
+      "grad_norm": 0.14535072445869446,
+      "learning_rate": 0.00019211150041220117,
+      "loss": 1.0445,
+      "step": 1632
+    },
+    {
+      "epoch": 0.12596482775234882,
+      "grad_norm": 0.11799806356430054,
+      "learning_rate": 0.00019210119538334708,
+      "loss": 1.0525,
+      "step": 1634
+    },
+    {
+      "epoch": 0.12611900746807997,
+      "grad_norm": 0.13399624824523926,
+      "learning_rate": 0.000192090890354493,
+      "loss": 1.0246,
+      "step": 1636
+    },
+    {
+      "epoch": 0.12627318718381114,
+      "grad_norm": 0.14404788613319397,
+      "learning_rate": 0.00019208058532563894,
+      "loss": 1.0582,
+      "step": 1638
+    },
+    {
+      "epoch": 0.1264273668995423,
+      "grad_norm": 0.14395713806152344,
+      "learning_rate": 0.00019207028029678486,
+      "loss": 1.0686,
+      "step": 1640
+    },
+    {
+      "epoch": 0.12658154661527343,
+      "grad_norm": 0.13249294459819794,
+      "learning_rate": 0.00019205997526793077,
+      "loss": 1.1286,
+      "step": 1642
+    },
+    {
+      "epoch": 0.12673572633100458,
+      "grad_norm": 0.12791812419891357,
+      "learning_rate": 0.00019204967023907669,
+      "loss": 1.062,
+      "step": 1644
+    },
+    {
+      "epoch": 0.12688990604673572,
+      "grad_norm": 0.12210959941148758,
+      "learning_rate": 0.0001920393652102226,
+      "loss": 1.0419,
+      "step": 1646
+    },
+    {
+      "epoch": 0.12704408576246687,
+      "grad_norm": 0.13438813388347626,
+      "learning_rate": 0.00019202906018136852,
+      "loss": 1.0589,
+      "step": 1648
+    },
+    {
+      "epoch": 0.127198265478198,
+      "grad_norm": 0.12953762710094452,
+      "learning_rate": 0.00019201875515251443,
+      "loss": 1.0128,
+      "step": 1650
+    },
+    {
+      "epoch": 0.1273524451939292,
+      "grad_norm": 0.1318603903055191,
+      "learning_rate": 0.00019200845012366035,
+      "loss": 1.073,
+      "step": 1652
+    },
+    {
+      "epoch": 0.12750662490966033,
+      "grad_norm": 0.12956051528453827,
+      "learning_rate": 0.00019199814509480626,
+      "loss": 1.0489,
+      "step": 1654
+    },
+    {
+      "epoch": 0.12766080462539148,
+      "grad_norm": 0.13501368463039398,
+      "learning_rate": 0.00019198784006595218,
+      "loss": 1.0198,
+      "step": 1656
+    },
+    {
+      "epoch": 0.12781498434112262,
+      "grad_norm": 0.13902342319488525,
+      "learning_rate": 0.00019197753503709812,
+      "loss": 1.0512,
+      "step": 1658
+    },
+    {
+      "epoch": 0.12796916405685377,
+      "grad_norm": 0.15590503811836243,
+      "learning_rate": 0.00019196723000824403,
+      "loss": 1.1782,
+      "step": 1660
+    },
+    {
+      "epoch": 0.1281233437725849,
+      "grad_norm": 0.13954932987689972,
+      "learning_rate": 0.00019195692497938995,
+      "loss": 1.0421,
+      "step": 1662
+    },
+    {
+      "epoch": 0.12827752348831606,
+      "grad_norm": 0.11550859361886978,
+      "learning_rate": 0.00019194661995053586,
+      "loss": 1.086,
+      "step": 1664
+    },
+    {
+      "epoch": 0.1284317032040472,
+      "grad_norm": 0.12175869196653366,
+      "learning_rate": 0.00019193631492168178,
+      "loss": 1.0704,
+      "step": 1666
+    },
+    {
+      "epoch": 0.12858588291977838,
+      "grad_norm": 0.13503512740135193,
+      "learning_rate": 0.00019192600989282772,
+      "loss": 1.1166,
+      "step": 1668
+    },
+    {
+      "epoch": 0.12874006263550952,
+      "grad_norm": 0.12849009037017822,
+      "learning_rate": 0.00019191570486397363,
+      "loss": 1.0315,
+      "step": 1670
+    },
+    {
+      "epoch": 0.12889424235124067,
+      "grad_norm": 0.12484319508075714,
+      "learning_rate": 0.00019190539983511955,
+      "loss": 1.0737,
+      "step": 1672
+    },
+    {
+      "epoch": 0.1290484220669718,
+      "grad_norm": 0.1364014446735382,
+      "learning_rate": 0.00019189509480626546,
+      "loss": 1.0619,
+      "step": 1674
+    },
+    {
+      "epoch": 0.12920260178270296,
+      "grad_norm": 0.12930172681808472,
+      "learning_rate": 0.00019188478977741138,
+      "loss": 1.046,
+      "step": 1676
+    },
+    {
+      "epoch": 0.1293567814984341,
+      "grad_norm": 0.13860805332660675,
+      "learning_rate": 0.00019187448474855732,
+      "loss": 1.0832,
+      "step": 1678
+    },
+    {
+      "epoch": 0.12951096121416525,
+      "grad_norm": 0.1379111111164093,
+      "learning_rate": 0.00019186417971970324,
+      "loss": 1.1406,
+      "step": 1680
+    },
+    {
+      "epoch": 0.12966514092989642,
+      "grad_norm": 0.1349123865365982,
+      "learning_rate": 0.00019185387469084915,
+      "loss": 1.1055,
+      "step": 1682
+    },
+    {
+      "epoch": 0.12981932064562757,
+      "grad_norm": 0.13304142653942108,
+      "learning_rate": 0.00019184356966199507,
+      "loss": 1.0392,
+      "step": 1684
+    },
+    {
+      "epoch": 0.1299735003613587,
+      "grad_norm": 0.12159105390310287,
+      "learning_rate": 0.00019183326463314098,
+      "loss": 1.0548,
+      "step": 1686
+    },
+    {
+      "epoch": 0.13012768007708986,
+      "grad_norm": 0.12661418318748474,
+      "learning_rate": 0.00019182295960428692,
+      "loss": 1.0588,
+      "step": 1688
+    },
+    {
+      "epoch": 0.130281859792821,
+      "grad_norm": 0.13691510260105133,
+      "learning_rate": 0.00019181265457543284,
+      "loss": 1.0854,
+      "step": 1690
+    },
+    {
+      "epoch": 0.13043603950855215,
+      "grad_norm": 0.1401318609714508,
+      "learning_rate": 0.00019180234954657875,
+      "loss": 1.0864,
+      "step": 1692
+    },
+    {
+      "epoch": 0.1305902192242833,
+      "grad_norm": 0.1355384737253189,
+      "learning_rate": 0.00019179204451772467,
+      "loss": 1.058,
+      "step": 1694
+    },
+    {
+      "epoch": 0.13074439894001447,
+      "grad_norm": 0.13987474143505096,
+      "learning_rate": 0.00019178173948887058,
+      "loss": 1.06,
+      "step": 1696
+    },
+    {
+      "epoch": 0.1308985786557456,
+      "grad_norm": 0.14350661635398865,
+      "learning_rate": 0.0001917714344600165,
+      "loss": 1.0731,
+      "step": 1698
+    },
+    {
+      "epoch": 0.13105275837147676,
+      "grad_norm": 0.12443742901086807,
+      "learning_rate": 0.0001917611294311624,
+      "loss": 1.0987,
+      "step": 1700
+    },
+    {
+      "epoch": 0.13105275837147676,
+      "eval_loss": 1.0880467891693115,
+      "eval_runtime": 185.5457,
+      "eval_samples_per_second": 91.314,
+      "eval_steps_per_second": 1.428,
+      "step": 1700
+    },
+    {
+      "epoch": 0.1312069380872079,
+      "grad_norm": 0.10956554859876633,
+      "learning_rate": 0.00019175082440230833,
+      "loss": 1.0393,
+      "step": 1702
+    },
+    {
+      "epoch": 0.13136111780293905,
+      "grad_norm": 0.11846137791872025,
+      "learning_rate": 0.00019174051937345424,
+      "loss": 1.0998,
+      "step": 1704
+    },
+    {
+      "epoch": 0.1315152975186702,
+      "grad_norm": 0.11894328892230988,
+      "learning_rate": 0.00019173021434460016,
+      "loss": 1.1007,
+      "step": 1706
+    },
+    {
+      "epoch": 0.13166947723440134,
+      "grad_norm": 0.11090514808893204,
+      "learning_rate": 0.00019171990931574607,
+      "loss": 1.0343,
+      "step": 1708
+    },
+    {
+      "epoch": 0.1318236569501325,
+      "grad_norm": 0.1276719868183136,
+      "learning_rate": 0.000191709604286892,
+      "loss": 1.0392,
+      "step": 1710
+    },
+    {
+      "epoch": 0.13197783666586366,
+      "grad_norm": 0.12342885881662369,
+      "learning_rate": 0.00019169929925803793,
+      "loss": 1.063,
+      "step": 1712
+    },
+    {
+      "epoch": 0.1321320163815948,
+      "grad_norm": 0.1237882748246193,
+      "learning_rate": 0.00019168899422918384,
+      "loss": 1.0558,
+      "step": 1714
+    },
+    {
+      "epoch": 0.13228619609732595,
+      "grad_norm": 0.12958785891532898,
+      "learning_rate": 0.00019167868920032976,
+      "loss": 1.0493,
+      "step": 1716
+    },
+    {
+      "epoch": 0.1324403758130571,
+      "grad_norm": 0.1181110367178917,
+      "learning_rate": 0.00019166838417147567,
+      "loss": 1.0668,
+      "step": 1718
+    },
+    {
+      "epoch": 0.13259455552878824,
+      "grad_norm": 0.12053950875997543,
+      "learning_rate": 0.00019165807914262162,
+      "loss": 1.0392,
+      "step": 1720
+    },
+    {
+      "epoch": 0.13274873524451938,
+      "grad_norm": 0.11725175380706787,
+      "learning_rate": 0.00019164777411376753,
+      "loss": 1.0188,
+      "step": 1722
+    },
+    {
+      "epoch": 0.13290291496025053,
+      "grad_norm": 0.12475614994764328,
+      "learning_rate": 0.00019163746908491344,
+      "loss": 1.0134,
+      "step": 1724
+    },
+    {
+      "epoch": 0.1330570946759817,
+      "grad_norm": 0.1231207475066185,
+      "learning_rate": 0.00019162716405605936,
+      "loss": 1.0309,
+      "step": 1726
+    },
+    {
+      "epoch": 0.13321127439171285,
+      "grad_norm": 0.1269765943288803,
+      "learning_rate": 0.00019161685902720527,
+      "loss": 1.0918,
+      "step": 1728
+    },
+    {
+      "epoch": 0.133365454107444,
+      "grad_norm": 0.12103556841611862,
+      "learning_rate": 0.00019160655399835122,
+      "loss": 1.0453,
+      "step": 1730
+    },
+    {
+      "epoch": 0.13351963382317514,
+      "grad_norm": 0.12427771091461182,
+      "learning_rate": 0.00019159624896949713,
+      "loss": 1.1544,
+      "step": 1732
+    },
+    {
+      "epoch": 0.13367381353890628,
+      "grad_norm": 0.13416282832622528,
+      "learning_rate": 0.00019158594394064305,
+      "loss": 1.0941,
+      "step": 1734
+    },
+    {
+      "epoch": 0.13382799325463743,
+      "grad_norm": 0.13207705318927765,
+      "learning_rate": 0.00019157563891178896,
+      "loss": 1.0998,
+      "step": 1736
+    },
+    {
+      "epoch": 0.13398217297036857,
+      "grad_norm": 0.1436687856912613,
+      "learning_rate": 0.00019156533388293488,
+      "loss": 1.0723,
+      "step": 1738
+    },
+    {
+      "epoch": 0.13413635268609975,
+      "grad_norm": 0.1206304207444191,
+      "learning_rate": 0.00019155502885408082,
+      "loss": 1.0279,
+      "step": 1740
+    },
+    {
+      "epoch": 0.1342905324018309,
+      "grad_norm": 0.12685900926589966,
+      "learning_rate": 0.00019154472382522673,
+      "loss": 1.0683,
+      "step": 1742
+    },
+    {
+      "epoch": 0.13444471211756204,
+      "grad_norm": 0.12833228707313538,
+      "learning_rate": 0.00019153441879637265,
+      "loss": 1.0904,
+      "step": 1744
+    },
+    {
+      "epoch": 0.13459889183329318,
+      "grad_norm": 0.12999312579631805,
+      "learning_rate": 0.00019152411376751856,
+      "loss": 1.0492,
+      "step": 1746
+    },
+    {
+      "epoch": 0.13475307154902433,
+      "grad_norm": 0.13486912846565247,
+      "learning_rate": 0.00019151380873866448,
+      "loss": 1.101,
+      "step": 1748
+    },
+    {
+      "epoch": 0.13490725126475547,
+      "grad_norm": 0.12793023884296417,
+      "learning_rate": 0.0001915035037098104,
+      "loss": 1.1135,
+      "step": 1750
+    },
+    {
+      "epoch": 0.13506143098048662,
+      "grad_norm": 0.12652675807476044,
+      "learning_rate": 0.0001914931986809563,
+      "loss": 1.0902,
+      "step": 1752
+    },
+    {
+      "epoch": 0.1352156106962178,
+      "grad_norm": 0.12431836873292923,
+      "learning_rate": 0.00019148289365210222,
+      "loss": 1.0922,
+      "step": 1754
+    },
+    {
+      "epoch": 0.13536979041194894,
+      "grad_norm": 0.13665209710597992,
+      "learning_rate": 0.00019147258862324814,
+      "loss": 1.0584,
+      "step": 1756
+    },
+    {
+      "epoch": 0.13552397012768008,
+      "grad_norm": 0.1355196088552475,
+      "learning_rate": 0.00019146228359439405,
+      "loss": 1.1199,
+      "step": 1758
+    },
+    {
+      "epoch": 0.13567814984341123,
+      "grad_norm": 0.14115893840789795,
+      "learning_rate": 0.00019145197856554,
+      "loss": 1.0697,
+      "step": 1760
+    },
+    {
+      "epoch": 0.13583232955914237,
+      "grad_norm": 0.13009534776210785,
+      "learning_rate": 0.0001914416735366859,
+      "loss": 1.1111,
+      "step": 1762
+    },
+    {
+      "epoch": 0.13598650927487352,
+      "grad_norm": 0.12280994653701782,
+      "learning_rate": 0.00019143136850783182,
+      "loss": 1.0341,
+      "step": 1764
+    },
+    {
+      "epoch": 0.13614068899060466,
+      "grad_norm": 0.15171582996845245,
+      "learning_rate": 0.00019142106347897774,
+      "loss": 1.1275,
+      "step": 1766
+    },
+    {
+      "epoch": 0.1362948687063358,
+      "grad_norm": 0.15258526802062988,
+      "learning_rate": 0.00019141075845012365,
+      "loss": 1.0513,
+      "step": 1768
+    },
+    {
+      "epoch": 0.13644904842206698,
+      "grad_norm": 0.132346972823143,
+      "learning_rate": 0.0001914004534212696,
+      "loss": 1.0878,
+      "step": 1770
+    },
+    {
+      "epoch": 0.13660322813779813,
+      "grad_norm": 0.13237041234970093,
+      "learning_rate": 0.0001913901483924155,
+      "loss": 1.0845,
+      "step": 1772
+    },
+    {
+      "epoch": 0.13675740785352927,
+      "grad_norm": 0.13837209343910217,
+      "learning_rate": 0.00019137984336356143,
+      "loss": 1.1221,
+      "step": 1774
+    },
+    {
+      "epoch": 0.13691158756926042,
+      "grad_norm": 0.17590375244617462,
+      "learning_rate": 0.00019136953833470734,
+      "loss": 1.1963,
+      "step": 1776
+    },
+    {
+      "epoch": 0.13706576728499156,
+      "grad_norm": 0.12898488342761993,
+      "learning_rate": 0.00019135923330585326,
+      "loss": 1.1306,
+      "step": 1778
+    },
+    {
+      "epoch": 0.1372199470007227,
+      "grad_norm": 0.12428785115480423,
+      "learning_rate": 0.0001913489282769992,
+      "loss": 1.068,
+      "step": 1780
+    },
+    {
+      "epoch": 0.13737412671645385,
+      "grad_norm": 0.12678809463977814,
+      "learning_rate": 0.0001913386232481451,
+      "loss": 1.0709,
+      "step": 1782
+    },
+    {
+      "epoch": 0.13752830643218503,
+      "grad_norm": 0.1344168782234192,
+      "learning_rate": 0.00019132831821929103,
+      "loss": 1.1073,
+      "step": 1784
+    },
+    {
+      "epoch": 0.13768248614791617,
+      "grad_norm": 0.14730733633041382,
+      "learning_rate": 0.00019131801319043694,
+      "loss": 1.0073,
+      "step": 1786
+    },
+    {
+      "epoch": 0.13783666586364732,
+      "grad_norm": 0.13661792874336243,
+      "learning_rate": 0.00019130770816158286,
+      "loss": 1.0637,
+      "step": 1788
+    },
+    {
+      "epoch": 0.13799084557937846,
+      "grad_norm": 0.1342434138059616,
+      "learning_rate": 0.0001912974031327288,
+      "loss": 1.1069,
+      "step": 1790
+    },
+    {
+      "epoch": 0.1381450252951096,
+      "grad_norm": 0.11941581219434738,
+      "learning_rate": 0.00019128709810387471,
+      "loss": 1.1023,
+      "step": 1792
+    },
+    {
+      "epoch": 0.13829920501084075,
+      "grad_norm": 0.13641759753227234,
+      "learning_rate": 0.00019127679307502063,
+      "loss": 1.0564,
+      "step": 1794
+    },
+    {
+      "epoch": 0.1384533847265719,
+      "grad_norm": 0.11148608475923538,
+      "learning_rate": 0.00019126648804616654,
+      "loss": 1.0255,
+      "step": 1796
+    },
+    {
+      "epoch": 0.13860756444230307,
+      "grad_norm": 0.1387186199426651,
+      "learning_rate": 0.00019125618301731246,
+      "loss": 1.0663,
+      "step": 1798
+    },
+    {
+      "epoch": 0.13876174415803422,
+      "grad_norm": 0.12380651384592056,
+      "learning_rate": 0.00019124587798845837,
+      "loss": 1.1222,
+      "step": 1800
+    },
+    {
+      "epoch": 0.13876174415803422,
+      "eval_loss": 1.0875153541564941,
+      "eval_runtime": 185.4605,
+      "eval_samples_per_second": 91.356,
+      "eval_steps_per_second": 1.429,
+      "step": 1800
+    },
+    {
+      "epoch": 0.13891592387376536,
+      "grad_norm": 0.13224369287490845,
+      "learning_rate": 0.00019123557295960432,
+      "loss": 1.0821,
+      "step": 1802
+    },
+    {
+      "epoch": 0.1390701035894965,
+      "grad_norm": 0.13096244633197784,
+      "learning_rate": 0.00019122526793075023,
+      "loss": 1.0097,
+      "step": 1804
+    },
+    {
+      "epoch": 0.13922428330522765,
+      "grad_norm": 0.11652527749538422,
+      "learning_rate": 0.00019121496290189615,
+      "loss": 1.0517,
+      "step": 1806
+    },
+    {
+      "epoch": 0.1393784630209588,
+      "grad_norm": 0.13449358940124512,
+      "learning_rate": 0.00019120465787304206,
+      "loss": 1.0915,
+      "step": 1808
+    },
+    {
+      "epoch": 0.13953264273668994,
+      "grad_norm": 0.11550068855285645,
+      "learning_rate": 0.00019119435284418798,
+      "loss": 1.0568,
+      "step": 1810
+    },
+    {
+      "epoch": 0.13968682245242112,
+      "grad_norm": 0.13804587721824646,
+      "learning_rate": 0.0001911840478153339,
+      "loss": 1.0933,
+      "step": 1812
+    },
+    {
+      "epoch": 0.13984100216815226,
+      "grad_norm": 0.12062159180641174,
+      "learning_rate": 0.0001911737427864798,
+      "loss": 1.0517,
+      "step": 1814
+    },
+    {
+      "epoch": 0.1399951818838834,
+      "grad_norm": 0.12154779583215714,
+      "learning_rate": 0.00019116343775762572,
+      "loss": 1.0955,
+      "step": 1816
+    },
+    {
+      "epoch": 0.14014936159961455,
+      "grad_norm": 0.11615799367427826,
+      "learning_rate": 0.00019115313272877164,
+      "loss": 0.968,
+      "step": 1818
+    },
+    {
+      "epoch": 0.1403035413153457,
+      "grad_norm": 0.1207037940621376,
+      "learning_rate": 0.00019114282769991755,
+      "loss": 1.0896,
+      "step": 1820
+    },
+    {
+      "epoch": 0.14045772103107684,
+      "grad_norm": 0.12750887870788574,
+      "learning_rate": 0.0001911325226710635,
+      "loss": 1.065,
+      "step": 1822
+    },
+    {
+      "epoch": 0.140611900746808,
+      "grad_norm": 0.16391952335834503,
+      "learning_rate": 0.0001911222176422094,
+      "loss": 1.0232,
+      "step": 1824
+    },
+    {
+      "epoch": 0.14076608046253913,
+      "grad_norm": 0.14626921713352203,
+      "learning_rate": 0.00019111191261335532,
+      "loss": 1.0375,
+      "step": 1826
+    },
+    {
+      "epoch": 0.1409202601782703,
+      "grad_norm": 0.12393996119499207,
+      "learning_rate": 0.00019110160758450124,
+      "loss": 1.0345,
+      "step": 1828
+    },
+    {
+      "epoch": 0.14107443989400145,
+      "grad_norm": 0.13275925815105438,
+      "learning_rate": 0.00019109130255564715,
+      "loss": 1.071,
+      "step": 1830
+    },
+    {
+      "epoch": 0.1412286196097326,
+      "grad_norm": 0.1255485862493515,
+      "learning_rate": 0.0001910809975267931,
+      "loss": 1.1026,
+      "step": 1832
+    },
+    {
+      "epoch": 0.14138279932546374,
+      "grad_norm": 0.13399668037891388,
+      "learning_rate": 0.000191070692497939,
+      "loss": 1.11,
+      "step": 1834
+    },
+    {
+      "epoch": 0.1415369790411949,
+      "grad_norm": 0.13084925711154938,
+      "learning_rate": 0.00019106038746908492,
+      "loss": 1.0528,
+      "step": 1836
+    },
+    {
+      "epoch": 0.14169115875692603,
+      "grad_norm": 0.15695689618587494,
+      "learning_rate": 0.00019105008244023084,
+      "loss": 1.1336,
+      "step": 1838
+    },
+    {
+      "epoch": 0.14184533847265718,
+      "grad_norm": 0.13630808889865875,
+      "learning_rate": 0.00019103977741137675,
+      "loss": 1.0767,
+      "step": 1840
+    },
+    {
+      "epoch": 0.14199951818838835,
+      "grad_norm": 0.11874844878911972,
+      "learning_rate": 0.0001910294723825227,
+      "loss": 1.0511,
+      "step": 1842
+    },
+    {
+      "epoch": 0.1421536979041195,
+      "grad_norm": 0.11898507922887802,
+      "learning_rate": 0.0001910191673536686,
+      "loss": 1.0866,
+      "step": 1844
+    },
+    {
+      "epoch": 0.14230787761985064,
+      "grad_norm": 0.1393211930990219,
+      "learning_rate": 0.00019100886232481453,
+      "loss": 1.0553,
+      "step": 1846
+    },
+    {
+      "epoch": 0.1424620573355818,
+      "grad_norm": 0.1382310539484024,
+      "learning_rate": 0.00019099855729596044,
+      "loss": 1.07,
+      "step": 1848
+    },
+    {
+      "epoch": 0.14261623705131293,
+      "grad_norm": 0.1471824198961258,
+      "learning_rate": 0.00019098825226710636,
+      "loss": 1.0893,
+      "step": 1850
+    },
+    {
+      "epoch": 0.14277041676704408,
+      "grad_norm": 0.12706084549427032,
+      "learning_rate": 0.0001909779472382523,
+      "loss": 1.0848,
+      "step": 1852
+    },
+    {
+      "epoch": 0.14292459648277522,
+      "grad_norm": 0.1324569135904312,
+      "learning_rate": 0.0001909676422093982,
+      "loss": 1.024,
+      "step": 1854
+    },
+    {
+      "epoch": 0.1430787761985064,
+      "grad_norm": 0.11245544254779816,
+      "learning_rate": 0.00019095733718054413,
+      "loss": 1.0802,
+      "step": 1856
+    },
+    {
+      "epoch": 0.14323295591423754,
+      "grad_norm": 0.15419217944145203,
+      "learning_rate": 0.00019094703215169004,
+      "loss": 1.1101,
+      "step": 1858
+    },
+    {
+      "epoch": 0.1433871356299687,
+      "grad_norm": 0.1071443036198616,
+      "learning_rate": 0.00019093672712283596,
+      "loss": 1.0576,
+      "step": 1860
+    },
+    {
+      "epoch": 0.14354131534569983,
+      "grad_norm": 0.1341090053319931,
+      "learning_rate": 0.00019092642209398187,
+      "loss": 1.0606,
+      "step": 1862
+    },
+    {
+      "epoch": 0.14369549506143098,
+      "grad_norm": 0.11848092079162598,
+      "learning_rate": 0.0001909161170651278,
+      "loss": 1.0714,
+      "step": 1864
+    },
+    {
+      "epoch": 0.14384967477716212,
+      "grad_norm": 0.12697815895080566,
+      "learning_rate": 0.0001909058120362737,
+      "loss": 1.092,
+      "step": 1866
+    },
+    {
+      "epoch": 0.14400385449289327,
+      "grad_norm": 0.11891257762908936,
+      "learning_rate": 0.00019089550700741962,
+      "loss": 0.9649,
+      "step": 1868
+    },
+    {
+      "epoch": 0.14415803420862444,
+      "grad_norm": 0.12616439163684845,
+      "learning_rate": 0.00019088520197856553,
+      "loss": 1.0962,
+      "step": 1870
+    },
+    {
+      "epoch": 0.1443122139243556,
+      "grad_norm": 0.12141067534685135,
+      "learning_rate": 0.00019087489694971147,
+      "loss": 1.0838,
+      "step": 1872
+    },
+    {
+      "epoch": 0.14446639364008673,
+      "grad_norm": 0.13279564678668976,
+      "learning_rate": 0.0001908645919208574,
+      "loss": 1.0484,
+      "step": 1874
+    },
+    {
+      "epoch": 0.14462057335581788,
+      "grad_norm": 0.15748505294322968,
+      "learning_rate": 0.0001908542868920033,
+      "loss": 1.1433,
+      "step": 1876
+    },
+    {
+      "epoch": 0.14477475307154902,
+      "grad_norm": 0.11593475937843323,
+      "learning_rate": 0.00019084398186314922,
+      "loss": 1.1483,
+      "step": 1878
+    },
+    {
+      "epoch": 0.14492893278728017,
+      "grad_norm": 0.14499489963054657,
+      "learning_rate": 0.00019083367683429513,
+      "loss": 1.0782,
+      "step": 1880
+    },
+    {
+      "epoch": 0.1450831125030113,
+      "grad_norm": 0.13570410013198853,
+      "learning_rate": 0.00019082337180544105,
+      "loss": 1.0989,
+      "step": 1882
+    },
+    {
+      "epoch": 0.14523729221874246,
+      "grad_norm": 0.12810774147510529,
+      "learning_rate": 0.000190813066776587,
+      "loss": 1.0374,
+      "step": 1884
+    },
+    {
+      "epoch": 0.14539147193447363,
+      "grad_norm": 0.11781581491231918,
+      "learning_rate": 0.0001908027617477329,
+      "loss": 1.0796,
+      "step": 1886
+    },
+    {
+      "epoch": 0.14554565165020478,
+      "grad_norm": 0.12243229150772095,
+      "learning_rate": 0.00019079245671887882,
+      "loss": 1.0477,
+      "step": 1888
+    },
+    {
+      "epoch": 0.14569983136593592,
+      "grad_norm": 0.1385030299425125,
+      "learning_rate": 0.00019078215169002474,
+      "loss": 1.0349,
+      "step": 1890
+    },
+    {
+      "epoch": 0.14585401108166707,
+      "grad_norm": 0.12011386454105377,
+      "learning_rate": 0.00019077184666117065,
+      "loss": 1.0718,
+      "step": 1892
+    },
+    {
+      "epoch": 0.1460081907973982,
+      "grad_norm": 0.12646062672138214,
+      "learning_rate": 0.0001907615416323166,
+      "loss": 1.1228,
+      "step": 1894
+    },
+    {
+      "epoch": 0.14616237051312936,
+      "grad_norm": 0.1284620612859726,
+      "learning_rate": 0.0001907512366034625,
+      "loss": 1.079,
+      "step": 1896
+    },
+    {
+      "epoch": 0.1463165502288605,
+      "grad_norm": 0.15374581515789032,
+      "learning_rate": 0.00019074093157460842,
+      "loss": 1.1147,
+      "step": 1898
+    },
+    {
+      "epoch": 0.14647072994459168,
+      "grad_norm": 0.1325882524251938,
+      "learning_rate": 0.00019073062654575434,
+      "loss": 1.0404,
+      "step": 1900
+    },
+    {
+      "epoch": 0.14647072994459168,
+      "eval_loss": 1.0869932174682617,
+      "eval_runtime": 185.4754,
+      "eval_samples_per_second": 91.349,
+      "eval_steps_per_second": 1.429,
+      "step": 1900
+    },
+    {
+      "epoch": 0.14662490966032282,
+      "grad_norm": 0.14041611552238464,
+      "learning_rate": 0.00019072032151690025,
+      "loss": 1.095,
+      "step": 1902
+    },
+    {
+      "epoch": 0.14677908937605397,
+      "grad_norm": 0.14162160456180573,
+      "learning_rate": 0.0001907100164880462,
+      "loss": 1.1714,
+      "step": 1904
+    },
+    {
+      "epoch": 0.1469332690917851,
+      "grad_norm": 0.12077832221984863,
+      "learning_rate": 0.0001906997114591921,
+      "loss": 1.1109,
+      "step": 1906
+    },
+    {
+      "epoch": 0.14708744880751626,
+      "grad_norm": 0.1738968789577484,
+      "learning_rate": 0.00019068940643033802,
+      "loss": 1.0838,
+      "step": 1908
+    },
+    {
+      "epoch": 0.1472416285232474,
+      "grad_norm": 0.13948039710521698,
+      "learning_rate": 0.00019067910140148394,
+      "loss": 1.0494,
+      "step": 1910
+    },
+    {
+      "epoch": 0.14739580823897855,
+      "grad_norm": 0.21179239451885223,
+      "learning_rate": 0.00019066879637262985,
+      "loss": 1.0962,
+      "step": 1912
+    },
+    {
+      "epoch": 0.14754998795470972,
+      "grad_norm": 0.12927787005901337,
+      "learning_rate": 0.00019065849134377577,
+      "loss": 1.1113,
+      "step": 1914
+    },
+    {
+      "epoch": 0.14770416767044087,
+      "grad_norm": 0.1296701431274414,
+      "learning_rate": 0.00019064818631492168,
+      "loss": 1.0603,
+      "step": 1916
+    },
+    {
+      "epoch": 0.147858347386172,
+      "grad_norm": 0.1282590925693512,
+      "learning_rate": 0.0001906378812860676,
+      "loss": 1.0594,
+      "step": 1918
+    },
+    {
+      "epoch": 0.14801252710190316,
+      "grad_norm": 0.13304758071899414,
+      "learning_rate": 0.0001906275762572135,
+      "loss": 1.0784,
+      "step": 1920
+    },
+    {
+      "epoch": 0.1481667068176343,
+      "grad_norm": 0.15661965310573578,
+      "learning_rate": 0.00019061727122835943,
+      "loss": 1.008,
+      "step": 1922
+    },
+    {
+      "epoch": 0.14832088653336545,
+      "grad_norm": 0.12986873090267181,
+      "learning_rate": 0.00019060696619950537,
+      "loss": 1.0788,
+      "step": 1924
+    },
+    {
+      "epoch": 0.1484750662490966,
+      "grad_norm": 0.1128251776099205,
+      "learning_rate": 0.00019059666117065128,
+      "loss": 1.1449,
+      "step": 1926
+    },
+    {
+      "epoch": 0.14862924596482774,
+      "grad_norm": 0.13722160458564758,
+      "learning_rate": 0.0001905863561417972,
+      "loss": 1.0914,
+      "step": 1928
+    },
+    {
+      "epoch": 0.1487834256805589,
+      "grad_norm": 0.1507786512374878,
+      "learning_rate": 0.00019057605111294311,
+      "loss": 1.0694,
+      "step": 1930
+    },
+    {
+      "epoch": 0.14893760539629006,
+      "grad_norm": 0.1368752121925354,
+      "learning_rate": 0.00019056574608408903,
+      "loss": 1.0417,
+      "step": 1932
+    },
+    {
+      "epoch": 0.1490917851120212,
+      "grad_norm": 0.12566259503364563,
+      "learning_rate": 0.00019055544105523497,
+      "loss": 1.0853,
+      "step": 1934
+    },
+    {
+      "epoch": 0.14924596482775235,
+      "grad_norm": 0.12362397462129593,
+      "learning_rate": 0.0001905451360263809,
+      "loss": 1.1136,
+      "step": 1936
+    },
+    {
+      "epoch": 0.1494001445434835,
+      "grad_norm": 0.12472514808177948,
+      "learning_rate": 0.0001905348309975268,
+      "loss": 1.0628,
+      "step": 1938
+    },
+    {
+      "epoch": 0.14955432425921464,
+      "grad_norm": 0.1355161964893341,
+      "learning_rate": 0.00019052452596867272,
+      "loss": 1.1211,
+      "step": 1940
+    },
+    {
+      "epoch": 0.14970850397494578,
+      "grad_norm": 0.13438721001148224,
+      "learning_rate": 0.00019051422093981863,
+      "loss": 1.0758,
+      "step": 1942
+    },
+    {
+      "epoch": 0.14986268369067696,
+      "grad_norm": 0.11768204718828201,
+      "learning_rate": 0.00019050391591096457,
+      "loss": 1.0533,
+      "step": 1944
+    },
+    {
+      "epoch": 0.1500168634064081,
+      "grad_norm": 0.13892577588558197,
+      "learning_rate": 0.0001904936108821105,
+      "loss": 1.1076,
+      "step": 1946
+    },
+    {
+      "epoch": 0.15017104312213925,
+      "grad_norm": 0.1532358080148697,
+      "learning_rate": 0.0001904833058532564,
+      "loss": 1.0706,
+      "step": 1948
+    },
+    {
+      "epoch": 0.1503252228378704,
+      "grad_norm": 0.13364464044570923,
+      "learning_rate": 0.00019047300082440232,
+      "loss": 1.1322,
+      "step": 1950
+    },
+    {
+      "epoch": 0.15047940255360154,
+      "grad_norm": 0.12663134932518005,
+      "learning_rate": 0.00019046269579554823,
+      "loss": 1.0749,
+      "step": 1952
+    },
+    {
+      "epoch": 0.15063358226933268,
+      "grad_norm": 0.1297607123851776,
+      "learning_rate": 0.00019045239076669417,
+      "loss": 1.0594,
+      "step": 1954
+    },
+    {
+      "epoch": 0.15078776198506383,
+      "grad_norm": 0.11931920051574707,
+      "learning_rate": 0.0001904420857378401,
+      "loss": 1.0522,
+      "step": 1956
+    },
+    {
+      "epoch": 0.150941941700795,
+      "grad_norm": 0.1334810107946396,
+      "learning_rate": 0.000190431780708986,
+      "loss": 1.0674,
+      "step": 1958
+    },
+    {
+      "epoch": 0.15109612141652615,
+      "grad_norm": 0.12633340060710907,
+      "learning_rate": 0.00019042147568013192,
+      "loss": 1.0139,
+      "step": 1960
+    },
+    {
+      "epoch": 0.1512503011322573,
+      "grad_norm": 0.12485836446285248,
+      "learning_rate": 0.00019041117065127783,
+      "loss": 1.0288,
+      "step": 1962
+    },
+    {
+      "epoch": 0.15140448084798844,
+      "grad_norm": 0.10940799117088318,
+      "learning_rate": 0.00019040086562242375,
+      "loss": 1.0475,
+      "step": 1964
+    },
+    {
+      "epoch": 0.15155866056371958,
+      "grad_norm": 0.12229325622320175,
+      "learning_rate": 0.00019039056059356966,
+      "loss": 1.0628,
+      "step": 1966
+    },
+    {
+      "epoch": 0.15171284027945073,
+      "grad_norm": 0.14333505928516388,
+      "learning_rate": 0.00019038025556471558,
+      "loss": 1.0423,
+      "step": 1968
+    },
+    {
+      "epoch": 0.15186701999518187,
+      "grad_norm": 0.12773017585277557,
+      "learning_rate": 0.0001903699505358615,
+      "loss": 1.1283,
+      "step": 1970
+    },
+    {
+      "epoch": 0.15202119971091305,
+      "grad_norm": 0.11913473904132843,
+      "learning_rate": 0.0001903596455070074,
+      "loss": 1.0646,
+      "step": 1972
+    },
+    {
+      "epoch": 0.1521753794266442,
+      "grad_norm": 0.13321518898010254,
+      "learning_rate": 0.00019034934047815332,
+      "loss": 1.0476,
+      "step": 1974
+    },
+    {
+      "epoch": 0.15232955914237534,
+      "grad_norm": 0.1362799108028412,
+      "learning_rate": 0.00019033903544929927,
+      "loss": 1.0937,
+      "step": 1976
+    },
+    {
+      "epoch": 0.15248373885810648,
+      "grad_norm": 0.13804180920124054,
+      "learning_rate": 0.00019032873042044518,
+      "loss": 1.113,
+      "step": 1978
+    },
+    {
+      "epoch": 0.15263791857383763,
+      "grad_norm": 0.1774570494890213,
+      "learning_rate": 0.0001903184253915911,
+      "loss": 1.0795,
+      "step": 1980
+    },
+    {
+      "epoch": 0.15279209828956877,
+      "grad_norm": 0.13106994330883026,
+      "learning_rate": 0.000190308120362737,
+      "loss": 1.098,
+      "step": 1982
+    },
+    {
+      "epoch": 0.15294627800529992,
+      "grad_norm": 0.14435411989688873,
+      "learning_rate": 0.00019029781533388293,
+      "loss": 1.0814,
+      "step": 1984
+    },
+    {
+      "epoch": 0.15310045772103106,
+      "grad_norm": 0.13178013265132904,
+      "learning_rate": 0.00019028751030502887,
+      "loss": 1.1002,
+      "step": 1986
+    },
+    {
+      "epoch": 0.15325463743676224,
+      "grad_norm": 0.1283218264579773,
+      "learning_rate": 0.00019027720527617478,
+      "loss": 1.0749,
+      "step": 1988
+    },
+    {
+      "epoch": 0.15340881715249338,
+      "grad_norm": 0.12113723158836365,
+      "learning_rate": 0.0001902669002473207,
+      "loss": 1.0831,
+      "step": 1990
+    },
+    {
+      "epoch": 0.15356299686822453,
+      "grad_norm": 0.12649892270565033,
+      "learning_rate": 0.0001902565952184666,
+      "loss": 1.0166,
+      "step": 1992
+    },
+    {
+      "epoch": 0.15371717658395567,
+      "grad_norm": 0.12823793292045593,
+      "learning_rate": 0.00019024629018961253,
+      "loss": 1.0273,
+      "step": 1994
+    },
+    {
+      "epoch": 0.15387135629968682,
+      "grad_norm": 0.1291527897119522,
+      "learning_rate": 0.00019023598516075847,
+      "loss": 1.1092,
+      "step": 1996
+    },
+    {
+      "epoch": 0.15402553601541796,
+      "grad_norm": 0.12588894367218018,
+      "learning_rate": 0.00019022568013190438,
+      "loss": 1.0627,
+      "step": 1998
+    },
+    {
+      "epoch": 0.1541797157311491,
+      "grad_norm": 0.12996312975883484,
+      "learning_rate": 0.0001902153751030503,
+      "loss": 1.1196,
+      "step": 2000
+    },
+    {
+      "epoch": 0.1541797157311491,
+      "eval_loss": 1.0863893032073975,
+      "eval_runtime": 185.3254,
+      "eval_samples_per_second": 91.423,
+      "eval_steps_per_second": 1.43,
+      "step": 2000
+    },
+    {
+      "epoch": 0.15433389544688028,
+      "grad_norm": 0.14361834526062012,
+      "learning_rate": 0.00019020507007419621,
+      "loss": 1.1151,
+      "step": 2002
+    },
+    {
+      "epoch": 0.15448807516261143,
+      "grad_norm": 0.12650837004184723,
+      "learning_rate": 0.00019019476504534213,
+      "loss": 1.1155,
+      "step": 2004
+    },
+    {
+      "epoch": 0.15464225487834257,
+      "grad_norm": 0.13820499181747437,
+      "learning_rate": 0.00019018446001648807,
+      "loss": 1.1243,
+      "step": 2006
+    },
+    {
+      "epoch": 0.15479643459407372,
+      "grad_norm": 0.13205693662166595,
+      "learning_rate": 0.00019017415498763399,
+      "loss": 1.0626,
+      "step": 2008
+    },
+    {
+      "epoch": 0.15495061430980486,
+      "grad_norm": 0.13930106163024902,
+      "learning_rate": 0.0001901638499587799,
+      "loss": 1.1105,
+      "step": 2010
+    },
+    {
+      "epoch": 0.155104794025536,
+      "grad_norm": 0.14711922407150269,
+      "learning_rate": 0.00019015354492992582,
+      "loss": 1.0556,
+      "step": 2012
+    },
+    {
+      "epoch": 0.15525897374126715,
+      "grad_norm": 0.11909156292676926,
+      "learning_rate": 0.00019014323990107173,
+      "loss": 1.1025,
+      "step": 2014
+    },
+    {
+      "epoch": 0.15541315345699833,
+      "grad_norm": 0.14099714159965515,
+      "learning_rate": 0.00019013293487221767,
+      "loss": 1.064,
+      "step": 2016
+    },
+    {
+      "epoch": 0.15556733317272947,
+      "grad_norm": 0.11500216275453568,
+      "learning_rate": 0.0001901226298433636,
+      "loss": 1.1196,
+      "step": 2018
+    },
+    {
+      "epoch": 0.15572151288846062,
+      "grad_norm": 0.12341683357954025,
+      "learning_rate": 0.0001901123248145095,
+      "loss": 1.0625,
+      "step": 2020
+    },
+    {
+      "epoch": 0.15587569260419176,
+      "grad_norm": 0.1390669196844101,
+      "learning_rate": 0.00019010201978565542,
+      "loss": 1.0526,
+      "step": 2022
+    },
+    {
+      "epoch": 0.1560298723199229,
+      "grad_norm": 0.13482992351055145,
+      "learning_rate": 0.00019009171475680133,
+      "loss": 1.1074,
+      "step": 2024
+    },
+    {
+      "epoch": 0.15618405203565405,
+      "grad_norm": 0.12277045845985413,
+      "learning_rate": 0.00019008140972794725,
+      "loss": 1.0648,
+      "step": 2026
+    },
+    {
+      "epoch": 0.1563382317513852,
+      "grad_norm": 0.13579949736595154,
+      "learning_rate": 0.00019007110469909316,
+      "loss": 1.1235,
+      "step": 2028
+    },
+    {
+      "epoch": 0.15649241146711637,
+      "grad_norm": 0.14128637313842773,
+      "learning_rate": 0.00019006079967023908,
+      "loss": 1.0442,
+      "step": 2030
+    },
+    {
+      "epoch": 0.15664659118284752,
+      "grad_norm": 0.13722474873065948,
+      "learning_rate": 0.000190050494641385,
+      "loss": 1.1215,
+      "step": 2032
+    },
+    {
+      "epoch": 0.15680077089857866,
+      "grad_norm": 0.13500674068927765,
+      "learning_rate": 0.0001900401896125309,
+      "loss": 1.0776,
+      "step": 2034
+    },
+    {
+      "epoch": 0.1569549506143098,
+      "grad_norm": 0.11917294561862946,
+      "learning_rate": 0.00019002988458367685,
+      "loss": 1.0698,
+      "step": 2036
+    },
+    {
+      "epoch": 0.15710913033004095,
+      "grad_norm": 0.12245581299066544,
+      "learning_rate": 0.00019001957955482276,
+      "loss": 1.0166,
+      "step": 2038
+    },
+    {
+      "epoch": 0.1572633100457721,
+      "grad_norm": 0.12556669116020203,
+      "learning_rate": 0.00019000927452596868,
+      "loss": 1.0846,
+      "step": 2040
+    },
+    {
+      "epoch": 0.15741748976150324,
+      "grad_norm": 0.13316373527050018,
+      "learning_rate": 0.0001899989694971146,
+      "loss": 1.0566,
+      "step": 2042
+    },
+    {
+      "epoch": 0.1575716694772344,
+      "grad_norm": 0.1296815425157547,
+      "learning_rate": 0.0001899886644682605,
+      "loss": 1.0824,
+      "step": 2044
+    },
+    {
+      "epoch": 0.15772584919296556,
+      "grad_norm": 0.1288246214389801,
+      "learning_rate": 0.00018997835943940645,
+      "loss": 1.0974,
+      "step": 2046
+    },
+    {
+      "epoch": 0.1578800289086967,
+      "grad_norm": 0.1185479462146759,
+      "learning_rate": 0.00018996805441055237,
+      "loss": 1.1443,
+      "step": 2048
+    },
+    {
+      "epoch": 0.15803420862442785,
+      "grad_norm": 0.12504369020462036,
+      "learning_rate": 0.00018995774938169828,
+      "loss": 1.0899,
+      "step": 2050
+    },
+    {
+      "epoch": 0.158188388340159,
+      "grad_norm": 0.1266452521085739,
+      "learning_rate": 0.0001899474443528442,
+      "loss": 1.0654,
+      "step": 2052
+    },
+    {
+      "epoch": 0.15834256805589014,
+      "grad_norm": 0.13447126746177673,
+      "learning_rate": 0.0001899371393239901,
+      "loss": 1.0649,
+      "step": 2054
+    },
+    {
+      "epoch": 0.1584967477716213,
+      "grad_norm": 0.1446131467819214,
+      "learning_rate": 0.00018992683429513603,
+      "loss": 1.1439,
+      "step": 2056
+    },
+    {
+      "epoch": 0.15865092748735243,
+      "grad_norm": 0.12688389420509338,
+      "learning_rate": 0.00018991652926628197,
+      "loss": 1.0262,
+      "step": 2058
+    },
+    {
+      "epoch": 0.1588051072030836,
+      "grad_norm": 0.12581713497638702,
+      "learning_rate": 0.00018990622423742788,
+      "loss": 1.0723,
+      "step": 2060
+    },
+    {
+      "epoch": 0.15895928691881475,
+      "grad_norm": 0.15745951235294342,
+      "learning_rate": 0.0001898959192085738,
+      "loss": 1.1038,
+      "step": 2062
+    },
+    {
+      "epoch": 0.1591134666345459,
+      "grad_norm": 0.14457587897777557,
+      "learning_rate": 0.0001898856141797197,
+      "loss": 1.1072,
+      "step": 2064
+    },
+    {
+      "epoch": 0.15926764635027704,
+      "grad_norm": 0.11454683542251587,
+      "learning_rate": 0.00018987530915086563,
+      "loss": 1.0605,
+      "step": 2066
+    },
+    {
+      "epoch": 0.1594218260660082,
+      "grad_norm": 0.1137547716498375,
+      "learning_rate": 0.00018986500412201157,
+      "loss": 1.0405,
+      "step": 2068
+    },
+    {
+      "epoch": 0.15957600578173933,
+      "grad_norm": 0.1220378428697586,
+      "learning_rate": 0.00018985469909315748,
+      "loss": 1.086,
+      "step": 2070
+    },
+    {
+      "epoch": 0.15973018549747048,
+      "grad_norm": 0.13579098880290985,
+      "learning_rate": 0.0001898443940643034,
+      "loss": 1.0334,
+      "step": 2072
+    },
+    {
+      "epoch": 0.15988436521320165,
+      "grad_norm": 0.1529407948255539,
+      "learning_rate": 0.00018983408903544931,
+      "loss": 1.0614,
+      "step": 2074
+    },
+    {
+      "epoch": 0.1600385449289328,
+      "grad_norm": 0.13769444823265076,
+      "learning_rate": 0.00018982378400659523,
+      "loss": 1.1212,
+      "step": 2076
+    },
+    {
+      "epoch": 0.16019272464466394,
+      "grad_norm": 0.12095335125923157,
+      "learning_rate": 0.00018981347897774114,
+      "loss": 1.047,
+      "step": 2078
+    },
+    {
+      "epoch": 0.1603469043603951,
+      "grad_norm": 0.12483233958482742,
+      "learning_rate": 0.00018980317394888706,
+      "loss": 1.0808,
+      "step": 2080
+    },
+    {
+      "epoch": 0.16050108407612623,
+      "grad_norm": 0.12451382726430893,
+      "learning_rate": 0.00018979286892003297,
+      "loss": 1.1259,
+      "step": 2082
+    },
+    {
+      "epoch": 0.16065526379185738,
+      "grad_norm": 0.12540730834007263,
+      "learning_rate": 0.0001897825638911789,
+      "loss": 1.0761,
+      "step": 2084
+    },
+    {
+      "epoch": 0.16080944350758852,
+      "grad_norm": 0.12948516011238098,
+      "learning_rate": 0.0001897722588623248,
+      "loss": 1.0621,
+      "step": 2086
+    },
+    {
+      "epoch": 0.16096362322331967,
+      "grad_norm": 0.1349886953830719,
+      "learning_rate": 0.00018976195383347075,
+      "loss": 1.0549,
+      "step": 2088
+    },
+    {
+      "epoch": 0.16111780293905084,
+      "grad_norm": 0.1249813437461853,
+      "learning_rate": 0.00018975164880461666,
+      "loss": 1.0828,
+      "step": 2090
+    },
+    {
+      "epoch": 0.161271982654782,
+      "grad_norm": 0.1299104243516922,
+      "learning_rate": 0.00018974134377576258,
+      "loss": 1.097,
+      "step": 2092
+    },
+    {
+      "epoch": 0.16142616237051313,
+      "grad_norm": 0.13004744052886963,
+      "learning_rate": 0.0001897310387469085,
+      "loss": 1.0417,
+      "step": 2094
+    },
+    {
+      "epoch": 0.16158034208624428,
+      "grad_norm": 0.11553830653429031,
+      "learning_rate": 0.0001897207337180544,
+      "loss": 1.0563,
+      "step": 2096
+    },
+    {
+      "epoch": 0.16173452180197542,
+      "grad_norm": 0.12000396102666855,
+      "learning_rate": 0.00018971042868920035,
+      "loss": 1.077,
+      "step": 2098
+    },
+    {
+      "epoch": 0.16188870151770657,
+      "grad_norm": 0.13707685470581055,
+      "learning_rate": 0.00018970012366034626,
+      "loss": 1.0994,
+      "step": 2100
+    },
+    {
+      "epoch": 0.16188870151770657,
+      "eval_loss": 1.0858707427978516,
+      "eval_runtime": 185.7188,
+      "eval_samples_per_second": 91.229,
+      "eval_steps_per_second": 1.427,
+      "step": 2100
+    }
+  ],
+  "logging_steps": 2,
+  "max_steps": 38916,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 3,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.132999221824717e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}