diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,4178 @@
+{
+  "best_global_step": 5600,
+  "best_metric": 1.2055819034576416,
+  "best_model_checkpoint": "models/MNLP_M3_rag_model_test/checkpoint-5600",
+  "epoch": 2.9254471863167515,
+  "eval_steps": 200,
+  "global_step": 5600,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005222613918266093,
+      "grad_norm": 27.89534568786621,
+      "learning_rate": 7.000000000000001e-07,
+      "loss": 3.4324,
+      "step": 10
+    },
+    {
+      "epoch": 0.010445227836532185,
+      "grad_norm": 12.811655044555664,
+      "learning_rate": 1.7000000000000002e-06,
+      "loss": 3.1666,
+      "step": 20
+    },
+    {
+      "epoch": 0.015667841754798278,
+      "grad_norm": 9.790305137634277,
+      "learning_rate": 2.7000000000000004e-06,
+      "loss": 2.7959,
+      "step": 30
+    },
+    {
+      "epoch": 0.02089045567306437,
+      "grad_norm": 8.068702697753906,
+      "learning_rate": 3.7e-06,
+      "loss": 2.5106,
+      "step": 40
+    },
+    {
+      "epoch": 0.02611306959133046,
+      "grad_norm": 8.072208404541016,
+      "learning_rate": 4.7e-06,
+      "loss": 2.4756,
+      "step": 50
+    },
+    {
+      "epoch": 0.031335683509596556,
+      "grad_norm": 8.231782913208008,
+      "learning_rate": 5.7e-06,
+      "loss": 2.433,
+      "step": 60
+    },
+    {
+      "epoch": 0.036558297427862645,
+      "grad_norm": 9.866532325744629,
+      "learning_rate": 6.700000000000001e-06,
+      "loss": 2.3698,
+      "step": 70
+    },
+    {
+      "epoch": 0.04178091134612874,
+      "grad_norm": 9.043497085571289,
+      "learning_rate": 7.7e-06,
+      "loss": 2.3509,
+      "step": 80
+    },
+    {
+      "epoch": 0.04700352526439483,
+      "grad_norm": 8.417478561401367,
+      "learning_rate": 8.700000000000001e-06,
+      "loss": 2.3521,
+      "step": 90
+    },
+    {
+      "epoch": 0.05222613918266092,
+      "grad_norm": 8.51689624786377,
+      "learning_rate": 9.7e-06,
+      "loss": 2.3839,
+      "step": 100
+    },
+    {
+      "epoch": 0.057448753100927015,
+      "grad_norm": 8.41511058807373,
+      "learning_rate": 9.995398369708125e-06,
+      "loss": 2.1144,
+      "step": 110
+    },
+    {
+      "epoch": 0.06267136701919311,
+      "grad_norm": 9.541606903076172,
+      "learning_rate": 9.988824612148304e-06,
+      "loss": 2.1907,
+      "step": 120
+    },
+    {
+      "epoch": 0.0678939809374592,
+      "grad_norm": 8.702176094055176,
+      "learning_rate": 9.982250854588484e-06,
+      "loss": 2.1522,
+      "step": 130
+    },
+    {
+      "epoch": 0.07311659485572529,
+      "grad_norm": 7.285041809082031,
+      "learning_rate": 9.975677097028661e-06,
+      "loss": 2.289,
+      "step": 140
+    },
+    {
+      "epoch": 0.07833920877399138,
+      "grad_norm": 8.327420234680176,
+      "learning_rate": 9.96910333946884e-06,
+      "loss": 2.1041,
+      "step": 150
+    },
+    {
+      "epoch": 0.08356182269225748,
+      "grad_norm": 9.352178573608398,
+      "learning_rate": 9.96252958190902e-06,
+      "loss": 2.1858,
+      "step": 160
+    },
+    {
+      "epoch": 0.08878443661052357,
+      "grad_norm": 10.03535270690918,
+      "learning_rate": 9.955955824349198e-06,
+      "loss": 2.092,
+      "step": 170
+    },
+    {
+      "epoch": 0.09400705052878966,
+      "grad_norm": 8.207446098327637,
+      "learning_rate": 9.949382066789379e-06,
+      "loss": 2.0972,
+      "step": 180
+    },
+    {
+      "epoch": 0.09922966444705575,
+      "grad_norm": 10.35864543914795,
+      "learning_rate": 9.942808309229556e-06,
+      "loss": 2.0366,
+      "step": 190
+    },
+    {
+      "epoch": 0.10445227836532184,
+      "grad_norm": 8.569424629211426,
+      "learning_rate": 9.936234551669736e-06,
+      "loss": 2.0236,
+      "step": 200
+    },
+    {
+      "epoch": 0.10445227836532184,
+      "eval_loss": 2.0600883960723877,
+      "eval_runtime": 46.6244,
+      "eval_samples_per_second": 36.505,
+      "eval_steps_per_second": 4.568,
+      "step": 200
+    },
+    {
+      "epoch": 0.10967489228358794,
+      "grad_norm": 10.646219253540039,
+      "learning_rate": 9.929660794109915e-06,
+      "loss": 2.1485,
+      "step": 210
+    },
+    {
+      "epoch": 0.11489750620185403,
+      "grad_norm": 8.741375923156738,
+      "learning_rate": 9.923087036550093e-06,
+      "loss": 2.0663,
+      "step": 220
+    },
+    {
+      "epoch": 0.12012012012012012,
+      "grad_norm": 8.898869514465332,
+      "learning_rate": 9.916513278990272e-06,
+      "loss": 1.8793,
+      "step": 230
+    },
+    {
+      "epoch": 0.12534273403838622,
+      "grad_norm": 10.121482849121094,
+      "learning_rate": 9.90993952143045e-06,
+      "loss": 1.9377,
+      "step": 240
+    },
+    {
+      "epoch": 0.1305653479566523,
+      "grad_norm": 9.474202156066895,
+      "learning_rate": 9.903365763870629e-06,
+      "loss": 2.0534,
+      "step": 250
+    },
+    {
+      "epoch": 0.1357879618749184,
+      "grad_norm": 7.948584079742432,
+      "learning_rate": 9.896792006310808e-06,
+      "loss": 1.9262,
+      "step": 260
+    },
+    {
+      "epoch": 0.1410105757931845,
+      "grad_norm": 9.371474266052246,
+      "learning_rate": 9.890218248750986e-06,
+      "loss": 1.9202,
+      "step": 270
+    },
+    {
+      "epoch": 0.14623318971145058,
+      "grad_norm": 8.858964920043945,
+      "learning_rate": 9.883644491191165e-06,
+      "loss": 2.0666,
+      "step": 280
+    },
+    {
+      "epoch": 0.15145580362971667,
+      "grad_norm": 8.085942268371582,
+      "learning_rate": 9.877070733631345e-06,
+      "loss": 1.9757,
+      "step": 290
+    },
+    {
+      "epoch": 0.15667841754798276,
+      "grad_norm": 9.249700546264648,
+      "learning_rate": 9.870496976071522e-06,
+      "loss": 1.9912,
+      "step": 300
+    },
+    {
+      "epoch": 0.16190103146624885,
+      "grad_norm": 9.418254852294922,
+      "learning_rate": 9.863923218511702e-06,
+      "loss": 1.8573,
+      "step": 310
+    },
+    {
+      "epoch": 0.16712364538451496,
+      "grad_norm": 7.633480072021484,
+      "learning_rate": 9.857349460951881e-06,
+      "loss": 1.8782,
+      "step": 320
+    },
+    {
+      "epoch": 0.17234625930278105,
+      "grad_norm": 8.046884536743164,
+      "learning_rate": 9.850775703392059e-06,
+      "loss": 2.0481,
+      "step": 330
+    },
+    {
+      "epoch": 0.17756887322104714,
+      "grad_norm": 8.936604499816895,
+      "learning_rate": 9.844201945832238e-06,
+      "loss": 1.9284,
+      "step": 340
+    },
+    {
+      "epoch": 0.18279148713931323,
+      "grad_norm": 8.811029434204102,
+      "learning_rate": 9.837628188272417e-06,
+      "loss": 1.9936,
+      "step": 350
+    },
+    {
+      "epoch": 0.18801410105757932,
+      "grad_norm": 8.724671363830566,
+      "learning_rate": 9.831054430712597e-06,
+      "loss": 1.6656,
+      "step": 360
+    },
+    {
+      "epoch": 0.1932367149758454,
+      "grad_norm": 8.415898323059082,
+      "learning_rate": 9.824480673152776e-06,
+      "loss": 1.7273,
+      "step": 370
+    },
+    {
+      "epoch": 0.1984593288941115,
+      "grad_norm": 8.664811134338379,
+      "learning_rate": 9.817906915592954e-06,
+      "loss": 1.865,
+      "step": 380
+    },
+    {
+      "epoch": 0.2036819428123776,
+      "grad_norm": 8.129899978637695,
+      "learning_rate": 9.811333158033133e-06,
+      "loss": 1.7854,
+      "step": 390
+    },
+    {
+      "epoch": 0.20890455673064368,
+      "grad_norm": 9.967806816101074,
+      "learning_rate": 9.80475940047331e-06,
+      "loss": 1.7001,
+      "step": 400
+    },
+    {
+      "epoch": 0.20890455673064368,
+      "eval_loss": 1.8084577322006226,
+      "eval_runtime": 46.1675,
+      "eval_samples_per_second": 36.866,
+      "eval_steps_per_second": 4.614,
+      "step": 400
+    },
+    {
+      "epoch": 0.2141271706489098,
+      "grad_norm": 8.425873756408691,
+      "learning_rate": 9.79818564291349e-06,
+      "loss": 1.8895,
+      "step": 410
+    },
+    {
+      "epoch": 0.21934978456717588,
+      "grad_norm": 7.594646453857422,
+      "learning_rate": 9.79161188535367e-06,
+      "loss": 1.7879,
+      "step": 420
+    },
+    {
+      "epoch": 0.22457239848544197,
+      "grad_norm": 8.4673433303833,
+      "learning_rate": 9.785038127793847e-06,
+      "loss": 1.8445,
+      "step": 430
+    },
+    {
+      "epoch": 0.22979501240370806,
+      "grad_norm": 8.08828353881836,
+      "learning_rate": 9.778464370234027e-06,
+      "loss": 1.7835,
+      "step": 440
+    },
+    {
+      "epoch": 0.23501762632197415,
+      "grad_norm": 9.548452377319336,
+      "learning_rate": 9.771890612674206e-06,
+      "loss": 1.8427,
+      "step": 450
+    },
+    {
+      "epoch": 0.24024024024024024,
+      "grad_norm": 9.219648361206055,
+      "learning_rate": 9.765316855114384e-06,
+      "loss": 1.6848,
+      "step": 460
+    },
+    {
+      "epoch": 0.24546285415850633,
+      "grad_norm": 7.592192649841309,
+      "learning_rate": 9.758743097554563e-06,
+      "loss": 1.683,
+      "step": 470
+    },
+    {
+      "epoch": 0.25068546807677244,
+      "grad_norm": 7.887657165527344,
+      "learning_rate": 9.752169339994742e-06,
+      "loss": 1.7546,
+      "step": 480
+    },
+    {
+      "epoch": 0.2559080819950385,
+      "grad_norm": 8.5076904296875,
+      "learning_rate": 9.74559558243492e-06,
+      "loss": 1.6915,
+      "step": 490
+    },
+    {
+      "epoch": 0.2611306959133046,
+      "grad_norm": 8.41952896118164,
+      "learning_rate": 9.7390218248751e-06,
+      "loss": 1.6883,
+      "step": 500
+    },
+    {
+      "epoch": 0.2663533098315707,
+      "grad_norm": 8.776701927185059,
+      "learning_rate": 9.732448067315277e-06,
+      "loss": 1.8154,
+      "step": 510
+    },
+    {
+      "epoch": 0.2715759237498368,
+      "grad_norm": 7.871417045593262,
+      "learning_rate": 9.725874309755458e-06,
+      "loss": 1.7861,
+      "step": 520
+    },
+    {
+      "epoch": 0.27679853766810286,
+      "grad_norm": 8.374330520629883,
+      "learning_rate": 9.719300552195636e-06,
+      "loss": 1.7334,
+      "step": 530
+    },
+    {
+      "epoch": 0.282021151586369,
+      "grad_norm": 9.533878326416016,
+      "learning_rate": 9.712726794635815e-06,
+      "loss": 1.642,
+      "step": 540
+    },
+    {
+      "epoch": 0.2872437655046351,
+      "grad_norm": 8.845860481262207,
+      "learning_rate": 9.706153037075994e-06,
+      "loss": 1.5368,
+      "step": 550
+    },
+    {
+      "epoch": 0.29246637942290116,
+      "grad_norm": 8.25372314453125,
+      "learning_rate": 9.699579279516172e-06,
+      "loss": 1.8538,
+      "step": 560
+    },
+    {
+      "epoch": 0.2976889933411673,
+      "grad_norm": 8.217201232910156,
+      "learning_rate": 9.693005521956351e-06,
+      "loss": 1.5753,
+      "step": 570
+    },
+    {
+      "epoch": 0.30291160725943334,
+      "grad_norm": 9.172762870788574,
+      "learning_rate": 9.68643176439653e-06,
+      "loss": 1.485,
+      "step": 580
+    },
+    {
+      "epoch": 0.30813422117769945,
+      "grad_norm": 8.109066009521484,
+      "learning_rate": 9.679858006836708e-06,
+      "loss": 1.6386,
+      "step": 590
+    },
+    {
+      "epoch": 0.3133568350959655,
+      "grad_norm": 9.576763153076172,
+      "learning_rate": 9.673284249276888e-06,
+      "loss": 1.6593,
+      "step": 600
+    },
+    {
+      "epoch": 0.3133568350959655,
+      "eval_loss": 1.669974684715271,
+      "eval_runtime": 46.1982,
+      "eval_samples_per_second": 36.841,
+      "eval_steps_per_second": 4.611,
+      "step": 600
+    },
+    {
+      "epoch": 0.31857944901423163,
+      "grad_norm": 8.07900333404541,
+      "learning_rate": 9.666710491717067e-06,
+      "loss": 1.5609,
+      "step": 610
+    },
+    {
+      "epoch": 0.3238020629324977,
+      "grad_norm": 7.784631729125977,
+      "learning_rate": 9.660136734157245e-06,
+      "loss": 1.598,
+      "step": 620
+    },
+    {
+      "epoch": 0.3290246768507638,
+      "grad_norm": 7.267651557922363,
+      "learning_rate": 9.653562976597424e-06,
+      "loss": 1.693,
+      "step": 630
+    },
+    {
+      "epoch": 0.3342472907690299,
+      "grad_norm": 7.9032816886901855,
+      "learning_rate": 9.646989219037603e-06,
+      "loss": 1.5721,
+      "step": 640
+    },
+    {
+      "epoch": 0.339469904687296,
+      "grad_norm": 7.756552696228027,
+      "learning_rate": 9.640415461477781e-06,
+      "loss": 1.688,
+      "step": 650
+    },
+    {
+      "epoch": 0.3446925186055621,
+      "grad_norm": 7.442072868347168,
+      "learning_rate": 9.63384170391796e-06,
+      "loss": 1.6674,
+      "step": 660
+    },
+    {
+      "epoch": 0.34991513252382817,
+      "grad_norm": 7.6043548583984375,
+      "learning_rate": 9.627267946358138e-06,
+      "loss": 1.5278,
+      "step": 670
+    },
+    {
+      "epoch": 0.3551377464420943,
+      "grad_norm": 8.23291015625,
+      "learning_rate": 9.620694188798317e-06,
+      "loss": 1.588,
+      "step": 680
+    },
+    {
+      "epoch": 0.36036036036036034,
+      "grad_norm": 7.537237644195557,
+      "learning_rate": 9.614120431238497e-06,
+      "loss": 1.4416,
+      "step": 690
+    },
+    {
+      "epoch": 0.36558297427862646,
+      "grad_norm": 8.757375717163086,
+      "learning_rate": 9.607546673678676e-06,
+      "loss": 1.8258,
+      "step": 700
+    },
+    {
+      "epoch": 0.3708055881968925,
+      "grad_norm": 8.787406921386719,
+      "learning_rate": 9.600972916118855e-06,
+      "loss": 1.5436,
+      "step": 710
+    },
+    {
+      "epoch": 0.37602820211515864,
+      "grad_norm": 8.105976104736328,
+      "learning_rate": 9.594399158559033e-06,
+      "loss": 1.6407,
+      "step": 720
+    },
+    {
+      "epoch": 0.38125081603342476,
+      "grad_norm": 9.059203147888184,
+      "learning_rate": 9.587825400999212e-06,
+      "loss": 1.5917,
+      "step": 730
+    },
+    {
+      "epoch": 0.3864734299516908,
+      "grad_norm": 7.951854228973389,
+      "learning_rate": 9.581251643439392e-06,
+      "loss": 1.6077,
+      "step": 740
+    },
+    {
+      "epoch": 0.39169604386995693,
+      "grad_norm": 7.4102396965026855,
+      "learning_rate": 9.57467788587957e-06,
+      "loss": 1.7028,
+      "step": 750
+    },
+    {
+      "epoch": 0.396918657788223,
+      "grad_norm": 8.980511665344238,
+      "learning_rate": 9.568104128319749e-06,
+      "loss": 1.5217,
+      "step": 760
+    },
+    {
+      "epoch": 0.4021412717064891,
+      "grad_norm": 7.856898784637451,
+      "learning_rate": 9.561530370759928e-06,
+      "loss": 1.6657,
+      "step": 770
+    },
+    {
+      "epoch": 0.4073638856247552,
+      "grad_norm": 7.5405802726745605,
+      "learning_rate": 9.554956613200106e-06,
+      "loss": 1.5447,
+      "step": 780
+    },
+    {
+      "epoch": 0.4125864995430213,
+      "grad_norm": 7.326587200164795,
+      "learning_rate": 9.548382855640285e-06,
+      "loss": 1.4387,
+      "step": 790
+    },
+    {
+      "epoch": 0.41780911346128735,
+      "grad_norm": 7.001702785491943,
+      "learning_rate": 9.541809098080463e-06,
+      "loss": 1.5789,
+      "step": 800
+    },
+    {
+      "epoch": 0.41780911346128735,
+      "eval_loss": 1.5911635160446167,
+      "eval_runtime": 46.2223,
+      "eval_samples_per_second": 36.822,
+      "eval_steps_per_second": 4.608,
+      "step": 800
+    },
+    {
+      "epoch": 0.42303172737955347,
+      "grad_norm": 10.446452140808105,
+      "learning_rate": 9.535235340520642e-06,
+      "loss": 1.5838,
+      "step": 810
+    },
+    {
+      "epoch": 0.4282543412978196,
+      "grad_norm": 8.23008918762207,
+      "learning_rate": 9.528661582960821e-06,
+      "loss": 1.6463,
+      "step": 820
+    },
+    {
+      "epoch": 0.43347695521608565,
+      "grad_norm": 10.593551635742188,
+      "learning_rate": 9.522087825400999e-06,
+      "loss": 1.6947,
+      "step": 830
+    },
+    {
+      "epoch": 0.43869956913435176,
+      "grad_norm": 11.24173641204834,
+      "learning_rate": 9.515514067841178e-06,
+      "loss": 1.4659,
+      "step": 840
+    },
+    {
+      "epoch": 0.4439221830526178,
+      "grad_norm": 8.380345344543457,
+      "learning_rate": 9.508940310281358e-06,
+      "loss": 1.6864,
+      "step": 850
+    },
+    {
+      "epoch": 0.44914479697088394,
+      "grad_norm": 9.692063331604004,
+      "learning_rate": 9.502366552721535e-06,
+      "loss": 1.5631,
+      "step": 860
+    },
+    {
+      "epoch": 0.45436741088915,
+      "grad_norm": 7.017058372497559,
+      "learning_rate": 9.495792795161716e-06,
+      "loss": 1.5841,
+      "step": 870
+    },
+    {
+      "epoch": 0.4595900248074161,
+      "grad_norm": 7.2586822509765625,
+      "learning_rate": 9.489219037601894e-06,
+      "loss": 1.3882,
+      "step": 880
+    },
+    {
+      "epoch": 0.4648126387256822,
+      "grad_norm": 7.41990852355957,
+      "learning_rate": 9.482645280042073e-06,
+      "loss": 1.6894,
+      "step": 890
+    },
+    {
+      "epoch": 0.4700352526439483,
+      "grad_norm": 8.770058631896973,
+      "learning_rate": 9.476071522482253e-06,
+      "loss": 1.586,
+      "step": 900
+    },
+    {
+      "epoch": 0.4752578665622144,
+      "grad_norm": 7.8078742027282715,
+      "learning_rate": 9.46949776492243e-06,
+      "loss": 1.4878,
+      "step": 910
+    },
+    {
+      "epoch": 0.4804804804804805,
+      "grad_norm": 6.9704203605651855,
+      "learning_rate": 9.46292400736261e-06,
+      "loss": 1.4678,
+      "step": 920
+    },
+    {
+      "epoch": 0.4857030943987466,
+      "grad_norm": 8.024680137634277,
+      "learning_rate": 9.456350249802787e-06,
+      "loss": 1.5597,
+      "step": 930
+    },
+    {
+      "epoch": 0.49092570831701265,
+      "grad_norm": 8.284809112548828,
+      "learning_rate": 9.449776492242967e-06,
+      "loss": 1.571,
+      "step": 940
+    },
+    {
+      "epoch": 0.49614832223527877,
+      "grad_norm": 8.523279190063477,
+      "learning_rate": 9.443202734683146e-06,
+      "loss": 1.5098,
+      "step": 950
+    },
+    {
+      "epoch": 0.5013709361535449,
+      "grad_norm": 7.113609790802002,
+      "learning_rate": 9.436628977123324e-06,
+      "loss": 1.6931,
+      "step": 960
+    },
+    {
+      "epoch": 0.506593550071811,
+      "grad_norm": 8.182168006896973,
+      "learning_rate": 9.430055219563503e-06,
+      "loss": 1.3979,
+      "step": 970
+    },
+    {
+      "epoch": 0.511816163990077,
+      "grad_norm": 6.154737949371338,
+      "learning_rate": 9.423481462003682e-06,
+      "loss": 1.6433,
+      "step": 980
+    },
+    {
+      "epoch": 0.5170387779083431,
+      "grad_norm": 7.272144794464111,
+      "learning_rate": 9.41690770444386e-06,
+      "loss": 1.6019,
+      "step": 990
+    },
+    {
+      "epoch": 0.5222613918266092,
+      "grad_norm": 9.126653671264648,
+      "learning_rate": 9.41033394688404e-06,
+      "loss": 1.4966,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5222613918266092,
+      "eval_loss": 1.519429087638855,
+      "eval_runtime": 46.2139,
+      "eval_samples_per_second": 36.829,
+      "eval_steps_per_second": 4.609,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5274840057448753,
+      "grad_norm": 6.704170227050781,
+      "learning_rate": 9.403760189324219e-06,
+      "loss": 1.3031,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5327066196631414,
+      "grad_norm": 7.243802070617676,
+      "learning_rate": 9.397186431764396e-06,
+      "loss": 1.6704,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5379292335814075,
+      "grad_norm": 8.116981506347656,
+      "learning_rate": 9.390612674204576e-06,
+      "loss": 1.6063,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5431518474996736,
+      "grad_norm": 8.39439868927002,
+      "learning_rate": 9.384038916644755e-06,
+      "loss": 1.6131,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5483744614179397,
+      "grad_norm": 7.224155426025391,
+      "learning_rate": 9.377465159084934e-06,
+      "loss": 1.4247,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5535970753362057,
+      "grad_norm": 7.421390056610107,
+      "learning_rate": 9.370891401525112e-06,
+      "loss": 1.4148,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5588196892544719,
+      "grad_norm": 7.829183578491211,
+      "learning_rate": 9.364317643965291e-06,
+      "loss": 1.3336,
+      "step": 1070
+    },
+    {
+      "epoch": 0.564042303172738,
+      "grad_norm": 7.551697254180908,
+      "learning_rate": 9.35774388640547e-06,
+      "loss": 1.3667,
+      "step": 1080
+    },
+    {
+      "epoch": 0.569264917091004,
+      "grad_norm": 10.012944221496582,
+      "learning_rate": 9.351170128845648e-06,
+      "loss": 1.4699,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5744875310092702,
+      "grad_norm": 7.096408367156982,
+      "learning_rate": 9.344596371285828e-06,
+      "loss": 1.3641,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5797101449275363,
+      "grad_norm": 6.685956954956055,
+      "learning_rate": 9.338022613726007e-06,
+      "loss": 1.4036,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5849327588458023,
+      "grad_norm": 6.154745101928711,
+      "learning_rate": 9.331448856166185e-06,
+      "loss": 1.3924,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5901553727640684,
+      "grad_norm": 8.996173858642578,
+      "learning_rate": 9.324875098606364e-06,
+      "loss": 1.3899,
+      "step": 1130
+    },
+    {
+      "epoch": 0.5953779866823345,
+      "grad_norm": 7.491435527801514,
+      "learning_rate": 9.318301341046544e-06,
+      "loss": 1.406,
+      "step": 1140
+    },
+    {
+      "epoch": 0.6006006006006006,
+      "grad_norm": 8.112247467041016,
+      "learning_rate": 9.311727583486721e-06,
+      "loss": 1.3555,
+      "step": 1150
+    },
+    {
+      "epoch": 0.6058232145188667,
+      "grad_norm": 6.320436000823975,
+      "learning_rate": 9.3051538259269e-06,
+      "loss": 1.4438,
+      "step": 1160
+    },
+    {
+      "epoch": 0.6110458284371327,
+      "grad_norm": 8.957696914672852,
+      "learning_rate": 9.29858006836708e-06,
+      "loss": 1.5721,
+      "step": 1170
+    },
+    {
+      "epoch": 0.6162684423553989,
+      "grad_norm": 7.852613925933838,
+      "learning_rate": 9.292006310807258e-06,
+      "loss": 1.3379,
+      "step": 1180
+    },
+    {
+      "epoch": 0.621491056273665,
+      "grad_norm": 8.566577911376953,
+      "learning_rate": 9.285432553247437e-06,
+      "loss": 1.2671,
+      "step": 1190
+    },
+    {
+      "epoch": 0.626713670191931,
+      "grad_norm": 8.17737102508545,
+      "learning_rate": 9.278858795687615e-06,
+      "loss": 1.5569,
+      "step": 1200
+    },
+    {
+      "epoch": 0.626713670191931,
+      "eval_loss": 1.4577302932739258,
+      "eval_runtime": 46.2429,
+      "eval_samples_per_second": 36.806,
+      "eval_steps_per_second": 4.606,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6319362841101972,
+      "grad_norm": 9.098960876464844,
+      "learning_rate": 9.272285038127796e-06,
+      "loss": 1.3189,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6371588980284633,
+      "grad_norm": 7.19639778137207,
+      "learning_rate": 9.265711280567973e-06,
+      "loss": 1.4387,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6423815119467293,
+      "grad_norm": 12.178171157836914,
+      "learning_rate": 9.259137523008153e-06,
+      "loss": 1.3847,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6476041258649954,
+      "grad_norm": 7.771910667419434,
+      "learning_rate": 9.252563765448332e-06,
+      "loss": 1.4767,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6528267397832616,
+      "grad_norm": 7.271229267120361,
+      "learning_rate": 9.24599000788851e-06,
+      "loss": 1.6648,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6580493537015276,
+      "grad_norm": 6.665337562561035,
+      "learning_rate": 9.239416250328689e-06,
+      "loss": 1.5588,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6632719676197937,
+      "grad_norm": 9.990988731384277,
+      "learning_rate": 9.232842492768868e-06,
+      "loss": 1.5604,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6684945815380599,
+      "grad_norm": 11.0624418258667,
+      "learning_rate": 9.226268735209046e-06,
+      "loss": 1.3764,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6737171954563259,
+      "grad_norm": 7.9689788818359375,
+      "learning_rate": 9.219694977649225e-06,
+      "loss": 1.4476,
+      "step": 1290
+    },
+    {
+      "epoch": 0.678939809374592,
+      "grad_norm": 9.779829978942871,
+      "learning_rate": 9.213121220089405e-06,
+      "loss": 1.6465,
+      "step": 1300
+    },
+    {
+      "epoch": 0.684162423292858,
+      "grad_norm": 13.22938060760498,
+      "learning_rate": 9.206547462529582e-06,
+      "loss": 1.3137,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6893850372111242,
+      "grad_norm": 7.279387950897217,
+      "learning_rate": 9.199973704969762e-06,
+      "loss": 1.3996,
+      "step": 1320
+    },
+    {
+      "epoch": 0.6946076511293903,
+      "grad_norm": 8.289338111877441,
+      "learning_rate": 9.19339994740994e-06,
+      "loss": 1.6241,
+      "step": 1330
+    },
+    {
+      "epoch": 0.6998302650476563,
+      "grad_norm": 6.582716464996338,
+      "learning_rate": 9.186826189850119e-06,
+      "loss": 1.4579,
+      "step": 1340
+    },
+    {
+      "epoch": 0.7050528789659224,
+      "grad_norm": 6.584691524505615,
+      "learning_rate": 9.180252432290298e-06,
+      "loss": 1.4906,
+      "step": 1350
+    },
+    {
+      "epoch": 0.7102754928841886,
+      "grad_norm": 7.73353910446167,
+      "learning_rate": 9.173678674730476e-06,
+      "loss": 1.4636,
+      "step": 1360
+    },
+    {
+      "epoch": 0.7154981068024546,
+      "grad_norm": 8.247365951538086,
+      "learning_rate": 9.167104917170655e-06,
+      "loss": 1.322,
+      "step": 1370
+    },
+    {
+      "epoch": 0.7207207207207207,
+      "grad_norm": 10.286720275878906,
+      "learning_rate": 9.160531159610834e-06,
+      "loss": 1.3642,
+      "step": 1380
+    },
+    {
+      "epoch": 0.7259433346389869,
+      "grad_norm": 8.615984916687012,
+      "learning_rate": 9.153957402051014e-06,
+      "loss": 1.2931,
+      "step": 1390
+    },
+    {
+      "epoch": 0.7311659485572529,
+      "grad_norm": 12.748590469360352,
+      "learning_rate": 9.147383644491193e-06,
+      "loss": 1.2309,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7311659485572529,
+      "eval_loss": 1.4139466285705566,
+      "eval_runtime": 46.1975,
+      "eval_samples_per_second": 36.842,
+      "eval_steps_per_second": 4.611,
+      "step": 1400
+    },
+    {
+      "epoch": 0.736388562475519,
+      "grad_norm": 7.815068244934082,
+      "learning_rate": 9.14080988693137e-06,
+      "loss": 1.4541,
+      "step": 1410
+    },
+    {
+      "epoch": 0.741611176393785,
+      "grad_norm": 7.568152904510498,
+      "learning_rate": 9.13423612937155e-06,
+      "loss": 1.4903,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7468337903120512,
+      "grad_norm": 7.310227394104004,
+      "learning_rate": 9.12766237181173e-06,
+      "loss": 1.4544,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7520564042303173,
+      "grad_norm": 8.279141426086426,
+      "learning_rate": 9.121088614251907e-06,
+      "loss": 1.5115,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7572790181485833,
+      "grad_norm": 6.896315574645996,
+      "learning_rate": 9.114514856692086e-06,
+      "loss": 1.3089,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7625016320668495,
+      "grad_norm": 8.386058807373047,
+      "learning_rate": 9.107941099132264e-06,
+      "loss": 1.2252,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7677242459851156,
+      "grad_norm": 6.841330528259277,
+      "learning_rate": 9.101367341572443e-06,
+      "loss": 1.519,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7729468599033816,
+      "grad_norm": 12.19370174407959,
+      "learning_rate": 9.094793584012623e-06,
+      "loss": 1.2249,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7781694738216477,
+      "grad_norm": 7.211881637573242,
+      "learning_rate": 9.0882198264528e-06,
+      "loss": 1.3279,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7833920877399139,
+      "grad_norm": 7.7247724533081055,
+      "learning_rate": 9.08164606889298e-06,
+      "loss": 1.3658,
+      "step": 1500
+    },
+    {
+      "epoch": 0.7886147016581799,
+      "grad_norm": 6.654493808746338,
+      "learning_rate": 9.075072311333159e-06,
+      "loss": 1.2192,
+      "step": 1510
+    },
+    {
+      "epoch": 0.793837315576446,
+      "grad_norm": 9.495121955871582,
+      "learning_rate": 9.068498553773337e-06,
+      "loss": 1.4915,
+      "step": 1520
+    },
+    {
+      "epoch": 0.799059929494712,
+      "grad_norm": 8.51004409790039,
+      "learning_rate": 9.061924796213516e-06,
+      "loss": 1.3852,
+      "step": 1530
+    },
+    {
+      "epoch": 0.8042825434129782,
+      "grad_norm": 8.426909446716309,
+      "learning_rate": 9.055351038653695e-06,
+      "loss": 1.5215,
+      "step": 1540
+    },
+    {
+      "epoch": 0.8095051573312443,
+      "grad_norm": 5.057459831237793,
+      "learning_rate": 9.048777281093875e-06,
+      "loss": 1.3362,
+      "step": 1550
+    },
+    {
+      "epoch": 0.8147277712495103,
+      "grad_norm": 7.3518571853637695,
+      "learning_rate": 9.042203523534054e-06,
+      "loss": 1.5133,
+      "step": 1560
+    },
+    {
+      "epoch": 0.8199503851677765,
+      "grad_norm": 7.912439823150635,
+      "learning_rate": 9.035629765974232e-06,
+      "loss": 1.3317,
+      "step": 1570
+    },
+    {
+      "epoch": 0.8251729990860426,
+      "grad_norm": 9.68945598602295,
+      "learning_rate": 9.029056008414411e-06,
+      "loss": 1.3286,
+      "step": 1580
+    },
+    {
+      "epoch": 0.8303956130043086,
+      "grad_norm": 6.489112377166748,
+      "learning_rate": 9.022482250854589e-06,
+      "loss": 1.2283,
+      "step": 1590
+    },
+    {
+      "epoch": 0.8356182269225747,
+      "grad_norm": 7.945755481719971,
+      "learning_rate": 9.015908493294768e-06,
+      "loss": 1.2544,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8356182269225747,
+      "eval_loss": 1.3756215572357178,
+      "eval_runtime": 46.2433,
+      "eval_samples_per_second": 36.805,
+      "eval_steps_per_second": 4.606,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8408408408408409,
+      "grad_norm": 7.036093235015869,
+      "learning_rate": 9.009334735734947e-06,
+      "loss": 1.4884,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8460634547591069,
+      "grad_norm": 5.541379928588867,
+      "learning_rate": 9.002760978175125e-06,
+      "loss": 1.4642,
+      "step": 1620
+    },
+    {
+      "epoch": 0.851286068677373,
+      "grad_norm": 7.85528564453125,
+      "learning_rate": 8.996187220615304e-06,
+      "loss": 1.4053,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8565086825956392,
+      "grad_norm": 7.868051052093506,
+      "learning_rate": 8.989613463055484e-06,
+      "loss": 1.1438,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8617312965139052,
+      "grad_norm": 6.309744834899902,
+      "learning_rate": 8.983039705495661e-06,
+      "loss": 1.2862,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8669539104321713,
+      "grad_norm": 8.591092109680176,
+      "learning_rate": 8.97646594793584e-06,
+      "loss": 1.3227,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8721765243504374,
+      "grad_norm": 7.7726149559021,
+      "learning_rate": 8.96989219037602e-06,
+      "loss": 1.1849,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8773991382687035,
+      "grad_norm": 7.683166980743408,
+      "learning_rate": 8.963318432816198e-06,
+      "loss": 1.598,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8826217521869696,
+      "grad_norm": 8.678107261657715,
+      "learning_rate": 8.956744675256377e-06,
+      "loss": 1.3754,
+      "step": 1690
+    },
+    {
+      "epoch": 0.8878443661052356,
+      "grad_norm": 6.267312049865723,
+      "learning_rate": 8.950170917696556e-06,
+      "loss": 1.2975,
+      "step": 1700
+    },
+    {
+      "epoch": 0.8930669800235017,
+      "grad_norm": 7.084597587585449,
+      "learning_rate": 8.943597160136734e-06,
+      "loss": 1.1706,
+      "step": 1710
+    },
+    {
+      "epoch": 0.8982895939417679,
+      "grad_norm": 7.456942558288574,
+      "learning_rate": 8.937023402576913e-06,
+      "loss": 1.3991,
+      "step": 1720
+    },
+    {
+      "epoch": 0.903512207860034,
+      "grad_norm": 7.471573829650879,
+      "learning_rate": 8.930449645017093e-06,
+      "loss": 1.2524,
+      "step": 1730
+    },
+    {
+      "epoch": 0.9087348217783,
+      "grad_norm": 10.539592742919922,
+      "learning_rate": 8.923875887457272e-06,
+      "loss": 1.4209,
+      "step": 1740
+    },
+    {
+      "epoch": 0.9139574356965662,
+      "grad_norm": 7.277865886688232,
+      "learning_rate": 8.91730212989745e-06,
+      "loss": 1.3516,
+      "step": 1750
+    },
+    {
+      "epoch": 0.9191800496148322,
+      "grad_norm": 9.74389362335205,
+      "learning_rate": 8.910728372337629e-06,
+      "loss": 1.3255,
+      "step": 1760
+    },
+    {
+      "epoch": 0.9244026635330983,
+      "grad_norm": 10.399345397949219,
+      "learning_rate": 8.904154614777808e-06,
+      "loss": 1.3837,
+      "step": 1770
+    },
+    {
+      "epoch": 0.9296252774513644,
+      "grad_norm": 7.868115425109863,
+      "learning_rate": 8.897580857217986e-06,
+      "loss": 1.3837,
+      "step": 1780
+    },
+    {
+      "epoch": 0.9348478913696305,
+      "grad_norm": 6.750741004943848,
+      "learning_rate": 8.891007099658165e-06,
+      "loss": 1.2667,
+      "step": 1790
+    },
+    {
+      "epoch": 0.9400705052878966,
+      "grad_norm": 6.125620365142822,
+      "learning_rate": 8.884433342098345e-06,
+      "loss": 1.338,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9400705052878966,
+      "eval_loss": 1.3472024202346802,
+      "eval_runtime": 46.2182,
+      "eval_samples_per_second": 36.825,
+      "eval_steps_per_second": 4.609,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9452931192061627,
+      "grad_norm": 10.77094841003418,
+      "learning_rate": 8.877859584538522e-06,
+      "loss": 1.3902,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9505157331244288,
+      "grad_norm": 5.590978145599365,
+      "learning_rate": 8.871285826978702e-06,
+      "loss": 1.2914,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9557383470426949,
+      "grad_norm": 6.462856292724609,
+      "learning_rate": 8.864712069418881e-06,
+      "loss": 1.5046,
+      "step": 1830
+    },
+    {
+      "epoch": 0.960960960960961,
+      "grad_norm": 7.771232604980469,
+      "learning_rate": 8.858138311859059e-06,
+      "loss": 1.4848,
+      "step": 1840
+    },
+    {
+      "epoch": 0.966183574879227,
+      "grad_norm": 7.693990230560303,
+      "learning_rate": 8.851564554299238e-06,
+      "loss": 1.396,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9714061887974932,
+      "grad_norm": 6.7986159324646,
+      "learning_rate": 8.844990796739416e-06,
+      "loss": 1.2407,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9766288027157592,
+      "grad_norm": 8.107544898986816,
+      "learning_rate": 8.838417039179595e-06,
+      "loss": 1.1383,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9818514166340253,
+      "grad_norm": 8.577611923217773,
+      "learning_rate": 8.831843281619774e-06,
+      "loss": 1.4034,
+      "step": 1880
+    },
+    {
+      "epoch": 0.9870740305522914,
+      "grad_norm": 8.117776870727539,
+      "learning_rate": 8.825269524059952e-06,
+      "loss": 1.5123,
+      "step": 1890
+    },
+    {
+      "epoch": 0.9922966444705575,
+      "grad_norm": 6.738471984863281,
+      "learning_rate": 8.818695766500133e-06,
+      "loss": 1.3251,
+      "step": 1900
+    },
+    {
+      "epoch": 0.9975192583888236,
+      "grad_norm": 7.631872177124023,
+      "learning_rate": 8.81212200894031e-06,
+      "loss": 1.3811,
+      "step": 1910
+    },
+    {
+      "epoch": 1.0031335683509597,
+      "grad_norm": 7.611673831939697,
+      "learning_rate": 8.80554825138049e-06,
+      "loss": 1.1202,
+      "step": 1920
+    },
+    {
+      "epoch": 1.0083561822692257,
+      "grad_norm": 10.344902992248535,
+      "learning_rate": 8.79897449382067e-06,
+      "loss": 1.1347,
+      "step": 1930
+    },
+    {
+      "epoch": 1.0135787961874918,
+      "grad_norm": 7.126277923583984,
+      "learning_rate": 8.792400736260847e-06,
+      "loss": 1.1396,
+      "step": 1940
+    },
+    {
+      "epoch": 1.0188014101057579,
+      "grad_norm": 5.496697425842285,
+      "learning_rate": 8.785826978701027e-06,
+      "loss": 1.1543,
+      "step": 1950
+    },
+    {
+      "epoch": 1.024024024024024,
+      "grad_norm": 7.818673610687256,
+      "learning_rate": 8.779253221141206e-06,
+      "loss": 1.0574,
+      "step": 1960
+    },
+    {
+      "epoch": 1.0292466379422902,
+      "grad_norm": 7.81503438949585,
+      "learning_rate": 8.772679463581384e-06,
+      "loss": 1.1081,
+      "step": 1970
+    },
+    {
+      "epoch": 1.0344692518605563,
+      "grad_norm": 7.696155548095703,
+      "learning_rate": 8.766105706021563e-06,
+      "loss": 1.0364,
+      "step": 1980
+    },
+    {
+      "epoch": 1.0396918657788223,
+      "grad_norm": 6.808041095733643,
+      "learning_rate": 8.75953194846174e-06,
+      "loss": 1.2301,
+      "step": 1990
+    },
+    {
+      "epoch": 1.0449144796970884,
+      "grad_norm": 9.0170316696167,
+      "learning_rate": 8.75295819090192e-06,
+      "loss": 0.9611,
+      "step": 2000
+    },
+    {
+      "epoch": 1.0449144796970884,
+      "eval_loss": 1.3424558639526367,
+      "eval_runtime": 46.1975,
+      "eval_samples_per_second": 36.842,
+      "eval_steps_per_second": 4.611,
+      "step": 2000
+    },
+    {
+      "epoch": 1.0501370936153545,
+      "grad_norm": 7.797222137451172,
+      "learning_rate": 8.7463844333421e-06,
+      "loss": 1.0953,
+      "step": 2010
+    },
+    {
+      "epoch": 1.0553597075336205,
+      "grad_norm": 8.089154243469238,
+      "learning_rate": 8.739810675782277e-06,
+      "loss": 1.1462,
+      "step": 2020
+    },
+    {
+      "epoch": 1.0605823214518866,
+      "grad_norm": 7.9765625,
+      "learning_rate": 8.733894293978439e-06,
+      "loss": 0.9677,
+      "step": 2030
+    },
+    {
+      "epoch": 1.0658049353701529,
+      "grad_norm": 7.094350814819336,
+      "learning_rate": 8.727320536418618e-06,
+      "loss": 1.0856,
+      "step": 2040
+    },
+    {
+      "epoch": 1.071027549288419,
+      "grad_norm": 6.9282307624816895,
+      "learning_rate": 8.720746778858796e-06,
+      "loss": 0.9677,
+      "step": 2050
+    },
+    {
+      "epoch": 1.076250163206685,
+      "grad_norm": 7.576324462890625,
+      "learning_rate": 8.714173021298975e-06,
+      "loss": 0.8877,
+      "step": 2060
+    },
+    {
+      "epoch": 1.081472777124951,
+      "grad_norm": 6.454078674316406,
+      "learning_rate": 8.707599263739153e-06,
+      "loss": 1.0766,
+      "step": 2070
+    },
+    {
+      "epoch": 1.086695391043217,
+      "grad_norm": 6.857527732849121,
+      "learning_rate": 8.701025506179332e-06,
+      "loss": 1.1231,
+      "step": 2080
+    },
+    {
+      "epoch": 1.0919180049614832,
+      "grad_norm": 9.354379653930664,
+      "learning_rate": 8.694451748619512e-06,
+      "loss": 1.0554,
+      "step": 2090
+    },
+    {
+      "epoch": 1.0971406188797492,
+      "grad_norm": 7.970911502838135,
+      "learning_rate": 8.68787799105969e-06,
+      "loss": 1.0672,
+      "step": 2100
+    },
+    {
+      "epoch": 1.1023632327980155,
+      "grad_norm": 7.781393527984619,
+      "learning_rate": 8.681304233499869e-06,
+      "loss": 1.1021,
+      "step": 2110
+    },
+    {
+      "epoch": 1.1075858467162816,
+      "grad_norm": 11.121278762817383,
+      "learning_rate": 8.674730475940048e-06,
+      "loss": 1.0507,
+      "step": 2120
+    },
+    {
+      "epoch": 1.1128084606345476,
+      "grad_norm": 6.972072124481201,
+      "learning_rate": 8.668156718380227e-06,
+      "loss": 0.9596,
+      "step": 2130
+    },
+    {
+      "epoch": 1.1180310745528137,
+      "grad_norm": 8.896551132202148,
+      "learning_rate": 8.661582960820407e-06,
+      "loss": 1.0482,
+      "step": 2140
+    },
+    {
+      "epoch": 1.1232536884710798,
+      "grad_norm": 7.05515718460083,
+      "learning_rate": 8.655009203260584e-06,
+      "loss": 1.0665,
+      "step": 2150
+    },
+    {
+      "epoch": 1.1284763023893458,
+      "grad_norm": 7.063408851623535,
+      "learning_rate": 8.648435445700764e-06,
+      "loss": 0.9991,
+      "step": 2160
+    },
+    {
+      "epoch": 1.1336989163076119,
+      "grad_norm": 7.457261085510254,
+      "learning_rate": 8.641861688140943e-06,
+      "loss": 1.1902,
+      "step": 2170
+    },
+    {
+      "epoch": 1.1389215302258782,
+      "grad_norm": 8.556297302246094,
+      "learning_rate": 8.63528793058112e-06,
+      "loss": 1.2003,
+      "step": 2180
+    },
+    {
+      "epoch": 1.1441441441441442,
+      "grad_norm": 7.71785306930542,
+      "learning_rate": 8.6287141730213e-06,
+      "loss": 1.1433,
+      "step": 2190
+    },
+    {
+      "epoch": 1.1493667580624103,
+      "grad_norm": 5.20206356048584,
+      "learning_rate": 8.622140415461478e-06,
+      "loss": 0.9814,
+      "step": 2200
+    },
+    {
+      "epoch": 1.1493667580624103,
+      "eval_loss": 1.3300807476043701,
+      "eval_runtime": 46.2098,
+      "eval_samples_per_second": 36.832,
+      "eval_steps_per_second": 4.609,
+      "step": 2200
+    },
+    {
+      "epoch": 1.1545893719806763,
+      "grad_norm": 10.628904342651367,
+      "learning_rate": 8.615566657901657e-06,
+      "loss": 1.1436,
+      "step": 2210
+    },
+    {
+      "epoch": 1.1598119858989424,
+      "grad_norm": 9.307092666625977,
+      "learning_rate": 8.608992900341836e-06,
+      "loss": 1.0982,
+      "step": 2220
+    },
+    {
+      "epoch": 1.1650345998172085,
+      "grad_norm": 5.672645092010498,
+      "learning_rate": 8.602419142782014e-06,
+      "loss": 0.9906,
+      "step": 2230
+    },
+    {
+      "epoch": 1.1702572137354745,
+      "grad_norm": 7.028037071228027,
+      "learning_rate": 8.595845385222193e-06,
+      "loss": 1.1416,
+      "step": 2240
+    },
+    {
+      "epoch": 1.1754798276537408,
+      "grad_norm": 7.749709606170654,
+      "learning_rate": 8.589271627662373e-06,
+      "loss": 1.1821,
+      "step": 2250
+    },
+    {
+      "epoch": 1.1807024415720069,
+      "grad_norm": 8.712191581726074,
+      "learning_rate": 8.58269787010255e-06,
+      "loss": 1.0771,
+      "step": 2260
+    },
+    {
+      "epoch": 1.185925055490273,
+      "grad_norm": 6.774427890777588,
+      "learning_rate": 8.57612411254273e-06,
+      "loss": 1.1228,
+      "step": 2270
+    },
+    {
+      "epoch": 1.191147669408539,
+      "grad_norm": 9.674090385437012,
+      "learning_rate": 8.569550354982909e-06,
+      "loss": 1.0714,
+      "step": 2280
+    },
+    {
+      "epoch": 1.196370283326805,
+      "grad_norm": 11.087074279785156,
+      "learning_rate": 8.562976597423088e-06,
+      "loss": 1.0104,
+      "step": 2290
+    },
+    {
+      "epoch": 1.2015928972450711,
+      "grad_norm": 7.747646331787109,
+      "learning_rate": 8.556402839863268e-06,
+      "loss": 1.1871,
+      "step": 2300
+    },
+    {
+      "epoch": 1.2068155111633372,
+      "grad_norm": 6.897591590881348,
+      "learning_rate": 8.549829082303445e-06,
+      "loss": 0.9774,
+      "step": 2310
+    },
+    {
+      "epoch": 1.2120381250816032,
+      "grad_norm": 8.087874412536621,
+      "learning_rate": 8.543255324743625e-06,
+      "loss": 1.0487,
+      "step": 2320
+    },
+    {
+      "epoch": 1.2172607389998695,
+      "grad_norm": 8.251788139343262,
+      "learning_rate": 8.536681567183802e-06,
+      "loss": 1.1489,
+      "step": 2330
+    },
+    {
+      "epoch": 1.2224833529181356,
+      "grad_norm": 8.342131614685059,
+      "learning_rate": 8.530107809623982e-06,
+      "loss": 1.0752,
+      "step": 2340
+    },
+    {
+      "epoch": 1.2277059668364017,
+      "grad_norm": 10.579221725463867,
+      "learning_rate": 8.523534052064161e-06,
+      "loss": 1.0812,
+      "step": 2350
+    },
+    {
+      "epoch": 1.2329285807546677,
+      "grad_norm": 5.970239639282227,
+      "learning_rate": 8.516960294504339e-06,
+      "loss": 0.9434,
+      "step": 2360
+    },
+    {
+      "epoch": 1.2381511946729338,
+      "grad_norm": 7.801792621612549,
+      "learning_rate": 8.510386536944518e-06,
+      "loss": 1.1259,
+      "step": 2370
+    },
+    {
+      "epoch": 1.2433738085911998,
+      "grad_norm": 6.949589252471924,
+      "learning_rate": 8.503812779384697e-06,
+      "loss": 1.0598,
+      "step": 2380
+    },
+    {
+      "epoch": 1.248596422509466,
+      "grad_norm": 7.705820083618164,
+      "learning_rate": 8.497239021824875e-06,
+      "loss": 1.0536,
+      "step": 2390
+    },
+    {
+      "epoch": 1.253819036427732,
+      "grad_norm": 7.674275875091553,
+      "learning_rate": 8.490665264265054e-06,
+      "loss": 0.9851,
+      "step": 2400
+    },
+    {
+      "epoch": 1.253819036427732,
+      "eval_loss": 1.307387351989746,
+      "eval_runtime": 46.2194,
+      "eval_samples_per_second": 36.824,
+      "eval_steps_per_second": 4.608,
+      "step": 2400
+    },
+    {
+      "epoch": 1.2590416503459982,
+      "grad_norm": 7.880123615264893,
+      "learning_rate": 8.484091506705234e-06,
+      "loss": 1.1418,
+      "step": 2410
+    },
+    {
+      "epoch": 1.2642642642642643,
+      "grad_norm": 8.38881778717041,
+      "learning_rate": 8.477517749145411e-06,
+      "loss": 1.1259,
+      "step": 2420
+    },
+    {
+      "epoch": 1.2694868781825304,
+      "grad_norm": 8.130038261413574,
+      "learning_rate": 8.47094399158559e-06,
+      "loss": 1.0038,
+      "step": 2430
+    },
+    {
+      "epoch": 1.2747094921007964,
+      "grad_norm": 8.897841453552246,
+      "learning_rate": 8.46437023402577e-06,
+      "loss": 1.0479,
+      "step": 2440
+    },
+    {
+      "epoch": 1.2799321060190625,
+      "grad_norm": 5.228075981140137,
+      "learning_rate": 8.457796476465948e-06,
+      "loss": 0.8703,
+      "step": 2450
+    },
+    {
+      "epoch": 1.2851547199373288,
+      "grad_norm": 7.763029098510742,
+      "learning_rate": 8.451222718906127e-06,
+      "loss": 0.9687,
+      "step": 2460
+    },
+    {
+      "epoch": 1.2903773338555946,
+      "grad_norm": 8.072653770446777,
+      "learning_rate": 8.444648961346307e-06,
+      "loss": 1.0566,
+      "step": 2470
+    },
+    {
+      "epoch": 1.295599947773861,
+      "grad_norm": 5.563382148742676,
+      "learning_rate": 8.438075203786486e-06,
+      "loss": 1.0113,
+      "step": 2480
+    },
+    {
+      "epoch": 1.300822561692127,
+      "grad_norm": 7.0666608810424805,
+      "learning_rate": 8.431501446226664e-06,
+      "loss": 1.0784,
+      "step": 2490
+    },
+    {
+      "epoch": 1.306045175610393,
+      "grad_norm": 7.217309474945068,
+      "learning_rate": 8.424927688666843e-06,
+      "loss": 1.0815,
+      "step": 2500
+    },
+    {
+      "epoch": 1.311267789528659,
+      "grad_norm": 7.389529705047607,
+      "learning_rate": 8.418353931107022e-06,
+      "loss": 0.9945,
+      "step": 2510
+    },
+    {
+      "epoch": 1.3164904034469251,
+      "grad_norm": 8.694013595581055,
+      "learning_rate": 8.4117801735472e-06,
+      "loss": 0.9028,
+      "step": 2520
+    },
+    {
+      "epoch": 1.3217130173651912,
+      "grad_norm": 10.238712310791016,
+      "learning_rate": 8.40520641598738e-06,
+      "loss": 1.0965,
+      "step": 2530
+    },
+    {
+      "epoch": 1.3269356312834573,
+      "grad_norm": 5.861429214477539,
+      "learning_rate": 8.398632658427559e-06,
+      "loss": 0.9861,
+      "step": 2540
+    },
+    {
+      "epoch": 1.3321582452017235,
+      "grad_norm": 8.313767433166504,
+      "learning_rate": 8.392058900867736e-06,
+      "loss": 0.977,
+      "step": 2550
+    },
+    {
+      "epoch": 1.3373808591199896,
+      "grad_norm": 8.89799976348877,
+      "learning_rate": 8.385485143307916e-06,
+      "loss": 1.0753,
+      "step": 2560
+    },
+    {
+      "epoch": 1.3426034730382557,
+      "grad_norm": 9.983473777770996,
+      "learning_rate": 8.378911385748095e-06,
+      "loss": 1.0387,
+      "step": 2570
+    },
+    {
+      "epoch": 1.3478260869565217,
+      "grad_norm": 7.438155651092529,
+      "learning_rate": 8.372337628188273e-06,
+      "loss": 1.0159,
+      "step": 2580
+    },
+    {
+      "epoch": 1.3530487008747878,
+      "grad_norm": 5.991061687469482,
+      "learning_rate": 8.365763870628452e-06,
+      "loss": 0.941,
+      "step": 2590
+    },
+    {
+      "epoch": 1.3582713147930539,
+      "grad_norm": 8.431944847106934,
+      "learning_rate": 8.35919011306863e-06,
+      "loss": 1.0113,
+      "step": 2600
+    },
+    {
+      "epoch": 1.3582713147930539,
+      "eval_loss": 1.291563630104065,
+      "eval_runtime": 46.1904,
+      "eval_samples_per_second": 36.848,
+      "eval_steps_per_second": 4.611,
+      "step": 2600
+    },
+    {
+      "epoch": 1.36349392871132,
+      "grad_norm": 6.396197319030762,
+      "learning_rate": 8.352616355508809e-06,
+      "loss": 1.0564,
+      "step": 2610
+    },
+    {
+      "epoch": 1.3687165426295862,
+      "grad_norm": 8.248652458190918,
+      "learning_rate": 8.346042597948988e-06,
+      "loss": 1.035,
+      "step": 2620
+    },
+    {
+      "epoch": 1.3739391565478523,
+      "grad_norm": 10.068962097167969,
+      "learning_rate": 8.339468840389166e-06,
+      "loss": 1.1366,
+      "step": 2630
+    },
+    {
+      "epoch": 1.3791617704661183,
+      "grad_norm": 7.027943134307861,
+      "learning_rate": 8.332895082829347e-06,
+      "loss": 1.0452,
+      "step": 2640
+    },
+    {
+      "epoch": 1.3843843843843844,
+      "grad_norm": 8.624917030334473,
+      "learning_rate": 8.326321325269525e-06,
+      "loss": 0.9896,
+      "step": 2650
+    },
+    {
+      "epoch": 1.3896069983026504,
+      "grad_norm": 10.41063117980957,
+      "learning_rate": 8.319747567709704e-06,
+      "loss": 0.8731,
+      "step": 2660
+    },
+    {
+      "epoch": 1.3948296122209165,
+      "grad_norm": 9.194129943847656,
+      "learning_rate": 8.313173810149883e-06,
+      "loss": 1.1184,
+      "step": 2670
+    },
+    {
+      "epoch": 1.4000522261391826,
+      "grad_norm": 7.924031734466553,
+      "learning_rate": 8.306600052590061e-06,
+      "loss": 0.9832,
+      "step": 2680
+    },
+    {
+      "epoch": 1.4052748400574488,
+      "grad_norm": 7.612093925476074,
+      "learning_rate": 8.30002629503024e-06,
+      "loss": 0.8867,
+      "step": 2690
+    },
+    {
+      "epoch": 1.410497453975715,
+      "grad_norm": 5.16526985168457,
+      "learning_rate": 8.29345253747042e-06,
+      "loss": 0.9798,
+      "step": 2700
+    },
+    {
+      "epoch": 1.415720067893981,
+      "grad_norm": 7.717186450958252,
+      "learning_rate": 8.286878779910597e-06,
+      "loss": 0.9842,
+      "step": 2710
+    },
+    {
+      "epoch": 1.420942681812247,
+      "grad_norm": 7.59783411026001,
+      "learning_rate": 8.280305022350777e-06,
+      "loss": 1.1226,
+      "step": 2720
+    },
+    {
+      "epoch": 1.426165295730513,
+      "grad_norm": 7.938925266265869,
+      "learning_rate": 8.273731264790954e-06,
+      "loss": 1.1191,
+      "step": 2730
+    },
+    {
+      "epoch": 1.4313879096487792,
+      "grad_norm": 5.337181091308594,
+      "learning_rate": 8.267157507231134e-06,
+      "loss": 0.9871,
+      "step": 2740
+    },
+    {
+      "epoch": 1.4366105235670452,
+      "grad_norm": 8.043785095214844,
+      "learning_rate": 8.260583749671313e-06,
+      "loss": 1.013,
+      "step": 2750
+    },
+    {
+      "epoch": 1.4418331374853115,
+      "grad_norm": 8.082165718078613,
+      "learning_rate": 8.25400999211149e-06,
+      "loss": 1.1487,
+      "step": 2760
+    },
+    {
+      "epoch": 1.4470557514035776,
+      "grad_norm": 8.345974922180176,
+      "learning_rate": 8.24743623455167e-06,
+      "loss": 1.1525,
+      "step": 2770
+    },
+    {
+      "epoch": 1.4522783653218436,
+      "grad_norm": 7.04867696762085,
+      "learning_rate": 8.24086247699185e-06,
+      "loss": 0.8678,
+      "step": 2780
+    },
+    {
+      "epoch": 1.4575009792401097,
+      "grad_norm": 10.37601375579834,
+      "learning_rate": 8.234288719432027e-06,
+      "loss": 1.0822,
+      "step": 2790
+    },
+    {
+      "epoch": 1.4627235931583757,
+      "grad_norm": 7.350039958953857,
+      "learning_rate": 8.227714961872206e-06,
+      "loss": 0.9471,
+      "step": 2800
+    },
+    {
+      "epoch": 1.4627235931583757,
+      "eval_loss": 1.282351016998291,
+      "eval_runtime": 46.2302,
+      "eval_samples_per_second": 36.816,
+      "eval_steps_per_second": 4.607,
+      "step": 2800
+    },
+    {
+      "epoch": 1.4679462070766418,
+      "grad_norm": 8.35920524597168,
+      "learning_rate": 8.221141204312386e-06,
+      "loss": 1.0057,
+      "step": 2810
+    },
+    {
+      "epoch": 1.4731688209949079,
+      "grad_norm": 7.395483016967773,
+      "learning_rate": 8.214567446752565e-06,
+      "loss": 1.0621,
+      "step": 2820
+    },
+    {
+      "epoch": 1.4783914349131742,
+      "grad_norm": 7.589978218078613,
+      "learning_rate": 8.207993689192744e-06,
+      "loss": 1.1316,
+      "step": 2830
+    },
+    {
+      "epoch": 1.4836140488314402,
+      "grad_norm": 6.801819324493408,
+      "learning_rate": 8.201419931632922e-06,
+      "loss": 1.0843,
+      "step": 2840
+    },
+    {
+      "epoch": 1.4888366627497063,
+      "grad_norm": 7.094258785247803,
+      "learning_rate": 8.194846174073101e-06,
+      "loss": 1.0899,
+      "step": 2850
+    },
+    {
+      "epoch": 1.4940592766679723,
+      "grad_norm": 7.351092338562012,
+      "learning_rate": 8.18827241651328e-06,
+      "loss": 0.8335,
+      "step": 2860
+    },
+    {
+      "epoch": 1.4992818905862384,
+      "grad_norm": 7.691007614135742,
+      "learning_rate": 8.181698658953458e-06,
+      "loss": 0.951,
+      "step": 2870
+    },
+    {
+      "epoch": 1.5045045045045045,
+      "grad_norm": 6.353541851043701,
+      "learning_rate": 8.175124901393638e-06,
+      "loss": 0.9797,
+      "step": 2880
+    },
+    {
+      "epoch": 1.5097271184227705,
+      "grad_norm": 8.544390678405762,
+      "learning_rate": 8.168551143833815e-06,
+      "loss": 0.9335,
+      "step": 2890
+    },
+    {
+      "epoch": 1.5149497323410368,
+      "grad_norm": 7.043111324310303,
+      "learning_rate": 8.161977386273995e-06,
+      "loss": 1.094,
+      "step": 2900
+    },
+    {
+      "epoch": 1.5201723462593026,
+      "grad_norm": 8.29129695892334,
+      "learning_rate": 8.155403628714174e-06,
+      "loss": 1.031,
+      "step": 2910
+    },
+    {
+      "epoch": 1.525394960177569,
+      "grad_norm": 9.749144554138184,
+      "learning_rate": 8.148829871154352e-06,
+      "loss": 0.989,
+      "step": 2920
+    },
+    {
+      "epoch": 1.530617574095835,
+      "grad_norm": 7.455598831176758,
+      "learning_rate": 8.142256113594531e-06,
+      "loss": 1.0981,
+      "step": 2930
+    },
+    {
+      "epoch": 1.535840188014101,
+      "grad_norm": 8.99416732788086,
+      "learning_rate": 8.13568235603471e-06,
+      "loss": 0.9406,
+      "step": 2940
+    },
+    {
+      "epoch": 1.541062801932367,
+      "grad_norm": 8.05884075164795,
+      "learning_rate": 8.129108598474888e-06,
+      "loss": 0.9374,
+      "step": 2950
+    },
+    {
+      "epoch": 1.5462854158506332,
+      "grad_norm": 8.008783340454102,
+      "learning_rate": 8.122534840915067e-06,
+      "loss": 1.2935,
+      "step": 2960
+    },
+    {
+      "epoch": 1.5515080297688995,
+      "grad_norm": 4.450300693511963,
+      "learning_rate": 8.115961083355247e-06,
+      "loss": 0.998,
+      "step": 2970
+    },
+    {
+      "epoch": 1.5567306436871653,
+      "grad_norm": 8.787833213806152,
+      "learning_rate": 8.109387325795426e-06,
+      "loss": 1.1033,
+      "step": 2980
+    },
+    {
+      "epoch": 1.5619532576054316,
+      "grad_norm": 8.741140365600586,
+      "learning_rate": 8.102813568235605e-06,
+      "loss": 1.0123,
+      "step": 2990
+    },
+    {
+      "epoch": 1.5671758715236976,
+      "grad_norm": 7.553483486175537,
+      "learning_rate": 8.096239810675783e-06,
+      "loss": 0.9479,
+      "step": 3000
+    },
+    {
+      "epoch": 1.5671758715236976,
+      "eval_loss": 1.2631675004959106,
+      "eval_runtime": 46.3906,
+      "eval_samples_per_second": 36.688,
+      "eval_steps_per_second": 4.591,
+      "step": 3000
+    },
+    {
+      "epoch": 1.5723984854419637,
+      "grad_norm": 7.2385478019714355,
+      "learning_rate": 8.089666053115962e-06,
+      "loss": 0.9502,
+      "step": 3010
+    },
+    {
+      "epoch": 1.5776210993602298,
+      "grad_norm": 7.736097812652588,
+      "learning_rate": 8.08309229555614e-06,
+      "loss": 1.0187,
+      "step": 3020
+    },
+    {
+      "epoch": 1.5828437132784958,
+      "grad_norm": 10.500167846679688,
+      "learning_rate": 8.07651853799632e-06,
+      "loss": 1.1638,
+      "step": 3030
+    },
+    {
+      "epoch": 1.588066327196762,
+      "grad_norm": 7.112147808074951,
+      "learning_rate": 8.069944780436499e-06,
+      "loss": 0.9987,
+      "step": 3040
+    },
+    {
+      "epoch": 1.593288941115028,
+      "grad_norm": 7.6480536460876465,
+      "learning_rate": 8.063371022876676e-06,
+      "loss": 0.9721,
+      "step": 3050
+    },
+    {
+      "epoch": 1.5985115550332942,
+      "grad_norm": 5.637914657592773,
+      "learning_rate": 8.056797265316856e-06,
+      "loss": 1.0136,
+      "step": 3060
+    },
+    {
+      "epoch": 1.6037341689515603,
+      "grad_norm": 9.559964179992676,
+      "learning_rate": 8.050223507757035e-06,
+      "loss": 0.99,
+      "step": 3070
+    },
+    {
+      "epoch": 1.6089567828698264,
+      "grad_norm": 9.080857276916504,
+      "learning_rate": 8.043649750197213e-06,
+      "loss": 0.9547,
+      "step": 3080
+    },
+    {
+      "epoch": 1.6141793967880924,
+      "grad_norm": 7.321665287017822,
+      "learning_rate": 8.037075992637392e-06,
+      "loss": 1.1887,
+      "step": 3090
+    },
+    {
+      "epoch": 1.6194020107063585,
+      "grad_norm": 7.967632293701172,
+      "learning_rate": 8.030502235077571e-06,
+      "loss": 1.0033,
+      "step": 3100
+    },
+    {
+      "epoch": 1.6246246246246248,
+      "grad_norm": 8.761786460876465,
+      "learning_rate": 8.023928477517749e-06,
+      "loss": 1.0964,
+      "step": 3110
+    },
+    {
+      "epoch": 1.6298472385428906,
+      "grad_norm": 8.108577728271484,
+      "learning_rate": 8.017354719957928e-06,
+      "loss": 0.9006,
+      "step": 3120
+    },
+    {
+      "epoch": 1.6350698524611569,
+      "grad_norm": 9.67638874053955,
+      "learning_rate": 8.010780962398108e-06,
+      "loss": 0.9444,
+      "step": 3130
+    },
+    {
+      "epoch": 1.640292466379423,
+      "grad_norm": 5.613959312438965,
+      "learning_rate": 8.004207204838285e-06,
+      "loss": 0.9477,
+      "step": 3140
+    },
+    {
+      "epoch": 1.645515080297689,
+      "grad_norm": 10.248137474060059,
+      "learning_rate": 7.997633447278465e-06,
+      "loss": 0.973,
+      "step": 3150
+    },
+    {
+      "epoch": 1.650737694215955,
+      "grad_norm": 8.074259757995605,
+      "learning_rate": 7.991059689718644e-06,
+      "loss": 0.9694,
+      "step": 3160
+    },
+    {
+      "epoch": 1.6559603081342211,
+      "grad_norm": 7.905124187469482,
+      "learning_rate": 7.984485932158824e-06,
+      "loss": 0.9383,
+      "step": 3170
+    },
+    {
+      "epoch": 1.6611829220524874,
+      "grad_norm": 7.854189395904541,
+      "learning_rate": 7.977912174599001e-06,
+      "loss": 0.9687,
+      "step": 3180
+    },
+    {
+      "epoch": 1.6664055359707532,
+      "grad_norm": 8.677661895751953,
+      "learning_rate": 7.97133841703918e-06,
+      "loss": 1.0772,
+      "step": 3190
+    },
+    {
+      "epoch": 1.6716281498890195,
+      "grad_norm": 6.915931701660156,
+      "learning_rate": 7.96476465947936e-06,
+      "loss": 0.9937,
+      "step": 3200
+    },
+    {
+      "epoch": 1.6716281498890195,
+      "eval_loss": 1.2522579431533813,
+      "eval_runtime": 46.2527,
+      "eval_samples_per_second": 36.798,
+      "eval_steps_per_second": 4.605,
+      "step": 3200
+    },
+    {
+      "epoch": 1.6768507638072856,
+      "grad_norm": 7.813302993774414,
+      "learning_rate": 7.958190901919538e-06,
+      "loss": 0.9355,
+      "step": 3210
+    },
+    {
+      "epoch": 1.6820733777255517,
+      "grad_norm": 7.921642780303955,
+      "learning_rate": 7.951617144359717e-06,
+      "loss": 1.0172,
+      "step": 3220
+    },
+    {
+      "epoch": 1.6872959916438177,
+      "grad_norm": 6.058948040008545,
+      "learning_rate": 7.945043386799896e-06,
+      "loss": 0.922,
+      "step": 3230
+    },
+    {
+      "epoch": 1.6925186055620838,
+      "grad_norm": 6.718349456787109,
+      "learning_rate": 7.938469629240074e-06,
+      "loss": 0.9229,
+      "step": 3240
+    },
+    {
+      "epoch": 1.69774121948035,
+      "grad_norm": 7.593595504760742,
+      "learning_rate": 7.931895871680253e-06,
+      "loss": 0.8853,
+      "step": 3250
+    },
+    {
+      "epoch": 1.702963833398616,
+      "grad_norm": 8.581092834472656,
+      "learning_rate": 7.925322114120433e-06,
+      "loss": 0.9894,
+      "step": 3260
+    },
+    {
+      "epoch": 1.7081864473168822,
+      "grad_norm": 9.009191513061523,
+      "learning_rate": 7.91874835656061e-06,
+      "loss": 0.9499,
+      "step": 3270
+    },
+    {
+      "epoch": 1.7134090612351482,
+      "grad_norm": 7.549342155456543,
+      "learning_rate": 7.91217459900079e-06,
+      "loss": 1.0529,
+      "step": 3280
+    },
+    {
+      "epoch": 1.7186316751534143,
+      "grad_norm": 7.8952317237854,
+      "learning_rate": 7.905600841440967e-06,
+      "loss": 1.0283,
+      "step": 3290
+    },
+    {
+      "epoch": 1.7238542890716804,
+      "grad_norm": 4.43769645690918,
+      "learning_rate": 7.899027083881147e-06,
+      "loss": 0.8977,
+      "step": 3300
+    },
+    {
+      "epoch": 1.7290769029899464,
+      "grad_norm": 9.048827171325684,
+      "learning_rate": 7.892453326321326e-06,
+      "loss": 1.0214,
+      "step": 3310
+    },
+    {
+      "epoch": 1.7342995169082127,
+      "grad_norm": 8.52785873413086,
+      "learning_rate": 7.885879568761504e-06,
+      "loss": 1.0758,
+      "step": 3320
+    },
+    {
+      "epoch": 1.7395221308264786,
+      "grad_norm": 9.804828643798828,
+      "learning_rate": 7.879305811201685e-06,
+      "loss": 1.0046,
+      "step": 3330
+    },
+    {
+      "epoch": 1.7447447447447448,
+      "grad_norm": 5.0326642990112305,
+      "learning_rate": 7.872732053641862e-06,
+      "loss": 1.0151,
+      "step": 3340
+    },
+    {
+      "epoch": 1.7499673586630107,
+      "grad_norm": 8.352339744567871,
+      "learning_rate": 7.866158296082042e-06,
+      "loss": 1.1599,
+      "step": 3350
+    },
+    {
+      "epoch": 1.755189972581277,
+      "grad_norm": 8.454385757446289,
+      "learning_rate": 7.859584538522221e-06,
+      "loss": 1.021,
+      "step": 3360
+    },
+    {
+      "epoch": 1.760412586499543,
+      "grad_norm": 6.878560543060303,
+      "learning_rate": 7.853010780962399e-06,
+      "loss": 1.0795,
+      "step": 3370
+    },
+    {
+      "epoch": 1.765635200417809,
+      "grad_norm": 7.027520179748535,
+      "learning_rate": 7.846437023402578e-06,
+      "loss": 0.9958,
+      "step": 3380
+    },
+    {
+      "epoch": 1.7708578143360754,
+      "grad_norm": 6.573943614959717,
+      "learning_rate": 7.839863265842757e-06,
+      "loss": 0.9701,
+      "step": 3390
+    },
+    {
+      "epoch": 1.7760804282543412,
+      "grad_norm": 8.435296058654785,
+      "learning_rate": 7.833289508282935e-06,
+      "loss": 1.2055,
+      "step": 3400
+    },
+    {
+      "epoch": 1.7760804282543412,
+      "eval_loss": 1.2281008958816528,
+      "eval_runtime": 46.491,
+      "eval_samples_per_second": 36.609,
+      "eval_steps_per_second": 4.582,
+      "step": 3400
+    },
+    {
+      "epoch": 1.7813030421726075,
+      "grad_norm": 8.427298545837402,
+      "learning_rate": 7.826715750723114e-06,
+      "loss": 0.8982,
+      "step": 3410
+    },
+    {
+      "epoch": 1.7865256560908733,
+      "grad_norm": 6.709393501281738,
+      "learning_rate": 7.820141993163292e-06,
+      "loss": 0.9613,
+      "step": 3420
+    },
+    {
+      "epoch": 1.7917482700091396,
+      "grad_norm": 7.055306911468506,
+      "learning_rate": 7.813568235603471e-06,
+      "loss": 1.0241,
+      "step": 3430
+    },
+    {
+      "epoch": 1.7969708839274057,
+      "grad_norm": 6.494068145751953,
+      "learning_rate": 7.80699447804365e-06,
+      "loss": 0.9973,
+      "step": 3440
+    },
+    {
+      "epoch": 1.8021934978456717,
+      "grad_norm": 7.806325912475586,
+      "learning_rate": 7.800420720483828e-06,
+      "loss": 1.0703,
+      "step": 3450
+    },
+    {
+      "epoch": 1.807416111763938,
+      "grad_norm": 7.981148719787598,
+      "learning_rate": 7.793846962924008e-06,
+      "loss": 1.0622,
+      "step": 3460
+    },
+    {
+      "epoch": 1.8126387256822039,
+      "grad_norm": 11.118639945983887,
+      "learning_rate": 7.787273205364187e-06,
+      "loss": 0.951,
+      "step": 3470
+    },
+    {
+      "epoch": 1.8178613396004701,
+      "grad_norm": 6.713343620300293,
+      "learning_rate": 7.780699447804365e-06,
+      "loss": 0.979,
+      "step": 3480
+    },
+    {
+      "epoch": 1.823083953518736,
+      "grad_norm": 7.850152492523193,
+      "learning_rate": 7.774125690244544e-06,
+      "loss": 1.0042,
+      "step": 3490
+    },
+    {
+      "epoch": 1.8283065674370023,
+      "grad_norm": 9.41398811340332,
+      "learning_rate": 7.767551932684723e-06,
+      "loss": 0.9911,
+      "step": 3500
+    },
+    {
+      "epoch": 1.8335291813552683,
+      "grad_norm": 5.341313362121582,
+      "learning_rate": 7.760978175124903e-06,
+      "loss": 1.1345,
+      "step": 3510
+    },
+    {
+      "epoch": 1.8387517952735344,
+      "grad_norm": 7.4594950675964355,
+      "learning_rate": 7.754404417565082e-06,
+      "loss": 1.0226,
+      "step": 3520
+    },
+    {
+      "epoch": 1.8439744091918004,
+      "grad_norm": 7.304319381713867,
+      "learning_rate": 7.74783066000526e-06,
+      "loss": 1.033,
+      "step": 3530
+    },
+    {
+      "epoch": 1.8491970231100665,
+      "grad_norm": 7.15964412689209,
+      "learning_rate": 7.741256902445439e-06,
+      "loss": 1.0393,
+      "step": 3540
+    },
+    {
+      "epoch": 1.8544196370283328,
+      "grad_norm": 7.043346405029297,
+      "learning_rate": 7.734683144885617e-06,
+      "loss": 0.9624,
+      "step": 3550
+    },
+    {
+      "epoch": 1.8596422509465986,
+      "grad_norm": 7.86375617980957,
+      "learning_rate": 7.728109387325796e-06,
+      "loss": 0.9693,
+      "step": 3560
+    },
+    {
+      "epoch": 1.864864864864865,
+      "grad_norm": 7.74959659576416,
+      "learning_rate": 7.721535629765975e-06,
+      "loss": 0.9708,
+      "step": 3570
+    },
+    {
+      "epoch": 1.870087478783131,
+      "grad_norm": 9.733150482177734,
+      "learning_rate": 7.714961872206153e-06,
+      "loss": 1.2004,
+      "step": 3580
+    },
+    {
+      "epoch": 1.875310092701397,
+      "grad_norm": 5.977094650268555,
+      "learning_rate": 7.708388114646332e-06,
+      "loss": 0.9956,
+      "step": 3590
+    },
+    {
+      "epoch": 1.880532706619663,
+      "grad_norm": 5.454172134399414,
+      "learning_rate": 7.701814357086512e-06,
+      "loss": 0.9481,
+      "step": 3600
+    },
+    {
+      "epoch": 1.880532706619663,
+      "eval_loss": 1.215065598487854,
+      "eval_runtime": 46.3976,
+      "eval_samples_per_second": 36.683,
+      "eval_steps_per_second": 4.591,
+      "step": 3600
+    },
+    {
+      "epoch": 1.8857553205379292,
+      "grad_norm": 7.248448848724365,
+      "learning_rate": 7.69524059952669e-06,
+      "loss": 0.9859,
+      "step": 3610
+    },
+    {
+      "epoch": 1.8909779344561954,
+      "grad_norm": 7.5254669189453125,
+      "learning_rate": 7.688666841966869e-06,
+      "loss": 0.9096,
+      "step": 3620
+    },
+    {
+      "epoch": 1.8962005483744613,
+      "grad_norm": 5.095489978790283,
+      "learning_rate": 7.682093084407048e-06,
+      "loss": 0.9428,
+      "step": 3630
+    },
+    {
+      "epoch": 1.9014231622927276,
+      "grad_norm": 6.897408485412598,
+      "learning_rate": 7.675519326847226e-06,
+      "loss": 0.9568,
+      "step": 3640
+    },
+    {
+      "epoch": 1.9066457762109936,
+      "grad_norm": 12.702198028564453,
+      "learning_rate": 7.668945569287405e-06,
+      "loss": 0.8563,
+      "step": 3650
+    },
+    {
+      "epoch": 1.9118683901292597,
+      "grad_norm": 8.203999519348145,
+      "learning_rate": 7.662371811727584e-06,
+      "loss": 0.9841,
+      "step": 3660
+    },
+    {
+      "epoch": 1.9170910040475257,
+      "grad_norm": 6.950149059295654,
+      "learning_rate": 7.655798054167764e-06,
+      "loss": 0.9489,
+      "step": 3670
+    },
+    {
+      "epoch": 1.9223136179657918,
+      "grad_norm": 9.259650230407715,
+      "learning_rate": 7.649224296607941e-06,
+      "loss": 0.9244,
+      "step": 3680
+    },
+    {
+      "epoch": 1.927536231884058,
+      "grad_norm": 8.09697151184082,
+      "learning_rate": 7.64265053904812e-06,
+      "loss": 0.9717,
+      "step": 3690
+    },
+    {
+      "epoch": 1.932758845802324,
+      "grad_norm": 12.38032054901123,
+      "learning_rate": 7.6360767814883e-06,
+      "loss": 0.8376,
+      "step": 3700
+    },
+    {
+      "epoch": 1.9379814597205902,
+      "grad_norm": 6.492051601409912,
+      "learning_rate": 7.629503023928478e-06,
+      "loss": 0.9907,
+      "step": 3710
+    },
+    {
+      "epoch": 1.9432040736388563,
+      "grad_norm": 6.897779941558838,
+      "learning_rate": 7.622929266368657e-06,
+      "loss": 0.8404,
+      "step": 3720
+    },
+    {
+      "epoch": 1.9484266875571223,
+      "grad_norm": 5.3351240158081055,
+      "learning_rate": 7.616355508808836e-06,
+      "loss": 0.8059,
+      "step": 3730
+    },
+    {
+      "epoch": 1.9536493014753884,
+      "grad_norm": 6.559781551361084,
+      "learning_rate": 7.609781751249015e-06,
+      "loss": 1.0037,
+      "step": 3740
+    },
+    {
+      "epoch": 1.9588719153936545,
+      "grad_norm": 8.780523300170898,
+      "learning_rate": 7.6032079936891934e-06,
+      "loss": 0.9477,
+      "step": 3750
+    },
+    {
+      "epoch": 1.9640945293119207,
+      "grad_norm": 6.601717472076416,
+      "learning_rate": 7.596634236129372e-06,
+      "loss": 0.9746,
+      "step": 3760
+    },
+    {
+      "epoch": 1.9693171432301866,
+      "grad_norm": 6.906192779541016,
+      "learning_rate": 7.590060478569551e-06,
+      "loss": 0.9532,
+      "step": 3770
+    },
+    {
+      "epoch": 1.9745397571484529,
+      "grad_norm": 8.66415023803711,
+      "learning_rate": 7.58348672100973e-06,
+      "loss": 0.9325,
+      "step": 3780
+    },
+    {
+      "epoch": 1.979762371066719,
+      "grad_norm": 7.726676940917969,
+      "learning_rate": 7.576912963449908e-06,
+      "loss": 1.1076,
+      "step": 3790
+    },
+    {
+      "epoch": 1.984984984984985,
+      "grad_norm": 9.380224227905273,
+      "learning_rate": 7.570339205890087e-06,
+      "loss": 1.0061,
+      "step": 3800
+    },
+    {
+      "epoch": 1.984984984984985,
+      "eval_loss": 1.206125020980835,
+      "eval_runtime": 46.4741,
+      "eval_samples_per_second": 36.623,
+      "eval_steps_per_second": 4.583,
+      "step": 3800
+    },
+    {
+      "epoch": 1.990207598903251,
+      "grad_norm": 8.470906257629395,
+      "learning_rate": 7.563765448330266e-06,
+      "loss": 1.0322,
+      "step": 3810
+    },
+    {
+      "epoch": 1.9954302128215171,
+      "grad_norm": 7.907584190368652,
+      "learning_rate": 7.557191690770445e-06,
+      "loss": 0.8892,
+      "step": 3820
+    },
+    {
+      "epoch": 2.001044522783653,
+      "grad_norm": 5.961308479309082,
+      "learning_rate": 7.550617933210623e-06,
+      "loss": 0.8805,
+      "step": 3830
+    },
+    {
+      "epoch": 2.0062671367019194,
+      "grad_norm": 5.583798408508301,
+      "learning_rate": 7.544044175650803e-06,
+      "loss": 0.6595,
+      "step": 3840
+    },
+    {
+      "epoch": 2.011489750620185,
+      "grad_norm": 7.540070533752441,
+      "learning_rate": 7.537470418090982e-06,
+      "loss": 0.7234,
+      "step": 3850
+    },
+    {
+      "epoch": 2.0167123645384515,
+      "grad_norm": 7.355820655822754,
+      "learning_rate": 7.53089666053116e-06,
+      "loss": 0.8085,
+      "step": 3860
+    },
+    {
+      "epoch": 2.0219349784567178,
+      "grad_norm": 8.060832977294922,
+      "learning_rate": 7.52432290297134e-06,
+      "loss": 0.8341,
+      "step": 3870
+    },
+    {
+      "epoch": 2.0271575923749836,
+      "grad_norm": 7.983650207519531,
+      "learning_rate": 7.517749145411518e-06,
+      "loss": 0.8054,
+      "step": 3880
+    },
+    {
+      "epoch": 2.03238020629325,
+      "grad_norm": 7.6987810134887695,
+      "learning_rate": 7.511175387851697e-06,
+      "loss": 0.6774,
+      "step": 3890
+    },
+    {
+      "epoch": 2.0376028202115157,
+      "grad_norm": 10.233553886413574,
+      "learning_rate": 7.504601630291876e-06,
+      "loss": 0.7152,
+      "step": 3900
+    },
+    {
+      "epoch": 2.042825434129782,
+      "grad_norm": 6.328879356384277,
+      "learning_rate": 7.4980278727320545e-06,
+      "loss": 0.8492,
+      "step": 3910
+    },
+    {
+      "epoch": 2.048048048048048,
+      "grad_norm": 5.196915149688721,
+      "learning_rate": 7.491454115172233e-06,
+      "loss": 0.756,
+      "step": 3920
+    },
+    {
+      "epoch": 2.053270661966314,
+      "grad_norm": 6.743471145629883,
+      "learning_rate": 7.4848803576124115e-06,
+      "loss": 0.8221,
+      "step": 3930
+    },
+    {
+      "epoch": 2.0584932758845804,
+      "grad_norm": 10.30760383605957,
+      "learning_rate": 7.478306600052591e-06,
+      "loss": 0.93,
+      "step": 3940
+    },
+    {
+      "epoch": 2.0637158898028463,
+      "grad_norm": 9.386149406433105,
+      "learning_rate": 7.471732842492769e-06,
+      "loss": 0.714,
+      "step": 3950
+    },
+    {
+      "epoch": 2.0689385037211125,
+      "grad_norm": 7.039390563964844,
+      "learning_rate": 7.465159084932948e-06,
+      "loss": 0.74,
+      "step": 3960
+    },
+    {
+      "epoch": 2.0741611176393784,
+      "grad_norm": 6.082592010498047,
+      "learning_rate": 7.458585327373127e-06,
+      "loss": 0.7808,
+      "step": 3970
+    },
+    {
+      "epoch": 2.0793837315576447,
+      "grad_norm": 8.188385009765625,
+      "learning_rate": 7.452011569813306e-06,
+      "loss": 0.8064,
+      "step": 3980
+    },
+    {
+      "epoch": 2.0846063454759105,
+      "grad_norm": 7.643196105957031,
+      "learning_rate": 7.445437812253484e-06,
+      "loss": 0.7309,
+      "step": 3990
+    },
+    {
+      "epoch": 2.089828959394177,
+      "grad_norm": 8.841309547424316,
+      "learning_rate": 7.438864054693663e-06,
+      "loss": 0.7749,
+      "step": 4000
+    },
+    {
+      "epoch": 2.089828959394177,
+      "eval_loss": 1.2640846967697144,
+      "eval_runtime": 46.3887,
+      "eval_samples_per_second": 36.69,
+      "eval_steps_per_second": 4.592,
+      "step": 4000
+    },
+    {
+      "epoch": 2.095051573312443,
+      "grad_norm": 7.499510765075684,
+      "learning_rate": 7.432290297133842e-06,
+      "loss": 0.6646,
+      "step": 4010
+    },
+    {
+      "epoch": 2.100274187230709,
+      "grad_norm": 8.0418119430542,
+      "learning_rate": 7.425716539574021e-06,
+      "loss": 0.6537,
+      "step": 4020
+    },
+    {
+      "epoch": 2.105496801148975,
+      "grad_norm": 8.836782455444336,
+      "learning_rate": 7.419142782014201e-06,
+      "loss": 0.5779,
+      "step": 4030
+    },
+    {
+      "epoch": 2.110719415067241,
+      "grad_norm": 4.353819370269775,
+      "learning_rate": 7.412569024454379e-06,
+      "loss": 0.7561,
+      "step": 4040
+    },
+    {
+      "epoch": 2.1159420289855073,
+      "grad_norm": 8.226792335510254,
+      "learning_rate": 7.405995266894558e-06,
+      "loss": 0.6666,
+      "step": 4050
+    },
+    {
+      "epoch": 2.121164642903773,
+      "grad_norm": 8.977775573730469,
+      "learning_rate": 7.399421509334736e-06,
+      "loss": 0.7941,
+      "step": 4060
+    },
+    {
+      "epoch": 2.1263872568220394,
+      "grad_norm": 7.1391706466674805,
+      "learning_rate": 7.392847751774916e-06,
+      "loss": 0.8272,
+      "step": 4070
+    },
+    {
+      "epoch": 2.1316098707403057,
+      "grad_norm": 7.537032604217529,
+      "learning_rate": 7.386273994215094e-06,
+      "loss": 0.7583,
+      "step": 4080
+    },
+    {
+      "epoch": 2.1368324846585716,
+      "grad_norm": 9.434386253356934,
+      "learning_rate": 7.379700236655273e-06,
+      "loss": 0.6792,
+      "step": 4090
+    },
+    {
+      "epoch": 2.142055098576838,
+      "grad_norm": 8.694379806518555,
+      "learning_rate": 7.373126479095452e-06,
+      "loss": 0.8661,
+      "step": 4100
+    },
+    {
+      "epoch": 2.1472777124951037,
+      "grad_norm": 7.044814586639404,
+      "learning_rate": 7.3665527215356304e-06,
+      "loss": 0.6704,
+      "step": 4110
+    },
+    {
+      "epoch": 2.15250032641337,
+      "grad_norm": 8.827573776245117,
+      "learning_rate": 7.359978963975809e-06,
+      "loss": 0.765,
+      "step": 4120
+    },
+    {
+      "epoch": 2.157722940331636,
+      "grad_norm": 9.122138023376465,
+      "learning_rate": 7.3534052064159874e-06,
+      "loss": 0.674,
+      "step": 4130
+    },
+    {
+      "epoch": 2.162945554249902,
+      "grad_norm": 8.409847259521484,
+      "learning_rate": 7.346831448856167e-06,
+      "loss": 0.718,
+      "step": 4140
+    },
+    {
+      "epoch": 2.1681681681681684,
+      "grad_norm": 6.554537773132324,
+      "learning_rate": 7.340257691296345e-06,
+      "loss": 0.7853,
+      "step": 4150
+    },
+    {
+      "epoch": 2.173390782086434,
+      "grad_norm": 8.142253875732422,
+      "learning_rate": 7.333683933736524e-06,
+      "loss": 0.6993,
+      "step": 4160
+    },
+    {
+      "epoch": 2.1786133960047005,
+      "grad_norm": 6.217634677886963,
+      "learning_rate": 7.327110176176703e-06,
+      "loss": 0.7372,
+      "step": 4170
+    },
+    {
+      "epoch": 2.1838360099229663,
+      "grad_norm": 8.371528625488281,
+      "learning_rate": 7.320536418616882e-06,
+      "loss": 0.68,
+      "step": 4180
+    },
+    {
+      "epoch": 2.1890586238412326,
+      "grad_norm": 7.018899917602539,
+      "learning_rate": 7.313962661057061e-06,
+      "loss": 0.7883,
+      "step": 4190
+    },
+    {
+      "epoch": 2.1942812377594985,
+      "grad_norm": 7.779040336608887,
+      "learning_rate": 7.30738890349724e-06,
+      "loss": 0.7449,
+      "step": 4200
+    },
+    {
+      "epoch": 2.1942812377594985,
+      "eval_loss": 1.26158607006073,
+      "eval_runtime": 46.3831,
+      "eval_samples_per_second": 36.694,
+      "eval_steps_per_second": 4.592,
+      "step": 4200
+    },
+    {
+      "epoch": 2.1995038516777647,
+      "grad_norm": 9.133999824523926,
+      "learning_rate": 7.300815145937419e-06,
+      "loss": 0.7521,
+      "step": 4210
+    },
+    {
+      "epoch": 2.204726465596031,
+      "grad_norm": 10.447097778320312,
+      "learning_rate": 7.294241388377597e-06,
+      "loss": 0.8424,
+      "step": 4220
+    },
+    {
+      "epoch": 2.209949079514297,
+      "grad_norm": 8.978377342224121,
+      "learning_rate": 7.287667630817777e-06,
+      "loss": 0.7258,
+      "step": 4230
+    },
+    {
+      "epoch": 2.215171693432563,
+      "grad_norm": 9.498069763183594,
+      "learning_rate": 7.281093873257955e-06,
+      "loss": 0.7952,
+      "step": 4240
+    },
+    {
+      "epoch": 2.220394307350829,
+      "grad_norm": 7.864907264709473,
+      "learning_rate": 7.274520115698134e-06,
+      "loss": 0.7921,
+      "step": 4250
+    },
+    {
+      "epoch": 2.2256169212690953,
+      "grad_norm": 8.536930084228516,
+      "learning_rate": 7.267946358138312e-06,
+      "loss": 0.8745,
+      "step": 4260
+    },
+    {
+      "epoch": 2.230839535187361,
+      "grad_norm": 8.639608383178711,
+      "learning_rate": 7.2613726005784915e-06,
+      "loss": 0.7734,
+      "step": 4270
+    },
+    {
+      "epoch": 2.2360621491056274,
+      "grad_norm": 9.74687385559082,
+      "learning_rate": 7.255456218774653e-06,
+      "loss": 0.7512,
+      "step": 4280
+    },
+    {
+      "epoch": 2.2412847630238932,
+      "grad_norm": 5.085870742797852,
+      "learning_rate": 7.248882461214831e-06,
+      "loss": 0.7072,
+      "step": 4290
+    },
+    {
+      "epoch": 2.2465073769421595,
+      "grad_norm": 8.538248062133789,
+      "learning_rate": 7.24230870365501e-06,
+      "loss": 0.7686,
+      "step": 4300
+    },
+    {
+      "epoch": 2.251729990860426,
+      "grad_norm": 7.15536642074585,
+      "learning_rate": 7.235734946095189e-06,
+      "loss": 0.7064,
+      "step": 4310
+    },
+    {
+      "epoch": 2.2569526047786916,
+      "grad_norm": 6.049582004547119,
+      "learning_rate": 7.229161188535368e-06,
+      "loss": 0.8178,
+      "step": 4320
+    },
+    {
+      "epoch": 2.262175218696958,
+      "grad_norm": 7.065965175628662,
+      "learning_rate": 7.222587430975546e-06,
+      "loss": 0.676,
+      "step": 4330
+    },
+    {
+      "epoch": 2.2673978326152238,
+      "grad_norm": 8.788618087768555,
+      "learning_rate": 7.216013673415725e-06,
+      "loss": 0.743,
+      "step": 4340
+    },
+    {
+      "epoch": 2.27262044653349,
+      "grad_norm": 7.415438175201416,
+      "learning_rate": 7.209439915855904e-06,
+      "loss": 0.7076,
+      "step": 4350
+    },
+    {
+      "epoch": 2.2778430604517563,
+      "grad_norm": 5.244143486022949,
+      "learning_rate": 7.2028661582960824e-06,
+      "loss": 0.6873,
+      "step": 4360
+    },
+    {
+      "epoch": 2.283065674370022,
+      "grad_norm": 8.050501823425293,
+      "learning_rate": 7.196292400736261e-06,
+      "loss": 0.7375,
+      "step": 4370
+    },
+    {
+      "epoch": 2.2882882882882885,
+      "grad_norm": 9.382434844970703,
+      "learning_rate": 7.18971864317644e-06,
+      "loss": 0.7662,
+      "step": 4380
+    },
+    {
+      "epoch": 2.2935109022065543,
+      "grad_norm": 9.074572563171387,
+      "learning_rate": 7.183144885616619e-06,
+      "loss": 0.7775,
+      "step": 4390
+    },
+    {
+      "epoch": 2.2987335161248206,
+      "grad_norm": 9.558895111083984,
+      "learning_rate": 7.176571128056797e-06,
+      "loss": 0.8324,
+      "step": 4400
+    },
+    {
+      "epoch": 2.2987335161248206,
+      "eval_loss": 1.246055006980896,
+      "eval_runtime": 46.3387,
+      "eval_samples_per_second": 36.73,
+      "eval_steps_per_second": 4.597,
+      "step": 4400
+    },
+    {
+      "epoch": 2.3039561300430864,
+      "grad_norm": 7.064749717712402,
+      "learning_rate": 7.1699973704969775e-06,
+      "loss": 0.6728,
+      "step": 4410
+    },
+    {
+      "epoch": 2.3091787439613527,
+      "grad_norm": 8.828902244567871,
+      "learning_rate": 7.163423612937156e-06,
+      "loss": 0.8473,
+      "step": 4420
+    },
+    {
+      "epoch": 2.3144013578796185,
+      "grad_norm": 6.347407817840576,
+      "learning_rate": 7.1568498553773345e-06,
+      "loss": 0.6786,
+      "step": 4430
+    },
+    {
+      "epoch": 2.319623971797885,
+      "grad_norm": 5.167905807495117,
+      "learning_rate": 7.150276097817514e-06,
+      "loss": 0.6778,
+      "step": 4440
+    },
+    {
+      "epoch": 2.324846585716151,
+      "grad_norm": 9.10237979888916,
+      "learning_rate": 7.143702340257692e-06,
+      "loss": 0.7664,
+      "step": 4450
+    },
+    {
+      "epoch": 2.330069199634417,
+      "grad_norm": 8.039263725280762,
+      "learning_rate": 7.137128582697871e-06,
+      "loss": 0.716,
+      "step": 4460
+    },
+    {
+      "epoch": 2.3352918135526832,
+      "grad_norm": 5.672702789306641,
+      "learning_rate": 7.130554825138049e-06,
+      "loss": 0.7147,
+      "step": 4470
+    },
+    {
+      "epoch": 2.340514427470949,
+      "grad_norm": 7.255804538726807,
+      "learning_rate": 7.123981067578229e-06,
+      "loss": 0.7427,
+      "step": 4480
+    },
+    {
+      "epoch": 2.3457370413892153,
+      "grad_norm": 7.924871444702148,
+      "learning_rate": 7.117407310018407e-06,
+      "loss": 0.7103,
+      "step": 4490
+    },
+    {
+      "epoch": 2.3509596553074816,
+      "grad_norm": 6.1431708335876465,
+      "learning_rate": 7.110833552458586e-06,
+      "loss": 0.6292,
+      "step": 4500
+    },
+    {
+      "epoch": 2.3561822692257475,
+      "grad_norm": 9.339113235473633,
+      "learning_rate": 7.104259794898765e-06,
+      "loss": 0.8119,
+      "step": 4510
+    },
+    {
+      "epoch": 2.3614048831440138,
+      "grad_norm": 8.561595916748047,
+      "learning_rate": 7.0976860373389435e-06,
+      "loss": 0.7959,
+      "step": 4520
+    },
+    {
+      "epoch": 2.3666274970622796,
+      "grad_norm": 8.174324035644531,
+      "learning_rate": 7.091112279779122e-06,
+      "loss": 0.6731,
+      "step": 4530
+    },
+    {
+      "epoch": 2.371850110980546,
+      "grad_norm": 8.378727912902832,
+      "learning_rate": 7.0845385222193005e-06,
+      "loss": 0.707,
+      "step": 4540
+    },
+    {
+      "epoch": 2.3770727248988117,
+      "grad_norm": 8.307716369628906,
+      "learning_rate": 7.07796476465948e-06,
+      "loss": 0.8,
+      "step": 4550
+    },
+    {
+      "epoch": 2.382295338817078,
+      "grad_norm": 9.376656532287598,
+      "learning_rate": 7.071391007099658e-06,
+      "loss": 0.832,
+      "step": 4560
+    },
+    {
+      "epoch": 2.387517952735344,
+      "grad_norm": 8.780073165893555,
+      "learning_rate": 7.064817249539837e-06,
+      "loss": 0.6899,
+      "step": 4570
+    },
+    {
+      "epoch": 2.39274056665361,
+      "grad_norm": 11.08247184753418,
+      "learning_rate": 7.058243491980017e-06,
+      "loss": 0.6915,
+      "step": 4580
+    },
+    {
+      "epoch": 2.3979631805718764,
+      "grad_norm": 7.881500720977783,
+      "learning_rate": 7.0516697344201955e-06,
+      "loss": 0.6109,
+      "step": 4590
+    },
+    {
+      "epoch": 2.4031857944901422,
+      "grad_norm": 8.939738273620605,
+      "learning_rate": 7.045095976860374e-06,
+      "loss": 0.6944,
+      "step": 4600
+    },
+    {
+      "epoch": 2.4031857944901422,
+      "eval_loss": 1.247771143913269,
+      "eval_runtime": 46.3198,
+      "eval_samples_per_second": 36.745,
+      "eval_steps_per_second": 4.598,
+      "step": 4600
+    },
+    {
+      "epoch": 2.4084084084084085,
+      "grad_norm": 10.969844818115234,
+      "learning_rate": 7.038522219300553e-06,
+      "loss": 0.7537,
+      "step": 4610
+    },
+    {
+      "epoch": 2.4136310223266744,
+      "grad_norm": 7.069279670715332,
+      "learning_rate": 7.031948461740732e-06,
+      "loss": 0.6966,
+      "step": 4620
+    },
+    {
+      "epoch": 2.4188536362449407,
+      "grad_norm": 9.71384334564209,
+      "learning_rate": 7.02537470418091e-06,
+      "loss": 0.7609,
+      "step": 4630
+    },
+    {
+      "epoch": 2.4240762501632065,
+      "grad_norm": 6.699389934539795,
+      "learning_rate": 7.01880094662109e-06,
+      "loss": 0.6327,
+      "step": 4640
+    },
+    {
+      "epoch": 2.4292988640814728,
+      "grad_norm": 8.881025314331055,
+      "learning_rate": 7.012227189061268e-06,
+      "loss": 0.8115,
+      "step": 4650
+    },
+    {
+      "epoch": 2.434521477999739,
+      "grad_norm": 6.328512191772461,
+      "learning_rate": 7.005653431501447e-06,
+      "loss": 0.7266,
+      "step": 4660
+    },
+    {
+      "epoch": 2.439744091918005,
+      "grad_norm": 6.8868279457092285,
+      "learning_rate": 6.999079673941625e-06,
+      "loss": 0.7584,
+      "step": 4670
+    },
+    {
+      "epoch": 2.444966705836271,
+      "grad_norm": 8.252555847167969,
+      "learning_rate": 6.992505916381805e-06,
+      "loss": 0.7738,
+      "step": 4680
+    },
+    {
+      "epoch": 2.450189319754537,
+      "grad_norm": 6.152466297149658,
+      "learning_rate": 6.985932158821983e-06,
+      "loss": 0.7227,
+      "step": 4690
+    },
+    {
+      "epoch": 2.4554119336728033,
+      "grad_norm": 8.81529426574707,
+      "learning_rate": 6.979358401262162e-06,
+      "loss": 0.7793,
+      "step": 4700
+    },
+    {
+      "epoch": 2.460634547591069,
+      "grad_norm": 11.22712516784668,
+      "learning_rate": 6.972784643702341e-06,
+      "loss": 0.7545,
+      "step": 4710
+    },
+    {
+      "epoch": 2.4658571615093354,
+      "grad_norm": 8.15162181854248,
+      "learning_rate": 6.9662108861425194e-06,
+      "loss": 0.8213,
+      "step": 4720
+    },
+    {
+      "epoch": 2.4710797754276017,
+      "grad_norm": 10.381488800048828,
+      "learning_rate": 6.959637128582698e-06,
+      "loss": 0.7706,
+      "step": 4730
+    },
+    {
+      "epoch": 2.4763023893458675,
+      "grad_norm": 5.261969566345215,
+      "learning_rate": 6.9530633710228764e-06,
+      "loss": 0.7216,
+      "step": 4740
+    },
+    {
+      "epoch": 2.481525003264134,
+      "grad_norm": 7.4040117263793945,
+      "learning_rate": 6.946489613463056e-06,
+      "loss": 0.6696,
+      "step": 4750
+    },
+    {
+      "epoch": 2.4867476171823997,
+      "grad_norm": 6.897395133972168,
+      "learning_rate": 6.939915855903235e-06,
+      "loss": 0.7881,
+      "step": 4760
+    },
+    {
+      "epoch": 2.491970231100666,
+      "grad_norm": 10.105294227600098,
+      "learning_rate": 6.9333420983434145e-06,
+      "loss": 0.6668,
+      "step": 4770
+    },
+    {
+      "epoch": 2.497192845018932,
+      "grad_norm": 6.959880828857422,
+      "learning_rate": 6.926768340783593e-06,
+      "loss": 0.7377,
+      "step": 4780
+    },
+    {
+      "epoch": 2.502415458937198,
+      "grad_norm": 10.688835144042969,
+      "learning_rate": 6.9201945832237715e-06,
+      "loss": 0.7178,
+      "step": 4790
+    },
+    {
+      "epoch": 2.507638072855464,
+      "grad_norm": 9.69953441619873,
+      "learning_rate": 6.91362082566395e-06,
+      "loss": 0.7491,
+      "step": 4800
+    },
+    {
+      "epoch": 2.507638072855464,
+      "eval_loss": 1.2456127405166626,
+      "eval_runtime": 46.9191,
+      "eval_samples_per_second": 36.275,
+      "eval_steps_per_second": 4.54,
+      "step": 4800
+    },
+    {
+      "epoch": 2.51286068677373,
+      "grad_norm": 7.690919399261475,
+      "learning_rate": 6.907047068104129e-06,
+      "loss": 0.7302,
+      "step": 4810
+    },
+    {
+      "epoch": 2.5180833006919965,
+      "grad_norm": 7.342029571533203,
+      "learning_rate": 6.900473310544308e-06,
+      "loss": 0.694,
+      "step": 4820
+    },
+    {
+      "epoch": 2.5233059146102623,
+      "grad_norm": 7.751829147338867,
+      "learning_rate": 6.893899552984486e-06,
+      "loss": 0.6835,
+      "step": 4830
+    },
+    {
+      "epoch": 2.5285285285285286,
+      "grad_norm": 7.896656036376953,
+      "learning_rate": 6.887325795424666e-06,
+      "loss": 0.7437,
+      "step": 4840
+    },
+    {
+      "epoch": 2.5337511424467944,
+      "grad_norm": 7.776586055755615,
+      "learning_rate": 6.880752037864844e-06,
+      "loss": 0.6117,
+      "step": 4850
+    },
+    {
+      "epoch": 2.5389737563650607,
+      "grad_norm": 9.255610466003418,
+      "learning_rate": 6.874178280305023e-06,
+      "loss": 0.7713,
+      "step": 4860
+    },
+    {
+      "epoch": 2.544196370283327,
+      "grad_norm": 8.877751350402832,
+      "learning_rate": 6.867604522745201e-06,
+      "loss": 0.6392,
+      "step": 4870
+    },
+    {
+      "epoch": 2.549418984201593,
+      "grad_norm": 5.893667221069336,
+      "learning_rate": 6.8610307651853805e-06,
+      "loss": 0.758,
+      "step": 4880
+    },
+    {
+      "epoch": 2.554641598119859,
+      "grad_norm": 5.860952854156494,
+      "learning_rate": 6.854457007625559e-06,
+      "loss": 0.6937,
+      "step": 4890
+    },
+    {
+      "epoch": 2.559864212038125,
+      "grad_norm": 6.213397979736328,
+      "learning_rate": 6.8478832500657375e-06,
+      "loss": 0.6206,
+      "step": 4900
+    },
+    {
+      "epoch": 2.5650868259563913,
+      "grad_norm": 7.741673946380615,
+      "learning_rate": 6.841309492505917e-06,
+      "loss": 0.8378,
+      "step": 4910
+    },
+    {
+      "epoch": 2.5703094398746575,
+      "grad_norm": 7.0414838790893555,
+      "learning_rate": 6.834735734946095e-06,
+      "loss": 0.7588,
+      "step": 4920
+    },
+    {
+      "epoch": 2.5755320537929234,
+      "grad_norm": 8.802332878112793,
+      "learning_rate": 6.828161977386275e-06,
+      "loss": 0.7929,
+      "step": 4930
+    },
+    {
+      "epoch": 2.580754667711189,
+      "grad_norm": 8.432696342468262,
+      "learning_rate": 6.821588219826454e-06,
+      "loss": 0.8244,
+      "step": 4940
+    },
+    {
+      "epoch": 2.5859772816294555,
+      "grad_norm": 4.923203945159912,
+      "learning_rate": 6.8150144622666325e-06,
+      "loss": 0.7292,
+      "step": 4950
+    },
+    {
+      "epoch": 2.591199895547722,
+      "grad_norm": 8.111337661743164,
+      "learning_rate": 6.808440704706811e-06,
+      "loss": 0.7311,
+      "step": 4960
+    },
+    {
+      "epoch": 2.5964225094659876,
+      "grad_norm": 8.650612831115723,
+      "learning_rate": 6.80186694714699e-06,
+      "loss": 0.7428,
+      "step": 4970
+    },
+    {
+      "epoch": 2.601645123384254,
+      "grad_norm": 9.554792404174805,
+      "learning_rate": 6.795293189587169e-06,
+      "loss": 0.817,
+      "step": 4980
+    },
+    {
+      "epoch": 2.6068677373025197,
+      "grad_norm": 9.558310508728027,
+      "learning_rate": 6.788719432027347e-06,
+      "loss": 0.7322,
+      "step": 4990
+    },
+    {
+      "epoch": 2.612090351220786,
+      "grad_norm": 7.577544689178467,
+      "learning_rate": 6.782145674467526e-06,
+      "loss": 0.7557,
+      "step": 5000
+    },
+    {
+      "epoch": 2.612090351220786,
+      "eval_loss": 1.2190589904785156,
+      "eval_runtime": 46.4012,
+      "eval_samples_per_second": 36.68,
+      "eval_steps_per_second": 4.59,
+      "step": 5000
+    },
+    {
+      "epoch": 2.6173129651390523,
+      "grad_norm": 9.112717628479004,
+      "learning_rate": 6.775571916907705e-06,
+      "loss": 0.7545,
+      "step": 5010
+    },
+    {
+      "epoch": 2.622535579057318,
+      "grad_norm": 9.887261390686035,
+      "learning_rate": 6.768998159347884e-06,
+      "loss": 0.7087,
+      "step": 5020
+    },
+    {
+      "epoch": 2.6277581929755844,
+      "grad_norm": 9.290141105651855,
+      "learning_rate": 6.762424401788062e-06,
+      "loss": 0.7582,
+      "step": 5030
+    },
+    {
+      "epoch": 2.6329808068938503,
+      "grad_norm": 5.160643100738525,
+      "learning_rate": 6.755850644228242e-06,
+      "loss": 0.7339,
+      "step": 5040
+    },
+    {
+      "epoch": 2.6382034208121166,
+      "grad_norm": 7.384275913238525,
+      "learning_rate": 6.74927688666842e-06,
+      "loss": 0.6564,
+      "step": 5050
+    },
+    {
+      "epoch": 2.6434260347303824,
+      "grad_norm": 10.130136489868164,
+      "learning_rate": 6.742703129108599e-06,
+      "loss": 0.7619,
+      "step": 5060
+    },
+    {
+      "epoch": 2.6486486486486487,
+      "grad_norm": 5.019765377044678,
+      "learning_rate": 6.736129371548777e-06,
+      "loss": 0.6594,
+      "step": 5070
+    },
+    {
+      "epoch": 2.6538712625669145,
+      "grad_norm": 8.21762752532959,
+      "learning_rate": 6.7295556139889564e-06,
+      "loss": 0.7934,
+      "step": 5080
+    },
+    {
+      "epoch": 2.659093876485181,
+      "grad_norm": 8.9004487991333,
+      "learning_rate": 6.722981856429135e-06,
+      "loss": 0.7252,
+      "step": 5090
+    },
+    {
+      "epoch": 2.664316490403447,
+      "grad_norm": 8.407866477966309,
+      "learning_rate": 6.716408098869315e-06,
+      "loss": 0.7514,
+      "step": 5100
+    },
+    {
+      "epoch": 2.669539104321713,
+      "grad_norm": 12.734916687011719,
+      "learning_rate": 6.709834341309494e-06,
+      "loss": 0.7225,
+      "step": 5110
+    },
+    {
+      "epoch": 2.674761718239979,
+      "grad_norm": 7.250882625579834,
+      "learning_rate": 6.703260583749672e-06,
+      "loss": 0.6623,
+      "step": 5120
+    },
+    {
+      "epoch": 2.679984332158245,
+      "grad_norm": 6.4962663650512695,
+      "learning_rate": 6.696686826189851e-06,
+      "loss": 0.7286,
+      "step": 5130
+    },
+    {
+      "epoch": 2.6852069460765113,
+      "grad_norm": 6.720248222351074,
+      "learning_rate": 6.69011306863003e-06,
+      "loss": 0.6411,
+      "step": 5140
+    },
+    {
+      "epoch": 2.6904295599947776,
+      "grad_norm": 7.895679950714111,
+      "learning_rate": 6.6835393110702085e-06,
+      "loss": 0.761,
+      "step": 5150
+    },
+    {
+      "epoch": 2.6956521739130435,
+      "grad_norm": 9.329249382019043,
+      "learning_rate": 6.676965553510387e-06,
+      "loss": 0.6963,
+      "step": 5160
+    },
+    {
+      "epoch": 2.7008747878313093,
+      "grad_norm": 7.362915515899658,
+      "learning_rate": 6.670391795950566e-06,
+      "loss": 0.7546,
+      "step": 5170
+    },
+    {
+      "epoch": 2.7060974017495756,
+      "grad_norm": 9.045941352844238,
+      "learning_rate": 6.663818038390745e-06,
+      "loss": 0.6971,
+      "step": 5180
+    },
+    {
+      "epoch": 2.711320015667842,
+      "grad_norm": 12.443278312683105,
+      "learning_rate": 6.657244280830923e-06,
+      "loss": 0.7261,
+      "step": 5190
+    },
+    {
+      "epoch": 2.7165426295861077,
+      "grad_norm": 9.796110153198242,
+      "learning_rate": 6.650670523271102e-06,
+      "loss": 0.7577,
+      "step": 5200
+    },
+    {
+      "epoch": 2.7165426295861077,
+      "eval_loss": 1.2298212051391602,
+      "eval_runtime": 46.3629,
+      "eval_samples_per_second": 36.71,
+      "eval_steps_per_second": 4.594,
+      "step": 5200
+    },
+    {
+      "epoch": 2.721765243504374,
+      "grad_norm": 6.790576457977295,
+      "learning_rate": 6.644096765711281e-06,
+      "loss": 0.7437,
+      "step": 5210
+    },
+    {
+      "epoch": 2.72698785742264,
+      "grad_norm": 6.432572841644287,
+      "learning_rate": 6.63752300815146e-06,
+      "loss": 0.6898,
+      "step": 5220
+    },
+    {
+      "epoch": 2.732210471340906,
+      "grad_norm": 6.689311504364014,
+      "learning_rate": 6.630949250591638e-06,
+      "loss": 0.736,
+      "step": 5230
+    },
+    {
+      "epoch": 2.7374330852591724,
+      "grad_norm": 8.386734962463379,
+      "learning_rate": 6.6243754930318175e-06,
+      "loss": 0.7809,
+      "step": 5240
+    },
+    {
+      "epoch": 2.7426556991774382,
+      "grad_norm": 8.303974151611328,
+      "learning_rate": 6.617801735471996e-06,
+      "loss": 0.7536,
+      "step": 5250
+    },
+    {
+      "epoch": 2.7478783130957045,
+      "grad_norm": 10.667426109313965,
+      "learning_rate": 6.6112279779121745e-06,
+      "loss": 0.8118,
+      "step": 5260
+    },
+    {
+      "epoch": 2.7531009270139704,
+      "grad_norm": 6.878983974456787,
+      "learning_rate": 6.604654220352355e-06,
+      "loss": 0.7044,
+      "step": 5270
+    },
+    {
+      "epoch": 2.7583235409322366,
+      "grad_norm": 9.650110244750977,
+      "learning_rate": 6.598080462792533e-06,
+      "loss": 0.7329,
+      "step": 5280
+    },
+    {
+      "epoch": 2.763546154850503,
+      "grad_norm": 6.491201877593994,
+      "learning_rate": 6.591506705232712e-06,
+      "loss": 0.6498,
+      "step": 5290
+    },
+    {
+      "epoch": 2.7687687687687688,
+      "grad_norm": 8.977750778198242,
+      "learning_rate": 6.584932947672891e-06,
+      "loss": 0.7035,
+      "step": 5300
+    },
+    {
+      "epoch": 2.7739913826870346,
+      "grad_norm": 6.382355213165283,
+      "learning_rate": 6.5783591901130695e-06,
+      "loss": 0.6769,
+      "step": 5310
+    },
+    {
+      "epoch": 2.779213996605301,
+      "grad_norm": 9.238271713256836,
+      "learning_rate": 6.571785432553248e-06,
+      "loss": 0.7547,
+      "step": 5320
+    },
+    {
+      "epoch": 2.784436610523567,
+      "grad_norm": 7.663956165313721,
+      "learning_rate": 6.5652116749934265e-06,
+      "loss": 0.7363,
+      "step": 5330
+    },
+    {
+      "epoch": 2.789659224441833,
+      "grad_norm": 10.917364120483398,
+      "learning_rate": 6.558637917433606e-06,
+      "loss": 0.7218,
+      "step": 5340
+    },
+    {
+      "epoch": 2.7948818383600993,
+      "grad_norm": 8.472268104553223,
+      "learning_rate": 6.552064159873784e-06,
+      "loss": 0.7329,
+      "step": 5350
+    },
+    {
+      "epoch": 2.800104452278365,
+      "grad_norm": 9.487913131713867,
+      "learning_rate": 6.545490402313963e-06,
+      "loss": 0.7496,
+      "step": 5360
+    },
+    {
+      "epoch": 2.8053270661966314,
+      "grad_norm": 7.357760906219482,
+      "learning_rate": 6.538916644754142e-06,
+      "loss": 0.6209,
+      "step": 5370
+    },
+    {
+      "epoch": 2.8105496801148977,
+      "grad_norm": 8.204305648803711,
+      "learning_rate": 6.532342887194321e-06,
+      "loss": 0.7739,
+      "step": 5380
+    },
+    {
+      "epoch": 2.8157722940331635,
+      "grad_norm": 6.225192070007324,
+      "learning_rate": 6.525769129634499e-06,
+      "loss": 0.6911,
+      "step": 5390
+    },
+    {
+      "epoch": 2.82099490795143,
+      "grad_norm": 9.028958320617676,
+      "learning_rate": 6.519195372074678e-06,
+      "loss": 0.7801,
+      "step": 5400
+    },
+    {
+      "epoch": 2.82099490795143,
+      "eval_loss": 1.220414400100708,
+      "eval_runtime": 46.3932,
+      "eval_samples_per_second": 36.686,
+      "eval_steps_per_second": 4.591,
+      "step": 5400
+    },
+    {
+      "epoch": 2.8262175218696957,
+      "grad_norm": 8.968846321105957,
+      "learning_rate": 6.512621614514857e-06,
+      "loss": 0.7202,
+      "step": 5410
+    },
+    {
+      "epoch": 2.831440135787962,
+      "grad_norm": 8.162824630737305,
+      "learning_rate": 6.506047856955036e-06,
+      "loss": 0.733,
+      "step": 5420
+    },
+    {
+      "epoch": 2.8366627497062282,
+      "grad_norm": 8.892393112182617,
+      "learning_rate": 6.499474099395214e-06,
+      "loss": 0.8351,
+      "step": 5430
+    },
+    {
+      "epoch": 2.841885363624494,
+      "grad_norm": 7.610676288604736,
+      "learning_rate": 6.492900341835394e-06,
+      "loss": 0.6797,
+      "step": 5440
+    },
+    {
+      "epoch": 2.84710797754276,
+      "grad_norm": 6.808664798736572,
+      "learning_rate": 6.486326584275573e-06,
+      "loss": 0.7261,
+      "step": 5450
+    },
+    {
+      "epoch": 2.852330591461026,
+      "grad_norm": 12.52797794342041,
+      "learning_rate": 6.479752826715751e-06,
+      "loss": 0.7066,
+      "step": 5460
+    },
+    {
+      "epoch": 2.8575532053792925,
+      "grad_norm": 7.522469520568848,
+      "learning_rate": 6.473179069155931e-06,
+      "loss": 0.6178,
+      "step": 5470
+    },
+    {
+      "epoch": 2.8627758192975583,
+      "grad_norm": 6.208746910095215,
+      "learning_rate": 6.466605311596109e-06,
+      "loss": 0.663,
+      "step": 5480
+    },
+    {
+      "epoch": 2.8679984332158246,
+      "grad_norm": 7.486382007598877,
+      "learning_rate": 6.460031554036288e-06,
+      "loss": 0.652,
+      "step": 5490
+    },
+    {
+      "epoch": 2.8732210471340904,
+      "grad_norm": 7.342718124389648,
+      "learning_rate": 6.453457796476467e-06,
+      "loss": 0.7086,
+      "step": 5500
+    },
+    {
+      "epoch": 2.8784436610523567,
+      "grad_norm": 11.11368465423584,
+      "learning_rate": 6.4468840389166455e-06,
+      "loss": 0.7731,
+      "step": 5510
+    },
+    {
+      "epoch": 2.883666274970623,
+      "grad_norm": 5.823308944702148,
+      "learning_rate": 6.440310281356824e-06,
+      "loss": 0.6701,
+      "step": 5520
+    },
+    {
+      "epoch": 2.888888888888889,
+      "grad_norm": 9.190802574157715,
+      "learning_rate": 6.4337365237970025e-06,
+      "loss": 0.8433,
+      "step": 5530
+    },
+    {
+      "epoch": 2.894111502807155,
+      "grad_norm": 7.263264179229736,
+      "learning_rate": 6.427162766237182e-06,
+      "loss": 0.6523,
+      "step": 5540
+    },
+    {
+      "epoch": 2.899334116725421,
+      "grad_norm": 9.167975425720215,
+      "learning_rate": 6.42058900867736e-06,
+      "loss": 0.6965,
+      "step": 5550
+    },
+    {
+      "epoch": 2.9045567306436872,
+      "grad_norm": 7.3201375007629395,
+      "learning_rate": 6.414015251117539e-06,
+      "loss": 0.7339,
+      "step": 5560
+    },
+    {
+      "epoch": 2.9097793445619535,
+      "grad_norm": 8.229689598083496,
+      "learning_rate": 6.407441493557718e-06,
+      "loss": 0.7504,
+      "step": 5570
+    },
+    {
+      "epoch": 2.9150019584802194,
+      "grad_norm": 8.792162895202637,
+      "learning_rate": 6.400867735997897e-06,
+      "loss": 0.7689,
+      "step": 5580
+    },
+    {
+      "epoch": 2.920224572398485,
+      "grad_norm": 9.17432689666748,
+      "learning_rate": 6.394293978438075e-06,
+      "loss": 0.7747,
+      "step": 5590
+    },
+    {
+      "epoch": 2.9254471863167515,
+      "grad_norm": 6.218533039093018,
+      "learning_rate": 6.387720220878254e-06,
+      "loss": 0.8414,
+      "step": 5600
+    },
+    {
+      "epoch": 2.9254471863167515,
+      "eval_loss": 1.2055819034576416,
+      "eval_runtime": 46.3894,
+      "eval_samples_per_second": 36.689,
+      "eval_steps_per_second": 4.592,
+      "step": 5600
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 15312,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.063042284067226e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}