diff --git "a/contextlm_gpt2_xl/trainer_state.json" "b/contextlm_gpt2_xl/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/contextlm_gpt2_xl/trainer_state.json"
@@ -0,0 +1,12240 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9999854940017697,
+  "eval_steps": 1000,
+  "global_step": 17234,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005802399292107286,
+      "grad_norm": 9.509501457214355,
+      "learning_rate": 4.176334106728538e-06,
+      "loss": 10.385,
+      "step": 10
+    },
+    {
+      "epoch": 0.0011604798584214572,
+      "grad_norm": 2.0677127838134766,
+      "learning_rate": 8.816705336426914e-06,
+      "loss": 8.9985,
+      "step": 20
+    },
+    {
+      "epoch": 0.001740719787632186,
+      "grad_norm": 2.0270631313323975,
+      "learning_rate": 1.345707656612529e-05,
+      "loss": 8.659,
+      "step": 30
+    },
+    {
+      "epoch": 0.0023209597168429145,
+      "grad_norm": 4.9875383377075195,
+      "learning_rate": 1.8097447795823665e-05,
+      "loss": 8.3344,
+      "step": 40
+    },
+    {
+      "epoch": 0.002901199646053643,
+      "grad_norm": 2.1933412551879883,
+      "learning_rate": 2.273781902552204e-05,
+      "loss": 8.0255,
+      "step": 50
+    },
+    {
+      "epoch": 0.003481439575264372,
+      "grad_norm": 3.3649749755859375,
+      "learning_rate": 2.737819025522042e-05,
+      "loss": 7.7313,
+      "step": 60
+    },
+    {
+      "epoch": 0.0040616795044751,
+      "grad_norm": 3.5209426879882812,
+      "learning_rate": 3.20185614849188e-05,
+      "loss": 7.4545,
+      "step": 70
+    },
+    {
+      "epoch": 0.004641919433685829,
+      "grad_norm": 1.8262568712234497,
+      "learning_rate": 3.665893271461717e-05,
+      "loss": 7.1915,
+      "step": 80
+    },
+    {
+      "epoch": 0.005222159362896558,
+      "grad_norm": 1.9789323806762695,
+      "learning_rate": 4.129930394431555e-05,
+      "loss": 6.9657,
+      "step": 90
+    },
+    {
+      "epoch": 0.005802399292107286,
+      "grad_norm": 1.8022093772888184,
+      "learning_rate": 4.593967517401392e-05,
+      "loss": 6.8231,
+      "step": 100
+    },
+    {
+      "epoch": 0.006382639221318015,
+      "grad_norm": 2.8330202102661133,
+      "learning_rate": 5.05800464037123e-05,
+      "loss": 6.7133,
+      "step": 110
+    },
+    {
+      "epoch": 0.006962879150528744,
+      "grad_norm": 1.8549864292144775,
+      "learning_rate": 5.522041763341067e-05,
+      "loss": 6.6013,
+      "step": 120
+    },
+    {
+      "epoch": 0.007543119079739472,
+      "grad_norm": 1.6776927709579468,
+      "learning_rate": 5.986078886310905e-05,
+      "loss": 6.5068,
+      "step": 130
+    },
+    {
+      "epoch": 0.0081233590089502,
+      "grad_norm": 1.2742928266525269,
+      "learning_rate": 6.450116009280742e-05,
+      "loss": 6.4149,
+      "step": 140
+    },
+    {
+      "epoch": 0.008703598938160929,
+      "grad_norm": 1.3102290630340576,
+      "learning_rate": 6.91415313225058e-05,
+      "loss": 6.326,
+      "step": 150
+    },
+    {
+      "epoch": 0.009283838867371658,
+      "grad_norm": 1.5298289060592651,
+      "learning_rate": 7.378190255220419e-05,
+      "loss": 6.2688,
+      "step": 160
+    },
+    {
+      "epoch": 0.009864078796582387,
+      "grad_norm": 1.2847580909729004,
+      "learning_rate": 7.842227378190256e-05,
+      "loss": 6.2044,
+      "step": 170
+    },
+    {
+      "epoch": 0.010444318725793116,
+      "grad_norm": 0.9664031863212585,
+      "learning_rate": 8.306264501160093e-05,
+      "loss": 6.1479,
+      "step": 180
+    },
+    {
+      "epoch": 0.011024558655003845,
+      "grad_norm": 1.668137550354004,
+      "learning_rate": 8.77030162412993e-05,
+      "loss": 6.1069,
+      "step": 190
+    },
+    {
+      "epoch": 0.011604798584214572,
+      "grad_norm": 1.3086931705474854,
+      "learning_rate": 9.234338747099769e-05,
+      "loss": 6.0787,
+      "step": 200
+    },
+    {
+      "epoch": 0.012185038513425301,
+      "grad_norm": 1.773193120956421,
+      "learning_rate": 9.698375870069606e-05,
+      "loss": 6.0132,
+      "step": 210
+    },
+    {
+      "epoch": 0.01276527844263603,
+      "grad_norm": 0.8364600539207458,
+      "learning_rate": 0.00010162412993039443,
+      "loss": 5.9783,
+      "step": 220
+    },
+    {
+      "epoch": 0.013345518371846759,
+      "grad_norm": 1.0191259384155273,
+      "learning_rate": 0.0001062645011600928,
+      "loss": 5.9312,
+      "step": 230
+    },
+    {
+      "epoch": 0.013925758301057488,
+      "grad_norm": 0.904647946357727,
+      "learning_rate": 0.00011090487238979119,
+      "loss": 5.924,
+      "step": 240
+    },
+    {
+      "epoch": 0.014505998230268215,
+      "grad_norm": 0.6738490462303162,
+      "learning_rate": 0.00011554524361948958,
+      "loss": 5.8692,
+      "step": 250
+    },
+    {
+      "epoch": 0.015086238159478944,
+      "grad_norm": 0.7382608652114868,
+      "learning_rate": 0.00012018561484918794,
+      "loss": 5.8497,
+      "step": 260
+    },
+    {
+      "epoch": 0.015666478088689675,
+      "grad_norm": 0.7634956240653992,
+      "learning_rate": 0.0001248259860788863,
+      "loss": 5.8059,
+      "step": 270
+    },
+    {
+      "epoch": 0.0162467180179004,
+      "grad_norm": 0.6887519955635071,
+      "learning_rate": 0.0001294663573085847,
+      "loss": 5.7723,
+      "step": 280
+    },
+    {
+      "epoch": 0.01682695794711113,
+      "grad_norm": 1.0721409320831299,
+      "learning_rate": 0.00013410672853828308,
+      "loss": 5.7549,
+      "step": 290
+    },
+    {
+      "epoch": 0.017407197876321858,
+      "grad_norm": 0.8256045579910278,
+      "learning_rate": 0.00013874709976798144,
+      "loss": 5.7243,
+      "step": 300
+    },
+    {
+      "epoch": 0.017987437805532587,
+      "grad_norm": 0.5584640502929688,
+      "learning_rate": 0.00014338747099767982,
+      "loss": 5.6909,
+      "step": 310
+    },
+    {
+      "epoch": 0.018567677734743316,
+      "grad_norm": 0.6018552780151367,
+      "learning_rate": 0.0001480278422273782,
+      "loss": 5.6591,
+      "step": 320
+    },
+    {
+      "epoch": 0.019147917663954045,
+      "grad_norm": 0.7498115301132202,
+      "learning_rate": 0.00015266821345707657,
+      "loss": 5.6199,
+      "step": 330
+    },
+    {
+      "epoch": 0.019728157593164774,
+      "grad_norm": 0.6623280048370361,
+      "learning_rate": 0.00015730858468677495,
+      "loss": 5.5741,
+      "step": 340
+    },
+    {
+      "epoch": 0.020308397522375503,
+      "grad_norm": 0.5316705703735352,
+      "learning_rate": 0.0001619489559164733,
+      "loss": 5.5444,
+      "step": 350
+    },
+    {
+      "epoch": 0.020888637451586232,
+      "grad_norm": 0.5719560980796814,
+      "learning_rate": 0.0001665893271461717,
+      "loss": 5.501,
+      "step": 360
+    },
+    {
+      "epoch": 0.02146887738079696,
+      "grad_norm": 0.8803657293319702,
+      "learning_rate": 0.00017122969837587008,
+      "loss": 5.4982,
+      "step": 370
+    },
+    {
+      "epoch": 0.02204911731000769,
+      "grad_norm": 0.46674010157585144,
+      "learning_rate": 0.00017587006960556844,
+      "loss": 5.4638,
+      "step": 380
+    },
+    {
+      "epoch": 0.022629357239218415,
+      "grad_norm": 0.9015390276908875,
+      "learning_rate": 0.00018051044083526683,
+      "loss": 5.4292,
+      "step": 390
+    },
+    {
+      "epoch": 0.023209597168429144,
+      "grad_norm": 0.5593010783195496,
+      "learning_rate": 0.0001851508120649652,
+      "loss": 5.4009,
+      "step": 400
+    },
+    {
+      "epoch": 0.023789837097639873,
+      "grad_norm": 0.4915124177932739,
+      "learning_rate": 0.0001897911832946636,
+      "loss": 5.3654,
+      "step": 410
+    },
+    {
+      "epoch": 0.024370077026850602,
+      "grad_norm": 0.6447678208351135,
+      "learning_rate": 0.00019443155452436196,
+      "loss": 5.3323,
+      "step": 420
+    },
+    {
+      "epoch": 0.02495031695606133,
+      "grad_norm": 0.43100541830062866,
+      "learning_rate": 0.00019907192575406032,
+      "loss": 5.3044,
+      "step": 430
+    },
+    {
+      "epoch": 0.02553055688527206,
+      "grad_norm": 0.5044266581535339,
+      "learning_rate": 0.00020371229698375873,
+      "loss": 5.2817,
+      "step": 440
+    },
+    {
+      "epoch": 0.02611079681448279,
+      "grad_norm": 0.44544899463653564,
+      "learning_rate": 0.00020835266821345706,
+      "loss": 5.2623,
+      "step": 450
+    },
+    {
+      "epoch": 0.026691036743693518,
+      "grad_norm": 0.6966679096221924,
+      "learning_rate": 0.00021299303944315545,
+      "loss": 5.2158,
+      "step": 460
+    },
+    {
+      "epoch": 0.027271276672904247,
+      "grad_norm": 0.6552466154098511,
+      "learning_rate": 0.00021763341067285383,
+      "loss": 5.1987,
+      "step": 470
+    },
+    {
+      "epoch": 0.027851516602114976,
+      "grad_norm": 0.3580920398235321,
+      "learning_rate": 0.00022227378190255222,
+      "loss": 5.1516,
+      "step": 480
+    },
+    {
+      "epoch": 0.028431756531325705,
+      "grad_norm": 0.6580965518951416,
+      "learning_rate": 0.0002269141531322506,
+      "loss": 5.1102,
+      "step": 490
+    },
+    {
+      "epoch": 0.02901199646053643,
+      "grad_norm": 0.40035542845726013,
+      "learning_rate": 0.000231554524361949,
+      "loss": 5.086,
+      "step": 500
+    },
+    {
+      "epoch": 0.02959223638974716,
+      "grad_norm": 0.46560588479042053,
+      "learning_rate": 0.00023619489559164735,
+      "loss": 5.0338,
+      "step": 510
+    },
+    {
+      "epoch": 0.030172476318957888,
+      "grad_norm": 0.610862135887146,
+      "learning_rate": 0.00024083526682134573,
+      "loss": 5.0033,
+      "step": 520
+    },
+    {
+      "epoch": 0.030752716248168617,
+      "grad_norm": 0.43752819299697876,
+      "learning_rate": 0.00024547563805104406,
+      "loss": 4.9667,
+      "step": 530
+    },
+    {
+      "epoch": 0.03133295617737935,
+      "grad_norm": 0.38704100251197815,
+      "learning_rate": 0.00025011600928074245,
+      "loss": 4.9253,
+      "step": 540
+    },
+    {
+      "epoch": 0.031913196106590075,
+      "grad_norm": 0.4786874055862427,
+      "learning_rate": 0.00025475638051044084,
+      "loss": 4.9177,
+      "step": 550
+    },
+    {
+      "epoch": 0.0324934360358008,
+      "grad_norm": 0.36762577295303345,
+      "learning_rate": 0.0002593967517401392,
+      "loss": 4.8879,
+      "step": 560
+    },
+    {
+      "epoch": 0.03307367596501153,
+      "grad_norm": 0.407249391078949,
+      "learning_rate": 0.0002640371229698376,
+      "loss": 4.8372,
+      "step": 570
+    },
+    {
+      "epoch": 0.03365391589422226,
+      "grad_norm": 0.4547578692436218,
+      "learning_rate": 0.000268677494199536,
+      "loss": 4.8013,
+      "step": 580
+    },
+    {
+      "epoch": 0.03423415582343299,
+      "grad_norm": 0.34854525327682495,
+      "learning_rate": 0.0002733178654292344,
+      "loss": 4.7609,
+      "step": 590
+    },
+    {
+      "epoch": 0.034814395752643716,
+      "grad_norm": 0.33431318402290344,
+      "learning_rate": 0.00027795823665893276,
+      "loss": 4.7366,
+      "step": 600
+    },
+    {
+      "epoch": 0.03539463568185445,
+      "grad_norm": 0.4790738523006439,
+      "learning_rate": 0.0002825986078886311,
+      "loss": 4.7052,
+      "step": 610
+    },
+    {
+      "epoch": 0.035974875611065174,
+      "grad_norm": 0.4698006510734558,
+      "learning_rate": 0.0002872389791183295,
+      "loss": 4.7075,
+      "step": 620
+    },
+    {
+      "epoch": 0.036555115540275906,
+      "grad_norm": 0.32214587926864624,
+      "learning_rate": 0.00029187935034802787,
+      "loss": 4.6669,
+      "step": 630
+    },
+    {
+      "epoch": 0.03713535546948663,
+      "grad_norm": 0.29147374629974365,
+      "learning_rate": 0.00029651972157772625,
+      "loss": 4.6087,
+      "step": 640
+    },
+    {
+      "epoch": 0.037715595398697364,
+      "grad_norm": 0.37832900881767273,
+      "learning_rate": 0.0003011600928074246,
+      "loss": 4.5891,
+      "step": 650
+    },
+    {
+      "epoch": 0.03829583532790809,
+      "grad_norm": 0.4138408899307251,
+      "learning_rate": 0.00030580046403712297,
+      "loss": 4.5831,
+      "step": 660
+    },
+    {
+      "epoch": 0.038876075257118815,
+      "grad_norm": 0.3426309823989868,
+      "learning_rate": 0.00031044083526682135,
+      "loss": 4.5467,
+      "step": 670
+    },
+    {
+      "epoch": 0.03945631518632955,
+      "grad_norm": 0.3773086369037628,
+      "learning_rate": 0.00031508120649651974,
+      "loss": 4.5238,
+      "step": 680
+    },
+    {
+      "epoch": 0.04003655511554027,
+      "grad_norm": 0.39989471435546875,
+      "learning_rate": 0.00031972157772621807,
+      "loss": 4.5152,
+      "step": 690
+    },
+    {
+      "epoch": 0.040616795044751006,
+      "grad_norm": 0.40890148282051086,
+      "learning_rate": 0.00032436194895591646,
+      "loss": 4.46,
+      "step": 700
+    },
+    {
+      "epoch": 0.04119703497396173,
+      "grad_norm": 0.38713014125823975,
+      "learning_rate": 0.00032900232018561484,
+      "loss": 4.4458,
+      "step": 710
+    },
+    {
+      "epoch": 0.041777274903172464,
+      "grad_norm": 0.3816765248775482,
+      "learning_rate": 0.00033364269141531323,
+      "loss": 4.4313,
+      "step": 720
+    },
+    {
+      "epoch": 0.04235751483238319,
+      "grad_norm": 0.2601516842842102,
+      "learning_rate": 0.0003382830626450116,
+      "loss": 4.3935,
+      "step": 730
+    },
+    {
+      "epoch": 0.04293775476159392,
+      "grad_norm": 0.5483155250549316,
+      "learning_rate": 0.00034292343387471,
+      "loss": 4.3647,
+      "step": 740
+    },
+    {
+      "epoch": 0.04351799469080465,
+      "grad_norm": 0.41322359442710876,
+      "learning_rate": 0.0003475638051044084,
+      "loss": 4.3641,
+      "step": 750
+    },
+    {
+      "epoch": 0.04409823462001538,
+      "grad_norm": 0.2739952802658081,
+      "learning_rate": 0.00035220417633410677,
+      "loss": 4.3179,
+      "step": 760
+    },
+    {
+      "epoch": 0.044678474549226105,
+      "grad_norm": 0.311780720949173,
+      "learning_rate": 0.00035684454756380516,
+      "loss": 4.2882,
+      "step": 770
+    },
+    {
+      "epoch": 0.04525871447843683,
+      "grad_norm": 0.34148845076560974,
+      "learning_rate": 0.0003614849187935035,
+      "loss": 4.2675,
+      "step": 780
+    },
+    {
+      "epoch": 0.04583895440764756,
+      "grad_norm": 0.3027192950248718,
+      "learning_rate": 0.0003661252900232019,
+      "loss": 4.2453,
+      "step": 790
+    },
+    {
+      "epoch": 0.04641919433685829,
+      "grad_norm": 0.31899961829185486,
+      "learning_rate": 0.00037076566125290026,
+      "loss": 4.2297,
+      "step": 800
+    },
+    {
+      "epoch": 0.04699943426606902,
+      "grad_norm": 0.29658201336860657,
+      "learning_rate": 0.00037540603248259865,
+      "loss": 4.2002,
+      "step": 810
+    },
+    {
+      "epoch": 0.047579674195279746,
+      "grad_norm": 0.3228875696659088,
+      "learning_rate": 0.00038004640371229703,
+      "loss": 4.1797,
+      "step": 820
+    },
+    {
+      "epoch": 0.04815991412449048,
+      "grad_norm": 0.3000330328941345,
+      "learning_rate": 0.00038468677494199536,
+      "loss": 4.1557,
+      "step": 830
+    },
+    {
+      "epoch": 0.048740154053701204,
+      "grad_norm": 0.3490864336490631,
+      "learning_rate": 0.00038932714617169375,
+      "loss": 4.1553,
+      "step": 840
+    },
+    {
+      "epoch": 0.049320393982911936,
+      "grad_norm": 0.3470998704433441,
+      "learning_rate": 0.00039396751740139213,
+      "loss": 4.1359,
+      "step": 850
+    },
+    {
+      "epoch": 0.04990063391212266,
+      "grad_norm": 0.28483402729034424,
+      "learning_rate": 0.00039860788863109047,
+      "loss": 4.1102,
+      "step": 860
+    },
+    {
+      "epoch": 0.050480873841333394,
+      "grad_norm": 0.2661000192165375,
+      "learning_rate": 0.0003999998195768387,
+      "loss": 4.089,
+      "step": 870
+    },
+    {
+      "epoch": 0.05106111377054412,
+      "grad_norm": 0.28349924087524414,
+      "learning_rate": 0.0003999989358723423,
+      "loss": 4.0595,
+      "step": 880
+    },
+    {
+      "epoch": 0.051641353699754845,
+      "grad_norm": 0.31661197543144226,
+      "learning_rate": 0.0003999973157508127,
+      "loss": 4.059,
+      "step": 890
+    },
+    {
+      "epoch": 0.05222159362896558,
+      "grad_norm": 0.28986701369285583,
+      "learning_rate": 0.0003999949592182153,
+      "loss": 4.0408,
+      "step": 900
+    },
+    {
+      "epoch": 0.0528018335581763,
+      "grad_norm": 0.21339298784732819,
+      "learning_rate": 0.0003999918662832272,
+      "loss": 4.0218,
+      "step": 910
+    },
+    {
+      "epoch": 0.053382073487387036,
+      "grad_norm": 0.3597091734409332,
+      "learning_rate": 0.00039998803695723685,
+      "loss": 3.9897,
+      "step": 920
+    },
+    {
+      "epoch": 0.05396231341659776,
+      "grad_norm": 0.3721710443496704,
+      "learning_rate": 0.0003999834712543442,
+      "loss": 3.9904,
+      "step": 930
+    },
+    {
+      "epoch": 0.054542553345808494,
+      "grad_norm": 0.2166401445865631,
+      "learning_rate": 0.0003999781691913607,
+      "loss": 3.9665,
+      "step": 940
+    },
+    {
+      "epoch": 0.05512279327501922,
+      "grad_norm": 0.24356764554977417,
+      "learning_rate": 0.00039997213078780903,
+      "loss": 3.9508,
+      "step": 950
+    },
+    {
+      "epoch": 0.05570303320422995,
+      "grad_norm": 0.21014730632305145,
+      "learning_rate": 0.00039996535606592334,
+      "loss": 3.9392,
+      "step": 960
+    },
+    {
+      "epoch": 0.05628327313344068,
+      "grad_norm": 0.26985248923301697,
+      "learning_rate": 0.0003999578450506487,
+      "loss": 3.9171,
+      "step": 970
+    },
+    {
+      "epoch": 0.05686351306265141,
+      "grad_norm": 0.28635284304618835,
+      "learning_rate": 0.00039994959776964165,
+      "loss": 3.9142,
+      "step": 980
+    },
+    {
+      "epoch": 0.057443752991862135,
+      "grad_norm": 0.26228785514831543,
+      "learning_rate": 0.0003999406142532694,
+      "loss": 3.8858,
+      "step": 990
+    },
+    {
+      "epoch": 0.05802399292107286,
+      "grad_norm": 0.24055221676826477,
+      "learning_rate": 0.00039993089453461023,
+      "loss": 3.886,
+      "step": 1000
+    },
+    {
+      "epoch": 0.05802399292107286,
+      "eval_loss": 3.8423373699188232,
+      "eval_runtime": 5.4409,
+      "eval_samples_per_second": 795.825,
+      "eval_steps_per_second": 1.654,
+      "step": 1000
+    },
+    {
+      "epoch": 0.05860423285028359,
+      "grad_norm": 0.23449479043483734,
+      "learning_rate": 0.00039992043864945325,
+      "loss": 3.8626,
+      "step": 1010
+    },
+    {
+      "epoch": 0.05918447277949432,
+      "grad_norm": 0.24179957807064056,
+      "learning_rate": 0.00039990924663629797,
+      "loss": 3.847,
+      "step": 1020
+    },
+    {
+      "epoch": 0.05976471270870505,
+      "grad_norm": 0.23391254246234894,
+      "learning_rate": 0.00039989731853635465,
+      "loss": 3.8457,
+      "step": 1030
+    },
+    {
+      "epoch": 0.060344952637915776,
+      "grad_norm": 0.3046986758708954,
+      "learning_rate": 0.0003998846543935438,
+      "loss": 3.8457,
+      "step": 1040
+    },
+    {
+      "epoch": 0.06092519256712651,
+      "grad_norm": 0.2446785569190979,
+      "learning_rate": 0.00039987125425449603,
+      "loss": 3.8428,
+      "step": 1050
+    },
+    {
+      "epoch": 0.061505432496337234,
+      "grad_norm": 0.21244779229164124,
+      "learning_rate": 0.00039985711816855224,
+      "loss": 3.8053,
+      "step": 1060
+    },
+    {
+      "epoch": 0.062085672425547966,
+      "grad_norm": 0.23864899575710297,
+      "learning_rate": 0.00039984224618776285,
+      "loss": 3.8097,
+      "step": 1070
+    },
+    {
+      "epoch": 0.0626659123547587,
+      "grad_norm": 0.2297637015581131,
+      "learning_rate": 0.0003998266383668881,
+      "loss": 3.7986,
+      "step": 1080
+    },
+    {
+      "epoch": 0.06324615228396942,
+      "grad_norm": 0.21897932887077332,
+      "learning_rate": 0.0003998102947633975,
+      "loss": 3.7788,
+      "step": 1090
+    },
+    {
+      "epoch": 0.06382639221318015,
+      "grad_norm": 0.2325911521911621,
+      "learning_rate": 0.0003997932154374701,
+      "loss": 3.778,
+      "step": 1100
+    },
+    {
+      "epoch": 0.06440663214239088,
+      "grad_norm": 0.2542111277580261,
+      "learning_rate": 0.0003997754004519936,
+      "loss": 3.7512,
+      "step": 1110
+    },
+    {
+      "epoch": 0.0649868720716016,
+      "grad_norm": 0.22122596204280853,
+      "learning_rate": 0.00039975684987256476,
+      "loss": 3.7449,
+      "step": 1120
+    },
+    {
+      "epoch": 0.06556711200081233,
+      "grad_norm": 0.21252059936523438,
+      "learning_rate": 0.00039973756376748875,
+      "loss": 3.7379,
+      "step": 1130
+    },
+    {
+      "epoch": 0.06614735193002307,
+      "grad_norm": 0.2249770313501358,
+      "learning_rate": 0.0003997175422077789,
+      "loss": 3.7369,
+      "step": 1140
+    },
+    {
+      "epoch": 0.0667275918592338,
+      "grad_norm": 0.1798800528049469,
+      "learning_rate": 0.00039969678526715686,
+      "loss": 3.7269,
+      "step": 1150
+    },
+    {
+      "epoch": 0.06730783178844452,
+      "grad_norm": 0.22175556421279907,
+      "learning_rate": 0.0003996752930220518,
+      "loss": 3.7057,
+      "step": 1160
+    },
+    {
+      "epoch": 0.06788807171765525,
+      "grad_norm": 0.1973702758550644,
+      "learning_rate": 0.0003996530655516003,
+      "loss": 3.6938,
+      "step": 1170
+    },
+    {
+      "epoch": 0.06846831164686598,
+      "grad_norm": 0.17350292205810547,
+      "learning_rate": 0.00039963010293764646,
+      "loss": 3.6902,
+      "step": 1180
+    },
+    {
+      "epoch": 0.06904855157607671,
+      "grad_norm": 0.44916781783103943,
+      "learning_rate": 0.0003996064052647408,
+      "loss": 3.7085,
+      "step": 1190
+    },
+    {
+      "epoch": 0.06962879150528743,
+      "grad_norm": 0.17633803188800812,
+      "learning_rate": 0.0003995819726201408,
+      "loss": 3.7162,
+      "step": 1200
+    },
+    {
+      "epoch": 0.07020903143449816,
+      "grad_norm": 0.18631690740585327,
+      "learning_rate": 0.00039955680509380995,
+      "loss": 3.6841,
+      "step": 1210
+    },
+    {
+      "epoch": 0.0707892713637089,
+      "grad_norm": 0.2087530940771103,
+      "learning_rate": 0.0003995309027784177,
+      "loss": 3.6755,
+      "step": 1220
+    },
+    {
+      "epoch": 0.07136951129291962,
+      "grad_norm": 0.2354528307914734,
+      "learning_rate": 0.0003995042657693391,
+      "loss": 3.6574,
+      "step": 1230
+    },
+    {
+      "epoch": 0.07194975122213035,
+      "grad_norm": 0.20092642307281494,
+      "learning_rate": 0.00039947689416465444,
+      "loss": 3.6568,
+      "step": 1240
+    },
+    {
+      "epoch": 0.07252999115134108,
+      "grad_norm": 0.17386938631534576,
+      "learning_rate": 0.00039944878806514884,
+      "loss": 3.6434,
+      "step": 1250
+    },
+    {
+      "epoch": 0.07311023108055181,
+      "grad_norm": 0.22579102218151093,
+      "learning_rate": 0.0003994199475743119,
+      "loss": 3.6382,
+      "step": 1260
+    },
+    {
+      "epoch": 0.07369047100976253,
+      "grad_norm": 0.1960754096508026,
+      "learning_rate": 0.0003993903727983373,
+      "loss": 3.6343,
+      "step": 1270
+    },
+    {
+      "epoch": 0.07427071093897326,
+      "grad_norm": 0.35644450783729553,
+      "learning_rate": 0.00039936006384612237,
+      "loss": 3.6366,
+      "step": 1280
+    },
+    {
+      "epoch": 0.074850950868184,
+      "grad_norm": 0.15999135375022888,
+      "learning_rate": 0.000399329020829268,
+      "loss": 3.6337,
+      "step": 1290
+    },
+    {
+      "epoch": 0.07543119079739473,
+      "grad_norm": 0.1807032823562622,
+      "learning_rate": 0.00039929724386207784,
+      "loss": 3.6182,
+      "step": 1300
+    },
+    {
+      "epoch": 0.07601143072660545,
+      "grad_norm": 0.19118531048297882,
+      "learning_rate": 0.00039926473306155794,
+      "loss": 3.6138,
+      "step": 1310
+    },
+    {
+      "epoch": 0.07659167065581618,
+      "grad_norm": 0.20418357849121094,
+      "learning_rate": 0.00039923148854741644,
+      "loss": 3.6009,
+      "step": 1320
+    },
+    {
+      "epoch": 0.07717191058502691,
+      "grad_norm": 0.17397375404834747,
+      "learning_rate": 0.0003991975104420632,
+      "loss": 3.5977,
+      "step": 1330
+    },
+    {
+      "epoch": 0.07775215051423763,
+      "grad_norm": 0.18904046714305878,
+      "learning_rate": 0.0003991627988706091,
+      "loss": 3.5917,
+      "step": 1340
+    },
+    {
+      "epoch": 0.07833239044344836,
+      "grad_norm": 0.1725725531578064,
+      "learning_rate": 0.0003991273539608658,
+      "loss": 3.5848,
+      "step": 1350
+    },
+    {
+      "epoch": 0.0789126303726591,
+      "grad_norm": 0.1786000281572342,
+      "learning_rate": 0.0003990911758433452,
+      "loss": 3.589,
+      "step": 1360
+    },
+    {
+      "epoch": 0.07949287030186983,
+      "grad_norm": 0.1812940239906311,
+      "learning_rate": 0.00039905426465125895,
+      "loss": 3.5774,
+      "step": 1370
+    },
+    {
+      "epoch": 0.08007311023108055,
+      "grad_norm": 0.17902950942516327,
+      "learning_rate": 0.00039901662052051787,
+      "loss": 3.5683,
+      "step": 1380
+    },
+    {
+      "epoch": 0.08065335016029128,
+      "grad_norm": 0.25934261083602905,
+      "learning_rate": 0.0003989782435897316,
+      "loss": 3.555,
+      "step": 1390
+    },
+    {
+      "epoch": 0.08123359008950201,
+      "grad_norm": 0.16889074444770813,
+      "learning_rate": 0.00039893913400020797,
+      "loss": 3.5638,
+      "step": 1400
+    },
+    {
+      "epoch": 0.08181383001871274,
+      "grad_norm": 0.1863388568162918,
+      "learning_rate": 0.00039889929189595264,
+      "loss": 3.5548,
+      "step": 1410
+    },
+    {
+      "epoch": 0.08239406994792346,
+      "grad_norm": 0.19880317151546478,
+      "learning_rate": 0.00039885871742366847,
+      "loss": 3.5539,
+      "step": 1420
+    },
+    {
+      "epoch": 0.0829743098771342,
+      "grad_norm": 0.2024102807044983,
+      "learning_rate": 0.00039881741073275476,
+      "loss": 3.5467,
+      "step": 1430
+    },
+    {
+      "epoch": 0.08355454980634493,
+      "grad_norm": 0.18990029394626617,
+      "learning_rate": 0.0003987753719753073,
+      "loss": 3.5297,
+      "step": 1440
+    },
+    {
+      "epoch": 0.08413478973555565,
+      "grad_norm": 0.15605872869491577,
+      "learning_rate": 0.00039873260130611694,
+      "loss": 3.5173,
+      "step": 1450
+    },
+    {
+      "epoch": 0.08471502966476638,
+      "grad_norm": 0.19115321338176727,
+      "learning_rate": 0.00039868909888267,
+      "loss": 3.5208,
+      "step": 1460
+    },
+    {
+      "epoch": 0.08529526959397711,
+      "grad_norm": 0.20802512764930725,
+      "learning_rate": 0.0003986448648651468,
+      "loss": 3.5295,
+      "step": 1470
+    },
+    {
+      "epoch": 0.08587550952318784,
+      "grad_norm": 0.207048699259758,
+      "learning_rate": 0.0003985998994164216,
+      "loss": 3.5266,
+      "step": 1480
+    },
+    {
+      "epoch": 0.08645574945239856,
+      "grad_norm": 0.21221551299095154,
+      "learning_rate": 0.00039855420270206213,
+      "loss": 3.5165,
+      "step": 1490
+    },
+    {
+      "epoch": 0.0870359893816093,
+      "grad_norm": 0.17720042169094086,
+      "learning_rate": 0.0003985077748903282,
+      "loss": 3.4996,
+      "step": 1500
+    },
+    {
+      "epoch": 0.08761622931082003,
+      "grad_norm": 0.1978517323732376,
+      "learning_rate": 0.0003984606161521721,
+      "loss": 3.4972,
+      "step": 1510
+    },
+    {
+      "epoch": 0.08819646924003076,
+      "grad_norm": 0.21575991809368134,
+      "learning_rate": 0.00039841272666123705,
+      "loss": 3.4846,
+      "step": 1520
+    },
+    {
+      "epoch": 0.08877670916924148,
+      "grad_norm": 0.16001969575881958,
+      "learning_rate": 0.0003983641065938573,
+      "loss": 3.4937,
+      "step": 1530
+    },
+    {
+      "epoch": 0.08935694909845221,
+      "grad_norm": 0.18556085228919983,
+      "learning_rate": 0.00039831475612905697,
+      "loss": 3.4797,
+      "step": 1540
+    },
+    {
+      "epoch": 0.08993718902766294,
+      "grad_norm": 0.193649560213089,
+      "learning_rate": 0.00039826467544854975,
+      "loss": 3.4808,
+      "step": 1550
+    },
+    {
+      "epoch": 0.09051742895687366,
+      "grad_norm": 0.18965889513492584,
+      "learning_rate": 0.00039821386473673775,
+      "loss": 3.4831,
+      "step": 1560
+    },
+    {
+      "epoch": 0.09109766888608439,
+      "grad_norm": 0.19180066883563995,
+      "learning_rate": 0.00039816232418071155,
+      "loss": 3.4768,
+      "step": 1570
+    },
+    {
+      "epoch": 0.09167790881529513,
+      "grad_norm": 0.18388795852661133,
+      "learning_rate": 0.0003981100539702487,
+      "loss": 3.4739,
+      "step": 1580
+    },
+    {
+      "epoch": 0.09225814874450586,
+      "grad_norm": 0.18927408754825592,
+      "learning_rate": 0.00039805705429781375,
+      "loss": 3.4572,
+      "step": 1590
+    },
+    {
+      "epoch": 0.09283838867371658,
+      "grad_norm": 0.1485849916934967,
+      "learning_rate": 0.00039800332535855695,
+      "loss": 3.4651,
+      "step": 1600
+    },
+    {
+      "epoch": 0.09341862860292731,
+      "grad_norm": 0.15955494344234467,
+      "learning_rate": 0.0003979488673503138,
+      "loss": 3.4644,
+      "step": 1610
+    },
+    {
+      "epoch": 0.09399886853213804,
+      "grad_norm": 0.17274880409240723,
+      "learning_rate": 0.0003978936804736046,
+      "loss": 3.4548,
+      "step": 1620
+    },
+    {
+      "epoch": 0.09457910846134877,
+      "grad_norm": 0.19138464331626892,
+      "learning_rate": 0.00039783776493163307,
+      "loss": 3.4504,
+      "step": 1630
+    },
+    {
+      "epoch": 0.09515934839055949,
+      "grad_norm": 0.19574107229709625,
+      "learning_rate": 0.0003977811209302861,
+      "loss": 3.4505,
+      "step": 1640
+    },
+    {
+      "epoch": 0.09573958831977022,
+      "grad_norm": 0.14684069156646729,
+      "learning_rate": 0.0003977237486781329,
+      "loss": 3.4536,
+      "step": 1650
+    },
+    {
+      "epoch": 0.09631982824898096,
+      "grad_norm": 0.17721091210842133,
+      "learning_rate": 0.00039766564838642404,
+      "loss": 3.4438,
+      "step": 1660
+    },
+    {
+      "epoch": 0.09690006817819168,
+      "grad_norm": 0.19373926520347595,
+      "learning_rate": 0.00039760682026909093,
+      "loss": 3.4303,
+      "step": 1670
+    },
+    {
+      "epoch": 0.09748030810740241,
+      "grad_norm": 0.15782880783081055,
+      "learning_rate": 0.00039754726454274485,
+      "loss": 3.4287,
+      "step": 1680
+    },
+    {
+      "epoch": 0.09806054803661314,
+      "grad_norm": 0.15952658653259277,
+      "learning_rate": 0.00039748698142667616,
+      "loss": 3.4409,
+      "step": 1690
+    },
+    {
+      "epoch": 0.09864078796582387,
+      "grad_norm": 0.1631140410900116,
+      "learning_rate": 0.00039742597114285377,
+      "loss": 3.4218,
+      "step": 1700
+    },
+    {
+      "epoch": 0.09922102789503459,
+      "grad_norm": 0.20951348543167114,
+      "learning_rate": 0.0003973642339159237,
+      "loss": 3.4259,
+      "step": 1710
+    },
+    {
+      "epoch": 0.09980126782424532,
+      "grad_norm": 0.18334078788757324,
+      "learning_rate": 0.0003973017699732092,
+      "loss": 3.4249,
+      "step": 1720
+    },
+    {
+      "epoch": 0.10038150775345606,
+      "grad_norm": 0.15507562458515167,
+      "learning_rate": 0.0003972385795447087,
+      "loss": 3.4177,
+      "step": 1730
+    },
+    {
+      "epoch": 0.10096174768266679,
+      "grad_norm": 0.19322511553764343,
+      "learning_rate": 0.0003971746628630962,
+      "loss": 3.4167,
+      "step": 1740
+    },
+    {
+      "epoch": 0.10154198761187751,
+      "grad_norm": 0.17755462229251862,
+      "learning_rate": 0.0003971100201637196,
+      "loss": 3.4152,
+      "step": 1750
+    },
+    {
+      "epoch": 0.10212222754108824,
+      "grad_norm": 0.1856836974620819,
+      "learning_rate": 0.0003970446516846,
+      "loss": 3.3932,
+      "step": 1760
+    },
+    {
+      "epoch": 0.10270246747029897,
+      "grad_norm": 0.19244609773159027,
+      "learning_rate": 0.0003969785576664311,
+      "loss": 3.4051,
+      "step": 1770
+    },
+    {
+      "epoch": 0.10328270739950969,
+      "grad_norm": 0.181436225771904,
+      "learning_rate": 0.0003969117383525779,
+      "loss": 3.3995,
+      "step": 1780
+    },
+    {
+      "epoch": 0.10386294732872042,
+      "grad_norm": 0.1531616598367691,
+      "learning_rate": 0.0003968441939890762,
+      "loss": 3.4055,
+      "step": 1790
+    },
+    {
+      "epoch": 0.10444318725793116,
+      "grad_norm": 0.1904386729001999,
+      "learning_rate": 0.00039677592482463135,
+      "loss": 3.3906,
+      "step": 1800
+    },
+    {
+      "epoch": 0.10502342718714189,
+      "grad_norm": 0.17353110015392303,
+      "learning_rate": 0.0003967069311106176,
+      "loss": 3.3925,
+      "step": 1810
+    },
+    {
+      "epoch": 0.1056036671163526,
+      "grad_norm": 0.15772603452205658,
+      "learning_rate": 0.000396637213101077,
+      "loss": 3.392,
+      "step": 1820
+    },
+    {
+      "epoch": 0.10618390704556334,
+      "grad_norm": 0.18432147800922394,
+      "learning_rate": 0.00039656677105271863,
+      "loss": 3.3907,
+      "step": 1830
+    },
+    {
+      "epoch": 0.10676414697477407,
+      "grad_norm": 0.16060298681259155,
+      "learning_rate": 0.0003964956052249174,
+      "loss": 3.3801,
+      "step": 1840
+    },
+    {
+      "epoch": 0.1073443869039848,
+      "grad_norm": 0.17931994795799255,
+      "learning_rate": 0.00039642371587971344,
+      "loss": 3.3788,
+      "step": 1850
+    },
+    {
+      "epoch": 0.10792462683319552,
+      "grad_norm": 0.15293100476264954,
+      "learning_rate": 0.0003963511032818108,
+      "loss": 3.3751,
+      "step": 1860
+    },
+    {
+      "epoch": 0.10850486676240625,
+      "grad_norm": 0.17011559009552002,
+      "learning_rate": 0.0003962777676985767,
+      "loss": 3.3794,
+      "step": 1870
+    },
+    {
+      "epoch": 0.10908510669161699,
+      "grad_norm": 0.13994856178760529,
+      "learning_rate": 0.00039620370940004037,
+      "loss": 3.3718,
+      "step": 1880
+    },
+    {
+      "epoch": 0.1096653466208277,
+      "grad_norm": 0.17306166887283325,
+      "learning_rate": 0.0003961289286588923,
+      "loss": 3.3733,
+      "step": 1890
+    },
+    {
+      "epoch": 0.11024558655003844,
+      "grad_norm": 0.19824837148189545,
+      "learning_rate": 0.000396053425750483,
+      "loss": 3.3625,
+      "step": 1900
+    },
+    {
+      "epoch": 0.11082582647924917,
+      "grad_norm": 0.16269779205322266,
+      "learning_rate": 0.00039597720095282203,
+      "loss": 3.3623,
+      "step": 1910
+    },
+    {
+      "epoch": 0.1114060664084599,
+      "grad_norm": 0.16221415996551514,
+      "learning_rate": 0.00039590025454657715,
+      "loss": 3.3556,
+      "step": 1920
+    },
+    {
+      "epoch": 0.11198630633767062,
+      "grad_norm": 0.16675835847854614,
+      "learning_rate": 0.000395822586815073,
+      "loss": 3.3626,
+      "step": 1930
+    },
+    {
+      "epoch": 0.11256654626688135,
+      "grad_norm": 0.1594047099351883,
+      "learning_rate": 0.0003957441980442904,
+      "loss": 3.3613,
+      "step": 1940
+    },
+    {
+      "epoch": 0.11314678619609209,
+      "grad_norm": 0.17084339261054993,
+      "learning_rate": 0.000395665088522865,
+      "loss": 3.3418,
+      "step": 1950
+    },
+    {
+      "epoch": 0.11372702612530282,
+      "grad_norm": 0.17646485567092896,
+      "learning_rate": 0.00039558525854208634,
+      "loss": 3.3425,
+      "step": 1960
+    },
+    {
+      "epoch": 0.11430726605451354,
+      "grad_norm": 0.16079320013523102,
+      "learning_rate": 0.00039550470839589666,
+      "loss": 3.3522,
+      "step": 1970
+    },
+    {
+      "epoch": 0.11488750598372427,
+      "grad_norm": 0.16689667105674744,
+      "learning_rate": 0.00039542343838089024,
+      "loss": 3.3528,
+      "step": 1980
+    },
+    {
+      "epoch": 0.115467745912935,
+      "grad_norm": 0.15823589265346527,
+      "learning_rate": 0.00039534144879631165,
+      "loss": 3.3392,
+      "step": 1990
+    },
+    {
+      "epoch": 0.11604798584214572,
+      "grad_norm": 0.1432604044675827,
+      "learning_rate": 0.00039525873994405514,
+      "loss": 3.3459,
+      "step": 2000
+    },
+    {
+      "epoch": 0.11604798584214572,
+      "eval_loss": 3.3026654720306396,
+      "eval_runtime": 5.3961,
+      "eval_samples_per_second": 802.435,
+      "eval_steps_per_second": 1.668,
+      "step": 2000
+    },
+    {
+      "epoch": 0.11662822577135645,
+      "grad_norm": 0.21145105361938477,
+      "learning_rate": 0.0003951753121286634,
+      "loss": 3.3382,
+      "step": 2010
+    },
+    {
+      "epoch": 0.11720846570056719,
+      "grad_norm": 0.1759408414363861,
+      "learning_rate": 0.00039509116565732643,
+      "loss": 3.3362,
+      "step": 2020
+    },
+    {
+      "epoch": 0.11778870562977792,
+      "grad_norm": 0.15327772498130798,
+      "learning_rate": 0.0003950063008398802,
+      "loss": 3.3375,
+      "step": 2030
+    },
+    {
+      "epoch": 0.11836894555898864,
+      "grad_norm": 0.16556823253631592,
+      "learning_rate": 0.000394920717988806,
+      "loss": 3.329,
+      "step": 2040
+    },
+    {
+      "epoch": 0.11894918548819937,
+      "grad_norm": 0.16622750461101532,
+      "learning_rate": 0.0003948344174192288,
+      "loss": 3.3216,
+      "step": 2050
+    },
+    {
+      "epoch": 0.1195294254174101,
+      "grad_norm": 0.17805641889572144,
+      "learning_rate": 0.00039474739944891636,
+      "loss": 3.3326,
+      "step": 2060
+    },
+    {
+      "epoch": 0.12010966534662083,
+      "grad_norm": 0.15195246040821075,
+      "learning_rate": 0.000394659664398278,
+      "loss": 3.3334,
+      "step": 2070
+    },
+    {
+      "epoch": 0.12068990527583155,
+      "grad_norm": 0.16342763602733612,
+      "learning_rate": 0.0003945712125903632,
+      "loss": 3.3203,
+      "step": 2080
+    },
+    {
+      "epoch": 0.12127014520504228,
+      "grad_norm": 0.15589125454425812,
+      "learning_rate": 0.00039448204435086096,
+      "loss": 3.3184,
+      "step": 2090
+    },
+    {
+      "epoch": 0.12185038513425302,
+      "grad_norm": 0.1632692962884903,
+      "learning_rate": 0.000394392160008098,
+      "loss": 3.3093,
+      "step": 2100
+    },
+    {
+      "epoch": 0.12243062506346374,
+      "grad_norm": 0.17584744095802307,
+      "learning_rate": 0.000394301559893038,
+      "loss": 3.3245,
+      "step": 2110
+    },
+    {
+      "epoch": 0.12301086499267447,
+      "grad_norm": 0.17610713839530945,
+      "learning_rate": 0.0003942102443392799,
+      "loss": 3.3209,
+      "step": 2120
+    },
+    {
+      "epoch": 0.1235911049218852,
+      "grad_norm": 0.17064009606838226,
+      "learning_rate": 0.00039411821368305725,
+      "loss": 3.3037,
+      "step": 2130
+    },
+    {
+      "epoch": 0.12417134485109593,
+      "grad_norm": 0.1743483990430832,
+      "learning_rate": 0.00039402546826323645,
+      "loss": 3.3111,
+      "step": 2140
+    },
+    {
+      "epoch": 0.12475158478030665,
+      "grad_norm": 0.15367501974105835,
+      "learning_rate": 0.000393932008421316,
+      "loss": 3.3087,
+      "step": 2150
+    },
+    {
+      "epoch": 0.1253318247095174,
+      "grad_norm": 0.21368132531642914,
+      "learning_rate": 0.00039383783450142474,
+      "loss": 3.3015,
+      "step": 2160
+    },
+    {
+      "epoch": 0.12591206463872812,
+      "grad_norm": 0.15691019594669342,
+      "learning_rate": 0.00039374294685032095,
+      "loss": 3.3067,
+      "step": 2170
+    },
+    {
+      "epoch": 0.12649230456793883,
+      "grad_norm": 0.164012610912323,
+      "learning_rate": 0.00039364734581739084,
+      "loss": 3.3069,
+      "step": 2180
+    },
+    {
+      "epoch": 0.12707254449714958,
+      "grad_norm": 0.14891429245471954,
+      "learning_rate": 0.0003935510317546475,
+      "loss": 3.3027,
+      "step": 2190
+    },
+    {
+      "epoch": 0.1276527844263603,
+      "grad_norm": 0.17328360676765442,
+      "learning_rate": 0.0003934540050167294,
+      "loss": 3.3122,
+      "step": 2200
+    },
+    {
+      "epoch": 0.12823302435557102,
+      "grad_norm": 0.16395007073879242,
+      "learning_rate": 0.00039335626596089906,
+      "loss": 3.2821,
+      "step": 2210
+    },
+    {
+      "epoch": 0.12881326428478176,
+      "grad_norm": 0.1645556390285492,
+      "learning_rate": 0.00039325781494704197,
+      "loss": 3.2988,
+      "step": 2220
+    },
+    {
+      "epoch": 0.12939350421399248,
+      "grad_norm": 0.22876814007759094,
+      "learning_rate": 0.0003931586523376652,
+      "loss": 3.3028,
+      "step": 2230
+    },
+    {
+      "epoch": 0.1299737441432032,
+      "grad_norm": 0.14323562383651733,
+      "learning_rate": 0.00039305877849789565,
+      "loss": 3.2887,
+      "step": 2240
+    },
+    {
+      "epoch": 0.13055398407241395,
+      "grad_norm": 0.15895529091358185,
+      "learning_rate": 0.0003929581937954794,
+      "loss": 3.2914,
+      "step": 2250
+    },
+    {
+      "epoch": 0.13113422400162467,
+      "grad_norm": 0.1550154834985733,
+      "learning_rate": 0.0003928568986007798,
+      "loss": 3.2814,
+      "step": 2260
+    },
+    {
+      "epoch": 0.1317144639308354,
+      "grad_norm": 0.19095434248447418,
+      "learning_rate": 0.00039275489328677646,
+      "loss": 3.2766,
+      "step": 2270
+    },
+    {
+      "epoch": 0.13229470386004613,
+      "grad_norm": 0.15697865188121796,
+      "learning_rate": 0.0003926521782290635,
+      "loss": 3.2826,
+      "step": 2280
+    },
+    {
+      "epoch": 0.13287494378925685,
+      "grad_norm": 0.16411994397640228,
+      "learning_rate": 0.00039254875380584863,
+      "loss": 3.2737,
+      "step": 2290
+    },
+    {
+      "epoch": 0.1334551837184676,
+      "grad_norm": 0.18466068804264069,
+      "learning_rate": 0.00039244462039795137,
+      "loss": 3.2768,
+      "step": 2300
+    },
+    {
+      "epoch": 0.13403542364767831,
+      "grad_norm": 0.15912406146526337,
+      "learning_rate": 0.00039233977838880183,
+      "loss": 3.2842,
+      "step": 2310
+    },
+    {
+      "epoch": 0.13461566357688903,
+      "grad_norm": 0.16722336411476135,
+      "learning_rate": 0.0003922342281644393,
+      "loss": 3.2664,
+      "step": 2320
+    },
+    {
+      "epoch": 0.13519590350609978,
+      "grad_norm": 0.1424356997013092,
+      "learning_rate": 0.00039212797011351066,
+      "loss": 3.2694,
+      "step": 2330
+    },
+    {
+      "epoch": 0.1357761434353105,
+      "grad_norm": 0.16306522488594055,
+      "learning_rate": 0.0003920210046272693,
+      "loss": 3.2668,
+      "step": 2340
+    },
+    {
+      "epoch": 0.13635638336452122,
+      "grad_norm": 0.18254053592681885,
+      "learning_rate": 0.00039191333209957335,
+      "loss": 3.2637,
+      "step": 2350
+    },
+    {
+      "epoch": 0.13693662329373196,
+      "grad_norm": 0.1430349200963974,
+      "learning_rate": 0.0003918049529268843,
+      "loss": 3.2579,
+      "step": 2360
+    },
+    {
+      "epoch": 0.13751686322294268,
+      "grad_norm": 0.15292882919311523,
+      "learning_rate": 0.00039169586750826564,
+      "loss": 3.2661,
+      "step": 2370
+    },
+    {
+      "epoch": 0.13809710315215343,
+      "grad_norm": 0.20139199495315552,
+      "learning_rate": 0.00039158607624538124,
+      "loss": 3.2595,
+      "step": 2380
+    },
+    {
+      "epoch": 0.13867734308136415,
+      "grad_norm": 0.14147156476974487,
+      "learning_rate": 0.0003914755795424941,
+      "loss": 3.2484,
+      "step": 2390
+    },
+    {
+      "epoch": 0.13925758301057486,
+      "grad_norm": 0.14732101559638977,
+      "learning_rate": 0.0003913643778064646,
+      "loss": 3.2564,
+      "step": 2400
+    },
+    {
+      "epoch": 0.1398378229397856,
+      "grad_norm": 0.18487505614757538,
+      "learning_rate": 0.00039125247144674923,
+      "loss": 3.245,
+      "step": 2410
+    },
+    {
+      "epoch": 0.14041806286899633,
+      "grad_norm": 0.1768069863319397,
+      "learning_rate": 0.0003911398608753989,
+      "loss": 3.2564,
+      "step": 2420
+    },
+    {
+      "epoch": 0.14099830279820705,
+      "grad_norm": 0.15146300196647644,
+      "learning_rate": 0.0003910265465070576,
+      "loss": 3.2616,
+      "step": 2430
+    },
+    {
+      "epoch": 0.1415785427274178,
+      "grad_norm": 0.15523262321949005,
+      "learning_rate": 0.00039091252875896054,
+      "loss": 3.2485,
+      "step": 2440
+    },
+    {
+      "epoch": 0.1421587826566285,
+      "grad_norm": 0.18047015368938446,
+      "learning_rate": 0.0003907978080509332,
+      "loss": 3.25,
+      "step": 2450
+    },
+    {
+      "epoch": 0.14273902258583923,
+      "grad_norm": 0.15500716865062714,
+      "learning_rate": 0.00039068238480538916,
+      "loss": 3.2571,
+      "step": 2460
+    },
+    {
+      "epoch": 0.14331926251504998,
+      "grad_norm": 0.1513693481683731,
+      "learning_rate": 0.0003905662594473289,
+      "loss": 3.2598,
+      "step": 2470
+    },
+    {
+      "epoch": 0.1438995024442607,
+      "grad_norm": 0.15952032804489136,
+      "learning_rate": 0.00039044943240433815,
+      "loss": 3.2426,
+      "step": 2480
+    },
+    {
+      "epoch": 0.14447974237347144,
+      "grad_norm": 0.1477235108613968,
+      "learning_rate": 0.0003903319041065863,
+      "loss": 3.2344,
+      "step": 2490
+    },
+    {
+      "epoch": 0.14505998230268216,
+      "grad_norm": 0.16264913976192474,
+      "learning_rate": 0.00039021367498682494,
+      "loss": 3.2407,
+      "step": 2500
+    },
+    {
+      "epoch": 0.14564022223189288,
+      "grad_norm": 0.16381001472473145,
+      "learning_rate": 0.000390094745480386,
+      "loss": 3.2408,
+      "step": 2510
+    },
+    {
+      "epoch": 0.14622046216110363,
+      "grad_norm": 0.1747412234544754,
+      "learning_rate": 0.00038997511602518044,
+      "loss": 3.2469,
+      "step": 2520
+    },
+    {
+      "epoch": 0.14680070209031434,
+      "grad_norm": 0.1646890491247177,
+      "learning_rate": 0.00038985478706169633,
+      "loss": 3.2368,
+      "step": 2530
+    },
+    {
+      "epoch": 0.14738094201952506,
+      "grad_norm": 0.18659403920173645,
+      "learning_rate": 0.00038973375903299766,
+      "loss": 3.229,
+      "step": 2540
+    },
+    {
+      "epoch": 0.1479611819487358,
+      "grad_norm": 0.15826112031936646,
+      "learning_rate": 0.0003896120323847222,
+      "loss": 3.2389,
+      "step": 2550
+    },
+    {
+      "epoch": 0.14854142187794653,
+      "grad_norm": 0.1658063381910324,
+      "learning_rate": 0.00038948960756508025,
+      "loss": 3.2326,
+      "step": 2560
+    },
+    {
+      "epoch": 0.14912166180715725,
+      "grad_norm": 0.18063782155513763,
+      "learning_rate": 0.0003893664850248529,
+      "loss": 3.2328,
+      "step": 2570
+    },
+    {
+      "epoch": 0.149701901736368,
+      "grad_norm": 0.16473062336444855,
+      "learning_rate": 0.0003892426652173901,
+      "loss": 3.2275,
+      "step": 2580
+    },
+    {
+      "epoch": 0.1502821416655787,
+      "grad_norm": 0.1553630381822586,
+      "learning_rate": 0.00038911814859860953,
+      "loss": 3.225,
+      "step": 2590
+    },
+    {
+      "epoch": 0.15086238159478946,
+      "grad_norm": 0.1639242172241211,
+      "learning_rate": 0.00038899293562699423,
+      "loss": 3.2222,
+      "step": 2600
+    },
+    {
+      "epoch": 0.15144262152400018,
+      "grad_norm": 0.16087086498737335,
+      "learning_rate": 0.00038886702676359166,
+      "loss": 3.2169,
+      "step": 2610
+    },
+    {
+      "epoch": 0.1520228614532109,
+      "grad_norm": 0.15263104438781738,
+      "learning_rate": 0.0003887404224720113,
+      "loss": 3.2239,
+      "step": 2620
+    },
+    {
+      "epoch": 0.15260310138242164,
+      "grad_norm": 0.16660773754119873,
+      "learning_rate": 0.0003886131232184235,
+      "loss": 3.2197,
+      "step": 2630
+    },
+    {
+      "epoch": 0.15318334131163236,
+      "grad_norm": 0.16617350280284882,
+      "learning_rate": 0.00038848512947155744,
+      "loss": 3.2181,
+      "step": 2640
+    },
+    {
+      "epoch": 0.15376358124084308,
+      "grad_norm": 0.1546907275915146,
+      "learning_rate": 0.00038835644170269945,
+      "loss": 3.2182,
+      "step": 2650
+    },
+    {
+      "epoch": 0.15434382117005382,
+      "grad_norm": 0.16329720616340637,
+      "learning_rate": 0.0003882270603856914,
+      "loss": 3.2224,
+      "step": 2660
+    },
+    {
+      "epoch": 0.15492406109926454,
+      "grad_norm": 0.1458432823419571,
+      "learning_rate": 0.00038809698599692884,
+      "loss": 3.217,
+      "step": 2670
+    },
+    {
+      "epoch": 0.15550430102847526,
+      "grad_norm": 0.17481467127799988,
+      "learning_rate": 0.00038796621901535935,
+      "loss": 3.2046,
+      "step": 2680
+    },
+    {
+      "epoch": 0.156084540957686,
+      "grad_norm": 0.15761947631835938,
+      "learning_rate": 0.00038783475992248067,
+      "loss": 3.2087,
+      "step": 2690
+    },
+    {
+      "epoch": 0.15666478088689673,
+      "grad_norm": 0.14478649199008942,
+      "learning_rate": 0.0003877026092023388,
+      "loss": 3.1956,
+      "step": 2700
+    },
+    {
+      "epoch": 0.15724502081610747,
+      "grad_norm": 0.16423922777175903,
+      "learning_rate": 0.00038756976734152673,
+      "loss": 3.21,
+      "step": 2710
+    },
+    {
+      "epoch": 0.1578252607453182,
+      "grad_norm": 0.16854038834571838,
+      "learning_rate": 0.000387436234829182,
+      "loss": 3.2031,
+      "step": 2720
+    },
+    {
+      "epoch": 0.1584055006745289,
+      "grad_norm": 0.1525936722755432,
+      "learning_rate": 0.00038730201215698534,
+      "loss": 3.2038,
+      "step": 2730
+    },
+    {
+      "epoch": 0.15898574060373966,
+      "grad_norm": 0.15923213958740234,
+      "learning_rate": 0.00038716709981915864,
+      "loss": 3.2074,
+      "step": 2740
+    },
+    {
+      "epoch": 0.15956598053295037,
+      "grad_norm": 0.1662934571504593,
+      "learning_rate": 0.0003870314983124633,
+      "loss": 3.1966,
+      "step": 2750
+    },
+    {
+      "epoch": 0.1601462204621611,
+      "grad_norm": 0.17807535827159882,
+      "learning_rate": 0.0003868952081361983,
+      "loss": 3.2076,
+      "step": 2760
+    },
+    {
+      "epoch": 0.16072646039137184,
+      "grad_norm": 0.1638825237751007,
+      "learning_rate": 0.0003867582297921983,
+      "loss": 3.2042,
+      "step": 2770
+    },
+    {
+      "epoch": 0.16130670032058256,
+      "grad_norm": 0.15023517608642578,
+      "learning_rate": 0.0003866205637848319,
+      "loss": 3.1912,
+      "step": 2780
+    },
+    {
+      "epoch": 0.16188694024979328,
+      "grad_norm": 0.17124484479427338,
+      "learning_rate": 0.00038648221062099987,
+      "loss": 3.2016,
+      "step": 2790
+    },
+    {
+      "epoch": 0.16246718017900402,
+      "grad_norm": 0.15234492719173431,
+      "learning_rate": 0.0003863431708101329,
+      "loss": 3.1892,
+      "step": 2800
+    },
+    {
+      "epoch": 0.16304742010821474,
+      "grad_norm": 0.16199001669883728,
+      "learning_rate": 0.0003862034448641902,
+      "loss": 3.1906,
+      "step": 2810
+    },
+    {
+      "epoch": 0.1636276600374255,
+      "grad_norm": 0.17630840837955475,
+      "learning_rate": 0.0003860630332976574,
+      "loss": 3.1921,
+      "step": 2820
+    },
+    {
+      "epoch": 0.1642078999666362,
+      "grad_norm": 0.16295549273490906,
+      "learning_rate": 0.0003859219366275445,
+      "loss": 3.1942,
+      "step": 2830
+    },
+    {
+      "epoch": 0.16478813989584692,
+      "grad_norm": 0.14281953871250153,
+      "learning_rate": 0.0003857801553733843,
+      "loss": 3.1869,
+      "step": 2840
+    },
+    {
+      "epoch": 0.16536837982505767,
+      "grad_norm": 0.16084730625152588,
+      "learning_rate": 0.00038563769005723025,
+      "loss": 3.1836,
+      "step": 2850
+    },
+    {
+      "epoch": 0.1659486197542684,
+      "grad_norm": 0.17616966366767883,
+      "learning_rate": 0.00038549454120365443,
+      "loss": 3.1909,
+      "step": 2860
+    },
+    {
+      "epoch": 0.1665288596834791,
+      "grad_norm": 0.16749773919582367,
+      "learning_rate": 0.00038535070933974603,
+      "loss": 3.1856,
+      "step": 2870
+    },
+    {
+      "epoch": 0.16710909961268985,
+      "grad_norm": 0.1418871134519577,
+      "learning_rate": 0.00038520619499510906,
+      "loss": 3.1804,
+      "step": 2880
+    },
+    {
+      "epoch": 0.16768933954190057,
+      "grad_norm": 0.1656585931777954,
+      "learning_rate": 0.00038506099870186036,
+      "loss": 3.1827,
+      "step": 2890
+    },
+    {
+      "epoch": 0.1682695794711113,
+      "grad_norm": 0.17027276754379272,
+      "learning_rate": 0.000384915120994628,
+      "loss": 3.1773,
+      "step": 2900
+    },
+    {
+      "epoch": 0.16884981940032204,
+      "grad_norm": 0.14748819172382355,
+      "learning_rate": 0.0003847685624105489,
+      "loss": 3.1852,
+      "step": 2910
+    },
+    {
+      "epoch": 0.16943005932953276,
+      "grad_norm": 0.14570088684558868,
+      "learning_rate": 0.00038462132348926725,
+      "loss": 3.1734,
+      "step": 2920
+    },
+    {
+      "epoch": 0.1700102992587435,
+      "grad_norm": 0.15523803234100342,
+      "learning_rate": 0.000384473404772932,
+      "loss": 3.1766,
+      "step": 2930
+    },
+    {
+      "epoch": 0.17059053918795422,
+      "grad_norm": 0.1739635467529297,
+      "learning_rate": 0.00038432480680619544,
+      "loss": 3.1826,
+      "step": 2940
+    },
+    {
+      "epoch": 0.17117077911716494,
+      "grad_norm": 0.171515554189682,
+      "learning_rate": 0.0003841755301362109,
+      "loss": 3.1771,
+      "step": 2950
+    },
+    {
+      "epoch": 0.17175101904637569,
+      "grad_norm": 0.15663164854049683,
+      "learning_rate": 0.00038402557531263073,
+      "loss": 3.1713,
+      "step": 2960
+    },
+    {
+      "epoch": 0.1723312589755864,
+      "grad_norm": 0.17138132452964783,
+      "learning_rate": 0.0003838749428876042,
+      "loss": 3.1771,
+      "step": 2970
+    },
+    {
+      "epoch": 0.17291149890479712,
+      "grad_norm": 0.1759568601846695,
+      "learning_rate": 0.0003837236334157757,
+      "loss": 3.1656,
+      "step": 2980
+    },
+    {
+      "epoch": 0.17349173883400787,
+      "grad_norm": 0.15324968099594116,
+      "learning_rate": 0.0003835716474542826,
+      "loss": 3.1619,
+      "step": 2990
+    },
+    {
+      "epoch": 0.1740719787632186,
+      "grad_norm": 0.1392621397972107,
+      "learning_rate": 0.00038341898556275316,
+      "loss": 3.1809,
+      "step": 3000
+    },
+    {
+      "epoch": 0.1740719787632186,
+      "eval_loss": 3.133876085281372,
+      "eval_runtime": 5.4039,
+      "eval_samples_per_second": 801.268,
+      "eval_steps_per_second": 1.665,
+      "step": 3000
+    },
+    {
+      "epoch": 0.1746522186924293,
+      "grad_norm": 0.1699080765247345,
+      "learning_rate": 0.00038326564830330436,
+      "loss": 3.161,
+      "step": 3010
+    },
+    {
+      "epoch": 0.17523245862164005,
+      "grad_norm": 0.15621140599250793,
+      "learning_rate": 0.0003831116362405401,
+      "loss": 3.1638,
+      "step": 3020
+    },
+    {
+      "epoch": 0.17581269855085077,
+      "grad_norm": 0.1653863489627838,
+      "learning_rate": 0.000382956949941549,
+      "loss": 3.166,
+      "step": 3030
+    },
+    {
+      "epoch": 0.17639293848006152,
+      "grad_norm": 0.16162124276161194,
+      "learning_rate": 0.0003828015899759021,
+      "loss": 3.1626,
+      "step": 3040
+    },
+    {
+      "epoch": 0.17697317840927224,
+      "grad_norm": 0.15676504373550415,
+      "learning_rate": 0.0003826455569156511,
+      "loss": 3.1747,
+      "step": 3050
+    },
+    {
+      "epoch": 0.17755341833848295,
+      "grad_norm": 0.15469352900981903,
+      "learning_rate": 0.00038248885133532613,
+      "loss": 3.1647,
+      "step": 3060
+    },
+    {
+      "epoch": 0.1781336582676937,
+      "grad_norm": 0.14125365018844604,
+      "learning_rate": 0.00038233147381193345,
+      "loss": 3.1577,
+      "step": 3070
+    },
+    {
+      "epoch": 0.17871389819690442,
+      "grad_norm": 0.1607305109500885,
+      "learning_rate": 0.00038217342492495376,
+      "loss": 3.155,
+      "step": 3080
+    },
+    {
+      "epoch": 0.17929413812611514,
+      "grad_norm": 0.15099143981933594,
+      "learning_rate": 0.0003820147052563394,
+      "loss": 3.156,
+      "step": 3090
+    },
+    {
+      "epoch": 0.17987437805532588,
+      "grad_norm": 0.1593606024980545,
+      "learning_rate": 0.00038185531539051303,
+      "loss": 3.1629,
+      "step": 3100
+    },
+    {
+      "epoch": 0.1804546179845366,
+      "grad_norm": 0.1826634258031845,
+      "learning_rate": 0.00038169525591436466,
+      "loss": 3.1433,
+      "step": 3110
+    },
+    {
+      "epoch": 0.18103485791374732,
+      "grad_norm": 0.15707820653915405,
+      "learning_rate": 0.00038153452741725017,
+      "loss": 3.1562,
+      "step": 3120
+    },
+    {
+      "epoch": 0.18161509784295807,
+      "grad_norm": 0.15769994258880615,
+      "learning_rate": 0.0003813731304909887,
+      "loss": 3.152,
+      "step": 3130
+    },
+    {
+      "epoch": 0.18219533777216879,
+      "grad_norm": 0.15145163238048553,
+      "learning_rate": 0.0003812110657298605,
+      "loss": 3.1527,
+      "step": 3140
+    },
+    {
+      "epoch": 0.18277557770137953,
+      "grad_norm": 0.15320977568626404,
+      "learning_rate": 0.0003810483337306052,
+      "loss": 3.1437,
+      "step": 3150
+    },
+    {
+      "epoch": 0.18335581763059025,
+      "grad_norm": 0.16322501003742218,
+      "learning_rate": 0.0003808849350924189,
+      "loss": 3.1564,
+      "step": 3160
+    },
+    {
+      "epoch": 0.18393605755980097,
+      "grad_norm": 0.14782309532165527,
+      "learning_rate": 0.0003807208704169527,
+      "loss": 3.1436,
+      "step": 3170
+    },
+    {
+      "epoch": 0.18451629748901172,
+      "grad_norm": 0.17319355905056,
+      "learning_rate": 0.0003805561403083097,
+      "loss": 3.1473,
+      "step": 3180
+    },
+    {
+      "epoch": 0.18509653741822243,
+      "grad_norm": 0.15451788902282715,
+      "learning_rate": 0.0003803907453730436,
+      "loss": 3.1395,
+      "step": 3190
+    },
+    {
+      "epoch": 0.18567677734743315,
+      "grad_norm": 0.17169536650180817,
+      "learning_rate": 0.00038022468622015576,
+      "loss": 3.1458,
+      "step": 3200
+    },
+    {
+      "epoch": 0.1862570172766439,
+      "grad_norm": 0.1508411318063736,
+      "learning_rate": 0.0003800579634610934,
+      "loss": 3.1425,
+      "step": 3210
+    },
+    {
+      "epoch": 0.18683725720585462,
+      "grad_norm": 0.17760975658893585,
+      "learning_rate": 0.00037989057770974725,
+      "loss": 3.144,
+      "step": 3220
+    },
+    {
+      "epoch": 0.18741749713506534,
+      "grad_norm": 0.15805186331272125,
+      "learning_rate": 0.0003797225295824491,
+      "loss": 3.1457,
+      "step": 3230
+    },
+    {
+      "epoch": 0.18799773706427608,
+      "grad_norm": 0.15923067927360535,
+      "learning_rate": 0.0003795538196979698,
+      "loss": 3.1366,
+      "step": 3240
+    },
+    {
+      "epoch": 0.1885779769934868,
+      "grad_norm": 0.16095119714736938,
+      "learning_rate": 0.00037938444867751677,
+      "loss": 3.1417,
+      "step": 3250
+    },
+    {
+      "epoch": 0.18915821692269755,
+      "grad_norm": 0.1521240770816803,
+      "learning_rate": 0.00037921441714473196,
+      "loss": 3.1393,
+      "step": 3260
+    },
+    {
+      "epoch": 0.18973845685190827,
+      "grad_norm": 0.16122375428676605,
+      "learning_rate": 0.0003790437257256892,
+      "loss": 3.1197,
+      "step": 3270
+    },
+    {
+      "epoch": 0.19031869678111898,
+      "grad_norm": 0.16436485946178436,
+      "learning_rate": 0.0003788723750488922,
+      "loss": 3.1349,
+      "step": 3280
+    },
+    {
+      "epoch": 0.19089893671032973,
+      "grad_norm": 0.15300863981246948,
+      "learning_rate": 0.00037870036574527206,
+      "loss": 3.1362,
+      "step": 3290
+    },
+    {
+      "epoch": 0.19147917663954045,
+      "grad_norm": 0.16245847940444946,
+      "learning_rate": 0.00037852769844818506,
+      "loss": 3.1323,
+      "step": 3300
+    },
+    {
+      "epoch": 0.19205941656875117,
+      "grad_norm": 0.14438757300376892,
+      "learning_rate": 0.00037835437379341036,
+      "loss": 3.1291,
+      "step": 3310
+    },
+    {
+      "epoch": 0.19263965649796191,
+      "grad_norm": 0.17143815755844116,
+      "learning_rate": 0.0003781803924191474,
+      "loss": 3.1266,
+      "step": 3320
+    },
+    {
+      "epoch": 0.19321989642717263,
+      "grad_norm": 0.15372861921787262,
+      "learning_rate": 0.0003780057549660139,
+      "loss": 3.141,
+      "step": 3330
+    },
+    {
+      "epoch": 0.19380013635638335,
+      "grad_norm": 0.15773558616638184,
+      "learning_rate": 0.0003778304620770432,
+      "loss": 3.1219,
+      "step": 3340
+    },
+    {
+      "epoch": 0.1943803762855941,
+      "grad_norm": 0.144402414560318,
+      "learning_rate": 0.0003776545143976821,
+      "loss": 3.129,
+      "step": 3350
+    },
+    {
+      "epoch": 0.19496061621480482,
+      "grad_norm": 0.14592647552490234,
+      "learning_rate": 0.00037747791257578846,
+      "loss": 3.117,
+      "step": 3360
+    },
+    {
+      "epoch": 0.19554085614401556,
+      "grad_norm": 0.146622434258461,
+      "learning_rate": 0.0003773006572616286,
+      "loss": 3.1317,
+      "step": 3370
+    },
+    {
+      "epoch": 0.19612109607322628,
+      "grad_norm": 0.14668260514736176,
+      "learning_rate": 0.00037712274910787515,
+      "loss": 3.1243,
+      "step": 3380
+    },
+    {
+      "epoch": 0.196701336002437,
+      "grad_norm": 0.166849285364151,
+      "learning_rate": 0.0003769441887696046,
+      "loss": 3.1273,
+      "step": 3390
+    },
+    {
+      "epoch": 0.19728157593164775,
+      "grad_norm": 0.16720564663410187,
+      "learning_rate": 0.0003767649769042948,
+      "loss": 3.118,
+      "step": 3400
+    },
+    {
+      "epoch": 0.19786181586085846,
+      "grad_norm": 0.17589610815048218,
+      "learning_rate": 0.00037658511417182263,
+      "loss": 3.1216,
+      "step": 3410
+    },
+    {
+      "epoch": 0.19844205579006918,
+      "grad_norm": 0.16391609609127045,
+      "learning_rate": 0.00037640460123446146,
+      "loss": 3.1152,
+      "step": 3420
+    },
+    {
+      "epoch": 0.19902229571927993,
+      "grad_norm": 0.14618930220603943,
+      "learning_rate": 0.000376223438756879,
+      "loss": 3.1129,
+      "step": 3430
+    },
+    {
+      "epoch": 0.19960253564849065,
+      "grad_norm": 0.1584542989730835,
+      "learning_rate": 0.0003760416274061343,
+      "loss": 3.1122,
+      "step": 3440
+    },
+    {
+      "epoch": 0.20018277557770137,
+      "grad_norm": 0.14501620829105377,
+      "learning_rate": 0.00037585916785167584,
+      "loss": 3.122,
+      "step": 3450
+    },
+    {
+      "epoch": 0.2007630155069121,
+      "grad_norm": 0.15992431342601776,
+      "learning_rate": 0.0003756760607653388,
+      "loss": 3.1164,
+      "step": 3460
+    },
+    {
+      "epoch": 0.20134325543612283,
+      "grad_norm": 0.15558482706546783,
+      "learning_rate": 0.0003754923068213428,
+      "loss": 3.1118,
+      "step": 3470
+    },
+    {
+      "epoch": 0.20192349536533358,
+      "grad_norm": 0.1663794219493866,
+      "learning_rate": 0.00037530790669628887,
+      "loss": 3.1149,
+      "step": 3480
+    },
+    {
+      "epoch": 0.2025037352945443,
+      "grad_norm": 0.1476125866174698,
+      "learning_rate": 0.0003751228610691578,
+      "loss": 3.1124,
+      "step": 3490
+    },
+    {
+      "epoch": 0.20308397522375501,
+      "grad_norm": 0.14965076744556427,
+      "learning_rate": 0.00037493717062130684,
+      "loss": 3.1167,
+      "step": 3500
+    },
+    {
+      "epoch": 0.20366421515296576,
+      "grad_norm": 0.17828093469142914,
+      "learning_rate": 0.0003747508360364677,
+      "loss": 3.1022,
+      "step": 3510
+    },
+    {
+      "epoch": 0.20424445508217648,
+      "grad_norm": 0.18014219403266907,
+      "learning_rate": 0.0003745638580007439,
+      "loss": 3.1002,
+      "step": 3520
+    },
+    {
+      "epoch": 0.2048246950113872,
+      "grad_norm": 0.1711190640926361,
+      "learning_rate": 0.0003743762372026081,
+      "loss": 3.1091,
+      "step": 3530
+    },
+    {
+      "epoch": 0.20540493494059794,
+      "grad_norm": 0.14033159613609314,
+      "learning_rate": 0.00037418797433289974,
+      "loss": 3.1023,
+      "step": 3540
+    },
+    {
+      "epoch": 0.20598517486980866,
+      "grad_norm": 0.15103302896022797,
+      "learning_rate": 0.00037399907008482246,
+      "loss": 3.0971,
+      "step": 3550
+    },
+    {
+      "epoch": 0.20656541479901938,
+      "grad_norm": 0.14676210284233093,
+      "learning_rate": 0.00037380952515394145,
+      "loss": 3.1013,
+      "step": 3560
+    },
+    {
+      "epoch": 0.20714565472823013,
+      "grad_norm": 0.15218688547611237,
+      "learning_rate": 0.000373619340238181,
+      "loss": 3.1072,
+      "step": 3570
+    },
+    {
+      "epoch": 0.20772589465744085,
+      "grad_norm": 0.16119298338890076,
+      "learning_rate": 0.00037342851603782193,
+      "loss": 3.1034,
+      "step": 3580
+    },
+    {
+      "epoch": 0.2083061345866516,
+      "grad_norm": 0.18104802072048187,
+      "learning_rate": 0.0003732370532554989,
+      "loss": 3.1047,
+      "step": 3590
+    },
+    {
+      "epoch": 0.2088863745158623,
+      "grad_norm": 0.16622230410575867,
+      "learning_rate": 0.00037304495259619794,
+      "loss": 3.0968,
+      "step": 3600
+    },
+    {
+      "epoch": 0.20946661444507303,
+      "grad_norm": 0.1655486822128296,
+      "learning_rate": 0.0003728522147672538,
+      "loss": 3.0997,
+      "step": 3610
+    },
+    {
+      "epoch": 0.21004685437428378,
+      "grad_norm": 0.14865244925022125,
+      "learning_rate": 0.0003726588404783474,
+      "loss": 3.1107,
+      "step": 3620
+    },
+    {
+      "epoch": 0.2106270943034945,
+      "grad_norm": 0.1397307813167572,
+      "learning_rate": 0.00037246483044150314,
+      "loss": 3.0949,
+      "step": 3630
+    },
+    {
+      "epoch": 0.2112073342327052,
+      "grad_norm": 0.15176993608474731,
+      "learning_rate": 0.0003722701853710862,
+      "loss": 3.0983,
+      "step": 3640
+    },
+    {
+      "epoch": 0.21178757416191596,
+      "grad_norm": 0.15583378076553345,
+      "learning_rate": 0.0003720749059838002,
+      "loss": 3.1001,
+      "step": 3650
+    },
+    {
+      "epoch": 0.21236781409112668,
+      "grad_norm": 0.14740754663944244,
+      "learning_rate": 0.0003718789929986843,
+      "loss": 3.0963,
+      "step": 3660
+    },
+    {
+      "epoch": 0.2129480540203374,
+      "grad_norm": 0.14856573939323425,
+      "learning_rate": 0.0003716824471371105,
+      "loss": 3.0926,
+      "step": 3670
+    },
+    {
+      "epoch": 0.21352829394954814,
+      "grad_norm": 0.1573585569858551,
+      "learning_rate": 0.0003714852691227814,
+      "loss": 3.0958,
+      "step": 3680
+    },
+    {
+      "epoch": 0.21410853387875886,
+      "grad_norm": 0.18037015199661255,
+      "learning_rate": 0.00037128745968172713,
+      "loss": 3.0936,
+      "step": 3690
+    },
+    {
+      "epoch": 0.2146887738079696,
+      "grad_norm": 0.15116359293460846,
+      "learning_rate": 0.00037108901954230263,
+      "loss": 3.0875,
+      "step": 3700
+    },
+    {
+      "epoch": 0.21526901373718033,
+      "grad_norm": 0.16249211132526398,
+      "learning_rate": 0.0003708899494351854,
+      "loss": 3.0927,
+      "step": 3710
+    },
+    {
+      "epoch": 0.21584925366639104,
+      "grad_norm": 0.1532929241657257,
+      "learning_rate": 0.00037069025009337246,
+      "loss": 3.0924,
+      "step": 3720
+    },
+    {
+      "epoch": 0.2164294935956018,
+      "grad_norm": 0.16931064426898956,
+      "learning_rate": 0.00037048992225217756,
+      "loss": 3.0961,
+      "step": 3730
+    },
+    {
+      "epoch": 0.2170097335248125,
+      "grad_norm": 0.15800388157367706,
+      "learning_rate": 0.0003702889666492289,
+      "loss": 3.0853,
+      "step": 3740
+    },
+    {
+      "epoch": 0.21758997345402323,
+      "grad_norm": 0.15049238502979279,
+      "learning_rate": 0.00037008738402446604,
+      "loss": 3.0863,
+      "step": 3750
+    },
+    {
+      "epoch": 0.21817021338323397,
+      "grad_norm": 0.1531439870595932,
+      "learning_rate": 0.0003698851751201373,
+      "loss": 3.0868,
+      "step": 3760
+    },
+    {
+      "epoch": 0.2187504533124447,
+      "grad_norm": 0.14501748979091644,
+      "learning_rate": 0.000369682340680797,
+      "loss": 3.0793,
+      "step": 3770
+    },
+    {
+      "epoch": 0.2193306932416554,
+      "grad_norm": 0.15761543810367584,
+      "learning_rate": 0.00036947888145330294,
+      "loss": 3.0912,
+      "step": 3780
+    },
+    {
+      "epoch": 0.21991093317086616,
+      "grad_norm": 0.15072381496429443,
+      "learning_rate": 0.00036927479818681325,
+      "loss": 3.0839,
+      "step": 3790
+    },
+    {
+      "epoch": 0.22049117310007688,
+      "grad_norm": 0.1547534018754959,
+      "learning_rate": 0.0003690700916327838,
+      "loss": 3.0793,
+      "step": 3800
+    },
+    {
+      "epoch": 0.22107141302928762,
+      "grad_norm": 0.16465094685554504,
+      "learning_rate": 0.0003688647625449657,
+      "loss": 3.0857,
+      "step": 3810
+    },
+    {
+      "epoch": 0.22165165295849834,
+      "grad_norm": 0.15939410030841827,
+      "learning_rate": 0.00036865881167940214,
+      "loss": 3.0748,
+      "step": 3820
+    },
+    {
+      "epoch": 0.22223189288770906,
+      "grad_norm": 0.1516985446214676,
+      "learning_rate": 0.00036845223979442565,
+      "loss": 3.0824,
+      "step": 3830
+    },
+    {
+      "epoch": 0.2228121328169198,
+      "grad_norm": 0.157265767455101,
+      "learning_rate": 0.00036824504765065573,
+      "loss": 3.0839,
+      "step": 3840
+    },
+    {
+      "epoch": 0.22339237274613052,
+      "grad_norm": 0.1470394879579544,
+      "learning_rate": 0.0003680372360109954,
+      "loss": 3.0831,
+      "step": 3850
+    },
+    {
+      "epoch": 0.22397261267534124,
+      "grad_norm": 0.16063229739665985,
+      "learning_rate": 0.000367828805640629,
+      "loss": 3.0752,
+      "step": 3860
+    },
+    {
+      "epoch": 0.224552852604552,
+      "grad_norm": 0.1397065967321396,
+      "learning_rate": 0.0003676197573070189,
+      "loss": 3.0689,
+      "step": 3870
+    },
+    {
+      "epoch": 0.2251330925337627,
+      "grad_norm": 0.16024722158908844,
+      "learning_rate": 0.0003674100917799031,
+      "loss": 3.0718,
+      "step": 3880
+    },
+    {
+      "epoch": 0.22571333246297343,
+      "grad_norm": 0.15182538330554962,
+      "learning_rate": 0.0003671998098312919,
+      "loss": 3.07,
+      "step": 3890
+    },
+    {
+      "epoch": 0.22629357239218417,
+      "grad_norm": 0.17511528730392456,
+      "learning_rate": 0.0003669889122354655,
+      "loss": 3.0768,
+      "step": 3900
+    },
+    {
+      "epoch": 0.2268738123213949,
+      "grad_norm": 0.1410207599401474,
+      "learning_rate": 0.00036677739976897095,
+      "loss": 3.0727,
+      "step": 3910
+    },
+    {
+      "epoch": 0.22745405225060564,
+      "grad_norm": 0.14334461092948914,
+      "learning_rate": 0.00036656527321061934,
+      "loss": 3.0677,
+      "step": 3920
+    },
+    {
+      "epoch": 0.22803429217981636,
+      "grad_norm": 0.16972528398036957,
+      "learning_rate": 0.0003663525333414828,
+      "loss": 3.0662,
+      "step": 3930
+    },
+    {
+      "epoch": 0.22861453210902707,
+      "grad_norm": 0.13627688586711884,
+      "learning_rate": 0.0003661391809448919,
+      "loss": 3.0743,
+      "step": 3940
+    },
+    {
+      "epoch": 0.22919477203823782,
+      "grad_norm": 0.14983052015304565,
+      "learning_rate": 0.0003659252168064325,
+      "loss": 3.0657,
+      "step": 3950
+    },
+    {
+      "epoch": 0.22977501196744854,
+      "grad_norm": 0.16689680516719818,
+      "learning_rate": 0.00036571064171394294,
+      "loss": 3.078,
+      "step": 3960
+    },
+    {
+      "epoch": 0.23035525189665926,
+      "grad_norm": 0.16488981246948242,
+      "learning_rate": 0.00036549545645751124,
+      "loss": 3.0733,
+      "step": 3970
+    },
+    {
+      "epoch": 0.23093549182587,
+      "grad_norm": 0.15926587581634521,
+      "learning_rate": 0.0003652796618294719,
+      "loss": 3.0602,
+      "step": 3980
+    },
+    {
+      "epoch": 0.23151573175508072,
+      "grad_norm": 0.16738209128379822,
+      "learning_rate": 0.0003650632586244036,
+      "loss": 3.0589,
+      "step": 3990
+    },
+    {
+      "epoch": 0.23209597168429144,
+      "grad_norm": 0.16227886080741882,
+      "learning_rate": 0.00036484624763912535,
+      "loss": 3.0677,
+      "step": 4000
+    },
+    {
+      "epoch": 0.23209597168429144,
+      "eval_loss": 3.035964250564575,
+      "eval_runtime": 5.3945,
+      "eval_samples_per_second": 802.668,
+      "eval_steps_per_second": 1.668,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2326762116135022,
+      "grad_norm": 0.15232205390930176,
+      "learning_rate": 0.00036462862967269455,
+      "loss": 3.0652,
+      "step": 4010
+    },
+    {
+      "epoch": 0.2332564515427129,
+      "grad_norm": 0.16944393515586853,
+      "learning_rate": 0.0003644104055264032,
+      "loss": 3.0726,
+      "step": 4020
+    },
+    {
+      "epoch": 0.23383669147192365,
+      "grad_norm": 0.16069863736629486,
+      "learning_rate": 0.00036419157600377553,
+      "loss": 3.0671,
+      "step": 4030
+    },
+    {
+      "epoch": 0.23441693140113437,
+      "grad_norm": 0.13939155638217926,
+      "learning_rate": 0.00036397214191056474,
+      "loss": 3.0597,
+      "step": 4040
+    },
+    {
+      "epoch": 0.2349971713303451,
+      "grad_norm": 0.14409920573234558,
+      "learning_rate": 0.0003637521040547502,
+      "loss": 3.0608,
+      "step": 4050
+    },
+    {
+      "epoch": 0.23557741125955584,
+      "grad_norm": 0.144365593791008,
+      "learning_rate": 0.0003635314632465343,
+      "loss": 3.0722,
+      "step": 4060
+    },
+    {
+      "epoch": 0.23615765118876655,
+      "grad_norm": 0.15792323648929596,
+      "learning_rate": 0.00036331022029833967,
+      "loss": 3.056,
+      "step": 4070
+    },
+    {
+      "epoch": 0.23673789111797727,
+      "grad_norm": 0.1579774171113968,
+      "learning_rate": 0.00036308837602480593,
+      "loss": 3.0596,
+      "step": 4080
+    },
+    {
+      "epoch": 0.23731813104718802,
+      "grad_norm": 0.14511054754257202,
+      "learning_rate": 0.00036286593124278696,
+      "loss": 3.0552,
+      "step": 4090
+    },
+    {
+      "epoch": 0.23789837097639874,
+      "grad_norm": 0.14825165271759033,
+      "learning_rate": 0.0003626428867713478,
+      "loss": 3.0444,
+      "step": 4100
+    },
+    {
+      "epoch": 0.23847861090560946,
+      "grad_norm": 0.16554515063762665,
+      "learning_rate": 0.00036241924343176146,
+      "loss": 3.0519,
+      "step": 4110
+    },
+    {
+      "epoch": 0.2390588508348202,
+      "grad_norm": 0.13520917296409607,
+      "learning_rate": 0.00036219500204750626,
+      "loss": 3.0523,
+      "step": 4120
+    },
+    {
+      "epoch": 0.23963909076403092,
+      "grad_norm": 0.16214175522327423,
+      "learning_rate": 0.00036197016344426244,
+      "loss": 3.056,
+      "step": 4130
+    },
+    {
+      "epoch": 0.24021933069324167,
+      "grad_norm": 0.15843996405601501,
+      "learning_rate": 0.0003617447284499093,
+      "loss": 3.0582,
+      "step": 4140
+    },
+    {
+      "epoch": 0.24079957062245239,
+      "grad_norm": 0.1621207445859909,
+      "learning_rate": 0.000361518697894522,
+      "loss": 3.0547,
+      "step": 4150
+    },
+    {
+      "epoch": 0.2413798105516631,
+      "grad_norm": 0.16439932584762573,
+      "learning_rate": 0.0003612920726103688,
+      "loss": 3.0564,
+      "step": 4160
+    },
+    {
+      "epoch": 0.24196005048087385,
+      "grad_norm": 0.168155238032341,
+      "learning_rate": 0.00036106485343190785,
+      "loss": 3.0524,
+      "step": 4170
+    },
+    {
+      "epoch": 0.24254029041008457,
+      "grad_norm": 0.17214582860469818,
+      "learning_rate": 0.0003608370411957838,
+      "loss": 3.0549,
+      "step": 4180
+    },
+    {
+      "epoch": 0.2431205303392953,
+      "grad_norm": 0.14759258925914764,
+      "learning_rate": 0.00036060863674082516,
+      "loss": 3.0482,
+      "step": 4190
+    },
+    {
+      "epoch": 0.24370077026850603,
+      "grad_norm": 0.15004561841487885,
+      "learning_rate": 0.00036037964090804113,
+      "loss": 3.0485,
+      "step": 4200
+    },
+    {
+      "epoch": 0.24428101019771675,
+      "grad_norm": 0.15239353477954865,
+      "learning_rate": 0.0003601500545406184,
+      "loss": 3.0451,
+      "step": 4210
+    },
+    {
+      "epoch": 0.24486125012692747,
+      "grad_norm": 0.17232055962085724,
+      "learning_rate": 0.00035991987848391793,
+      "loss": 3.0633,
+      "step": 4220
+    },
+    {
+      "epoch": 0.24544149005613822,
+      "grad_norm": 0.15132233500480652,
+      "learning_rate": 0.0003596891135854722,
+      "loss": 3.0469,
+      "step": 4230
+    },
+    {
+      "epoch": 0.24602172998534894,
+      "grad_norm": 0.15383832156658173,
+      "learning_rate": 0.00035945776069498154,
+      "loss": 3.043,
+      "step": 4240
+    },
+    {
+      "epoch": 0.24660196991455968,
+      "grad_norm": 0.1648101806640625,
+      "learning_rate": 0.0003592258206643117,
+      "loss": 3.0494,
+      "step": 4250
+    },
+    {
+      "epoch": 0.2471822098437704,
+      "grad_norm": 0.14788667857646942,
+      "learning_rate": 0.0003589932943474901,
+      "loss": 3.051,
+      "step": 4260
+    },
+    {
+      "epoch": 0.24776244977298112,
+      "grad_norm": 0.15840385854244232,
+      "learning_rate": 0.00035876018260070307,
+      "loss": 3.04,
+      "step": 4270
+    },
+    {
+      "epoch": 0.24834268970219187,
+      "grad_norm": 0.1488913893699646,
+      "learning_rate": 0.0003585264862822924,
+      "loss": 3.0482,
+      "step": 4280
+    },
+    {
+      "epoch": 0.24892292963140258,
+      "grad_norm": 0.1538337618112564,
+      "learning_rate": 0.00035829220625275247,
+      "loss": 3.0438,
+      "step": 4290
+    },
+    {
+      "epoch": 0.2495031695606133,
+      "grad_norm": 0.15288054943084717,
+      "learning_rate": 0.00035805734337472677,
+      "loss": 3.047,
+      "step": 4300
+    },
+    {
+      "epoch": 0.25008340948982405,
+      "grad_norm": 0.14262276887893677,
+      "learning_rate": 0.0003578218985130052,
+      "loss": 3.0327,
+      "step": 4310
+    },
+    {
+      "epoch": 0.2506636494190348,
+      "grad_norm": 0.1732674241065979,
+      "learning_rate": 0.0003575858725345203,
+      "loss": 3.04,
+      "step": 4320
+    },
+    {
+      "epoch": 0.2512438893482455,
+      "grad_norm": 0.14097104966640472,
+      "learning_rate": 0.00035734926630834443,
+      "loss": 3.0479,
+      "step": 4330
+    },
+    {
+      "epoch": 0.25182412927745623,
+      "grad_norm": 0.1385360062122345,
+      "learning_rate": 0.0003571120807056866,
+      "loss": 3.0382,
+      "step": 4340
+    },
+    {
+      "epoch": 0.252404369206667,
+      "grad_norm": 0.14046671986579895,
+      "learning_rate": 0.0003568743165998889,
+      "loss": 3.0426,
+      "step": 4350
+    },
+    {
+      "epoch": 0.25298460913587767,
+      "grad_norm": 0.17887942492961884,
+      "learning_rate": 0.0003566359748664238,
+      "loss": 3.0435,
+      "step": 4360
+    },
+    {
+      "epoch": 0.2535648490650884,
+      "grad_norm": 0.14854948222637177,
+      "learning_rate": 0.00035639705638289054,
+      "loss": 3.0453,
+      "step": 4370
+    },
+    {
+      "epoch": 0.25414508899429916,
+      "grad_norm": 0.15000031888484955,
+      "learning_rate": 0.0003561575620290119,
+      "loss": 3.044,
+      "step": 4380
+    },
+    {
+      "epoch": 0.25472532892350985,
+      "grad_norm": 0.1506197601556778,
+      "learning_rate": 0.0003559174926866312,
+      "loss": 3.0411,
+      "step": 4390
+    },
+    {
+      "epoch": 0.2553055688527206,
+      "grad_norm": 0.1484203040599823,
+      "learning_rate": 0.000355676849239709,
+      "loss": 3.0383,
+      "step": 4400
+    },
+    {
+      "epoch": 0.25588580878193135,
+      "grad_norm": 0.15664157271385193,
+      "learning_rate": 0.00035543563257431967,
+      "loss": 3.03,
+      "step": 4410
+    },
+    {
+      "epoch": 0.25646604871114204,
+      "grad_norm": 0.1449301540851593,
+      "learning_rate": 0.00035519384357864814,
+      "loss": 3.0264,
+      "step": 4420
+    },
+    {
+      "epoch": 0.2570462886403528,
+      "grad_norm": 0.15928883850574493,
+      "learning_rate": 0.0003549514831429869,
+      "loss": 3.0256,
+      "step": 4430
+    },
+    {
+      "epoch": 0.25762652856956353,
+      "grad_norm": 0.15556131303310394,
+      "learning_rate": 0.0003547085521597324,
+      "loss": 3.042,
+      "step": 4440
+    },
+    {
+      "epoch": 0.2582067684987742,
+      "grad_norm": 0.15970775485038757,
+      "learning_rate": 0.00035446505152338196,
+      "loss": 3.0259,
+      "step": 4450
+    },
+    {
+      "epoch": 0.25878700842798497,
+      "grad_norm": 0.16376914083957672,
+      "learning_rate": 0.00035422098213053053,
+      "loss": 3.0263,
+      "step": 4460
+    },
+    {
+      "epoch": 0.2593672483571957,
+      "grad_norm": 0.15761475265026093,
+      "learning_rate": 0.00035397634487986716,
+      "loss": 3.0231,
+      "step": 4470
+    },
+    {
+      "epoch": 0.2599474882864064,
+      "grad_norm": 0.1443856954574585,
+      "learning_rate": 0.00035373114067217175,
+      "loss": 3.0262,
+      "step": 4480
+    },
+    {
+      "epoch": 0.26052772821561715,
+      "grad_norm": 0.16176897287368774,
+      "learning_rate": 0.000353485370410312,
+      "loss": 3.0209,
+      "step": 4490
+    },
+    {
+      "epoch": 0.2611079681448279,
+      "grad_norm": 0.15875308215618134,
+      "learning_rate": 0.00035323903499923966,
+      "loss": 3.0182,
+      "step": 4500
+    },
+    {
+      "epoch": 0.26168820807403864,
+      "grad_norm": 0.16142487525939941,
+      "learning_rate": 0.0003529921353459875,
+      "loss": 3.0303,
+      "step": 4510
+    },
+    {
+      "epoch": 0.26226844800324933,
+      "grad_norm": 0.1423715204000473,
+      "learning_rate": 0.00035274467235966604,
+      "loss": 3.027,
+      "step": 4520
+    },
+    {
+      "epoch": 0.2628486879324601,
+      "grad_norm": 0.1590990573167801,
+      "learning_rate": 0.0003524966469514598,
+      "loss": 3.0175,
+      "step": 4530
+    },
+    {
+      "epoch": 0.2634289278616708,
+      "grad_norm": 0.16158755123615265,
+      "learning_rate": 0.0003522480600346244,
+      "loss": 3.0201,
+      "step": 4540
+    },
+    {
+      "epoch": 0.2640091677908815,
+      "grad_norm": 0.14409056305885315,
+      "learning_rate": 0.00035199891252448286,
+      "loss": 3.0249,
+      "step": 4550
+    },
+    {
+      "epoch": 0.26458940772009226,
+      "grad_norm": 0.1601254642009735,
+      "learning_rate": 0.0003517492053384224,
+      "loss": 3.0259,
+      "step": 4560
+    },
+    {
+      "epoch": 0.265169647649303,
+      "grad_norm": 0.1478479653596878,
+      "learning_rate": 0.00035149893939589105,
+      "loss": 3.0304,
+      "step": 4570
+    },
+    {
+      "epoch": 0.2657498875785137,
+      "grad_norm": 0.15490871667861938,
+      "learning_rate": 0.0003512481156183943,
+      "loss": 3.0322,
+      "step": 4580
+    },
+    {
+      "epoch": 0.26633012750772445,
+      "grad_norm": 0.16943325102329254,
+      "learning_rate": 0.00035099673492949135,
+      "loss": 3.0206,
+      "step": 4590
+    },
+    {
+      "epoch": 0.2669103674369352,
+      "grad_norm": 0.14153891801834106,
+      "learning_rate": 0.00035074479825479256,
+      "loss": 3.0222,
+      "step": 4600
+    },
+    {
+      "epoch": 0.2674906073661459,
+      "grad_norm": 0.16155371069908142,
+      "learning_rate": 0.0003504923065219549,
+      "loss": 3.0235,
+      "step": 4610
+    },
+    {
+      "epoch": 0.26807084729535663,
+      "grad_norm": 0.16925618052482605,
+      "learning_rate": 0.0003502392606606795,
+      "loss": 3.023,
+      "step": 4620
+    },
+    {
+      "epoch": 0.2686510872245674,
+      "grad_norm": 0.1443454921245575,
+      "learning_rate": 0.00034998566160270765,
+      "loss": 3.0171,
+      "step": 4630
+    },
+    {
+      "epoch": 0.26923132715377807,
+      "grad_norm": 0.15553627908229828,
+      "learning_rate": 0.0003497315102818177,
+      "loss": 3.0249,
+      "step": 4640
+    },
+    {
+      "epoch": 0.2698115670829888,
+      "grad_norm": 0.15279638767242432,
+      "learning_rate": 0.00034947680763382146,
+      "loss": 3.0172,
+      "step": 4650
+    },
+    {
+      "epoch": 0.27039180701219956,
+      "grad_norm": 0.17935776710510254,
+      "learning_rate": 0.00034922155459656077,
+      "loss": 3.0227,
+      "step": 4660
+    },
+    {
+      "epoch": 0.27097204694141025,
+      "grad_norm": 0.15077771246433258,
+      "learning_rate": 0.000348965752109904,
+      "loss": 3.0012,
+      "step": 4670
+    },
+    {
+      "epoch": 0.271552286870621,
+      "grad_norm": 0.17899669706821442,
+      "learning_rate": 0.0003487094011157427,
+      "loss": 3.0186,
+      "step": 4680
+    },
+    {
+      "epoch": 0.27213252679983174,
+      "grad_norm": 0.1435423046350479,
+      "learning_rate": 0.00034845250255798813,
+      "loss": 3.0165,
+      "step": 4690
+    },
+    {
+      "epoch": 0.27271276672904243,
+      "grad_norm": 0.15444038808345795,
+      "learning_rate": 0.0003481950573825676,
+      "loss": 3.0207,
+      "step": 4700
+    },
+    {
+      "epoch": 0.2732930066582532,
+      "grad_norm": 0.14248330891132355,
+      "learning_rate": 0.0003479370665374213,
+      "loss": 3.0107,
+      "step": 4710
+    },
+    {
+      "epoch": 0.2738732465874639,
+      "grad_norm": 0.14584462344646454,
+      "learning_rate": 0.0003476785309724986,
+      "loss": 3.0094,
+      "step": 4720
+    },
+    {
+      "epoch": 0.27445348651667467,
+      "grad_norm": 0.1478513926267624,
+      "learning_rate": 0.0003474194516397544,
+      "loss": 3.0145,
+      "step": 4730
+    },
+    {
+      "epoch": 0.27503372644588536,
+      "grad_norm": 0.16704440116882324,
+      "learning_rate": 0.00034715982949314603,
+      "loss": 3.0119,
+      "step": 4740
+    },
+    {
+      "epoch": 0.2756139663750961,
+      "grad_norm": 0.1426423341035843,
+      "learning_rate": 0.0003468996654886294,
+      "loss": 3.0093,
+      "step": 4750
+    },
+    {
+      "epoch": 0.27619420630430686,
+      "grad_norm": 0.14460447430610657,
+      "learning_rate": 0.00034663896058415565,
+      "loss": 3.0113,
+      "step": 4760
+    },
+    {
+      "epoch": 0.27677444623351755,
+      "grad_norm": 0.1554524302482605,
+      "learning_rate": 0.0003463777157396676,
+      "loss": 3.0009,
+      "step": 4770
+    },
+    {
+      "epoch": 0.2773546861627283,
+      "grad_norm": 0.14673510193824768,
+      "learning_rate": 0.00034611593191709593,
+      "loss": 3.0107,
+      "step": 4780
+    },
+    {
+      "epoch": 0.27793492609193904,
+      "grad_norm": 0.15779437124729156,
+      "learning_rate": 0.0003458536100803564,
+      "loss": 3.0242,
+      "step": 4790
+    },
+    {
+      "epoch": 0.27851516602114973,
+      "grad_norm": 0.15662328898906708,
+      "learning_rate": 0.0003455907511953452,
+      "loss": 3.0155,
+      "step": 4800
+    },
+    {
+      "epoch": 0.2790954059503605,
+      "grad_norm": 0.1433698534965515,
+      "learning_rate": 0.00034532735622993643,
+      "loss": 3.0027,
+      "step": 4810
+    },
+    {
+      "epoch": 0.2796756458795712,
+      "grad_norm": 0.13105028867721558,
+      "learning_rate": 0.000345063426153978,
+      "loss": 3.0059,
+      "step": 4820
+    },
+    {
+      "epoch": 0.2802558858087819,
+      "grad_norm": 0.1466207504272461,
+      "learning_rate": 0.00034479896193928794,
+      "loss": 3.0132,
+      "step": 4830
+    },
+    {
+      "epoch": 0.28083612573799266,
+      "grad_norm": 0.1647462099790573,
+      "learning_rate": 0.00034453396455965134,
+      "loss": 3.0014,
+      "step": 4840
+    },
+    {
+      "epoch": 0.2814163656672034,
+      "grad_norm": 0.14212268590927124,
+      "learning_rate": 0.0003442684349908162,
+      "loss": 3.0069,
+      "step": 4850
+    },
+    {
+      "epoch": 0.2819966055964141,
+      "grad_norm": 0.15203924477100372,
+      "learning_rate": 0.00034400237421049033,
+      "loss": 3.0011,
+      "step": 4860
+    },
+    {
+      "epoch": 0.28257684552562484,
+      "grad_norm": 0.15818612277507782,
+      "learning_rate": 0.0003437357831983373,
+      "loss": 3.0014,
+      "step": 4870
+    },
+    {
+      "epoch": 0.2831570854548356,
+      "grad_norm": 0.15356150269508362,
+      "learning_rate": 0.0003434686629359732,
+      "loss": 3.0052,
+      "step": 4880
+    },
+    {
+      "epoch": 0.2837373253840463,
+      "grad_norm": 0.14168018102645874,
+      "learning_rate": 0.0003432010144069628,
+      "loss": 3.0062,
+      "step": 4890
+    },
+    {
+      "epoch": 0.284317565313257,
+      "grad_norm": 0.1359260231256485,
+      "learning_rate": 0.0003429328385968159,
+      "loss": 2.9985,
+      "step": 4900
+    },
+    {
+      "epoch": 0.28489780524246777,
+      "grad_norm": 0.1524299532175064,
+      "learning_rate": 0.00034266413649298414,
+      "loss": 3.0019,
+      "step": 4910
+    },
+    {
+      "epoch": 0.28547804517167846,
+      "grad_norm": 0.16266267001628876,
+      "learning_rate": 0.00034239490908485664,
+      "loss": 2.9963,
+      "step": 4920
+    },
+    {
+      "epoch": 0.2860582851008892,
+      "grad_norm": 0.18476419150829315,
+      "learning_rate": 0.00034212515736375704,
+      "loss": 3.0011,
+      "step": 4930
+    },
+    {
+      "epoch": 0.28663852503009996,
+      "grad_norm": 0.14814235270023346,
+      "learning_rate": 0.00034185488232293937,
+      "loss": 3.0012,
+      "step": 4940
+    },
+    {
+      "epoch": 0.2872187649593107,
+      "grad_norm": 0.13690084218978882,
+      "learning_rate": 0.00034158408495758467,
+      "loss": 3.0005,
+      "step": 4950
+    },
+    {
+      "epoch": 0.2877990048885214,
+      "grad_norm": 0.1411154866218567,
+      "learning_rate": 0.00034131276626479714,
+      "loss": 2.9906,
+      "step": 4960
+    },
+    {
+      "epoch": 0.28837924481773214,
+      "grad_norm": 0.1581835001707077,
+      "learning_rate": 0.0003410409272436008,
+      "loss": 3.0064,
+      "step": 4970
+    },
+    {
+      "epoch": 0.2889594847469429,
+      "grad_norm": 0.1560392826795578,
+      "learning_rate": 0.0003407685688949352,
+      "loss": 3.002,
+      "step": 4980
+    },
+    {
+      "epoch": 0.2895397246761536,
+      "grad_norm": 0.14799857139587402,
+      "learning_rate": 0.0003404956922216524,
+      "loss": 2.9964,
+      "step": 4990
+    },
+    {
+      "epoch": 0.2901199646053643,
+      "grad_norm": 0.14189326763153076,
+      "learning_rate": 0.000340222298228513,
+      "loss": 2.9938,
+      "step": 5000
+    },
+    {
+      "epoch": 0.2901199646053643,
+      "eval_loss": 2.9654901027679443,
+      "eval_runtime": 5.3907,
+      "eval_samples_per_second": 803.242,
+      "eval_steps_per_second": 1.67,
+      "step": 5000
+    },
+    {
+      "epoch": 0.29070020453457507,
+      "grad_norm": 0.14645496010780334,
+      "learning_rate": 0.00033994838792218213,
+      "loss": 2.9949,
+      "step": 5010
+    },
+    {
+      "epoch": 0.29128044446378576,
+      "grad_norm": 0.16313683986663818,
+      "learning_rate": 0.00033967396231122634,
+      "loss": 2.9859,
+      "step": 5020
+    },
+    {
+      "epoch": 0.2918606843929965,
+      "grad_norm": 0.14886920154094696,
+      "learning_rate": 0.00033939902240610946,
+      "loss": 2.993,
+      "step": 5030
+    },
+    {
+      "epoch": 0.29244092432220725,
+      "grad_norm": 0.15066947042942047,
+      "learning_rate": 0.0003391235692191891,
+      "loss": 2.995,
+      "step": 5040
+    },
+    {
+      "epoch": 0.29302116425141794,
+      "grad_norm": 0.14852523803710938,
+      "learning_rate": 0.0003388476037647125,
+      "loss": 2.9928,
+      "step": 5050
+    },
+    {
+      "epoch": 0.2936014041806287,
+      "grad_norm": 0.14773225784301758,
+      "learning_rate": 0.0003385711270588137,
+      "loss": 2.9887,
+      "step": 5060
+    },
+    {
+      "epoch": 0.29418164410983944,
+      "grad_norm": 0.1471126675605774,
+      "learning_rate": 0.0003382941401195087,
+      "loss": 2.9868,
+      "step": 5070
+    },
+    {
+      "epoch": 0.2947618840390501,
+      "grad_norm": 0.17213530838489532,
+      "learning_rate": 0.00033801664396669254,
+      "loss": 2.9993,
+      "step": 5080
+    },
+    {
+      "epoch": 0.2953421239682609,
+      "grad_norm": 0.17217926681041718,
+      "learning_rate": 0.00033773863962213496,
+      "loss": 2.9852,
+      "step": 5090
+    },
+    {
+      "epoch": 0.2959223638974716,
+      "grad_norm": 0.14535531401634216,
+      "learning_rate": 0.0003374601281094771,
+      "loss": 2.9892,
+      "step": 5100
+    },
+    {
+      "epoch": 0.2965026038266823,
+      "grad_norm": 0.15037214756011963,
+      "learning_rate": 0.0003371811104542277,
+      "loss": 2.9831,
+      "step": 5110
+    },
+    {
+      "epoch": 0.29708284375589306,
+      "grad_norm": 0.15636031329631805,
+      "learning_rate": 0.00033690158768375894,
+      "loss": 2.982,
+      "step": 5120
+    },
+    {
+      "epoch": 0.2976630836851038,
+      "grad_norm": 0.1443617343902588,
+      "learning_rate": 0.0003366215608273028,
+      "loss": 2.9893,
+      "step": 5130
+    },
+    {
+      "epoch": 0.2982433236143145,
+      "grad_norm": 0.15526960790157318,
+      "learning_rate": 0.0003363410309159477,
+      "loss": 2.9988,
+      "step": 5140
+    },
+    {
+      "epoch": 0.29882356354352524,
+      "grad_norm": 0.1502479910850525,
+      "learning_rate": 0.00033605999898263396,
+      "loss": 2.9802,
+      "step": 5150
+    },
+    {
+      "epoch": 0.299403803472736,
+      "grad_norm": 0.15904352068901062,
+      "learning_rate": 0.0003357784660621507,
+      "loss": 2.9866,
+      "step": 5160
+    },
+    {
+      "epoch": 0.29998404340194673,
+      "grad_norm": 0.1367734670639038,
+      "learning_rate": 0.00033549643319113163,
+      "loss": 2.992,
+      "step": 5170
+    },
+    {
+      "epoch": 0.3005642833311574,
+      "grad_norm": 0.1586538702249527,
+      "learning_rate": 0.00033521390140805134,
+      "loss": 2.9801,
+      "step": 5180
+    },
+    {
+      "epoch": 0.30114452326036817,
+      "grad_norm": 0.13850967586040497,
+      "learning_rate": 0.00033493087175322147,
+      "loss": 2.9865,
+      "step": 5190
+    },
+    {
+      "epoch": 0.3017247631895789,
+      "grad_norm": 0.15822730958461761,
+      "learning_rate": 0.00033464734526878674,
+      "loss": 2.9862,
+      "step": 5200
+    },
+    {
+      "epoch": 0.3023050031187896,
+      "grad_norm": 0.15228135883808136,
+      "learning_rate": 0.00033436332299872153,
+      "loss": 2.9831,
+      "step": 5210
+    },
+    {
+      "epoch": 0.30288524304800035,
+      "grad_norm": 0.15129725635051727,
+      "learning_rate": 0.00033407880598882545,
+      "loss": 2.9803,
+      "step": 5220
+    },
+    {
+      "epoch": 0.3034654829772111,
+      "grad_norm": 0.14200474321842194,
+      "learning_rate": 0.00033379379528672,
+      "loss": 2.9857,
+      "step": 5230
+    },
+    {
+      "epoch": 0.3040457229064218,
+      "grad_norm": 0.14871156215667725,
+      "learning_rate": 0.00033350829194184444,
+      "loss": 2.9838,
+      "step": 5240
+    },
+    {
+      "epoch": 0.30462596283563254,
+      "grad_norm": 0.157440185546875,
+      "learning_rate": 0.00033322229700545196,
+      "loss": 2.9818,
+      "step": 5250
+    },
+    {
+      "epoch": 0.3052062027648433,
+      "grad_norm": 0.13436777889728546,
+      "learning_rate": 0.000332935811530606,
+      "loss": 2.9785,
+      "step": 5260
+    },
+    {
+      "epoch": 0.305786442694054,
+      "grad_norm": 0.1572754830121994,
+      "learning_rate": 0.0003326488365721759,
+      "loss": 2.9882,
+      "step": 5270
+    },
+    {
+      "epoch": 0.3063666826232647,
+      "grad_norm": 0.13976529240608215,
+      "learning_rate": 0.00033236137318683363,
+      "loss": 2.9742,
+      "step": 5280
+    },
+    {
+      "epoch": 0.30694692255247547,
+      "grad_norm": 0.15709726512432098,
+      "learning_rate": 0.0003320734224330495,
+      "loss": 2.9786,
+      "step": 5290
+    },
+    {
+      "epoch": 0.30752716248168616,
+      "grad_norm": 0.16040396690368652,
+      "learning_rate": 0.00033178498537108833,
+      "loss": 2.9863,
+      "step": 5300
+    },
+    {
+      "epoch": 0.3081074024108969,
+      "grad_norm": 0.1624976396560669,
+      "learning_rate": 0.0003314960630630056,
+      "loss": 2.981,
+      "step": 5310
+    },
+    {
+      "epoch": 0.30868764234010765,
+      "grad_norm": 0.1401677429676056,
+      "learning_rate": 0.00033120665657264344,
+      "loss": 2.9798,
+      "step": 5320
+    },
+    {
+      "epoch": 0.30926788226931834,
+      "grad_norm": 0.15790094435214996,
+      "learning_rate": 0.00033091676696562697,
+      "loss": 2.97,
+      "step": 5330
+    },
+    {
+      "epoch": 0.3098481221985291,
+      "grad_norm": 0.13581514358520508,
+      "learning_rate": 0.0003306263953093601,
+      "loss": 2.9696,
+      "step": 5340
+    },
+    {
+      "epoch": 0.31042836212773983,
+      "grad_norm": 0.15074850618839264,
+      "learning_rate": 0.00033033554267302155,
+      "loss": 2.9666,
+      "step": 5350
+    },
+    {
+      "epoch": 0.3110086020569505,
+      "grad_norm": 0.1477966606616974,
+      "learning_rate": 0.0003300442101275614,
+      "loss": 2.976,
+      "step": 5360
+    },
+    {
+      "epoch": 0.31158884198616127,
+      "grad_norm": 0.16635243594646454,
+      "learning_rate": 0.00032975239874569645,
+      "loss": 2.9843,
+      "step": 5370
+    },
+    {
+      "epoch": 0.312169081915372,
+      "grad_norm": 0.1615062654018402,
+      "learning_rate": 0.00032946010960190677,
+      "loss": 2.9767,
+      "step": 5380
+    },
+    {
+      "epoch": 0.31274932184458276,
+      "grad_norm": 0.14296725392341614,
+      "learning_rate": 0.0003291673437724317,
+      "loss": 2.9669,
+      "step": 5390
+    },
+    {
+      "epoch": 0.31332956177379345,
+      "grad_norm": 0.14347875118255615,
+      "learning_rate": 0.0003288741023352656,
+      "loss": 2.9744,
+      "step": 5400
+    },
+    {
+      "epoch": 0.3139098017030042,
+      "grad_norm": 0.14779694378376007,
+      "learning_rate": 0.0003285803863701542,
+      "loss": 2.973,
+      "step": 5410
+    },
+    {
+      "epoch": 0.31449004163221495,
+      "grad_norm": 0.15234588086605072,
+      "learning_rate": 0.00032828619695859045,
+      "loss": 2.973,
+      "step": 5420
+    },
+    {
+      "epoch": 0.31507028156142564,
+      "grad_norm": 0.13710370659828186,
+      "learning_rate": 0.00032799153518381065,
+      "loss": 2.9752,
+      "step": 5430
+    },
+    {
+      "epoch": 0.3156505214906364,
+      "grad_norm": 0.16290375590324402,
+      "learning_rate": 0.00032769640213079024,
+      "loss": 2.9683,
+      "step": 5440
+    },
+    {
+      "epoch": 0.31623076141984713,
+      "grad_norm": 0.14308467507362366,
+      "learning_rate": 0.00032740079888624014,
+      "loss": 2.9705,
+      "step": 5450
+    },
+    {
+      "epoch": 0.3168110013490578,
+      "grad_norm": 0.14017410576343536,
+      "learning_rate": 0.0003271047265386022,
+      "loss": 2.9737,
+      "step": 5460
+    },
+    {
+      "epoch": 0.31739124127826857,
+      "grad_norm": 0.14751465618610382,
+      "learning_rate": 0.00032680818617804617,
+      "loss": 2.9683,
+      "step": 5470
+    },
+    {
+      "epoch": 0.3179714812074793,
+      "grad_norm": 0.15221184492111206,
+      "learning_rate": 0.0003265111788964645,
+      "loss": 2.9676,
+      "step": 5480
+    },
+    {
+      "epoch": 0.31855172113669,
+      "grad_norm": 0.15240968763828278,
+      "learning_rate": 0.00032621370578746916,
+      "loss": 2.9666,
+      "step": 5490
+    },
+    {
+      "epoch": 0.31913196106590075,
+      "grad_norm": 0.1354389488697052,
+      "learning_rate": 0.00032591576794638733,
+      "loss": 2.967,
+      "step": 5500
+    },
+    {
+      "epoch": 0.3197122009951115,
+      "grad_norm": 0.16345831751823425,
+      "learning_rate": 0.0003256173664702573,
+      "loss": 2.9655,
+      "step": 5510
+    },
+    {
+      "epoch": 0.3202924409243222,
+      "grad_norm": 0.14933979511260986,
+      "learning_rate": 0.00032531850245782465,
+      "loss": 2.9572,
+      "step": 5520
+    },
+    {
+      "epoch": 0.32087268085353293,
+      "grad_norm": 0.15477211773395538,
+      "learning_rate": 0.0003250191770095379,
+      "loss": 2.9679,
+      "step": 5530
+    },
+    {
+      "epoch": 0.3214529207827437,
+      "grad_norm": 0.1461821049451828,
+      "learning_rate": 0.0003247193912275448,
+      "loss": 2.9648,
+      "step": 5540
+    },
+    {
+      "epoch": 0.32203316071195437,
+      "grad_norm": 0.14192292094230652,
+      "learning_rate": 0.00032441914621568783,
+      "loss": 2.9637,
+      "step": 5550
+    },
+    {
+      "epoch": 0.3226134006411651,
+      "grad_norm": 0.15110647678375244,
+      "learning_rate": 0.00032411844307950074,
+      "loss": 2.9644,
+      "step": 5560
+    },
+    {
+      "epoch": 0.32319364057037586,
+      "grad_norm": 0.14680705964565277,
+      "learning_rate": 0.0003238172829262039,
+      "loss": 2.9645,
+      "step": 5570
+    },
+    {
+      "epoch": 0.32377388049958655,
+      "grad_norm": 0.14200454950332642,
+      "learning_rate": 0.00032351566686470064,
+      "loss": 2.9482,
+      "step": 5580
+    },
+    {
+      "epoch": 0.3243541204287973,
+      "grad_norm": 0.1601497381925583,
+      "learning_rate": 0.00032321359600557273,
+      "loss": 2.9551,
+      "step": 5590
+    },
+    {
+      "epoch": 0.32493436035800805,
+      "grad_norm": 0.14178360998630524,
+      "learning_rate": 0.00032291107146107686,
+      "loss": 2.9588,
+      "step": 5600
+    },
+    {
+      "epoch": 0.3255146002872188,
+      "grad_norm": 0.14087755978107452,
+      "learning_rate": 0.00032260809434514004,
+      "loss": 2.9599,
+      "step": 5610
+    },
+    {
+      "epoch": 0.3260948402164295,
+      "grad_norm": 0.14895635843276978,
+      "learning_rate": 0.00032230466577335575,
+      "loss": 2.9638,
+      "step": 5620
+    },
+    {
+      "epoch": 0.32667508014564023,
+      "grad_norm": 0.14719648659229279,
+      "learning_rate": 0.00032200078686297985,
+      "loss": 2.9506,
+      "step": 5630
+    },
+    {
+      "epoch": 0.327255320074851,
+      "grad_norm": 0.15351636707782745,
+      "learning_rate": 0.00032169645873292616,
+      "loss": 2.9642,
+      "step": 5640
+    },
+    {
+      "epoch": 0.32783556000406167,
+      "grad_norm": 0.14748744666576385,
+      "learning_rate": 0.0003213916825037629,
+      "loss": 2.9635,
+      "step": 5650
+    },
+    {
+      "epoch": 0.3284157999332724,
+      "grad_norm": 0.15472351014614105,
+      "learning_rate": 0.000321086459297708,
+      "loss": 2.9561,
+      "step": 5660
+    },
+    {
+      "epoch": 0.32899603986248316,
+      "grad_norm": 0.16757521033287048,
+      "learning_rate": 0.0003207807902386252,
+      "loss": 2.9488,
+      "step": 5670
+    },
+    {
+      "epoch": 0.32957627979169385,
+      "grad_norm": 0.13843347132205963,
+      "learning_rate": 0.00032047467645202017,
+      "loss": 2.9595,
+      "step": 5680
+    },
+    {
+      "epoch": 0.3301565197209046,
+      "grad_norm": 0.1418449878692627,
+      "learning_rate": 0.0003201681190650358,
+      "loss": 2.9732,
+      "step": 5690
+    },
+    {
+      "epoch": 0.33073675965011534,
+      "grad_norm": 0.14602024853229523,
+      "learning_rate": 0.00031986111920644854,
+      "loss": 2.9641,
+      "step": 5700
+    },
+    {
+      "epoch": 0.33131699957932603,
+      "grad_norm": 0.14498759806156158,
+      "learning_rate": 0.0003195536780066641,
+      "loss": 2.9616,
+      "step": 5710
+    },
+    {
+      "epoch": 0.3318972395085368,
+      "grad_norm": 0.1715255081653595,
+      "learning_rate": 0.0003192457965977131,
+      "loss": 2.9466,
+      "step": 5720
+    },
+    {
+      "epoch": 0.3324774794377475,
+      "grad_norm": 0.154770627617836,
+      "learning_rate": 0.0003189374761132472,
+      "loss": 2.9502,
+      "step": 5730
+    },
+    {
+      "epoch": 0.3330577193669582,
+      "grad_norm": 0.1444711685180664,
+      "learning_rate": 0.00031862871768853463,
+      "loss": 2.9541,
+      "step": 5740
+    },
+    {
+      "epoch": 0.33363795929616896,
+      "grad_norm": 0.1380070149898529,
+      "learning_rate": 0.0003183195224604563,
+      "loss": 2.9603,
+      "step": 5750
+    },
+    {
+      "epoch": 0.3342181992253797,
+      "grad_norm": 0.14697673916816711,
+      "learning_rate": 0.00031800989156750153,
+      "loss": 2.9534,
+      "step": 5760
+    },
+    {
+      "epoch": 0.3347984391545904,
+      "grad_norm": 0.16386087238788605,
+      "learning_rate": 0.00031769982614976357,
+      "loss": 2.9613,
+      "step": 5770
+    },
+    {
+      "epoch": 0.33537867908380115,
+      "grad_norm": 0.14437103271484375,
+      "learning_rate": 0.0003173893273489358,
+      "loss": 2.9544,
+      "step": 5780
+    },
+    {
+      "epoch": 0.3359589190130119,
+      "grad_norm": 0.14022108912467957,
+      "learning_rate": 0.00031707839630830734,
+      "loss": 2.9623,
+      "step": 5790
+    },
+    {
+      "epoch": 0.3365391589422226,
+      "grad_norm": 0.14068008959293365,
+      "learning_rate": 0.0003167670341727589,
+      "loss": 2.9499,
+      "step": 5800
+    },
+    {
+      "epoch": 0.33711939887143333,
+      "grad_norm": 0.14008371531963348,
+      "learning_rate": 0.00031645524208875843,
+      "loss": 2.9505,
+      "step": 5810
+    },
+    {
+      "epoch": 0.3376996388006441,
+      "grad_norm": 0.14702627062797546,
+      "learning_rate": 0.0003161430212043571,
+      "loss": 2.9456,
+      "step": 5820
+    },
+    {
+      "epoch": 0.3382798787298548,
+      "grad_norm": 0.13805069029331207,
+      "learning_rate": 0.0003158303726691848,
+      "loss": 2.9483,
+      "step": 5830
+    },
+    {
+      "epoch": 0.3388601186590655,
+      "grad_norm": 0.14876188337802887,
+      "learning_rate": 0.0003155172976344463,
+      "loss": 2.9478,
+      "step": 5840
+    },
+    {
+      "epoch": 0.33944035858827626,
+      "grad_norm": 0.15237033367156982,
+      "learning_rate": 0.0003152037972529167,
+      "loss": 2.9565,
+      "step": 5850
+    },
+    {
+      "epoch": 0.340020598517487,
+      "grad_norm": 0.14114312827587128,
+      "learning_rate": 0.0003148898726789371,
+      "loss": 2.9498,
+      "step": 5860
+    },
+    {
+      "epoch": 0.3406008384466977,
+      "grad_norm": 0.14159853756427765,
+      "learning_rate": 0.0003145755250684107,
+      "loss": 2.9576,
+      "step": 5870
+    },
+    {
+      "epoch": 0.34118107837590844,
+      "grad_norm": 0.143374502658844,
+      "learning_rate": 0.00031426075557879844,
+      "loss": 2.9532,
+      "step": 5880
+    },
+    {
+      "epoch": 0.3417613183051192,
+      "grad_norm": 0.14989744126796722,
+      "learning_rate": 0.0003139455653691146,
+      "loss": 2.9493,
+      "step": 5890
+    },
+    {
+      "epoch": 0.3423415582343299,
+      "grad_norm": 0.15993821620941162,
+      "learning_rate": 0.0003136299555999223,
+      "loss": 2.946,
+      "step": 5900
+    },
+    {
+      "epoch": 0.3429217981635406,
+      "grad_norm": 0.15719273686408997,
+      "learning_rate": 0.00031331392743333,
+      "loss": 2.9489,
+      "step": 5910
+    },
+    {
+      "epoch": 0.34350203809275137,
+      "grad_norm": 0.15121470391750336,
+      "learning_rate": 0.00031299748203298647,
+      "loss": 2.94,
+      "step": 5920
+    },
+    {
+      "epoch": 0.34408227802196206,
+      "grad_norm": 0.14775414764881134,
+      "learning_rate": 0.0003126806205640767,
+      "loss": 2.9317,
+      "step": 5930
+    },
+    {
+      "epoch": 0.3446625179511728,
+      "grad_norm": 0.158615842461586,
+      "learning_rate": 0.0003123633441933179,
+      "loss": 2.943,
+      "step": 5940
+    },
+    {
+      "epoch": 0.34524275788038356,
+      "grad_norm": 0.1479695737361908,
+      "learning_rate": 0.0003120456540889549,
+      "loss": 2.9435,
+      "step": 5950
+    },
+    {
+      "epoch": 0.34582299780959425,
+      "grad_norm": 0.13843965530395508,
+      "learning_rate": 0.00031172755142075604,
+      "loss": 2.9422,
+      "step": 5960
+    },
+    {
+      "epoch": 0.346403237738805,
+      "grad_norm": 0.1605447232723236,
+      "learning_rate": 0.00031140903736000855,
+      "loss": 2.9473,
+      "step": 5970
+    },
+    {
+      "epoch": 0.34698347766801574,
+      "grad_norm": 0.15243135392665863,
+      "learning_rate": 0.0003110901130795146,
+      "loss": 2.9397,
+      "step": 5980
+    },
+    {
+      "epoch": 0.34756371759722643,
+      "grad_norm": 0.14652499556541443,
+      "learning_rate": 0.00031077077975358677,
+      "loss": 2.9353,
+      "step": 5990
+    },
+    {
+      "epoch": 0.3481439575264372,
+      "grad_norm": 0.1344570368528366,
+      "learning_rate": 0.0003104510385580438,
+      "loss": 2.954,
+      "step": 6000
+    },
+    {
+      "epoch": 0.3481439575264372,
+      "eval_loss": 2.910837173461914,
+      "eval_runtime": 5.406,
+      "eval_samples_per_second": 800.965,
+      "eval_steps_per_second": 1.665,
+      "step": 6000
+    },
+    {
+      "epoch": 0.3487241974556479,
+      "grad_norm": 0.1439635157585144,
+      "learning_rate": 0.0003101308906702064,
+      "loss": 2.9376,
+      "step": 6010
+    },
+    {
+      "epoch": 0.3493044373848586,
+      "grad_norm": 0.14717575907707214,
+      "learning_rate": 0.00030981033726889255,
+      "loss": 2.946,
+      "step": 6020
+    },
+    {
+      "epoch": 0.34988467731406936,
+      "grad_norm": 0.1374998688697815,
+      "learning_rate": 0.00030948937953441337,
+      "loss": 2.9403,
+      "step": 6030
+    },
+    {
+      "epoch": 0.3504649172432801,
+      "grad_norm": 0.1412826031446457,
+      "learning_rate": 0.000309168018648569,
+      "loss": 2.9355,
+      "step": 6040
+    },
+    {
+      "epoch": 0.35104515717249085,
+      "grad_norm": 0.14428307116031647,
+      "learning_rate": 0.0003088462557946438,
+      "loss": 2.9445,
+      "step": 6050
+    },
+    {
+      "epoch": 0.35162539710170154,
+      "grad_norm": 0.1365126520395279,
+      "learning_rate": 0.00030852409215740233,
+      "loss": 2.9444,
+      "step": 6060
+    },
+    {
+      "epoch": 0.3522056370309123,
+      "grad_norm": 0.14436426758766174,
+      "learning_rate": 0.0003082015289230848,
+      "loss": 2.9283,
+      "step": 6070
+    },
+    {
+      "epoch": 0.35278587696012303,
+      "grad_norm": 0.14028888940811157,
+      "learning_rate": 0.000307878567279403,
+      "loss": 2.9419,
+      "step": 6080
+    },
+    {
+      "epoch": 0.3533661168893337,
+      "grad_norm": 0.15606454014778137,
+      "learning_rate": 0.0003075552084155354,
+      "loss": 2.9475,
+      "step": 6090
+    },
+    {
+      "epoch": 0.35394635681854447,
+      "grad_norm": 0.15795260667800903,
+      "learning_rate": 0.00030723145352212316,
+      "loss": 2.9387,
+      "step": 6100
+    },
+    {
+      "epoch": 0.3545265967477552,
+      "grad_norm": 0.14786306023597717,
+      "learning_rate": 0.0003069073037912658,
+      "loss": 2.9329,
+      "step": 6110
+    },
+    {
+      "epoch": 0.3551068366769659,
+      "grad_norm": 0.14278551936149597,
+      "learning_rate": 0.00030658276041651655,
+      "loss": 2.9327,
+      "step": 6120
+    },
+    {
+      "epoch": 0.35568707660617666,
+      "grad_norm": 0.15043289959430695,
+      "learning_rate": 0.0003062578245928782,
+      "loss": 2.9364,
+      "step": 6130
+    },
+    {
+      "epoch": 0.3562673165353874,
+      "grad_norm": 0.15945866703987122,
+      "learning_rate": 0.0003059324975167984,
+      "loss": 2.9343,
+      "step": 6140
+    },
+    {
+      "epoch": 0.3568475564645981,
+      "grad_norm": 0.14819258451461792,
+      "learning_rate": 0.00030560678038616545,
+      "loss": 2.9362,
+      "step": 6150
+    },
+    {
+      "epoch": 0.35742779639380884,
+      "grad_norm": 0.14421693980693817,
+      "learning_rate": 0.00030528067440030416,
+      "loss": 2.9339,
+      "step": 6160
+    },
+    {
+      "epoch": 0.3580080363230196,
+      "grad_norm": 0.1377636194229126,
+      "learning_rate": 0.00030495418075997076,
+      "loss": 2.9405,
+      "step": 6170
+    },
+    {
+      "epoch": 0.3585882762522303,
+      "grad_norm": 0.15939627587795258,
+      "learning_rate": 0.0003046273006673491,
+      "loss": 2.9381,
+      "step": 6180
+    },
+    {
+      "epoch": 0.359168516181441,
+      "grad_norm": 0.15114201605319977,
+      "learning_rate": 0.00030430003532604593,
+      "loss": 2.9364,
+      "step": 6190
+    },
+    {
+      "epoch": 0.35974875611065177,
+      "grad_norm": 0.1577521562576294,
+      "learning_rate": 0.0003039723859410865,
+      "loss": 2.9261,
+      "step": 6200
+    },
+    {
+      "epoch": 0.36032899603986246,
+      "grad_norm": 0.1542629450559616,
+      "learning_rate": 0.00030364435371891017,
+      "loss": 2.9255,
+      "step": 6210
+    },
+    {
+      "epoch": 0.3609092359690732,
+      "grad_norm": 0.1399134397506714,
+      "learning_rate": 0.000303315939867366,
+      "loss": 2.9322,
+      "step": 6220
+    },
+    {
+      "epoch": 0.36148947589828395,
+      "grad_norm": 0.1502622365951538,
+      "learning_rate": 0.0003029871455957081,
+      "loss": 2.9259,
+      "step": 6230
+    },
+    {
+      "epoch": 0.36206971582749464,
+      "grad_norm": 0.152298703789711,
+      "learning_rate": 0.00030265797211459137,
+      "loss": 2.9391,
+      "step": 6240
+    },
+    {
+      "epoch": 0.3626499557567054,
+      "grad_norm": 0.14043064415454865,
+      "learning_rate": 0.0003023284206360673,
+      "loss": 2.9278,
+      "step": 6250
+    },
+    {
+      "epoch": 0.36323019568591614,
+      "grad_norm": 0.1389763206243515,
+      "learning_rate": 0.0003019984923735787,
+      "loss": 2.9297,
+      "step": 6260
+    },
+    {
+      "epoch": 0.3638104356151269,
+      "grad_norm": 0.13958598673343658,
+      "learning_rate": 0.0003016681885419562,
+      "loss": 2.9278,
+      "step": 6270
+    },
+    {
+      "epoch": 0.36439067554433757,
+      "grad_norm": 0.1489957571029663,
+      "learning_rate": 0.00030133751035741295,
+      "loss": 2.9351,
+      "step": 6280
+    },
+    {
+      "epoch": 0.3649709154735483,
+      "grad_norm": 0.13442069292068481,
+      "learning_rate": 0.0003010064590375407,
+      "loss": 2.932,
+      "step": 6290
+    },
+    {
+      "epoch": 0.36555115540275906,
+      "grad_norm": 0.13820981979370117,
+      "learning_rate": 0.00030067503580130515,
+      "loss": 2.9297,
+      "step": 6300
+    },
+    {
+      "epoch": 0.36613139533196976,
+      "grad_norm": 0.1524563729763031,
+      "learning_rate": 0.00030034324186904135,
+      "loss": 2.9296,
+      "step": 6310
+    },
+    {
+      "epoch": 0.3667116352611805,
+      "grad_norm": 0.1481136530637741,
+      "learning_rate": 0.0003000110784624493,
+      "loss": 2.9181,
+      "step": 6320
+    },
+    {
+      "epoch": 0.36729187519039125,
+      "grad_norm": 0.1579747349023819,
+      "learning_rate": 0.00029967854680458945,
+      "loss": 2.9283,
+      "step": 6330
+    },
+    {
+      "epoch": 0.36787211511960194,
+      "grad_norm": 0.15123365819454193,
+      "learning_rate": 0.0002993456481198783,
+      "loss": 2.9284,
+      "step": 6340
+    },
+    {
+      "epoch": 0.3684523550488127,
+      "grad_norm": 0.14494162797927856,
+      "learning_rate": 0.00029901238363408357,
+      "loss": 2.9292,
+      "step": 6350
+    },
+    {
+      "epoch": 0.36903259497802343,
+      "grad_norm": 0.15171104669570923,
+      "learning_rate": 0.00029867875457431994,
+      "loss": 2.9232,
+      "step": 6360
+    },
+    {
+      "epoch": 0.3696128349072341,
+      "grad_norm": 0.13486771285533905,
+      "learning_rate": 0.0002983447621690447,
+      "loss": 2.9239,
+      "step": 6370
+    },
+    {
+      "epoch": 0.37019307483644487,
+      "grad_norm": 0.12901608645915985,
+      "learning_rate": 0.0002980104076480528,
+      "loss": 2.9247,
+      "step": 6380
+    },
+    {
+      "epoch": 0.3707733147656556,
+      "grad_norm": 0.14679357409477234,
+      "learning_rate": 0.00029767569224247267,
+      "loss": 2.9212,
+      "step": 6390
+    },
+    {
+      "epoch": 0.3713535546948663,
+      "grad_norm": 0.148614764213562,
+      "learning_rate": 0.0002973406171847615,
+      "loss": 2.9303,
+      "step": 6400
+    },
+    {
+      "epoch": 0.37193379462407705,
+      "grad_norm": 0.1406886726617813,
+      "learning_rate": 0.0002970051837087007,
+      "loss": 2.9228,
+      "step": 6410
+    },
+    {
+      "epoch": 0.3725140345532878,
+      "grad_norm": 0.1530819684267044,
+      "learning_rate": 0.00029666939304939143,
+      "loss": 2.9251,
+      "step": 6420
+    },
+    {
+      "epoch": 0.3730942744824985,
+      "grad_norm": 0.14401894807815552,
+      "learning_rate": 0.0002963332464432502,
+      "loss": 2.9211,
+      "step": 6430
+    },
+    {
+      "epoch": 0.37367451441170924,
+      "grad_norm": 0.14230068027973175,
+      "learning_rate": 0.000295996745128004,
+      "loss": 2.9283,
+      "step": 6440
+    },
+    {
+      "epoch": 0.37425475434092,
+      "grad_norm": 0.14690563082695007,
+      "learning_rate": 0.00029565989034268584,
+      "loss": 2.9243,
+      "step": 6450
+    },
+    {
+      "epoch": 0.3748349942701307,
+      "grad_norm": 0.15261906385421753,
+      "learning_rate": 0.0002953226833276304,
+      "loss": 2.9246,
+      "step": 6460
+    },
+    {
+      "epoch": 0.3754152341993414,
+      "grad_norm": 0.14708411693572998,
+      "learning_rate": 0.0002949851253244691,
+      "loss": 2.9203,
+      "step": 6470
+    },
+    {
+      "epoch": 0.37599547412855217,
+      "grad_norm": 0.13880537450313568,
+      "learning_rate": 0.0002946472175761261,
+      "loss": 2.9217,
+      "step": 6480
+    },
+    {
+      "epoch": 0.3765757140577629,
+      "grad_norm": 0.14952170848846436,
+      "learning_rate": 0.00029430896132681293,
+      "loss": 2.9137,
+      "step": 6490
+    },
+    {
+      "epoch": 0.3771559539869736,
+      "grad_norm": 0.14160752296447754,
+      "learning_rate": 0.0002939703578220246,
+      "loss": 2.9172,
+      "step": 6500
+    },
+    {
+      "epoch": 0.37773619391618435,
+      "grad_norm": 0.1431347280740738,
+      "learning_rate": 0.0002936314083085348,
+      "loss": 2.9132,
+      "step": 6510
+    },
+    {
+      "epoch": 0.3783164338453951,
+      "grad_norm": 0.1614401489496231,
+      "learning_rate": 0.0002932921140343909,
+      "loss": 2.9216,
+      "step": 6520
+    },
+    {
+      "epoch": 0.3788966737746058,
+      "grad_norm": 0.1511864811182022,
+      "learning_rate": 0.0002929524762489102,
+      "loss": 2.9178,
+      "step": 6530
+    },
+    {
+      "epoch": 0.37947691370381653,
+      "grad_norm": 0.15035977959632874,
+      "learning_rate": 0.0002926124962026744,
+      "loss": 2.9144,
+      "step": 6540
+    },
+    {
+      "epoch": 0.3800571536330273,
+      "grad_norm": 0.1370215266942978,
+      "learning_rate": 0.0002922721751475259,
+      "loss": 2.9157,
+      "step": 6550
+    },
+    {
+      "epoch": 0.38063739356223797,
+      "grad_norm": 0.1499612033367157,
+      "learning_rate": 0.00029193151433656227,
+      "loss": 2.915,
+      "step": 6560
+    },
+    {
+      "epoch": 0.3812176334914487,
+      "grad_norm": 0.15834592282772064,
+      "learning_rate": 0.00029159051502413233,
+      "loss": 2.9155,
+      "step": 6570
+    },
+    {
+      "epoch": 0.38179787342065946,
+      "grad_norm": 0.13816189765930176,
+      "learning_rate": 0.0002912491784658313,
+      "loss": 2.9242,
+      "step": 6580
+    },
+    {
+      "epoch": 0.38237811334987015,
+      "grad_norm": 0.16079404950141907,
+      "learning_rate": 0.00029090750591849614,
+      "loss": 2.9132,
+      "step": 6590
+    },
+    {
+      "epoch": 0.3829583532790809,
+      "grad_norm": 0.14459337294101715,
+      "learning_rate": 0.000290565498640201,
+      "loss": 2.9124,
+      "step": 6600
+    },
+    {
+      "epoch": 0.38353859320829164,
+      "grad_norm": 0.14939634501934052,
+      "learning_rate": 0.00029022315789025246,
+      "loss": 2.9052,
+      "step": 6610
+    },
+    {
+      "epoch": 0.38411883313750234,
+      "grad_norm": 0.1586650013923645,
+      "learning_rate": 0.0002898804849291851,
+      "loss": 2.914,
+      "step": 6620
+    },
+    {
+      "epoch": 0.3846990730667131,
+      "grad_norm": 0.14331689476966858,
+      "learning_rate": 0.0002895374810187565,
+      "loss": 2.9082,
+      "step": 6630
+    },
+    {
+      "epoch": 0.38527931299592383,
+      "grad_norm": 0.13959959149360657,
+      "learning_rate": 0.00028919414742194314,
+      "loss": 2.9198,
+      "step": 6640
+    },
+    {
+      "epoch": 0.3858595529251345,
+      "grad_norm": 0.13745518028736115,
+      "learning_rate": 0.00028885048540293524,
+      "loss": 2.9066,
+      "step": 6650
+    },
+    {
+      "epoch": 0.38643979285434527,
+      "grad_norm": 0.1419799029827118,
+      "learning_rate": 0.00028850649622713236,
+      "loss": 2.9121,
+      "step": 6660
+    },
+    {
+      "epoch": 0.387020032783556,
+      "grad_norm": 0.14384348690509796,
+      "learning_rate": 0.00028816218116113867,
+      "loss": 2.9162,
+      "step": 6670
+    },
+    {
+      "epoch": 0.3876002727127667,
+      "grad_norm": 0.15276901423931122,
+      "learning_rate": 0.0002878175414727583,
+      "loss": 2.9058,
+      "step": 6680
+    },
+    {
+      "epoch": 0.38818051264197745,
+      "grad_norm": 0.14799462258815765,
+      "learning_rate": 0.00028747257843099076,
+      "loss": 2.9062,
+      "step": 6690
+    },
+    {
+      "epoch": 0.3887607525711882,
+      "grad_norm": 0.14701081812381744,
+      "learning_rate": 0.000287127293306026,
+      "loss": 2.9033,
+      "step": 6700
+    },
+    {
+      "epoch": 0.38934099250039894,
+      "grad_norm": 0.14179976284503937,
+      "learning_rate": 0.0002867816873692401,
+      "loss": 2.915,
+      "step": 6710
+    },
+    {
+      "epoch": 0.38992123242960963,
+      "grad_norm": 0.15130779147148132,
+      "learning_rate": 0.0002864357618931902,
+      "loss": 2.9104,
+      "step": 6720
+    },
+    {
+      "epoch": 0.3905014723588204,
+      "grad_norm": 0.1438591182231903,
+      "learning_rate": 0.00028608951815161033,
+      "loss": 2.9072,
+      "step": 6730
+    },
+    {
+      "epoch": 0.3910817122880311,
+      "grad_norm": 0.13668999075889587,
+      "learning_rate": 0.0002857429574194061,
+      "loss": 2.914,
+      "step": 6740
+    },
+    {
+      "epoch": 0.3916619522172418,
+      "grad_norm": 0.14742673933506012,
+      "learning_rate": 0.00028539608097265056,
+      "loss": 2.911,
+      "step": 6750
+    },
+    {
+      "epoch": 0.39224219214645256,
+      "grad_norm": 0.1358884572982788,
+      "learning_rate": 0.0002850488900885789,
+      "loss": 2.9041,
+      "step": 6760
+    },
+    {
+      "epoch": 0.3928224320756633,
+      "grad_norm": 0.1417282521724701,
+      "learning_rate": 0.0002847013860455845,
+      "loss": 2.9038,
+      "step": 6770
+    },
+    {
+      "epoch": 0.393402672004874,
+      "grad_norm": 0.14336806535720825,
+      "learning_rate": 0.00028435357012321355,
+      "loss": 2.8981,
+      "step": 6780
+    },
+    {
+      "epoch": 0.39398291193408475,
+      "grad_norm": 0.1505221426486969,
+      "learning_rate": 0.00028400544360216074,
+      "loss": 2.9099,
+      "step": 6790
+    },
+    {
+      "epoch": 0.3945631518632955,
+      "grad_norm": 0.13609366118907928,
+      "learning_rate": 0.0002836570077642644,
+      "loss": 2.91,
+      "step": 6800
+    },
+    {
+      "epoch": 0.3951433917925062,
+      "grad_norm": 0.14644168317317963,
+      "learning_rate": 0.00028330826389250195,
+      "loss": 2.9037,
+      "step": 6810
+    },
+    {
+      "epoch": 0.39572363172171693,
+      "grad_norm": 0.13570183515548706,
+      "learning_rate": 0.0002829592132709846,
+      "loss": 2.9081,
+      "step": 6820
+    },
+    {
+      "epoch": 0.3963038716509277,
+      "grad_norm": 0.13495145738124847,
+      "learning_rate": 0.0002826098571849534,
+      "loss": 2.9046,
+      "step": 6830
+    },
+    {
+      "epoch": 0.39688411158013837,
+      "grad_norm": 0.155324786901474,
+      "learning_rate": 0.00028226019692077406,
+      "loss": 2.8966,
+      "step": 6840
+    },
+    {
+      "epoch": 0.3974643515093491,
+      "grad_norm": 0.15253056585788727,
+      "learning_rate": 0.0002819102337659323,
+      "loss": 2.8914,
+      "step": 6850
+    },
+    {
+      "epoch": 0.39804459143855986,
+      "grad_norm": 0.1406160295009613,
+      "learning_rate": 0.0002815599690090291,
+      "loss": 2.9042,
+      "step": 6860
+    },
+    {
+      "epoch": 0.39862483136777055,
+      "grad_norm": 0.152736097574234,
+      "learning_rate": 0.00028120940393977614,
+      "loss": 2.9106,
+      "step": 6870
+    },
+    {
+      "epoch": 0.3992050712969813,
+      "grad_norm": 0.13760948181152344,
+      "learning_rate": 0.00028085853984899053,
+      "loss": 2.9015,
+      "step": 6880
+    },
+    {
+      "epoch": 0.39978531122619204,
+      "grad_norm": 0.14538171887397766,
+      "learning_rate": 0.0002805073780285906,
+      "loss": 2.9018,
+      "step": 6890
+    },
+    {
+      "epoch": 0.40036555115540273,
+      "grad_norm": 0.14871671795845032,
+      "learning_rate": 0.0002801559197715911,
+      "loss": 2.9011,
+      "step": 6900
+    },
+    {
+      "epoch": 0.4009457910846135,
+      "grad_norm": 0.1533951461315155,
+      "learning_rate": 0.0002798041663720981,
+      "loss": 2.8993,
+      "step": 6910
+    },
+    {
+      "epoch": 0.4015260310138242,
+      "grad_norm": 0.14396968483924866,
+      "learning_rate": 0.0002794521191253045,
+      "loss": 2.8938,
+      "step": 6920
+    },
+    {
+      "epoch": 0.40210627094303497,
+      "grad_norm": 0.16459780931472778,
+      "learning_rate": 0.000279099779327485,
+      "loss": 2.8966,
+      "step": 6930
+    },
+    {
+      "epoch": 0.40268651087224566,
+      "grad_norm": 0.14155222475528717,
+      "learning_rate": 0.0002787471482759918,
+      "loss": 2.8944,
+      "step": 6940
+    },
+    {
+      "epoch": 0.4032667508014564,
+      "grad_norm": 0.14908990263938904,
+      "learning_rate": 0.0002783942272692493,
+      "loss": 2.9055,
+      "step": 6950
+    },
+    {
+      "epoch": 0.40384699073066715,
+      "grad_norm": 0.14655596017837524,
+      "learning_rate": 0.0002780410176067496,
+      "loss": 2.8998,
+      "step": 6960
+    },
+    {
+      "epoch": 0.40442723065987785,
+      "grad_norm": 0.1373496949672699,
+      "learning_rate": 0.00027768752058904777,
+      "loss": 2.9049,
+      "step": 6970
+    },
+    {
+      "epoch": 0.4050074705890886,
+      "grad_norm": 0.14998483657836914,
+      "learning_rate": 0.0002773337375177568,
+      "loss": 2.892,
+      "step": 6980
+    },
+    {
+      "epoch": 0.40558771051829934,
+      "grad_norm": 0.14310023188591003,
+      "learning_rate": 0.00027697966969554295,
+      "loss": 2.8971,
+      "step": 6990
+    },
+    {
+      "epoch": 0.40616795044751003,
+      "grad_norm": 0.15001101791858673,
+      "learning_rate": 0.00027662531842612115,
+      "loss": 2.8931,
+      "step": 7000
+    },
+    {
+      "epoch": 0.40616795044751003,
+      "eval_loss": 2.8682682514190674,
+      "eval_runtime": 5.3865,
+      "eval_samples_per_second": 803.854,
+      "eval_steps_per_second": 1.671,
+      "step": 7000
+    },
+    {
+      "epoch": 0.4067481903767208,
+      "grad_norm": 0.14453168213367462,
+      "learning_rate": 0.0002762706850142498,
+      "loss": 2.8964,
+      "step": 7010
+    },
+    {
+      "epoch": 0.4073284303059315,
+      "grad_norm": 0.15523375570774078,
+      "learning_rate": 0.0002759157707657264,
+      "loss": 2.8987,
+      "step": 7020
+    },
+    {
+      "epoch": 0.4079086702351422,
+      "grad_norm": 0.14011220633983612,
+      "learning_rate": 0.0002755605769873823,
+      "loss": 2.9069,
+      "step": 7030
+    },
+    {
+      "epoch": 0.40848891016435296,
+      "grad_norm": 0.1405903846025467,
+      "learning_rate": 0.0002752051049870782,
+      "loss": 2.8941,
+      "step": 7040
+    },
+    {
+      "epoch": 0.4090691500935637,
+      "grad_norm": 0.13568729162216187,
+      "learning_rate": 0.00027484935607369925,
+      "loss": 2.8864,
+      "step": 7050
+    },
+    {
+      "epoch": 0.4096493900227744,
+      "grad_norm": 0.13538648188114166,
+      "learning_rate": 0.00027449333155715023,
+      "loss": 2.9006,
+      "step": 7060
+    },
+    {
+      "epoch": 0.41022962995198514,
+      "grad_norm": 0.15839791297912598,
+      "learning_rate": 0.00027413703274835067,
+      "loss": 2.8905,
+      "step": 7070
+    },
+    {
+      "epoch": 0.4108098698811959,
+      "grad_norm": 0.14201544225215912,
+      "learning_rate": 0.0002737804609592302,
+      "loss": 2.9017,
+      "step": 7080
+    },
+    {
+      "epoch": 0.4113901098104066,
+      "grad_norm": 0.135043665766716,
+      "learning_rate": 0.0002734236175027234,
+      "loss": 2.8998,
+      "step": 7090
+    },
+    {
+      "epoch": 0.4119703497396173,
+      "grad_norm": 0.13998910784721375,
+      "learning_rate": 0.00027306650369276526,
+      "loss": 2.8953,
+      "step": 7100
+    },
+    {
+      "epoch": 0.41255058966882807,
+      "grad_norm": 0.13548001646995544,
+      "learning_rate": 0.0002727091208442864,
+      "loss": 2.89,
+      "step": 7110
+    },
+    {
+      "epoch": 0.41313082959803876,
+      "grad_norm": 0.14130084216594696,
+      "learning_rate": 0.0002723514702732077,
+      "loss": 2.8918,
+      "step": 7120
+    },
+    {
+      "epoch": 0.4137110695272495,
+      "grad_norm": 0.13718628883361816,
+      "learning_rate": 0.0002719935532964361,
+      "loss": 2.8879,
+      "step": 7130
+    },
+    {
+      "epoch": 0.41429130945646025,
+      "grad_norm": 0.1469535529613495,
+      "learning_rate": 0.00027163537123185943,
+      "loss": 2.8788,
+      "step": 7140
+    },
+    {
+      "epoch": 0.414871549385671,
+      "grad_norm": 0.13813258707523346,
+      "learning_rate": 0.0002712769253983416,
+      "loss": 2.8904,
+      "step": 7150
+    },
+    {
+      "epoch": 0.4154517893148817,
+      "grad_norm": 0.14960864186286926,
+      "learning_rate": 0.0002709182171157176,
+      "loss": 2.8982,
+      "step": 7160
+    },
+    {
+      "epoch": 0.41603202924409244,
+      "grad_norm": 0.13153991103172302,
+      "learning_rate": 0.00027055924770478905,
+      "loss": 2.8803,
+      "step": 7170
+    },
+    {
+      "epoch": 0.4166122691733032,
+      "grad_norm": 0.15670733153820038,
+      "learning_rate": 0.0002702000184873189,
+      "loss": 2.8961,
+      "step": 7180
+    },
+    {
+      "epoch": 0.4171925091025139,
+      "grad_norm": 0.14150060713291168,
+      "learning_rate": 0.00026984053078602665,
+      "loss": 2.8943,
+      "step": 7190
+    },
+    {
+      "epoch": 0.4177727490317246,
+      "grad_norm": 0.13610132038593292,
+      "learning_rate": 0.0002694807859245837,
+      "loss": 2.8932,
+      "step": 7200
+    },
+    {
+      "epoch": 0.41835298896093537,
+      "grad_norm": 0.13310842216014862,
+      "learning_rate": 0.0002691207852276084,
+      "loss": 2.8876,
+      "step": 7210
+    },
+    {
+      "epoch": 0.41893322889014606,
+      "grad_norm": 0.135100319981575,
+      "learning_rate": 0.00026876053002066104,
+      "loss": 2.894,
+      "step": 7220
+    },
+    {
+      "epoch": 0.4195134688193568,
+      "grad_norm": 0.13600456714630127,
+      "learning_rate": 0.00026840002163023896,
+      "loss": 2.8777,
+      "step": 7230
+    },
+    {
+      "epoch": 0.42009370874856755,
+      "grad_norm": 0.1357976198196411,
+      "learning_rate": 0.00026803926138377186,
+      "loss": 2.8869,
+      "step": 7240
+    },
+    {
+      "epoch": 0.42067394867777824,
+      "grad_norm": 0.13665033876895905,
+      "learning_rate": 0.00026767825060961664,
+      "loss": 2.8824,
+      "step": 7250
+    },
+    {
+      "epoch": 0.421254188606989,
+      "grad_norm": 0.14291678369045258,
+      "learning_rate": 0.00026731699063705294,
+      "loss": 2.8981,
+      "step": 7260
+    },
+    {
+      "epoch": 0.42183442853619973,
+      "grad_norm": 0.14100785553455353,
+      "learning_rate": 0.0002669554827962778,
+      "loss": 2.8894,
+      "step": 7270
+    },
+    {
+      "epoch": 0.4224146684654104,
+      "grad_norm": 0.1521817296743393,
+      "learning_rate": 0.000266593728418401,
+      "loss": 2.8745,
+      "step": 7280
+    },
+    {
+      "epoch": 0.42299490839462117,
+      "grad_norm": 0.16641181707382202,
+      "learning_rate": 0.0002662317288354399,
+      "loss": 2.8901,
+      "step": 7290
+    },
+    {
+      "epoch": 0.4235751483238319,
+      "grad_norm": 0.14584742486476898,
+      "learning_rate": 0.000265869485380315,
+      "loss": 2.8804,
+      "step": 7300
+    },
+    {
+      "epoch": 0.4241553882530426,
+      "grad_norm": 0.14091430604457855,
+      "learning_rate": 0.00026550699938684454,
+      "loss": 2.8814,
+      "step": 7310
+    },
+    {
+      "epoch": 0.42473562818225336,
+      "grad_norm": 0.13544070720672607,
+      "learning_rate": 0.0002651442721897401,
+      "loss": 2.8865,
+      "step": 7320
+    },
+    {
+      "epoch": 0.4253158681114641,
+      "grad_norm": 0.13583482801914215,
+      "learning_rate": 0.0002647813051246011,
+      "loss": 2.8687,
+      "step": 7330
+    },
+    {
+      "epoch": 0.4258961080406748,
+      "grad_norm": 0.15594419836997986,
+      "learning_rate": 0.0002644180995279103,
+      "loss": 2.8812,
+      "step": 7340
+    },
+    {
+      "epoch": 0.42647634796988554,
+      "grad_norm": 0.1415625512599945,
+      "learning_rate": 0.0002640546567370288,
+      "loss": 2.8922,
+      "step": 7350
+    },
+    {
+      "epoch": 0.4270565878990963,
+      "grad_norm": 0.13848547637462616,
+      "learning_rate": 0.000263690978090191,
+      "loss": 2.8816,
+      "step": 7360
+    },
+    {
+      "epoch": 0.42763682782830703,
+      "grad_norm": 0.1387799084186554,
+      "learning_rate": 0.00026332706492649977,
+      "loss": 2.8866,
+      "step": 7370
+    },
+    {
+      "epoch": 0.4282170677575177,
+      "grad_norm": 0.13819080591201782,
+      "learning_rate": 0.0002629629185859215,
+      "loss": 2.8838,
+      "step": 7380
+    },
+    {
+      "epoch": 0.42879730768672847,
+      "grad_norm": 0.14040718972682953,
+      "learning_rate": 0.00026259854040928124,
+      "loss": 2.8766,
+      "step": 7390
+    },
+    {
+      "epoch": 0.4293775476159392,
+      "grad_norm": 0.14268594980239868,
+      "learning_rate": 0.0002622339317382575,
+      "loss": 2.8778,
+      "step": 7400
+    },
+    {
+      "epoch": 0.4299577875451499,
+      "grad_norm": 0.13744668662548065,
+      "learning_rate": 0.00026186909391537767,
+      "loss": 2.8843,
+      "step": 7410
+    },
+    {
+      "epoch": 0.43053802747436065,
+      "grad_norm": 0.1380259245634079,
+      "learning_rate": 0.0002615040282840128,
+      "loss": 2.8819,
+      "step": 7420
+    },
+    {
+      "epoch": 0.4311182674035714,
+      "grad_norm": 0.15185829997062683,
+      "learning_rate": 0.00026113873618837275,
+      "loss": 2.8734,
+      "step": 7430
+    },
+    {
+      "epoch": 0.4316985073327821,
+      "grad_norm": 0.1312202364206314,
+      "learning_rate": 0.00026077321897350134,
+      "loss": 2.8769,
+      "step": 7440
+    },
+    {
+      "epoch": 0.43227874726199284,
+      "grad_norm": 0.15877887606620789,
+      "learning_rate": 0.0002604074779852713,
+      "loss": 2.8784,
+      "step": 7450
+    },
+    {
+      "epoch": 0.4328589871912036,
+      "grad_norm": 0.15031276643276215,
+      "learning_rate": 0.0002600415145703791,
+      "loss": 2.8942,
+      "step": 7460
+    },
+    {
+      "epoch": 0.43343922712041427,
+      "grad_norm": 0.14033295214176178,
+      "learning_rate": 0.00025967533007634056,
+      "loss": 2.8734,
+      "step": 7470
+    },
+    {
+      "epoch": 0.434019467049625,
+      "grad_norm": 0.13741450011730194,
+      "learning_rate": 0.00025930892585148525,
+      "loss": 2.8772,
+      "step": 7480
+    },
+    {
+      "epoch": 0.43459970697883576,
+      "grad_norm": 0.13238540291786194,
+      "learning_rate": 0.0002589423032449519,
+      "loss": 2.8872,
+      "step": 7490
+    },
+    {
+      "epoch": 0.43517994690804646,
+      "grad_norm": 0.15299426019191742,
+      "learning_rate": 0.0002585754636066833,
+      "loss": 2.8773,
+      "step": 7500
+    },
+    {
+      "epoch": 0.4357601868372572,
+      "grad_norm": 0.1423303186893463,
+      "learning_rate": 0.00025820840828742156,
+      "loss": 2.8814,
+      "step": 7510
+    },
+    {
+      "epoch": 0.43634042676646795,
+      "grad_norm": 0.135323166847229,
+      "learning_rate": 0.0002578411386387027,
+      "loss": 2.868,
+      "step": 7520
+    },
+    {
+      "epoch": 0.43692066669567864,
+      "grad_norm": 0.14203716814517975,
+      "learning_rate": 0.00025747365601285215,
+      "loss": 2.8856,
+      "step": 7530
+    },
+    {
+      "epoch": 0.4375009066248894,
+      "grad_norm": 0.14312061667442322,
+      "learning_rate": 0.00025710596176297936,
+      "loss": 2.8793,
+      "step": 7540
+    },
+    {
+      "epoch": 0.43808114655410013,
+      "grad_norm": 0.13628032803535461,
+      "learning_rate": 0.0002567380572429731,
+      "loss": 2.8727,
+      "step": 7550
+    },
+    {
+      "epoch": 0.4386613864833108,
+      "grad_norm": 0.13479942083358765,
+      "learning_rate": 0.00025636994380749635,
+      "loss": 2.8826,
+      "step": 7560
+    },
+    {
+      "epoch": 0.43924162641252157,
+      "grad_norm": 0.13710373640060425,
+      "learning_rate": 0.0002560016228119814,
+      "loss": 2.8741,
+      "step": 7570
+    },
+    {
+      "epoch": 0.4398218663417323,
+      "grad_norm": 0.14385974407196045,
+      "learning_rate": 0.0002556330956126246,
+      "loss": 2.8721,
+      "step": 7580
+    },
+    {
+      "epoch": 0.44040210627094306,
+      "grad_norm": 0.13590388000011444,
+      "learning_rate": 0.0002552643635663818,
+      "loss": 2.8751,
+      "step": 7590
+    },
+    {
+      "epoch": 0.44098234620015375,
+      "grad_norm": 0.14734290540218353,
+      "learning_rate": 0.000254895428030963,
+      "loss": 2.8657,
+      "step": 7600
+    },
+    {
+      "epoch": 0.4415625861293645,
+      "grad_norm": 0.14358721673488617,
+      "learning_rate": 0.00025452629036482754,
+      "loss": 2.8658,
+      "step": 7610
+    },
+    {
+      "epoch": 0.44214282605857524,
+      "grad_norm": 0.1496707648038864,
+      "learning_rate": 0.00025415695192717886,
+      "loss": 2.8647,
+      "step": 7620
+    },
+    {
+      "epoch": 0.44272306598778594,
+      "grad_norm": 0.14276018738746643,
+      "learning_rate": 0.0002537874140779599,
+      "loss": 2.8746,
+      "step": 7630
+    },
+    {
+      "epoch": 0.4433033059169967,
+      "grad_norm": 0.14037010073661804,
+      "learning_rate": 0.0002534176781778477,
+      "loss": 2.8721,
+      "step": 7640
+    },
+    {
+      "epoch": 0.44388354584620743,
+      "grad_norm": 0.13529075682163239,
+      "learning_rate": 0.00025304774558824854,
+      "loss": 2.8778,
+      "step": 7650
+    },
+    {
+      "epoch": 0.4444637857754181,
+      "grad_norm": 0.14173325896263123,
+      "learning_rate": 0.000252677617671293,
+      "loss": 2.8699,
+      "step": 7660
+    },
+    {
+      "epoch": 0.44504402570462887,
+      "grad_norm": 0.1350284367799759,
+      "learning_rate": 0.0002523072957898308,
+      "loss": 2.874,
+      "step": 7670
+    },
+    {
+      "epoch": 0.4456242656338396,
+      "grad_norm": 0.13940376043319702,
+      "learning_rate": 0.00025193678130742595,
+      "loss": 2.8709,
+      "step": 7680
+    },
+    {
+      "epoch": 0.4462045055630503,
+      "grad_norm": 0.13375508785247803,
+      "learning_rate": 0.00025156607558835155,
+      "loss": 2.8789,
+      "step": 7690
+    },
+    {
+      "epoch": 0.44678474549226105,
+      "grad_norm": 0.13682647049427032,
+      "learning_rate": 0.000251195179997585,
+      "loss": 2.8637,
+      "step": 7700
+    },
+    {
+      "epoch": 0.4473649854214718,
+      "grad_norm": 0.15215632319450378,
+      "learning_rate": 0.00025082409590080257,
+      "loss": 2.8656,
+      "step": 7710
+    },
+    {
+      "epoch": 0.4479452253506825,
+      "grad_norm": 0.13502159714698792,
+      "learning_rate": 0.0002504528246643749,
+      "loss": 2.8723,
+      "step": 7720
+    },
+    {
+      "epoch": 0.44852546527989323,
+      "grad_norm": 0.1432695984840393,
+      "learning_rate": 0.00025008136765536143,
+      "loss": 2.8769,
+      "step": 7730
+    },
+    {
+      "epoch": 0.449105705209104,
+      "grad_norm": 0.1322629153728485,
+      "learning_rate": 0.0002497097262415058,
+      "loss": 2.8718,
+      "step": 7740
+    },
+    {
+      "epoch": 0.44968594513831467,
+      "grad_norm": 0.14167630672454834,
+      "learning_rate": 0.00024933790179123086,
+      "loss": 2.876,
+      "step": 7750
+    },
+    {
+      "epoch": 0.4502661850675254,
+      "grad_norm": 0.1397087574005127,
+      "learning_rate": 0.000248965895673633,
+      "loss": 2.859,
+      "step": 7760
+    },
+    {
+      "epoch": 0.45084642499673616,
+      "grad_norm": 0.1338258534669876,
+      "learning_rate": 0.00024859370925847766,
+      "loss": 2.8832,
+      "step": 7770
+    },
+    {
+      "epoch": 0.45142666492594685,
+      "grad_norm": 0.13774670660495758,
+      "learning_rate": 0.0002482213439161943,
+      "loss": 2.8647,
+      "step": 7780
+    },
+    {
+      "epoch": 0.4520069048551576,
+      "grad_norm": 0.13594569265842438,
+      "learning_rate": 0.0002478488010178711,
+      "loss": 2.8695,
+      "step": 7790
+    },
+    {
+      "epoch": 0.45258714478436834,
+      "grad_norm": 0.13634726405143738,
+      "learning_rate": 0.0002474760819352501,
+      "loss": 2.8559,
+      "step": 7800
+    },
+    {
+      "epoch": 0.4531673847135791,
+      "grad_norm": 0.15008622407913208,
+      "learning_rate": 0.0002471031880407219,
+      "loss": 2.8595,
+      "step": 7810
+    },
+    {
+      "epoch": 0.4537476246427898,
+      "grad_norm": 0.132884681224823,
+      "learning_rate": 0.000246730120707321,
+      "loss": 2.8614,
+      "step": 7820
+    },
+    {
+      "epoch": 0.45432786457200053,
+      "grad_norm": 0.14270992577075958,
+      "learning_rate": 0.00024635688130872027,
+      "loss": 2.8676,
+      "step": 7830
+    },
+    {
+      "epoch": 0.4549081045012113,
+      "grad_norm": 0.13346141576766968,
+      "learning_rate": 0.00024598347121922636,
+      "loss": 2.8663,
+      "step": 7840
+    },
+    {
+      "epoch": 0.45548834443042197,
+      "grad_norm": 0.1440788060426712,
+      "learning_rate": 0.00024560989181377434,
+      "loss": 2.8742,
+      "step": 7850
+    },
+    {
+      "epoch": 0.4560685843596327,
+      "grad_norm": 0.13537128269672394,
+      "learning_rate": 0.00024523614446792267,
+      "loss": 2.8677,
+      "step": 7860
+    },
+    {
+      "epoch": 0.45664882428884346,
+      "grad_norm": 0.1451151967048645,
+      "learning_rate": 0.0002448622305578483,
+      "loss": 2.855,
+      "step": 7870
+    },
+    {
+      "epoch": 0.45722906421805415,
+      "grad_norm": 0.13695837557315826,
+      "learning_rate": 0.00024448815146034135,
+      "loss": 2.8736,
+      "step": 7880
+    },
+    {
+      "epoch": 0.4578093041472649,
+      "grad_norm": 0.13232262432575226,
+      "learning_rate": 0.00024411390855280023,
+      "loss": 2.865,
+      "step": 7890
+    },
+    {
+      "epoch": 0.45838954407647564,
+      "grad_norm": 0.1366211622953415,
+      "learning_rate": 0.00024373950321322663,
+      "loss": 2.86,
+      "step": 7900
+    },
+    {
+      "epoch": 0.45896978400568633,
+      "grad_norm": 0.14180238544940948,
+      "learning_rate": 0.00024336493682022012,
+      "loss": 2.8601,
+      "step": 7910
+    },
+    {
+      "epoch": 0.4595500239348971,
+      "grad_norm": 0.1388852894306183,
+      "learning_rate": 0.00024299021075297343,
+      "loss": 2.86,
+      "step": 7920
+    },
+    {
+      "epoch": 0.4601302638641078,
+      "grad_norm": 0.1402927190065384,
+      "learning_rate": 0.0002426153263912673,
+      "loss": 2.8718,
+      "step": 7930
+    },
+    {
+      "epoch": 0.4607105037933185,
+      "grad_norm": 0.1446109563112259,
+      "learning_rate": 0.00024224028511546505,
+      "loss": 2.8632,
+      "step": 7940
+    },
+    {
+      "epoch": 0.46129074372252926,
+      "grad_norm": 0.138419508934021,
+      "learning_rate": 0.00024186508830650806,
+      "loss": 2.8598,
+      "step": 7950
+    },
+    {
+      "epoch": 0.46187098365174,
+      "grad_norm": 0.14222180843353271,
+      "learning_rate": 0.00024148973734591027,
+      "loss": 2.861,
+      "step": 7960
+    },
+    {
+      "epoch": 0.4624512235809507,
+      "grad_norm": 0.12518581748008728,
+      "learning_rate": 0.00024111423361575322,
+      "loss": 2.8533,
+      "step": 7970
+    },
+    {
+      "epoch": 0.46303146351016145,
+      "grad_norm": 0.13554495573043823,
+      "learning_rate": 0.00024073857849868092,
+      "loss": 2.8599,
+      "step": 7980
+    },
+    {
+      "epoch": 0.4636117034393722,
+      "grad_norm": 0.13020184636116028,
+      "learning_rate": 0.000240362773377895,
+      "loss": 2.8551,
+      "step": 7990
+    },
+    {
+      "epoch": 0.4641919433685829,
+      "grad_norm": 0.14233264327049255,
+      "learning_rate": 0.00023998681963714914,
+      "loss": 2.8584,
+      "step": 8000
+    },
+    {
+      "epoch": 0.4641919433685829,
+      "eval_loss": 2.8317151069641113,
+      "eval_runtime": 5.391,
+      "eval_samples_per_second": 803.193,
+      "eval_steps_per_second": 1.669,
+      "step": 8000
+    },
+    {
+      "epoch": 0.46477218329779363,
+      "grad_norm": 0.1347561776638031,
+      "learning_rate": 0.00023961071866074454,
+      "loss": 2.8583,
+      "step": 8010
+    },
+    {
+      "epoch": 0.4653524232270044,
+      "grad_norm": 0.13999909162521362,
+      "learning_rate": 0.0002392344718335243,
+      "loss": 2.8618,
+      "step": 8020
+    },
+    {
+      "epoch": 0.4659326631562151,
+      "grad_norm": 0.15302954614162445,
+      "learning_rate": 0.00023885808054086867,
+      "loss": 2.857,
+      "step": 8030
+    },
+    {
+      "epoch": 0.4665129030854258,
+      "grad_norm": 0.1391817033290863,
+      "learning_rate": 0.00023848154616868978,
+      "loss": 2.855,
+      "step": 8040
+    },
+    {
+      "epoch": 0.46709314301463656,
+      "grad_norm": 0.13904796540737152,
+      "learning_rate": 0.0002381048701034267,
+      "loss": 2.8585,
+      "step": 8050
+    },
+    {
+      "epoch": 0.4676733829438473,
+      "grad_norm": 0.14073707163333893,
+      "learning_rate": 0.00023772805373204018,
+      "loss": 2.8599,
+      "step": 8060
+    },
+    {
+      "epoch": 0.468253622873058,
+      "grad_norm": 0.1362558752298355,
+      "learning_rate": 0.0002373510984420075,
+      "loss": 2.8605,
+      "step": 8070
+    },
+    {
+      "epoch": 0.46883386280226874,
+      "grad_norm": 0.14300206303596497,
+      "learning_rate": 0.00023697400562131758,
+      "loss": 2.8514,
+      "step": 8080
+    },
+    {
+      "epoch": 0.4694141027314795,
+      "grad_norm": 0.15007098019123077,
+      "learning_rate": 0.00023659677665846562,
+      "loss": 2.8581,
+      "step": 8090
+    },
+    {
+      "epoch": 0.4699943426606902,
+      "grad_norm": 0.1375647932291031,
+      "learning_rate": 0.00023621941294244824,
+      "loss": 2.851,
+      "step": 8100
+    },
+    {
+      "epoch": 0.4705745825899009,
+      "grad_norm": 0.14102703332901,
+      "learning_rate": 0.0002358419158627582,
+      "loss": 2.853,
+      "step": 8110
+    },
+    {
+      "epoch": 0.47115482251911167,
+      "grad_norm": 0.14791908860206604,
+      "learning_rate": 0.00023546428680937926,
+      "loss": 2.8586,
+      "step": 8120
+    },
+    {
+      "epoch": 0.47173506244832236,
+      "grad_norm": 0.1465589553117752,
+      "learning_rate": 0.00023508652717278111,
+      "loss": 2.856,
+      "step": 8130
+    },
+    {
+      "epoch": 0.4723153023775331,
+      "grad_norm": 0.1365519016981125,
+      "learning_rate": 0.00023470863834391438,
+      "loss": 2.8643,
+      "step": 8140
+    },
+    {
+      "epoch": 0.47289554230674385,
+      "grad_norm": 0.13424411416053772,
+      "learning_rate": 0.00023433062171420522,
+      "loss": 2.8562,
+      "step": 8150
+    },
+    {
+      "epoch": 0.47347578223595455,
+      "grad_norm": 0.15107622742652893,
+      "learning_rate": 0.00023395247867555053,
+      "loss": 2.857,
+      "step": 8160
+    },
+    {
+      "epoch": 0.4740560221651653,
+      "grad_norm": 0.13523408770561218,
+      "learning_rate": 0.00023357421062031265,
+      "loss": 2.853,
+      "step": 8170
+    },
+    {
+      "epoch": 0.47463626209437604,
+      "grad_norm": 0.1367097645998001,
+      "learning_rate": 0.0002331958189413141,
+      "loss": 2.853,
+      "step": 8180
+    },
+    {
+      "epoch": 0.47521650202358673,
+      "grad_norm": 0.139958456158638,
+      "learning_rate": 0.00023281730503183274,
+      "loss": 2.8529,
+      "step": 8190
+    },
+    {
+      "epoch": 0.4757967419527975,
+      "grad_norm": 0.1375078707933426,
+      "learning_rate": 0.00023243867028559633,
+      "loss": 2.8492,
+      "step": 8200
+    },
+    {
+      "epoch": 0.4763769818820082,
+      "grad_norm": 0.1325850486755371,
+      "learning_rate": 0.0002320599160967778,
+      "loss": 2.8555,
+      "step": 8210
+    },
+    {
+      "epoch": 0.4769572218112189,
+      "grad_norm": 0.13762585818767548,
+      "learning_rate": 0.00023168104385998963,
+      "loss": 2.8443,
+      "step": 8220
+    },
+    {
+      "epoch": 0.47753746174042966,
+      "grad_norm": 0.13820673525333405,
+      "learning_rate": 0.0002313020549702792,
+      "loss": 2.8495,
+      "step": 8230
+    },
+    {
+      "epoch": 0.4781177016696404,
+      "grad_norm": 0.13856127858161926,
+      "learning_rate": 0.00023092295082312325,
+      "loss": 2.8484,
+      "step": 8240
+    },
+    {
+      "epoch": 0.47869794159885115,
+      "grad_norm": 0.13684526085853577,
+      "learning_rate": 0.000230543732814423,
+      "loss": 2.8481,
+      "step": 8250
+    },
+    {
+      "epoch": 0.47927818152806184,
+      "grad_norm": 0.13395436108112335,
+      "learning_rate": 0.0002301644023404988,
+      "loss": 2.8482,
+      "step": 8260
+    },
+    {
+      "epoch": 0.4798584214572726,
+      "grad_norm": 0.14342832565307617,
+      "learning_rate": 0.00022978496079808526,
+      "loss": 2.8523,
+      "step": 8270
+    },
+    {
+      "epoch": 0.48043866138648333,
+      "grad_norm": 0.14550542831420898,
+      "learning_rate": 0.00022940540958432584,
+      "loss": 2.8512,
+      "step": 8280
+    },
+    {
+      "epoch": 0.481018901315694,
+      "grad_norm": 0.14272627234458923,
+      "learning_rate": 0.00022902575009676795,
+      "loss": 2.8479,
+      "step": 8290
+    },
+    {
+      "epoch": 0.48159914124490477,
+      "grad_norm": 0.13808168470859528,
+      "learning_rate": 0.00022864598373335753,
+      "loss": 2.8539,
+      "step": 8300
+    },
+    {
+      "epoch": 0.4821793811741155,
+      "grad_norm": 0.13078927993774414,
+      "learning_rate": 0.00022826611189243407,
+      "loss": 2.8494,
+      "step": 8310
+    },
+    {
+      "epoch": 0.4827596211033262,
+      "grad_norm": 0.12881894409656525,
+      "learning_rate": 0.0002278861359727256,
+      "loss": 2.8447,
+      "step": 8320
+    },
+    {
+      "epoch": 0.48333986103253695,
+      "grad_norm": 0.12993864715099335,
+      "learning_rate": 0.00022750605737334323,
+      "loss": 2.8411,
+      "step": 8330
+    },
+    {
+      "epoch": 0.4839201009617477,
+      "grad_norm": 0.13983088731765747,
+      "learning_rate": 0.00022712587749377608,
+      "loss": 2.8414,
+      "step": 8340
+    },
+    {
+      "epoch": 0.4845003408909584,
+      "grad_norm": 0.1545930653810501,
+      "learning_rate": 0.0002267455977338864,
+      "loss": 2.8432,
+      "step": 8350
+    },
+    {
+      "epoch": 0.48508058082016914,
+      "grad_norm": 0.13540688157081604,
+      "learning_rate": 0.00022636521949390406,
+      "loss": 2.8493,
+      "step": 8360
+    },
+    {
+      "epoch": 0.4856608207493799,
+      "grad_norm": 0.15347440540790558,
+      "learning_rate": 0.0002259847441744216,
+      "loss": 2.8424,
+      "step": 8370
+    },
+    {
+      "epoch": 0.4862410606785906,
+      "grad_norm": 0.1298235058784485,
+      "learning_rate": 0.00022560417317638907,
+      "loss": 2.8462,
+      "step": 8380
+    },
+    {
+      "epoch": 0.4868213006078013,
+      "grad_norm": 0.15003375709056854,
+      "learning_rate": 0.00022522350790110863,
+      "loss": 2.845,
+      "step": 8390
+    },
+    {
+      "epoch": 0.48740154053701207,
+      "grad_norm": 0.15155071020126343,
+      "learning_rate": 0.00022484274975022973,
+      "loss": 2.8421,
+      "step": 8400
+    },
+    {
+      "epoch": 0.48798178046622276,
+      "grad_norm": 0.12803903222084045,
+      "learning_rate": 0.0002244619001257438,
+      "loss": 2.8456,
+      "step": 8410
+    },
+    {
+      "epoch": 0.4885620203954335,
+      "grad_norm": 0.14884670078754425,
+      "learning_rate": 0.00022408096042997905,
+      "loss": 2.8433,
+      "step": 8420
+    },
+    {
+      "epoch": 0.48914226032464425,
+      "grad_norm": 0.14629362523555756,
+      "learning_rate": 0.00022369993206559533,
+      "loss": 2.8419,
+      "step": 8430
+    },
+    {
+      "epoch": 0.48972250025385494,
+      "grad_norm": 0.14142775535583496,
+      "learning_rate": 0.00022331881643557905,
+      "loss": 2.8509,
+      "step": 8440
+    },
+    {
+      "epoch": 0.4903027401830657,
+      "grad_norm": 0.14854149520397186,
+      "learning_rate": 0.00022293761494323783,
+      "loss": 2.8364,
+      "step": 8450
+    },
+    {
+      "epoch": 0.49088298011227643,
+      "grad_norm": 0.14419108629226685,
+      "learning_rate": 0.00022255632899219547,
+      "loss": 2.8466,
+      "step": 8460
+    },
+    {
+      "epoch": 0.4914632200414872,
+      "grad_norm": 0.13738323748111725,
+      "learning_rate": 0.0002221749599863868,
+      "loss": 2.844,
+      "step": 8470
+    },
+    {
+      "epoch": 0.49204345997069787,
+      "grad_norm": 0.14036454260349274,
+      "learning_rate": 0.00022179350933005255,
+      "loss": 2.8401,
+      "step": 8480
+    },
+    {
+      "epoch": 0.4926236998999086,
+      "grad_norm": 0.1321595460176468,
+      "learning_rate": 0.00022141197842773385,
+      "loss": 2.848,
+      "step": 8490
+    },
+    {
+      "epoch": 0.49320393982911936,
+      "grad_norm": 0.13454943895339966,
+      "learning_rate": 0.0002210303686842676,
+      "loss": 2.8359,
+      "step": 8500
+    },
+    {
+      "epoch": 0.49378417975833006,
+      "grad_norm": 0.13193447887897491,
+      "learning_rate": 0.00022064868150478066,
+      "loss": 2.8477,
+      "step": 8510
+    },
+    {
+      "epoch": 0.4943644196875408,
+      "grad_norm": 0.13707976043224335,
+      "learning_rate": 0.0002202669182946854,
+      "loss": 2.8433,
+      "step": 8520
+    },
+    {
+      "epoch": 0.49494465961675155,
+      "grad_norm": 0.1408550888299942,
+      "learning_rate": 0.00021988508045967385,
+      "loss": 2.8376,
+      "step": 8530
+    },
+    {
+      "epoch": 0.49552489954596224,
+      "grad_norm": 0.13284894824028015,
+      "learning_rate": 0.00021950316940571294,
+      "loss": 2.8442,
+      "step": 8540
+    },
+    {
+      "epoch": 0.496105139475173,
+      "grad_norm": 0.1373133808374405,
+      "learning_rate": 0.0002191211865390392,
+      "loss": 2.8333,
+      "step": 8550
+    },
+    {
+      "epoch": 0.49668537940438373,
+      "grad_norm": 0.1252400428056717,
+      "learning_rate": 0.00021873913326615356,
+      "loss": 2.8383,
+      "step": 8560
+    },
+    {
+      "epoch": 0.4972656193335944,
+      "grad_norm": 0.13226158916950226,
+      "learning_rate": 0.0002183570109938161,
+      "loss": 2.8334,
+      "step": 8570
+    },
+    {
+      "epoch": 0.49784585926280517,
+      "grad_norm": 0.1358543038368225,
+      "learning_rate": 0.00021797482112904118,
+      "loss": 2.8407,
+      "step": 8580
+    },
+    {
+      "epoch": 0.4984260991920159,
+      "grad_norm": 0.14613795280456543,
+      "learning_rate": 0.00021759256507909185,
+      "loss": 2.8415,
+      "step": 8590
+    },
+    {
+      "epoch": 0.4990063391212266,
+      "grad_norm": 0.14075227081775665,
+      "learning_rate": 0.00021721024425147496,
+      "loss": 2.8347,
+      "step": 8600
+    },
+    {
+      "epoch": 0.49958657905043735,
+      "grad_norm": 0.1412448287010193,
+      "learning_rate": 0.00021682786005393587,
+      "loss": 2.8366,
+      "step": 8610
+    },
+    {
+      "epoch": 0.5001668189796481,
+      "grad_norm": 0.13218757510185242,
+      "learning_rate": 0.00021644541389445317,
+      "loss": 2.8401,
+      "step": 8620
+    },
+    {
+      "epoch": 0.5007470589088588,
+      "grad_norm": 0.1320735365152359,
+      "learning_rate": 0.00021606290718123377,
+      "loss": 2.8443,
+      "step": 8630
+    },
+    {
+      "epoch": 0.5013272988380696,
+      "grad_norm": 0.13078896701335907,
+      "learning_rate": 0.0002156803413227074,
+      "loss": 2.8471,
+      "step": 8640
+    },
+    {
+      "epoch": 0.5019075387672802,
+      "grad_norm": 0.13882210850715637,
+      "learning_rate": 0.00021529771772752163,
+      "loss": 2.8499,
+      "step": 8650
+    },
+    {
+      "epoch": 0.502487778696491,
+      "grad_norm": 0.1350562423467636,
+      "learning_rate": 0.00021491503780453672,
+      "loss": 2.8324,
+      "step": 8660
+    },
+    {
+      "epoch": 0.5030680186257017,
+      "grad_norm": 0.1424356997013092,
+      "learning_rate": 0.0002145323029628201,
+      "loss": 2.8423,
+      "step": 8670
+    },
+    {
+      "epoch": 0.5036482585549125,
+      "grad_norm": 0.13661132752895355,
+      "learning_rate": 0.0002141495146116416,
+      "loss": 2.8403,
+      "step": 8680
+    },
+    {
+      "epoch": 0.5042284984841232,
+      "grad_norm": 0.13870751857757568,
+      "learning_rate": 0.00021376667416046806,
+      "loss": 2.8355,
+      "step": 8690
+    },
+    {
+      "epoch": 0.504808738413334,
+      "grad_norm": 0.1345418244600296,
+      "learning_rate": 0.0002133837830189581,
+      "loss": 2.8396,
+      "step": 8700
+    },
+    {
+      "epoch": 0.5053889783425446,
+      "grad_norm": 0.13897638022899628,
+      "learning_rate": 0.00021300084259695697,
+      "loss": 2.8376,
+      "step": 8710
+    },
+    {
+      "epoch": 0.5059692182717553,
+      "grad_norm": 0.14594705402851105,
+      "learning_rate": 0.00021261785430449153,
+      "loss": 2.8382,
+      "step": 8720
+    },
+    {
+      "epoch": 0.5065494582009661,
+      "grad_norm": 0.13326287269592285,
+      "learning_rate": 0.00021223481955176467,
+      "loss": 2.8332,
+      "step": 8730
+    },
+    {
+      "epoch": 0.5071296981301768,
+      "grad_norm": 0.1389443427324295,
+      "learning_rate": 0.00021185173974915057,
+      "loss": 2.8356,
+      "step": 8740
+    },
+    {
+      "epoch": 0.5077099380593876,
+      "grad_norm": 0.14216211438179016,
+      "learning_rate": 0.0002114686163071892,
+      "loss": 2.8374,
+      "step": 8750
+    },
+    {
+      "epoch": 0.5082901779885983,
+      "grad_norm": 0.13781870901584625,
+      "learning_rate": 0.00021108545063658113,
+      "loss": 2.8358,
+      "step": 8760
+    },
+    {
+      "epoch": 0.508870417917809,
+      "grad_norm": 0.13731315732002258,
+      "learning_rate": 0.00021070224414818247,
+      "loss": 2.824,
+      "step": 8770
+    },
+    {
+      "epoch": 0.5094506578470197,
+      "grad_norm": 0.13154597580432892,
+      "learning_rate": 0.00021031899825299974,
+      "loss": 2.8328,
+      "step": 8780
+    },
+    {
+      "epoch": 0.5100308977762305,
+      "grad_norm": 0.1381417214870453,
+      "learning_rate": 0.00020993571436218452,
+      "loss": 2.8275,
+      "step": 8790
+    },
+    {
+      "epoch": 0.5106111377054412,
+      "grad_norm": 0.14278624951839447,
+      "learning_rate": 0.00020955239388702817,
+      "loss": 2.8339,
+      "step": 8800
+    },
+    {
+      "epoch": 0.5111913776346519,
+      "grad_norm": 0.13169626891613007,
+      "learning_rate": 0.00020916903823895683,
+      "loss": 2.8297,
+      "step": 8810
+    },
+    {
+      "epoch": 0.5117716175638627,
+      "grad_norm": 0.12725697457790375,
+      "learning_rate": 0.0002087856488295262,
+      "loss": 2.8334,
+      "step": 8820
+    },
+    {
+      "epoch": 0.5123518574930734,
+      "grad_norm": 0.1367608606815338,
+      "learning_rate": 0.00020840222707041616,
+      "loss": 2.8345,
+      "step": 8830
+    },
+    {
+      "epoch": 0.5129320974222841,
+      "grad_norm": 0.13655100762844086,
+      "learning_rate": 0.00020801877437342584,
+      "loss": 2.8295,
+      "step": 8840
+    },
+    {
+      "epoch": 0.5135123373514948,
+      "grad_norm": 0.13129231333732605,
+      "learning_rate": 0.00020763529215046827,
+      "loss": 2.8403,
+      "step": 8850
+    },
+    {
+      "epoch": 0.5140925772807056,
+      "grad_norm": 0.1390671730041504,
+      "learning_rate": 0.0002072517818135652,
+      "loss": 2.8419,
+      "step": 8860
+    },
+    {
+      "epoch": 0.5146728172099163,
+      "grad_norm": 0.13736438751220703,
+      "learning_rate": 0.00020686824477484178,
+      "loss": 2.8326,
+      "step": 8870
+    },
+    {
+      "epoch": 0.5152530571391271,
+      "grad_norm": 0.1363706886768341,
+      "learning_rate": 0.0002064846824465216,
+      "loss": 2.8308,
+      "step": 8880
+    },
+    {
+      "epoch": 0.5158332970683378,
+      "grad_norm": 0.1408400982618332,
+      "learning_rate": 0.00020610109624092133,
+      "loss": 2.8328,
+      "step": 8890
+    },
+    {
+      "epoch": 0.5164135369975484,
+      "grad_norm": 0.13489697873592377,
+      "learning_rate": 0.00020571748757044556,
+      "loss": 2.8258,
+      "step": 8900
+    },
+    {
+      "epoch": 0.5169937769267592,
+      "grad_norm": 0.13712669909000397,
+      "learning_rate": 0.00020533385784758163,
+      "loss": 2.8357,
+      "step": 8910
+    },
+    {
+      "epoch": 0.5175740168559699,
+      "grad_norm": 0.13442225754261017,
+      "learning_rate": 0.00020495020848489438,
+      "loss": 2.8213,
+      "step": 8920
+    },
+    {
+      "epoch": 0.5181542567851807,
+      "grad_norm": 0.13573797047138214,
+      "learning_rate": 0.00020456654089502085,
+      "loss": 2.8359,
+      "step": 8930
+    },
+    {
+      "epoch": 0.5187344967143914,
+      "grad_norm": 0.12967585027217865,
+      "learning_rate": 0.0002041828564906654,
+      "loss": 2.8287,
+      "step": 8940
+    },
+    {
+      "epoch": 0.5193147366436022,
+      "grad_norm": 0.13563676178455353,
+      "learning_rate": 0.00020379915668459412,
+      "loss": 2.8245,
+      "step": 8950
+    },
+    {
+      "epoch": 0.5198949765728128,
+      "grad_norm": 0.14425864815711975,
+      "learning_rate": 0.00020341544288963,
+      "loss": 2.8288,
+      "step": 8960
+    },
+    {
+      "epoch": 0.5204752165020236,
+      "grad_norm": 0.13940462470054626,
+      "learning_rate": 0.00020303171651864737,
+      "loss": 2.8248,
+      "step": 8970
+    },
+    {
+      "epoch": 0.5210554564312343,
+      "grad_norm": 0.13967497646808624,
+      "learning_rate": 0.00020264797898456692,
+      "loss": 2.8302,
+      "step": 8980
+    },
+    {
+      "epoch": 0.521635696360445,
+      "grad_norm": 0.14502641558647156,
+      "learning_rate": 0.00020226423170035043,
+      "loss": 2.8241,
+      "step": 8990
+    },
+    {
+      "epoch": 0.5222159362896558,
+      "grad_norm": 0.12917964160442352,
+      "learning_rate": 0.00020188047607899563,
+      "loss": 2.8266,
+      "step": 9000
+    },
+    {
+      "epoch": 0.5222159362896558,
+      "eval_loss": 2.798532724380493,
+      "eval_runtime": 5.3934,
+      "eval_samples_per_second": 802.828,
+      "eval_steps_per_second": 1.669,
+      "step": 9000
+    },
+    {
+      "epoch": 0.5227961762188665,
+      "grad_norm": 0.131534606218338,
+      "learning_rate": 0.00020149671353353088,
+      "loss": 2.8277,
+      "step": 9010
+    },
+    {
+      "epoch": 0.5233764161480773,
+      "grad_norm": 0.13375796377658844,
+      "learning_rate": 0.00020111294547701017,
+      "loss": 2.8293,
+      "step": 9020
+    },
+    {
+      "epoch": 0.5239566560772879,
+      "grad_norm": 0.13121195137500763,
+      "learning_rate": 0.00020072917332250757,
+      "loss": 2.8292,
+      "step": 9030
+    },
+    {
+      "epoch": 0.5245368960064987,
+      "grad_norm": 0.12965995073318481,
+      "learning_rate": 0.0002003453984831124,
+      "loss": 2.8317,
+      "step": 9040
+    },
+    {
+      "epoch": 0.5251171359357094,
+      "grad_norm": 0.13647259771823883,
+      "learning_rate": 0.0001999616223719239,
+      "loss": 2.83,
+      "step": 9050
+    },
+    {
+      "epoch": 0.5256973758649202,
+      "grad_norm": 0.13376837968826294,
+      "learning_rate": 0.0001995778464020458,
+      "loss": 2.8333,
+      "step": 9060
+    },
+    {
+      "epoch": 0.5262776157941309,
+      "grad_norm": 0.13384296000003815,
+      "learning_rate": 0.00019919407198658155,
+      "loss": 2.8144,
+      "step": 9070
+    },
+    {
+      "epoch": 0.5268578557233417,
+      "grad_norm": 0.13470128178596497,
+      "learning_rate": 0.00019881030053862857,
+      "loss": 2.8216,
+      "step": 9080
+    },
+    {
+      "epoch": 0.5274380956525523,
+      "grad_norm": 0.14076776802539825,
+      "learning_rate": 0.0001984265334712737,
+      "loss": 2.8259,
+      "step": 9090
+    },
+    {
+      "epoch": 0.528018335581763,
+      "grad_norm": 0.12356127053499222,
+      "learning_rate": 0.00019804277219758737,
+      "loss": 2.825,
+      "step": 9100
+    },
+    {
+      "epoch": 0.5285985755109738,
+      "grad_norm": 0.13532927632331848,
+      "learning_rate": 0.00019765901813061882,
+      "loss": 2.8228,
+      "step": 9110
+    },
+    {
+      "epoch": 0.5291788154401845,
+      "grad_norm": 0.14467062056064606,
+      "learning_rate": 0.00019727527268339088,
+      "loss": 2.8316,
+      "step": 9120
+    },
+    {
+      "epoch": 0.5297590553693953,
+      "grad_norm": 0.14286072552204132,
+      "learning_rate": 0.00019689153726889423,
+      "loss": 2.824,
+      "step": 9130
+    },
+    {
+      "epoch": 0.530339295298606,
+      "grad_norm": 0.13074620068073273,
+      "learning_rate": 0.00019650781330008305,
+      "loss": 2.823,
+      "step": 9140
+    },
+    {
+      "epoch": 0.5309195352278167,
+      "grad_norm": 0.12763041257858276,
+      "learning_rate": 0.00019612410218986908,
+      "loss": 2.8227,
+      "step": 9150
+    },
+    {
+      "epoch": 0.5314997751570274,
+      "grad_norm": 0.1378396898508072,
+      "learning_rate": 0.0001957404053511169,
+      "loss": 2.8189,
+      "step": 9160
+    },
+    {
+      "epoch": 0.5320800150862381,
+      "grad_norm": 0.1301429569721222,
+      "learning_rate": 0.0001953567241966385,
+      "loss": 2.8173,
+      "step": 9170
+    },
+    {
+      "epoch": 0.5326602550154489,
+      "grad_norm": 0.13029593229293823,
+      "learning_rate": 0.00019497306013918793,
+      "loss": 2.8295,
+      "step": 9180
+    },
+    {
+      "epoch": 0.5332404949446596,
+      "grad_norm": 0.13020405173301697,
+      "learning_rate": 0.00019458941459145657,
+      "loss": 2.8249,
+      "step": 9190
+    },
+    {
+      "epoch": 0.5338207348738704,
+      "grad_norm": 0.1313934177160263,
+      "learning_rate": 0.00019420578896606747,
+      "loss": 2.8262,
+      "step": 9200
+    },
+    {
+      "epoch": 0.534400974803081,
+      "grad_norm": 0.13342641294002533,
+      "learning_rate": 0.00019382218467557048,
+      "loss": 2.8289,
+      "step": 9210
+    },
+    {
+      "epoch": 0.5349812147322918,
+      "grad_norm": 0.13609477877616882,
+      "learning_rate": 0.00019343860313243659,
+      "loss": 2.8192,
+      "step": 9220
+    },
+    {
+      "epoch": 0.5355614546615025,
+      "grad_norm": 0.1358291357755661,
+      "learning_rate": 0.00019305504574905328,
+      "loss": 2.8275,
+      "step": 9230
+    },
+    {
+      "epoch": 0.5361416945907133,
+      "grad_norm": 0.1305118352174759,
+      "learning_rate": 0.00019267151393771918,
+      "loss": 2.8214,
+      "step": 9240
+    },
+    {
+      "epoch": 0.536721934519924,
+      "grad_norm": 0.12923507392406464,
+      "learning_rate": 0.0001922880091106384,
+      "loss": 2.8266,
+      "step": 9250
+    },
+    {
+      "epoch": 0.5373021744491348,
+      "grad_norm": 0.13450828194618225,
+      "learning_rate": 0.00019190453267991598,
+      "loss": 2.8187,
+      "step": 9260
+    },
+    {
+      "epoch": 0.5378824143783455,
+      "grad_norm": 0.1334317922592163,
+      "learning_rate": 0.00019152108605755222,
+      "loss": 2.819,
+      "step": 9270
+    },
+    {
+      "epoch": 0.5384626543075561,
+      "grad_norm": 0.13587439060211182,
+      "learning_rate": 0.0001911376706554379,
+      "loss": 2.8167,
+      "step": 9280
+    },
+    {
+      "epoch": 0.5390428942367669,
+      "grad_norm": 0.13316716253757477,
+      "learning_rate": 0.00019075428788534863,
+      "loss": 2.8143,
+      "step": 9290
+    },
+    {
+      "epoch": 0.5396231341659776,
+      "grad_norm": 0.13156826794147491,
+      "learning_rate": 0.00019037093915893986,
+      "loss": 2.816,
+      "step": 9300
+    },
+    {
+      "epoch": 0.5402033740951884,
+      "grad_norm": 0.1294325441122055,
+      "learning_rate": 0.00018998762588774188,
+      "loss": 2.815,
+      "step": 9310
+    },
+    {
+      "epoch": 0.5407836140243991,
+      "grad_norm": 0.1401001513004303,
+      "learning_rate": 0.0001896043494831542,
+      "loss": 2.813,
+      "step": 9320
+    },
+    {
+      "epoch": 0.5413638539536099,
+      "grad_norm": 0.13030453026294708,
+      "learning_rate": 0.00018922111135644083,
+      "loss": 2.8207,
+      "step": 9330
+    },
+    {
+      "epoch": 0.5419440938828205,
+      "grad_norm": 0.13054735958576202,
+      "learning_rate": 0.00018883791291872452,
+      "loss": 2.8208,
+      "step": 9340
+    },
+    {
+      "epoch": 0.5425243338120312,
+      "grad_norm": 0.13646872341632843,
+      "learning_rate": 0.00018845475558098215,
+      "loss": 2.814,
+      "step": 9350
+    },
+    {
+      "epoch": 0.543104573741242,
+      "grad_norm": 0.12413671612739563,
+      "learning_rate": 0.00018807164075403923,
+      "loss": 2.8181,
+      "step": 9360
+    },
+    {
+      "epoch": 0.5436848136704527,
+      "grad_norm": 0.12649095058441162,
+      "learning_rate": 0.0001876885698485646,
+      "loss": 2.8199,
+      "step": 9370
+    },
+    {
+      "epoch": 0.5442650535996635,
+      "grad_norm": 0.12739893794059753,
+      "learning_rate": 0.00018730554427506558,
+      "loss": 2.8129,
+      "step": 9380
+    },
+    {
+      "epoch": 0.5448452935288742,
+      "grad_norm": 0.14016854763031006,
+      "learning_rate": 0.00018692256544388227,
+      "loss": 2.8023,
+      "step": 9390
+    },
+    {
+      "epoch": 0.5454255334580849,
+      "grad_norm": 0.1389675885438919,
+      "learning_rate": 0.00018653963476518296,
+      "loss": 2.8174,
+      "step": 9400
+    },
+    {
+      "epoch": 0.5460057733872956,
+      "grad_norm": 0.13839608430862427,
+      "learning_rate": 0.00018615675364895857,
+      "loss": 2.8153,
+      "step": 9410
+    },
+    {
+      "epoch": 0.5465860133165064,
+      "grad_norm": 0.14397138357162476,
+      "learning_rate": 0.00018577392350501736,
+      "loss": 2.8256,
+      "step": 9420
+    },
+    {
+      "epoch": 0.5471662532457171,
+      "grad_norm": 0.13635268807411194,
+      "learning_rate": 0.0001853911457429802,
+      "loss": 2.8074,
+      "step": 9430
+    },
+    {
+      "epoch": 0.5477464931749279,
+      "grad_norm": 0.14737777411937714,
+      "learning_rate": 0.0001850084217722747,
+      "loss": 2.8211,
+      "step": 9440
+    },
+    {
+      "epoch": 0.5483267331041386,
+      "grad_norm": 0.13884200155735016,
+      "learning_rate": 0.00018462575300213076,
+      "loss": 2.8142,
+      "step": 9450
+    },
+    {
+      "epoch": 0.5489069730333493,
+      "grad_norm": 0.13363014161586761,
+      "learning_rate": 0.0001842431408415748,
+      "loss": 2.8046,
+      "step": 9460
+    },
+    {
+      "epoch": 0.54948721296256,
+      "grad_norm": 0.13794805109500885,
+      "learning_rate": 0.00018386058669942487,
+      "loss": 2.8011,
+      "step": 9470
+    },
+    {
+      "epoch": 0.5500674528917707,
+      "grad_norm": 0.13166405260562897,
+      "learning_rate": 0.00018347809198428555,
+      "loss": 2.8138,
+      "step": 9480
+    },
+    {
+      "epoch": 0.5506476928209815,
+      "grad_norm": 0.13902068138122559,
+      "learning_rate": 0.00018309565810454222,
+      "loss": 2.8204,
+      "step": 9490
+    },
+    {
+      "epoch": 0.5512279327501922,
+      "grad_norm": 0.12628786265850067,
+      "learning_rate": 0.00018271328646835672,
+      "loss": 2.8203,
+      "step": 9500
+    },
+    {
+      "epoch": 0.551808172679403,
+      "grad_norm": 0.1412256807088852,
+      "learning_rate": 0.00018233097848366125,
+      "loss": 2.8169,
+      "step": 9510
+    },
+    {
+      "epoch": 0.5523884126086137,
+      "grad_norm": 0.13156931102275848,
+      "learning_rate": 0.00018194873555815394,
+      "loss": 2.8215,
+      "step": 9520
+    },
+    {
+      "epoch": 0.5529686525378243,
+      "grad_norm": 0.14922703802585602,
+      "learning_rate": 0.0001815665590992934,
+      "loss": 2.8131,
+      "step": 9530
+    },
+    {
+      "epoch": 0.5535488924670351,
+      "grad_norm": 0.13142195343971252,
+      "learning_rate": 0.0001811844505142932,
+      "loss": 2.8113,
+      "step": 9540
+    },
+    {
+      "epoch": 0.5541291323962458,
+      "grad_norm": 0.13327111303806305,
+      "learning_rate": 0.0001808024112101174,
+      "loss": 2.8157,
+      "step": 9550
+    },
+    {
+      "epoch": 0.5547093723254566,
+      "grad_norm": 0.12934935092926025,
+      "learning_rate": 0.0001804204425934745,
+      "loss": 2.8038,
+      "step": 9560
+    },
+    {
+      "epoch": 0.5552896122546673,
+      "grad_norm": 0.12223649024963379,
+      "learning_rate": 0.0001800385460708131,
+      "loss": 2.8231,
+      "step": 9570
+    },
+    {
+      "epoch": 0.5558698521838781,
+      "grad_norm": 0.13266624510288239,
+      "learning_rate": 0.00017965672304831614,
+      "loss": 2.8154,
+      "step": 9580
+    },
+    {
+      "epoch": 0.5564500921130887,
+      "grad_norm": 0.12882035970687866,
+      "learning_rate": 0.00017927497493189603,
+      "loss": 2.8011,
+      "step": 9590
+    },
+    {
+      "epoch": 0.5570303320422995,
+      "grad_norm": 0.14030689001083374,
+      "learning_rate": 0.0001788933031271894,
+      "loss": 2.8148,
+      "step": 9600
+    },
+    {
+      "epoch": 0.5576105719715102,
+      "grad_norm": 0.14994706213474274,
+      "learning_rate": 0.00017851170903955167,
+      "loss": 2.8033,
+      "step": 9610
+    },
+    {
+      "epoch": 0.558190811900721,
+      "grad_norm": 0.13254228234291077,
+      "learning_rate": 0.00017813019407405232,
+      "loss": 2.8123,
+      "step": 9620
+    },
+    {
+      "epoch": 0.5587710518299317,
+      "grad_norm": 0.13098488748073578,
+      "learning_rate": 0.0001777487596354694,
+      "loss": 2.81,
+      "step": 9630
+    },
+    {
+      "epoch": 0.5593512917591424,
+      "grad_norm": 0.13294130563735962,
+      "learning_rate": 0.00017736740712828443,
+      "loss": 2.8191,
+      "step": 9640
+    },
+    {
+      "epoch": 0.5599315316883531,
+      "grad_norm": 0.13288183510303497,
+      "learning_rate": 0.00017698613795667746,
+      "loss": 2.8131,
+      "step": 9650
+    },
+    {
+      "epoch": 0.5605117716175638,
+      "grad_norm": 0.12782664597034454,
+      "learning_rate": 0.00017660495352452132,
+      "loss": 2.8103,
+      "step": 9660
+    },
+    {
+      "epoch": 0.5610920115467746,
+      "grad_norm": 0.1320827752351761,
+      "learning_rate": 0.00017622385523537713,
+      "loss": 2.807,
+      "step": 9670
+    },
+    {
+      "epoch": 0.5616722514759853,
+      "grad_norm": 0.13161487877368927,
+      "learning_rate": 0.00017584284449248864,
+      "loss": 2.8104,
+      "step": 9680
+    },
+    {
+      "epoch": 0.5622524914051961,
+      "grad_norm": 0.1345384418964386,
+      "learning_rate": 0.00017546192269877748,
+      "loss": 2.812,
+      "step": 9690
+    },
+    {
+      "epoch": 0.5628327313344068,
+      "grad_norm": 0.1259540617465973,
+      "learning_rate": 0.00017508109125683737,
+      "loss": 2.8013,
+      "step": 9700
+    },
+    {
+      "epoch": 0.5634129712636176,
+      "grad_norm": 0.12676270306110382,
+      "learning_rate": 0.00017470035156892972,
+      "loss": 2.8109,
+      "step": 9710
+    },
+    {
+      "epoch": 0.5639932111928282,
+      "grad_norm": 0.13035669922828674,
+      "learning_rate": 0.00017431970503697795,
+      "loss": 2.8096,
+      "step": 9720
+    },
+    {
+      "epoch": 0.5645734511220389,
+      "grad_norm": 0.13196563720703125,
+      "learning_rate": 0.00017393915306256237,
+      "loss": 2.8044,
+      "step": 9730
+    },
+    {
+      "epoch": 0.5651536910512497,
+      "grad_norm": 0.12951119244098663,
+      "learning_rate": 0.00017355869704691537,
+      "loss": 2.8023,
+      "step": 9740
+    },
+    {
+      "epoch": 0.5657339309804604,
+      "grad_norm": 0.13105542957782745,
+      "learning_rate": 0.00017317833839091567,
+      "loss": 2.806,
+      "step": 9750
+    },
+    {
+      "epoch": 0.5663141709096712,
+      "grad_norm": 0.1464497148990631,
+      "learning_rate": 0.00017279807849508377,
+      "loss": 2.8131,
+      "step": 9760
+    },
+    {
+      "epoch": 0.5668944108388819,
+      "grad_norm": 0.14261843264102936,
+      "learning_rate": 0.00017241791875957657,
+      "loss": 2.812,
+      "step": 9770
+    },
+    {
+      "epoch": 0.5674746507680926,
+      "grad_norm": 0.1395425796508789,
+      "learning_rate": 0.0001720378605841818,
+      "loss": 2.8075,
+      "step": 9780
+    },
+    {
+      "epoch": 0.5680548906973033,
+      "grad_norm": 0.13989108800888062,
+      "learning_rate": 0.00017165790536831366,
+      "loss": 2.8076,
+      "step": 9790
+    },
+    {
+      "epoch": 0.568635130626514,
+      "grad_norm": 0.14962467551231384,
+      "learning_rate": 0.00017127805451100692,
+      "loss": 2.8027,
+      "step": 9800
+    },
+    {
+      "epoch": 0.5692153705557248,
+      "grad_norm": 0.13398703932762146,
+      "learning_rate": 0.0001708983094109124,
+      "loss": 2.8068,
+      "step": 9810
+    },
+    {
+      "epoch": 0.5697956104849355,
+      "grad_norm": 0.12953974306583405,
+      "learning_rate": 0.00017051867146629116,
+      "loss": 2.8114,
+      "step": 9820
+    },
+    {
+      "epoch": 0.5703758504141463,
+      "grad_norm": 0.14943692088127136,
+      "learning_rate": 0.00017013914207501,
+      "loss": 2.8009,
+      "step": 9830
+    },
+    {
+      "epoch": 0.5709560903433569,
+      "grad_norm": 0.13319486379623413,
+      "learning_rate": 0.00016975972263453585,
+      "loss": 2.8085,
+      "step": 9840
+    },
+    {
+      "epoch": 0.5715363302725677,
+      "grad_norm": 0.13063213229179382,
+      "learning_rate": 0.00016938041454193082,
+      "loss": 2.803,
+      "step": 9850
+    },
+    {
+      "epoch": 0.5721165702017784,
+      "grad_norm": 0.1308039277791977,
+      "learning_rate": 0.00016900121919384716,
+      "loss": 2.8039,
+      "step": 9860
+    },
+    {
+      "epoch": 0.5726968101309892,
+      "grad_norm": 0.12958461046218872,
+      "learning_rate": 0.0001686221379865217,
+      "loss": 2.8047,
+      "step": 9870
+    },
+    {
+      "epoch": 0.5732770500601999,
+      "grad_norm": 0.12382495403289795,
+      "learning_rate": 0.0001682431723157712,
+      "loss": 2.819,
+      "step": 9880
+    },
+    {
+      "epoch": 0.5738572899894107,
+      "grad_norm": 0.1312320977449417,
+      "learning_rate": 0.00016786432357698708,
+      "loss": 2.7955,
+      "step": 9890
+    },
+    {
+      "epoch": 0.5744375299186214,
+      "grad_norm": 0.12677842378616333,
+      "learning_rate": 0.00016748559316512993,
+      "loss": 2.7956,
+      "step": 9900
+    },
+    {
+      "epoch": 0.575017769847832,
+      "grad_norm": 0.130837082862854,
+      "learning_rate": 0.00016710698247472493,
+      "loss": 2.7972,
+      "step": 9910
+    },
+    {
+      "epoch": 0.5755980097770428,
+      "grad_norm": 0.13894180953502655,
+      "learning_rate": 0.0001667284928998562,
+      "loss": 2.8055,
+      "step": 9920
+    },
+    {
+      "epoch": 0.5761782497062535,
+      "grad_norm": 0.14453203976154327,
+      "learning_rate": 0.00016635012583416205,
+      "loss": 2.8029,
+      "step": 9930
+    },
+    {
+      "epoch": 0.5767584896354643,
+      "grad_norm": 0.12817683815956116,
+      "learning_rate": 0.0001659718826708296,
+      "loss": 2.7971,
+      "step": 9940
+    },
+    {
+      "epoch": 0.577338729564675,
+      "grad_norm": 0.1308722198009491,
+      "learning_rate": 0.00016559376480258987,
+      "loss": 2.7924,
+      "step": 9950
+    },
+    {
+      "epoch": 0.5779189694938858,
+      "grad_norm": 0.12425903975963593,
+      "learning_rate": 0.00016521577362171253,
+      "loss": 2.795,
+      "step": 9960
+    },
+    {
+      "epoch": 0.5784992094230964,
+      "grad_norm": 0.13473841547966003,
+      "learning_rate": 0.0001648379105200005,
+      "loss": 2.8023,
+      "step": 9970
+    },
+    {
+      "epoch": 0.5790794493523072,
+      "grad_norm": 0.13867899775505066,
+      "learning_rate": 0.00016446017688878547,
+      "loss": 2.803,
+      "step": 9980
+    },
+    {
+      "epoch": 0.5796596892815179,
+      "grad_norm": 0.1321575790643692,
+      "learning_rate": 0.00016408257411892215,
+      "loss": 2.7967,
+      "step": 9990
+    },
+    {
+      "epoch": 0.5802399292107286,
+      "grad_norm": 0.12522001564502716,
+      "learning_rate": 0.00016370510360078354,
+      "loss": 2.7939,
+      "step": 10000
+    },
+    {
+      "epoch": 0.5802399292107286,
+      "eval_loss": 2.7709856033325195,
+      "eval_runtime": 5.4112,
+      "eval_samples_per_second": 800.188,
+      "eval_steps_per_second": 1.663,
+      "step": 10000
+    },
+    {
+      "epoch": 0.5808201691399394,
+      "grad_norm": 0.13485607504844666,
+      "learning_rate": 0.0001633277667242557,
+      "loss": 2.7988,
+      "step": 10010
+    },
+    {
+      "epoch": 0.5814004090691501,
+      "grad_norm": 0.1311897337436676,
+      "learning_rate": 0.00016295056487873242,
+      "loss": 2.8103,
+      "step": 10020
+    },
+    {
+      "epoch": 0.5819806489983608,
+      "grad_norm": 0.13095928728580475,
+      "learning_rate": 0.00016257349945311044,
+      "loss": 2.7913,
+      "step": 10030
+    },
+    {
+      "epoch": 0.5825608889275715,
+      "grad_norm": 0.13543324172496796,
+      "learning_rate": 0.00016219657183578416,
+      "loss": 2.793,
+      "step": 10040
+    },
+    {
+      "epoch": 0.5831411288567823,
+      "grad_norm": 0.13157446682453156,
+      "learning_rate": 0.00016181978341464064,
+      "loss": 2.8001,
+      "step": 10050
+    },
+    {
+      "epoch": 0.583721368785993,
+      "grad_norm": 0.12915684282779694,
+      "learning_rate": 0.00016144313557705416,
+      "loss": 2.7973,
+      "step": 10060
+    },
+    {
+      "epoch": 0.5843016087152038,
+      "grad_norm": 0.12583227455615997,
+      "learning_rate": 0.0001610666297098816,
+      "loss": 2.7976,
+      "step": 10070
+    },
+    {
+      "epoch": 0.5848818486444145,
+      "grad_norm": 0.12779027223587036,
+      "learning_rate": 0.00016069026719945707,
+      "loss": 2.794,
+      "step": 10080
+    },
+    {
+      "epoch": 0.5854620885736251,
+      "grad_norm": 0.12447459995746613,
+      "learning_rate": 0.0001603140494315866,
+      "loss": 2.797,
+      "step": 10090
+    },
+    {
+      "epoch": 0.5860423285028359,
+      "grad_norm": 0.128029927611351,
+      "learning_rate": 0.00015993797779154356,
+      "loss": 2.8017,
+      "step": 10100
+    },
+    {
+      "epoch": 0.5866225684320466,
+      "grad_norm": 0.1264951229095459,
+      "learning_rate": 0.00015956205366406305,
+      "loss": 2.7931,
+      "step": 10110
+    },
+    {
+      "epoch": 0.5872028083612574,
+      "grad_norm": 0.13305509090423584,
+      "learning_rate": 0.0001591862784333371,
+      "loss": 2.7975,
+      "step": 10120
+    },
+    {
+      "epoch": 0.5877830482904681,
+      "grad_norm": 0.13381877541542053,
+      "learning_rate": 0.00015881065348300957,
+      "loss": 2.803,
+      "step": 10130
+    },
+    {
+      "epoch": 0.5883632882196789,
+      "grad_norm": 0.12950487434864044,
+      "learning_rate": 0.00015843518019617074,
+      "loss": 2.7968,
+      "step": 10140
+    },
+    {
+      "epoch": 0.5889435281488896,
+      "grad_norm": 0.12617862224578857,
+      "learning_rate": 0.0001580598599553527,
+      "loss": 2.7882,
+      "step": 10150
+    },
+    {
+      "epoch": 0.5895237680781003,
+      "grad_norm": 0.13075517117977142,
+      "learning_rate": 0.00015768469414252376,
+      "loss": 2.8059,
+      "step": 10160
+    },
+    {
+      "epoch": 0.590104008007311,
+      "grad_norm": 0.13265134394168854,
+      "learning_rate": 0.0001573096841390839,
+      "loss": 2.7919,
+      "step": 10170
+    },
+    {
+      "epoch": 0.5906842479365217,
+      "grad_norm": 0.12705199420452118,
+      "learning_rate": 0.00015693483132585908,
+      "loss": 2.7983,
+      "step": 10180
+    },
+    {
+      "epoch": 0.5912644878657325,
+      "grad_norm": 0.13738813996315002,
+      "learning_rate": 0.00015656013708309672,
+      "loss": 2.7949,
+      "step": 10190
+    },
+    {
+      "epoch": 0.5918447277949432,
+      "grad_norm": 0.13547635078430176,
+      "learning_rate": 0.0001561856027904603,
+      "loss": 2.7971,
+      "step": 10200
+    },
+    {
+      "epoch": 0.592424967724154,
+      "grad_norm": 0.12752999365329742,
+      "learning_rate": 0.00015581122982702425,
+      "loss": 2.798,
+      "step": 10210
+    },
+    {
+      "epoch": 0.5930052076533646,
+      "grad_norm": 0.12572290003299713,
+      "learning_rate": 0.00015543701957126916,
+      "loss": 2.7963,
+      "step": 10220
+    },
+    {
+      "epoch": 0.5935854475825754,
+      "grad_norm": 0.13692601025104523,
+      "learning_rate": 0.0001550629734010762,
+      "loss": 2.7906,
+      "step": 10230
+    },
+    {
+      "epoch": 0.5941656875117861,
+      "grad_norm": 0.12763461470603943,
+      "learning_rate": 0.00015468909269372266,
+      "loss": 2.7867,
+      "step": 10240
+    },
+    {
+      "epoch": 0.5947459274409969,
+      "grad_norm": 0.12994034588336945,
+      "learning_rate": 0.00015431537882587649,
+      "loss": 2.792,
+      "step": 10250
+    },
+    {
+      "epoch": 0.5953261673702076,
+      "grad_norm": 0.13037025928497314,
+      "learning_rate": 0.00015394183317359126,
+      "loss": 2.7964,
+      "step": 10260
+    },
+    {
+      "epoch": 0.5959064072994184,
+      "grad_norm": 0.13446973264217377,
+      "learning_rate": 0.00015356845711230128,
+      "loss": 2.7919,
+      "step": 10270
+    },
+    {
+      "epoch": 0.596486647228629,
+      "grad_norm": 0.1517745852470398,
+      "learning_rate": 0.00015319525201681617,
+      "loss": 2.7862,
+      "step": 10280
+    },
+    {
+      "epoch": 0.5970668871578397,
+      "grad_norm": 0.13981299102306366,
+      "learning_rate": 0.00015282221926131632,
+      "loss": 2.783,
+      "step": 10290
+    },
+    {
+      "epoch": 0.5976471270870505,
+      "grad_norm": 0.12882374227046967,
+      "learning_rate": 0.00015244936021934733,
+      "loss": 2.794,
+      "step": 10300
+    },
+    {
+      "epoch": 0.5982273670162612,
+      "grad_norm": 0.12829270958900452,
+      "learning_rate": 0.00015207667626381528,
+      "loss": 2.7928,
+      "step": 10310
+    },
+    {
+      "epoch": 0.598807606945472,
+      "grad_norm": 0.13218654692173004,
+      "learning_rate": 0.0001517041687669816,
+      "loss": 2.7832,
+      "step": 10320
+    },
+    {
+      "epoch": 0.5993878468746827,
+      "grad_norm": 0.13118650019168854,
+      "learning_rate": 0.0001513318391004578,
+      "loss": 2.7923,
+      "step": 10330
+    },
+    {
+      "epoch": 0.5999680868038935,
+      "grad_norm": 0.12444707006216049,
+      "learning_rate": 0.00015095968863520088,
+      "loss": 2.7941,
+      "step": 10340
+    },
+    {
+      "epoch": 0.6005483267331041,
+      "grad_norm": 0.13274915516376495,
+      "learning_rate": 0.00015058771874150762,
+      "loss": 2.7907,
+      "step": 10350
+    },
+    {
+      "epoch": 0.6011285666623148,
+      "grad_norm": 0.1304982304573059,
+      "learning_rate": 0.00015021593078901025,
+      "loss": 2.7849,
+      "step": 10360
+    },
+    {
+      "epoch": 0.6017088065915256,
+      "grad_norm": 0.12944455444812775,
+      "learning_rate": 0.000149844326146671,
+      "loss": 2.7902,
+      "step": 10370
+    },
+    {
+      "epoch": 0.6022890465207363,
+      "grad_norm": 0.13214483857154846,
+      "learning_rate": 0.000149472906182777,
+      "loss": 2.7986,
+      "step": 10380
+    },
+    {
+      "epoch": 0.6028692864499471,
+      "grad_norm": 0.1335555911064148,
+      "learning_rate": 0.00014910167226493562,
+      "loss": 2.7839,
+      "step": 10390
+    },
+    {
+      "epoch": 0.6034495263791578,
+      "grad_norm": 0.12958268821239471,
+      "learning_rate": 0.0001487306257600688,
+      "loss": 2.7837,
+      "step": 10400
+    },
+    {
+      "epoch": 0.6040297663083685,
+      "grad_norm": 0.13215544819831848,
+      "learning_rate": 0.00014835976803440886,
+      "loss": 2.7761,
+      "step": 10410
+    },
+    {
+      "epoch": 0.6046100062375792,
+      "grad_norm": 0.12011713534593582,
+      "learning_rate": 0.00014798910045349265,
+      "loss": 2.7899,
+      "step": 10420
+    },
+    {
+      "epoch": 0.60519024616679,
+      "grad_norm": 0.1265355348587036,
+      "learning_rate": 0.00014761862438215708,
+      "loss": 2.7856,
+      "step": 10430
+    },
+    {
+      "epoch": 0.6057704860960007,
+      "grad_norm": 0.12620458006858826,
+      "learning_rate": 0.0001472483411845339,
+      "loss": 2.795,
+      "step": 10440
+    },
+    {
+      "epoch": 0.6063507260252115,
+      "grad_norm": 0.1332314908504486,
+      "learning_rate": 0.0001468782522240446,
+      "loss": 2.7867,
+      "step": 10450
+    },
+    {
+      "epoch": 0.6069309659544222,
+      "grad_norm": 0.13342437148094177,
+      "learning_rate": 0.0001465083588633955,
+      "loss": 2.7831,
+      "step": 10460
+    },
+    {
+      "epoch": 0.6075112058836328,
+      "grad_norm": 0.13332657516002655,
+      "learning_rate": 0.00014613866246457265,
+      "loss": 2.7936,
+      "step": 10470
+    },
+    {
+      "epoch": 0.6080914458128436,
+      "grad_norm": 0.13002225756645203,
+      "learning_rate": 0.00014576916438883698,
+      "loss": 2.7875,
+      "step": 10480
+    },
+    {
+      "epoch": 0.6086716857420543,
+      "grad_norm": 0.13596920669078827,
+      "learning_rate": 0.0001453998659967192,
+      "loss": 2.7938,
+      "step": 10490
+    },
+    {
+      "epoch": 0.6092519256712651,
+      "grad_norm": 0.13465642929077148,
+      "learning_rate": 0.00014503076864801447,
+      "loss": 2.7886,
+      "step": 10500
+    },
+    {
+      "epoch": 0.6098321656004758,
+      "grad_norm": 0.13074541091918945,
+      "learning_rate": 0.00014466187370177806,
+      "loss": 2.7876,
+      "step": 10510
+    },
+    {
+      "epoch": 0.6104124055296866,
+      "grad_norm": 0.12734749913215637,
+      "learning_rate": 0.00014429318251631972,
+      "loss": 2.7756,
+      "step": 10520
+    },
+    {
+      "epoch": 0.6109926454588972,
+      "grad_norm": 0.12566278874874115,
+      "learning_rate": 0.0001439246964491991,
+      "loss": 2.7902,
+      "step": 10530
+    },
+    {
+      "epoch": 0.611572885388108,
+      "grad_norm": 0.13200882077217102,
+      "learning_rate": 0.0001435564168572204,
+      "loss": 2.781,
+      "step": 10540
+    },
+    {
+      "epoch": 0.6121531253173187,
+      "grad_norm": 0.13373680412769318,
+      "learning_rate": 0.00014318834509642766,
+      "loss": 2.7913,
+      "step": 10550
+    },
+    {
+      "epoch": 0.6127333652465294,
+      "grad_norm": 0.12818191945552826,
+      "learning_rate": 0.0001428204825220998,
+      "loss": 2.786,
+      "step": 10560
+    },
+    {
+      "epoch": 0.6133136051757402,
+      "grad_norm": 0.12577253580093384,
+      "learning_rate": 0.00014245283048874518,
+      "loss": 2.785,
+      "step": 10570
+    },
+    {
+      "epoch": 0.6138938451049509,
+      "grad_norm": 0.1309729665517807,
+      "learning_rate": 0.0001420853903500973,
+      "loss": 2.787,
+      "step": 10580
+    },
+    {
+      "epoch": 0.6144740850341617,
+      "grad_norm": 0.13818277418613434,
+      "learning_rate": 0.00014171816345910903,
+      "loss": 2.7856,
+      "step": 10590
+    },
+    {
+      "epoch": 0.6150543249633723,
+      "grad_norm": 0.13121308386325836,
+      "learning_rate": 0.00014135115116794834,
+      "loss": 2.7882,
+      "step": 10600
+    },
+    {
+      "epoch": 0.6156345648925831,
+      "grad_norm": 0.12498753517866135,
+      "learning_rate": 0.00014098435482799303,
+      "loss": 2.7865,
+      "step": 10610
+    },
+    {
+      "epoch": 0.6162148048217938,
+      "grad_norm": 0.12569987773895264,
+      "learning_rate": 0.00014061777578982547,
+      "loss": 2.7822,
+      "step": 10620
+    },
+    {
+      "epoch": 0.6167950447510046,
+      "grad_norm": 0.1260175108909607,
+      "learning_rate": 0.0001402514154032282,
+      "loss": 2.7785,
+      "step": 10630
+    },
+    {
+      "epoch": 0.6173752846802153,
+      "grad_norm": 0.12860171496868134,
+      "learning_rate": 0.00013988527501717848,
+      "loss": 2.787,
+      "step": 10640
+    },
+    {
+      "epoch": 0.617955524609426,
+      "grad_norm": 0.13064344227313995,
+      "learning_rate": 0.0001395193559798437,
+      "loss": 2.7959,
+      "step": 10650
+    },
+    {
+      "epoch": 0.6185357645386367,
+      "grad_norm": 0.12624002993106842,
+      "learning_rate": 0.0001391536596385759,
+      "loss": 2.7757,
+      "step": 10660
+    },
+    {
+      "epoch": 0.6191160044678474,
+      "grad_norm": 0.1249968633055687,
+      "learning_rate": 0.00013878818733990738,
+      "loss": 2.7928,
+      "step": 10670
+    },
+    {
+      "epoch": 0.6196962443970582,
+      "grad_norm": 0.1293274611234665,
+      "learning_rate": 0.00013842294042954554,
+      "loss": 2.7823,
+      "step": 10680
+    },
+    {
+      "epoch": 0.6202764843262689,
+      "grad_norm": 0.1331176459789276,
+      "learning_rate": 0.0001380579202523676,
+      "loss": 2.7898,
+      "step": 10690
+    },
+    {
+      "epoch": 0.6208567242554797,
+      "grad_norm": 0.12228766083717346,
+      "learning_rate": 0.00013769312815241626,
+      "loss": 2.7947,
+      "step": 10700
+    },
+    {
+      "epoch": 0.6214369641846904,
+      "grad_norm": 0.12410891056060791,
+      "learning_rate": 0.0001373285654728941,
+      "loss": 2.7706,
+      "step": 10710
+    },
+    {
+      "epoch": 0.622017204113901,
+      "grad_norm": 0.1261039823293686,
+      "learning_rate": 0.00013696423355615914,
+      "loss": 2.7705,
+      "step": 10720
+    },
+    {
+      "epoch": 0.6225974440431118,
+      "grad_norm": 0.13225802779197693,
+      "learning_rate": 0.00013660013374371973,
+      "loss": 2.7883,
+      "step": 10730
+    },
+    {
+      "epoch": 0.6231776839723225,
+      "grad_norm": 0.12750951945781708,
+      "learning_rate": 0.00013623626737622942,
+      "loss": 2.7895,
+      "step": 10740
+    },
+    {
+      "epoch": 0.6237579239015333,
+      "grad_norm": 0.1364651620388031,
+      "learning_rate": 0.00013587263579348239,
+      "loss": 2.7772,
+      "step": 10750
+    },
+    {
+      "epoch": 0.624338163830744,
+      "grad_norm": 0.13347889482975006,
+      "learning_rate": 0.00013550924033440813,
+      "loss": 2.787,
+      "step": 10760
+    },
+    {
+      "epoch": 0.6249184037599548,
+      "grad_norm": 0.1324913650751114,
+      "learning_rate": 0.0001351460823370669,
+      "loss": 2.786,
+      "step": 10770
+    },
+    {
+      "epoch": 0.6254986436891655,
+      "grad_norm": 0.1301848590373993,
+      "learning_rate": 0.00013478316313864433,
+      "loss": 2.7848,
+      "step": 10780
+    },
+    {
+      "epoch": 0.6260788836183762,
+      "grad_norm": 0.1281544417142868,
+      "learning_rate": 0.00013442048407544705,
+      "loss": 2.7803,
+      "step": 10790
+    },
+    {
+      "epoch": 0.6266591235475869,
+      "grad_norm": 0.1410907804965973,
+      "learning_rate": 0.0001340580464828974,
+      "loss": 2.7809,
+      "step": 10800
+    },
+    {
+      "epoch": 0.6272393634767977,
+      "grad_norm": 0.1318545639514923,
+      "learning_rate": 0.0001336958516955284,
+      "loss": 2.773,
+      "step": 10810
+    },
+    {
+      "epoch": 0.6278196034060084,
+      "grad_norm": 0.12339744716882706,
+      "learning_rate": 0.00013333390104697937,
+      "loss": 2.774,
+      "step": 10820
+    },
+    {
+      "epoch": 0.6283998433352191,
+      "grad_norm": 0.13089622557163239,
+      "learning_rate": 0.0001329721958699904,
+      "loss": 2.7764,
+      "step": 10830
+    },
+    {
+      "epoch": 0.6289800832644299,
+      "grad_norm": 0.1368187516927719,
+      "learning_rate": 0.00013261073749639785,
+      "loss": 2.7855,
+      "step": 10840
+    },
+    {
+      "epoch": 0.6295603231936405,
+      "grad_norm": 0.1409740447998047,
+      "learning_rate": 0.00013224952725712948,
+      "loss": 2.7771,
+      "step": 10850
+    },
+    {
+      "epoch": 0.6301405631228513,
+      "grad_norm": 0.1319727897644043,
+      "learning_rate": 0.000131888566482199,
+      "loss": 2.7745,
+      "step": 10860
+    },
+    {
+      "epoch": 0.630720803052062,
+      "grad_norm": 0.1281840205192566,
+      "learning_rate": 0.00013152785650070198,
+      "loss": 2.7799,
+      "step": 10870
+    },
+    {
+      "epoch": 0.6313010429812728,
+      "grad_norm": 0.1265205293893814,
+      "learning_rate": 0.00013116739864081018,
+      "loss": 2.7767,
+      "step": 10880
+    },
+    {
+      "epoch": 0.6318812829104835,
+      "grad_norm": 0.1335509866476059,
+      "learning_rate": 0.00013080719422976732,
+      "loss": 2.7734,
+      "step": 10890
+    },
+    {
+      "epoch": 0.6324615228396943,
+      "grad_norm": 0.1291695237159729,
+      "learning_rate": 0.00013044724459388375,
+      "loss": 2.7717,
+      "step": 10900
+    },
+    {
+      "epoch": 0.6330417627689049,
+      "grad_norm": 0.1341160535812378,
+      "learning_rate": 0.00013008755105853174,
+      "loss": 2.7797,
+      "step": 10910
+    },
+    {
+      "epoch": 0.6336220026981156,
+      "grad_norm": 0.13332979381084442,
+      "learning_rate": 0.00012972811494814062,
+      "loss": 2.7757,
+      "step": 10920
+    },
+    {
+      "epoch": 0.6342022426273264,
+      "grad_norm": 0.1305907964706421,
+      "learning_rate": 0.00012936893758619172,
+      "loss": 2.7826,
+      "step": 10930
+    },
+    {
+      "epoch": 0.6347824825565371,
+      "grad_norm": 0.1261076033115387,
+      "learning_rate": 0.00012901002029521377,
+      "loss": 2.7736,
+      "step": 10940
+    },
+    {
+      "epoch": 0.6353627224857479,
+      "grad_norm": 0.12599441409111023,
+      "learning_rate": 0.00012865136439677772,
+      "loss": 2.7678,
+      "step": 10950
+    },
+    {
+      "epoch": 0.6359429624149586,
+      "grad_norm": 0.13536348938941956,
+      "learning_rate": 0.0001282929712114923,
+      "loss": 2.7736,
+      "step": 10960
+    },
+    {
+      "epoch": 0.6365232023441693,
+      "grad_norm": 0.1305113434791565,
+      "learning_rate": 0.00012793484205899874,
+      "loss": 2.7856,
+      "step": 10970
+    },
+    {
+      "epoch": 0.63710344227338,
+      "grad_norm": 0.1301414519548416,
+      "learning_rate": 0.00012757697825796602,
+      "loss": 2.7801,
+      "step": 10980
+    },
+    {
+      "epoch": 0.6376836822025908,
+      "grad_norm": 0.12686558067798615,
+      "learning_rate": 0.00012721938112608623,
+      "loss": 2.7767,
+      "step": 10990
+    },
+    {
+      "epoch": 0.6382639221318015,
+      "grad_norm": 0.1347103714942932,
+      "learning_rate": 0.00012686205198006938,
+      "loss": 2.7718,
+      "step": 11000
+    },
+    {
+      "epoch": 0.6382639221318015,
+      "eval_loss": 2.7470815181732178,
+      "eval_runtime": 5.3884,
+      "eval_samples_per_second": 803.575,
+      "eval_steps_per_second": 1.67,
+      "step": 11000
+    },
+    {
+      "epoch": 0.6388441620610122,
+      "grad_norm": 0.1285264790058136,
+      "learning_rate": 0.00012650499213563894,
+      "loss": 2.7835,
+      "step": 11010
+    },
+    {
+      "epoch": 0.639424401990223,
+      "grad_norm": 0.13870897889137268,
+      "learning_rate": 0.00012614820290752653,
+      "loss": 2.771,
+      "step": 11020
+    },
+    {
+      "epoch": 0.6400046419194337,
+      "grad_norm": 0.12220434844493866,
+      "learning_rate": 0.0001257916856094675,
+      "loss": 2.7725,
+      "step": 11030
+    },
+    {
+      "epoch": 0.6405848818486444,
+      "grad_norm": 0.12159667909145355,
+      "learning_rate": 0.00012543544155419598,
+      "loss": 2.7679,
+      "step": 11040
+    },
+    {
+      "epoch": 0.6411651217778551,
+      "grad_norm": 0.1229986697435379,
+      "learning_rate": 0.0001250794720534398,
+      "loss": 2.7774,
+      "step": 11050
+    },
+    {
+      "epoch": 0.6417453617070659,
+      "grad_norm": 0.12473072856664658,
+      "learning_rate": 0.00012472377841791604,
+      "loss": 2.7729,
+      "step": 11060
+    },
+    {
+      "epoch": 0.6423256016362766,
+      "grad_norm": 0.12544026970863342,
+      "learning_rate": 0.0001243683619573258,
+      "loss": 2.7581,
+      "step": 11070
+    },
+    {
+      "epoch": 0.6429058415654874,
+      "grad_norm": 0.13708697259426117,
+      "learning_rate": 0.0001240132239803498,
+      "loss": 2.7734,
+      "step": 11080
+    },
+    {
+      "epoch": 0.6434860814946981,
+      "grad_norm": 0.13081100583076477,
+      "learning_rate": 0.00012365836579464332,
+      "loss": 2.7763,
+      "step": 11090
+    },
+    {
+      "epoch": 0.6440663214239087,
+      "grad_norm": 0.13071762025356293,
+      "learning_rate": 0.00012330378870683124,
+      "loss": 2.7725,
+      "step": 11100
+    },
+    {
+      "epoch": 0.6446465613531195,
+      "grad_norm": 0.12263841927051544,
+      "learning_rate": 0.00012294949402250378,
+      "loss": 2.7615,
+      "step": 11110
+    },
+    {
+      "epoch": 0.6452268012823302,
+      "grad_norm": 0.12744446098804474,
+      "learning_rate": 0.00012259548304621078,
+      "loss": 2.773,
+      "step": 11120
+    },
+    {
+      "epoch": 0.645807041211541,
+      "grad_norm": 0.12339978665113449,
+      "learning_rate": 0.00012224175708145797,
+      "loss": 2.7672,
+      "step": 11130
+    },
+    {
+      "epoch": 0.6463872811407517,
+      "grad_norm": 0.12739062309265137,
+      "learning_rate": 0.00012188831743070125,
+      "loss": 2.7768,
+      "step": 11140
+    },
+    {
+      "epoch": 0.6469675210699625,
+      "grad_norm": 0.1270364671945572,
+      "learning_rate": 0.00012153516539534253,
+      "loss": 2.7759,
+      "step": 11150
+    },
+    {
+      "epoch": 0.6475477609991731,
+      "grad_norm": 0.12549492716789246,
+      "learning_rate": 0.00012118230227572467,
+      "loss": 2.7727,
+      "step": 11160
+    },
+    {
+      "epoch": 0.6481280009283839,
+      "grad_norm": 0.13069987297058105,
+      "learning_rate": 0.00012082972937112646,
+      "loss": 2.7671,
+      "step": 11170
+    },
+    {
+      "epoch": 0.6487082408575946,
+      "grad_norm": 0.1319432556629181,
+      "learning_rate": 0.00012047744797975848,
+      "loss": 2.7725,
+      "step": 11180
+    },
+    {
+      "epoch": 0.6492884807868053,
+      "grad_norm": 0.1310606598854065,
+      "learning_rate": 0.00012012545939875747,
+      "loss": 2.7731,
+      "step": 11190
+    },
+    {
+      "epoch": 0.6498687207160161,
+      "grad_norm": 0.12737832963466644,
+      "learning_rate": 0.00011977376492418245,
+      "loss": 2.7681,
+      "step": 11200
+    },
+    {
+      "epoch": 0.6504489606452268,
+      "grad_norm": 0.12653803825378418,
+      "learning_rate": 0.00011942236585100926,
+      "loss": 2.7698,
+      "step": 11210
+    },
+    {
+      "epoch": 0.6510292005744376,
+      "grad_norm": 0.12586897611618042,
+      "learning_rate": 0.00011907126347312605,
+      "loss": 2.7767,
+      "step": 11220
+    },
+    {
+      "epoch": 0.6516094405036482,
+      "grad_norm": 0.12253769487142563,
+      "learning_rate": 0.0001187204590833287,
+      "loss": 2.7547,
+      "step": 11230
+    },
+    {
+      "epoch": 0.652189680432859,
+      "grad_norm": 0.12751977145671844,
+      "learning_rate": 0.00011836995397331554,
+      "loss": 2.7699,
+      "step": 11240
+    },
+    {
+      "epoch": 0.6527699203620697,
+      "grad_norm": 0.12828411161899567,
+      "learning_rate": 0.00011801974943368321,
+      "loss": 2.7704,
+      "step": 11250
+    },
+    {
+      "epoch": 0.6533501602912805,
+      "grad_norm": 0.12598510086536407,
+      "learning_rate": 0.00011766984675392147,
+      "loss": 2.7734,
+      "step": 11260
+    },
+    {
+      "epoch": 0.6539304002204912,
+      "grad_norm": 0.12886223196983337,
+      "learning_rate": 0.00011732024722240869,
+      "loss": 2.7621,
+      "step": 11270
+    },
+    {
+      "epoch": 0.654510640149702,
+      "grad_norm": 0.12585338950157166,
+      "learning_rate": 0.00011697095212640699,
+      "loss": 2.7658,
+      "step": 11280
+    },
+    {
+      "epoch": 0.6550908800789126,
+      "grad_norm": 0.1305324286222458,
+      "learning_rate": 0.00011662196275205736,
+      "loss": 2.7719,
+      "step": 11290
+    },
+    {
+      "epoch": 0.6556711200081233,
+      "grad_norm": 0.12806616723537445,
+      "learning_rate": 0.00011627328038437537,
+      "loss": 2.7749,
+      "step": 11300
+    },
+    {
+      "epoch": 0.6562513599373341,
+      "grad_norm": 0.13020043075084686,
+      "learning_rate": 0.00011592490630724602,
+      "loss": 2.7611,
+      "step": 11310
+    },
+    {
+      "epoch": 0.6568315998665448,
+      "grad_norm": 0.13302190601825714,
+      "learning_rate": 0.00011557684180341901,
+      "loss": 2.7708,
+      "step": 11320
+    },
+    {
+      "epoch": 0.6574118397957556,
+      "grad_norm": 0.1290377378463745,
+      "learning_rate": 0.00011522908815450448,
+      "loss": 2.7743,
+      "step": 11330
+    },
+    {
+      "epoch": 0.6579920797249663,
+      "grad_norm": 0.12511534988880157,
+      "learning_rate": 0.00011488164664096777,
+      "loss": 2.7712,
+      "step": 11340
+    },
+    {
+      "epoch": 0.658572319654177,
+      "grad_norm": 0.12530122697353363,
+      "learning_rate": 0.00011453451854212489,
+      "loss": 2.7757,
+      "step": 11350
+    },
+    {
+      "epoch": 0.6591525595833877,
+      "grad_norm": 0.13579903542995453,
+      "learning_rate": 0.00011418770513613783,
+      "loss": 2.7633,
+      "step": 11360
+    },
+    {
+      "epoch": 0.6597327995125984,
+      "grad_norm": 0.12218291312456131,
+      "learning_rate": 0.00011384120770000997,
+      "loss": 2.7704,
+      "step": 11370
+    },
+    {
+      "epoch": 0.6603130394418092,
+      "grad_norm": 0.12762351334095,
+      "learning_rate": 0.00011349502750958101,
+      "loss": 2.7602,
+      "step": 11380
+    },
+    {
+      "epoch": 0.6608932793710199,
+      "grad_norm": 0.1221364438533783,
+      "learning_rate": 0.00011314916583952287,
+      "loss": 2.7779,
+      "step": 11390
+    },
+    {
+      "epoch": 0.6614735193002307,
+      "grad_norm": 0.1309012919664383,
+      "learning_rate": 0.00011280362396333433,
+      "loss": 2.7678,
+      "step": 11400
+    },
+    {
+      "epoch": 0.6620537592294413,
+      "grad_norm": 0.12953191995620728,
+      "learning_rate": 0.00011245840315333685,
+      "loss": 2.7707,
+      "step": 11410
+    },
+    {
+      "epoch": 0.6626339991586521,
+      "grad_norm": 0.12538152933120728,
+      "learning_rate": 0.00011211350468066954,
+      "loss": 2.7684,
+      "step": 11420
+    },
+    {
+      "epoch": 0.6632142390878628,
+      "grad_norm": 0.1254589706659317,
+      "learning_rate": 0.00011176892981528478,
+      "loss": 2.7689,
+      "step": 11430
+    },
+    {
+      "epoch": 0.6637944790170736,
+      "grad_norm": 0.12923027575016022,
+      "learning_rate": 0.00011142467982594316,
+      "loss": 2.7776,
+      "step": 11440
+    },
+    {
+      "epoch": 0.6643747189462843,
+      "grad_norm": 0.13753275573253632,
+      "learning_rate": 0.00011108075598020944,
+      "loss": 2.7567,
+      "step": 11450
+    },
+    {
+      "epoch": 0.664954958875495,
+      "grad_norm": 0.1266188770532608,
+      "learning_rate": 0.00011073715954444712,
+      "loss": 2.7705,
+      "step": 11460
+    },
+    {
+      "epoch": 0.6655351988047058,
+      "grad_norm": 0.12759283185005188,
+      "learning_rate": 0.00011039389178381427,
+      "loss": 2.7604,
+      "step": 11470
+    },
+    {
+      "epoch": 0.6661154387339164,
+      "grad_norm": 0.1300540566444397,
+      "learning_rate": 0.0001100509539622588,
+      "loss": 2.7636,
+      "step": 11480
+    },
+    {
+      "epoch": 0.6666956786631272,
+      "grad_norm": 0.12630164623260498,
+      "learning_rate": 0.00010970834734251363,
+      "loss": 2.7766,
+      "step": 11490
+    },
+    {
+      "epoch": 0.6672759185923379,
+      "grad_norm": 0.1265067309141159,
+      "learning_rate": 0.00010936607318609218,
+      "loss": 2.7604,
+      "step": 11500
+    },
+    {
+      "epoch": 0.6678561585215487,
+      "grad_norm": 0.12710276246070862,
+      "learning_rate": 0.00010902413275328389,
+      "loss": 2.7562,
+      "step": 11510
+    },
+    {
+      "epoch": 0.6684363984507594,
+      "grad_norm": 0.12407544255256653,
+      "learning_rate": 0.00010868252730314918,
+      "loss": 2.7669,
+      "step": 11520
+    },
+    {
+      "epoch": 0.6690166383799702,
+      "grad_norm": 0.12599880993366241,
+      "learning_rate": 0.00010834125809351512,
+      "loss": 2.772,
+      "step": 11530
+    },
+    {
+      "epoch": 0.6695968783091808,
+      "grad_norm": 0.12807178497314453,
+      "learning_rate": 0.00010800032638097067,
+      "loss": 2.759,
+      "step": 11540
+    },
+    {
+      "epoch": 0.6701771182383915,
+      "grad_norm": 0.13285909593105316,
+      "learning_rate": 0.00010765973342086204,
+      "loss": 2.7591,
+      "step": 11550
+    },
+    {
+      "epoch": 0.6707573581676023,
+      "grad_norm": 0.1285742074251175,
+      "learning_rate": 0.00010731948046728834,
+      "loss": 2.7567,
+      "step": 11560
+    },
+    {
+      "epoch": 0.671337598096813,
+      "grad_norm": 0.1229885146021843,
+      "learning_rate": 0.00010697956877309651,
+      "loss": 2.7585,
+      "step": 11570
+    },
+    {
+      "epoch": 0.6719178380260238,
+      "grad_norm": 0.12948161363601685,
+      "learning_rate": 0.00010663999958987702,
+      "loss": 2.76,
+      "step": 11580
+    },
+    {
+      "epoch": 0.6724980779552345,
+      "grad_norm": 0.12574850022792816,
+      "learning_rate": 0.00010630077416795919,
+      "loss": 2.7581,
+      "step": 11590
+    },
+    {
+      "epoch": 0.6730783178844452,
+      "grad_norm": 0.12991534173488617,
+      "learning_rate": 0.00010596189375640646,
+      "loss": 2.7543,
+      "step": 11600
+    },
+    {
+      "epoch": 0.6736585578136559,
+      "grad_norm": 0.12388639152050018,
+      "learning_rate": 0.00010562335960301225,
+      "loss": 2.7503,
+      "step": 11610
+    },
+    {
+      "epoch": 0.6742387977428667,
+      "grad_norm": 0.13579106330871582,
+      "learning_rate": 0.00010528517295429445,
+      "loss": 2.7579,
+      "step": 11620
+    },
+    {
+      "epoch": 0.6748190376720774,
+      "grad_norm": 0.1378217339515686,
+      "learning_rate": 0.00010494733505549197,
+      "loss": 2.7699,
+      "step": 11630
+    },
+    {
+      "epoch": 0.6753992776012882,
+      "grad_norm": 0.13772232830524445,
+      "learning_rate": 0.0001046098471505593,
+      "loss": 2.7658,
+      "step": 11640
+    },
+    {
+      "epoch": 0.6759795175304989,
+      "grad_norm": 0.13989433646202087,
+      "learning_rate": 0.00010427271048216214,
+      "loss": 2.767,
+      "step": 11650
+    },
+    {
+      "epoch": 0.6765597574597096,
+      "grad_norm": 0.13066108524799347,
+      "learning_rate": 0.00010393592629167326,
+      "loss": 2.7671,
+      "step": 11660
+    },
+    {
+      "epoch": 0.6771399973889203,
+      "grad_norm": 0.12652446329593658,
+      "learning_rate": 0.00010359949581916701,
+      "loss": 2.7602,
+      "step": 11670
+    },
+    {
+      "epoch": 0.677720237318131,
+      "grad_norm": 0.12439344823360443,
+      "learning_rate": 0.00010326342030341591,
+      "loss": 2.7597,
+      "step": 11680
+    },
+    {
+      "epoch": 0.6783004772473418,
+      "grad_norm": 0.12505538761615753,
+      "learning_rate": 0.00010292770098188511,
+      "loss": 2.7552,
+      "step": 11690
+    },
+    {
+      "epoch": 0.6788807171765525,
+      "grad_norm": 0.12654612958431244,
+      "learning_rate": 0.00010259233909072823,
+      "loss": 2.7624,
+      "step": 11700
+    },
+    {
+      "epoch": 0.6794609571057633,
+      "grad_norm": 0.12148457765579224,
+      "learning_rate": 0.00010225733586478315,
+      "loss": 2.7619,
+      "step": 11710
+    },
+    {
+      "epoch": 0.680041197034974,
+      "grad_norm": 0.12959624826908112,
+      "learning_rate": 0.00010192269253756648,
+      "loss": 2.7636,
+      "step": 11720
+    },
+    {
+      "epoch": 0.6806214369641846,
+      "grad_norm": 0.13695235550403595,
+      "learning_rate": 0.00010158841034127035,
+      "loss": 2.7599,
+      "step": 11730
+    },
+    {
+      "epoch": 0.6812016768933954,
+      "grad_norm": 0.1335146725177765,
+      "learning_rate": 0.00010125449050675655,
+      "loss": 2.7629,
+      "step": 11740
+    },
+    {
+      "epoch": 0.6817819168226061,
+      "grad_norm": 0.1221490204334259,
+      "learning_rate": 0.00010092093426355307,
+      "loss": 2.7578,
+      "step": 11750
+    },
+    {
+      "epoch": 0.6823621567518169,
+      "grad_norm": 0.12168210744857788,
+      "learning_rate": 0.00010058774283984887,
+      "loss": 2.7605,
+      "step": 11760
+    },
+    {
+      "epoch": 0.6829423966810276,
+      "grad_norm": 0.12987020611763,
+      "learning_rate": 0.00010025491746248963,
+      "loss": 2.7601,
+      "step": 11770
+    },
+    {
+      "epoch": 0.6835226366102384,
+      "grad_norm": 0.12631021440029144,
+      "learning_rate": 9.992245935697346e-05,
+      "loss": 2.7466,
+      "step": 11780
+    },
+    {
+      "epoch": 0.684102876539449,
+      "grad_norm": 0.12719561159610748,
+      "learning_rate": 9.959036974744562e-05,
+      "loss": 2.7544,
+      "step": 11790
+    },
+    {
+      "epoch": 0.6846831164686598,
+      "grad_norm": 0.12334798276424408,
+      "learning_rate": 9.925864985669509e-05,
+      "loss": 2.7511,
+      "step": 11800
+    },
+    {
+      "epoch": 0.6852633563978705,
+      "grad_norm": 0.12944762408733368,
+      "learning_rate": 9.892730090614917e-05,
+      "loss": 2.7651,
+      "step": 11810
+    },
+    {
+      "epoch": 0.6858435963270813,
+      "grad_norm": 0.1283605396747589,
+      "learning_rate": 9.859632411586935e-05,
+      "loss": 2.7533,
+      "step": 11820
+    },
+    {
+      "epoch": 0.686423836256292,
+      "grad_norm": 0.12465015053749084,
+      "learning_rate": 9.826572070454702e-05,
+      "loss": 2.7572,
+      "step": 11830
+    },
+    {
+      "epoch": 0.6870040761855027,
+      "grad_norm": 0.13003583252429962,
+      "learning_rate": 9.793549188949835e-05,
+      "loss": 2.7584,
+      "step": 11840
+    },
+    {
+      "epoch": 0.6875843161147134,
+      "grad_norm": 0.12272375077009201,
+      "learning_rate": 9.760563888666059e-05,
+      "loss": 2.7473,
+      "step": 11850
+    },
+    {
+      "epoch": 0.6881645560439241,
+      "grad_norm": 0.12487037479877472,
+      "learning_rate": 9.7276162910587e-05,
+      "loss": 2.7501,
+      "step": 11860
+    },
+    {
+      "epoch": 0.6887447959731349,
+      "grad_norm": 0.12201700359582901,
+      "learning_rate": 9.694706517444256e-05,
+      "loss": 2.7487,
+      "step": 11870
+    },
+    {
+      "epoch": 0.6893250359023456,
+      "grad_norm": 0.1306881606578827,
+      "learning_rate": 9.661834688999987e-05,
+      "loss": 2.7551,
+      "step": 11880
+    },
+    {
+      "epoch": 0.6899052758315564,
+      "grad_norm": 0.123641736805439,
+      "learning_rate": 9.629000926763371e-05,
+      "loss": 2.7461,
+      "step": 11890
+    },
+    {
+      "epoch": 0.6904855157607671,
+      "grad_norm": 0.13387881219387054,
+      "learning_rate": 9.596205351631791e-05,
+      "loss": 2.7595,
+      "step": 11900
+    },
+    {
+      "epoch": 0.6910657556899779,
+      "grad_norm": 0.12325596064329147,
+      "learning_rate": 9.563448084361979e-05,
+      "loss": 2.7546,
+      "step": 11910
+    },
+    {
+      "epoch": 0.6916459956191885,
+      "grad_norm": 0.12377669662237167,
+      "learning_rate": 9.530729245569614e-05,
+      "loss": 2.7551,
+      "step": 11920
+    },
+    {
+      "epoch": 0.6922262355483992,
+      "grad_norm": 0.12587293982505798,
+      "learning_rate": 9.498048955728917e-05,
+      "loss": 2.7536,
+      "step": 11930
+    },
+    {
+      "epoch": 0.69280647547761,
+      "grad_norm": 0.12992674112319946,
+      "learning_rate": 9.465407335172102e-05,
+      "loss": 2.7633,
+      "step": 11940
+    },
+    {
+      "epoch": 0.6933867154068207,
+      "grad_norm": 0.1415330022573471,
+      "learning_rate": 9.432804504089065e-05,
+      "loss": 2.7563,
+      "step": 11950
+    },
+    {
+      "epoch": 0.6939669553360315,
+      "grad_norm": 0.13113632798194885,
+      "learning_rate": 9.400240582526834e-05,
+      "loss": 2.7571,
+      "step": 11960
+    },
+    {
+      "epoch": 0.6945471952652422,
+      "grad_norm": 0.12729544937610626,
+      "learning_rate": 9.367715690389178e-05,
+      "loss": 2.753,
+      "step": 11970
+    },
+    {
+      "epoch": 0.6951274351944529,
+      "grad_norm": 0.12055703997612,
+      "learning_rate": 9.335229947436157e-05,
+      "loss": 2.7618,
+      "step": 11980
+    },
+    {
+      "epoch": 0.6957076751236636,
+      "grad_norm": 0.12297140061855316,
+      "learning_rate": 9.302783473283676e-05,
+      "loss": 2.7526,
+      "step": 11990
+    },
+    {
+      "epoch": 0.6962879150528744,
+      "grad_norm": 0.12334892898797989,
+      "learning_rate": 9.270376387403073e-05,
+      "loss": 2.7557,
+      "step": 12000
+    },
+    {
+      "epoch": 0.6962879150528744,
+      "eval_loss": 2.7262585163116455,
+      "eval_runtime": 5.3974,
+      "eval_samples_per_second": 802.231,
+      "eval_steps_per_second": 1.667,
+      "step": 12000
+    },
+    {
+      "epoch": 0.6968681549820851,
+      "grad_norm": 0.12369856983423233,
+      "learning_rate": 9.238008809120602e-05,
+      "loss": 2.7586,
+      "step": 12010
+    },
+    {
+      "epoch": 0.6974483949112958,
+      "grad_norm": 0.12686264514923096,
+      "learning_rate": 9.205680857617099e-05,
+      "loss": 2.7587,
+      "step": 12020
+    },
+    {
+      "epoch": 0.6980286348405066,
+      "grad_norm": 0.12067441642284393,
+      "learning_rate": 9.173392651927462e-05,
+      "loss": 2.7581,
+      "step": 12030
+    },
+    {
+      "epoch": 0.6986088747697172,
+      "grad_norm": 0.12240596115589142,
+      "learning_rate": 9.141144310940237e-05,
+      "loss": 2.7504,
+      "step": 12040
+    },
+    {
+      "epoch": 0.699189114698928,
+      "grad_norm": 0.12386554479598999,
+      "learning_rate": 9.10893595339722e-05,
+      "loss": 2.7533,
+      "step": 12050
+    },
+    {
+      "epoch": 0.6997693546281387,
+      "grad_norm": 0.1218453124165535,
+      "learning_rate": 9.076767697892923e-05,
+      "loss": 2.7528,
+      "step": 12060
+    },
+    {
+      "epoch": 0.7003495945573495,
+      "grad_norm": 0.12844803929328918,
+      "learning_rate": 9.04463966287426e-05,
+      "loss": 2.7485,
+      "step": 12070
+    },
+    {
+      "epoch": 0.7009298344865602,
+      "grad_norm": 0.1264135241508484,
+      "learning_rate": 9.01255196664001e-05,
+      "loss": 2.7659,
+      "step": 12080
+    },
+    {
+      "epoch": 0.701510074415771,
+      "grad_norm": 0.12528979778289795,
+      "learning_rate": 8.980504727340433e-05,
+      "loss": 2.751,
+      "step": 12090
+    },
+    {
+      "epoch": 0.7020903143449817,
+      "grad_norm": 0.1233081966638565,
+      "learning_rate": 8.948498062976825e-05,
+      "loss": 2.7523,
+      "step": 12100
+    },
+    {
+      "epoch": 0.7026705542741923,
+      "grad_norm": 0.1261734664440155,
+      "learning_rate": 8.916532091401065e-05,
+      "loss": 2.7573,
+      "step": 12110
+    },
+    {
+      "epoch": 0.7032507942034031,
+      "grad_norm": 0.12680375576019287,
+      "learning_rate": 8.884606930315223e-05,
+      "loss": 2.7523,
+      "step": 12120
+    },
+    {
+      "epoch": 0.7038310341326138,
+      "grad_norm": 0.12935955822467804,
+      "learning_rate": 8.852722697271084e-05,
+      "loss": 2.7474,
+      "step": 12130
+    },
+    {
+      "epoch": 0.7044112740618246,
+      "grad_norm": 0.12751561403274536,
+      "learning_rate": 8.820879509669731e-05,
+      "loss": 2.7596,
+      "step": 12140
+    },
+    {
+      "epoch": 0.7049915139910353,
+      "grad_norm": 0.11992467194795609,
+      "learning_rate": 8.789077484761116e-05,
+      "loss": 2.7521,
+      "step": 12150
+    },
+    {
+      "epoch": 0.7055717539202461,
+      "grad_norm": 0.12174970656633377,
+      "learning_rate": 8.757316739643621e-05,
+      "loss": 2.749,
+      "step": 12160
+    },
+    {
+      "epoch": 0.7061519938494567,
+      "grad_norm": 0.12190863490104675,
+      "learning_rate": 8.725597391263651e-05,
+      "loss": 2.7512,
+      "step": 12170
+    },
+    {
+      "epoch": 0.7067322337786675,
+      "grad_norm": 0.1286507099866867,
+      "learning_rate": 8.69391955641516e-05,
+      "loss": 2.7448,
+      "step": 12180
+    },
+    {
+      "epoch": 0.7073124737078782,
+      "grad_norm": 0.12180124223232269,
+      "learning_rate": 8.662283351739257e-05,
+      "loss": 2.7594,
+      "step": 12190
+    },
+    {
+      "epoch": 0.7078927136370889,
+      "grad_norm": 0.12321013957262039,
+      "learning_rate": 8.630688893723762e-05,
+      "loss": 2.748,
+      "step": 12200
+    },
+    {
+      "epoch": 0.7084729535662997,
+      "grad_norm": 0.12682393193244934,
+      "learning_rate": 8.599136298702776e-05,
+      "loss": 2.7451,
+      "step": 12210
+    },
+    {
+      "epoch": 0.7090531934955104,
+      "grad_norm": 0.12241560220718384,
+      "learning_rate": 8.567625682856255e-05,
+      "loss": 2.7472,
+      "step": 12220
+    },
+    {
+      "epoch": 0.7096334334247211,
+      "grad_norm": 0.1252165287733078,
+      "learning_rate": 8.536157162209601e-05,
+      "loss": 2.7484,
+      "step": 12230
+    },
+    {
+      "epoch": 0.7102136733539318,
+      "grad_norm": 0.12328600883483887,
+      "learning_rate": 8.504730852633197e-05,
+      "loss": 2.7518,
+      "step": 12240
+    },
+    {
+      "epoch": 0.7107939132831426,
+      "grad_norm": 0.12212226539850235,
+      "learning_rate": 8.473346869842003e-05,
+      "loss": 2.7409,
+      "step": 12250
+    },
+    {
+      "epoch": 0.7113741532123533,
+      "grad_norm": 0.12457285821437836,
+      "learning_rate": 8.442005329395137e-05,
+      "loss": 2.7466,
+      "step": 12260
+    },
+    {
+      "epoch": 0.7119543931415641,
+      "grad_norm": 0.13215821981430054,
+      "learning_rate": 8.410706346695432e-05,
+      "loss": 2.7566,
+      "step": 12270
+    },
+    {
+      "epoch": 0.7125346330707748,
+      "grad_norm": 0.12475377321243286,
+      "learning_rate": 8.379450036989014e-05,
+      "loss": 2.7524,
+      "step": 12280
+    },
+    {
+      "epoch": 0.7131148729999854,
+      "grad_norm": 0.12629540264606476,
+      "learning_rate": 8.348236515364903e-05,
+      "loss": 2.7526,
+      "step": 12290
+    },
+    {
+      "epoch": 0.7136951129291962,
+      "grad_norm": 0.1277417689561844,
+      "learning_rate": 8.317065896754548e-05,
+      "loss": 2.745,
+      "step": 12300
+    },
+    {
+      "epoch": 0.7142753528584069,
+      "grad_norm": 0.136353999376297,
+      "learning_rate": 8.285938295931435e-05,
+      "loss": 2.7577,
+      "step": 12310
+    },
+    {
+      "epoch": 0.7148555927876177,
+      "grad_norm": 0.12383802980184555,
+      "learning_rate": 8.254853827510646e-05,
+      "loss": 2.7461,
+      "step": 12320
+    },
+    {
+      "epoch": 0.7154358327168284,
+      "grad_norm": 0.12012789398431778,
+      "learning_rate": 8.223812605948458e-05,
+      "loss": 2.7471,
+      "step": 12330
+    },
+    {
+      "epoch": 0.7160160726460392,
+      "grad_norm": 0.12090951204299927,
+      "learning_rate": 8.192814745541884e-05,
+      "loss": 2.743,
+      "step": 12340
+    },
+    {
+      "epoch": 0.7165963125752499,
+      "grad_norm": 0.12389807403087616,
+      "learning_rate": 8.161860360428315e-05,
+      "loss": 2.7397,
+      "step": 12350
+    },
+    {
+      "epoch": 0.7171765525044606,
+      "grad_norm": 0.12754105031490326,
+      "learning_rate": 8.130949564585028e-05,
+      "loss": 2.7405,
+      "step": 12360
+    },
+    {
+      "epoch": 0.7177567924336713,
+      "grad_norm": 0.12422367185354233,
+      "learning_rate": 8.100082471828813e-05,
+      "loss": 2.7499,
+      "step": 12370
+    },
+    {
+      "epoch": 0.718337032362882,
+      "grad_norm": 0.12758466601371765,
+      "learning_rate": 8.069259195815542e-05,
+      "loss": 2.749,
+      "step": 12380
+    },
+    {
+      "epoch": 0.7189172722920928,
+      "grad_norm": 0.12679055333137512,
+      "learning_rate": 8.038479850039735e-05,
+      "loss": 2.7423,
+      "step": 12390
+    },
+    {
+      "epoch": 0.7194975122213035,
+      "grad_norm": 0.12068577855825424,
+      "learning_rate": 8.007744547834182e-05,
+      "loss": 2.745,
+      "step": 12400
+    },
+    {
+      "epoch": 0.7200777521505143,
+      "grad_norm": 0.1262856423854828,
+      "learning_rate": 7.977053402369482e-05,
+      "loss": 2.7466,
+      "step": 12410
+    },
+    {
+      "epoch": 0.7206579920797249,
+      "grad_norm": 0.12377514690160751,
+      "learning_rate": 7.946406526653641e-05,
+      "loss": 2.738,
+      "step": 12420
+    },
+    {
+      "epoch": 0.7212382320089357,
+      "grad_norm": 0.12496384978294373,
+      "learning_rate": 7.915804033531673e-05,
+      "loss": 2.7468,
+      "step": 12430
+    },
+    {
+      "epoch": 0.7218184719381464,
+      "grad_norm": 0.12281250953674316,
+      "learning_rate": 7.885246035685153e-05,
+      "loss": 2.7477,
+      "step": 12440
+    },
+    {
+      "epoch": 0.7223987118673572,
+      "grad_norm": 0.12681566178798676,
+      "learning_rate": 7.85473264563185e-05,
+      "loss": 2.7382,
+      "step": 12450
+    },
+    {
+      "epoch": 0.7229789517965679,
+      "grad_norm": 0.12231052666902542,
+      "learning_rate": 7.824263975725238e-05,
+      "loss": 2.7501,
+      "step": 12460
+    },
+    {
+      "epoch": 0.7235591917257786,
+      "grad_norm": 0.1255701333284378,
+      "learning_rate": 7.793840138154172e-05,
+      "loss": 2.7444,
+      "step": 12470
+    },
+    {
+      "epoch": 0.7241394316549893,
+      "grad_norm": 0.11985606700181961,
+      "learning_rate": 7.763461244942398e-05,
+      "loss": 2.7464,
+      "step": 12480
+    },
+    {
+      "epoch": 0.7247196715842,
+      "grad_norm": 0.12225638329982758,
+      "learning_rate": 7.733127407948182e-05,
+      "loss": 2.7449,
+      "step": 12490
+    },
+    {
+      "epoch": 0.7252999115134108,
+      "grad_norm": 0.12188901752233505,
+      "learning_rate": 7.702838738863907e-05,
+      "loss": 2.7308,
+      "step": 12500
+    },
+    {
+      "epoch": 0.7258801514426215,
+      "grad_norm": 0.12092640995979309,
+      "learning_rate": 7.672595349215597e-05,
+      "loss": 2.7393,
+      "step": 12510
+    },
+    {
+      "epoch": 0.7264603913718323,
+      "grad_norm": 0.13984154164791107,
+      "learning_rate": 7.642397350362604e-05,
+      "loss": 2.7399,
+      "step": 12520
+    },
+    {
+      "epoch": 0.727040631301043,
+      "grad_norm": 0.12072645872831345,
+      "learning_rate": 7.612244853497114e-05,
+      "loss": 2.7361,
+      "step": 12530
+    },
+    {
+      "epoch": 0.7276208712302538,
+      "grad_norm": 0.12388517707586288,
+      "learning_rate": 7.582137969643775e-05,
+      "loss": 2.7512,
+      "step": 12540
+    },
+    {
+      "epoch": 0.7282011111594644,
+      "grad_norm": 0.1254212111234665,
+      "learning_rate": 7.552076809659308e-05,
+      "loss": 2.755,
+      "step": 12550
+    },
+    {
+      "epoch": 0.7287813510886751,
+      "grad_norm": 0.12324492633342743,
+      "learning_rate": 7.522061484232022e-05,
+      "loss": 2.7484,
+      "step": 12560
+    },
+    {
+      "epoch": 0.7293615910178859,
+      "grad_norm": 0.12253236025571823,
+      "learning_rate": 7.492092103881518e-05,
+      "loss": 2.7395,
+      "step": 12570
+    },
+    {
+      "epoch": 0.7299418309470966,
+      "grad_norm": 0.1230064406991005,
+      "learning_rate": 7.462168778958169e-05,
+      "loss": 2.7499,
+      "step": 12580
+    },
+    {
+      "epoch": 0.7305220708763074,
+      "grad_norm": 0.1225818321108818,
+      "learning_rate": 7.43229161964281e-05,
+      "loss": 2.7338,
+      "step": 12590
+    },
+    {
+      "epoch": 0.7311023108055181,
+      "grad_norm": 0.12051333487033844,
+      "learning_rate": 7.402460735946269e-05,
+      "loss": 2.742,
+      "step": 12600
+    },
+    {
+      "epoch": 0.7316825507347288,
+      "grad_norm": 0.12291798740625381,
+      "learning_rate": 7.372676237708973e-05,
+      "loss": 2.7379,
+      "step": 12610
+    },
+    {
+      "epoch": 0.7322627906639395,
+      "grad_norm": 0.12274395674467087,
+      "learning_rate": 7.342938234600587e-05,
+      "loss": 2.7474,
+      "step": 12620
+    },
+    {
+      "epoch": 0.7328430305931503,
+      "grad_norm": 0.12541697919368744,
+      "learning_rate": 7.313246836119525e-05,
+      "loss": 2.7451,
+      "step": 12630
+    },
+    {
+      "epoch": 0.733423270522361,
+      "grad_norm": 0.1245257779955864,
+      "learning_rate": 7.28360215159265e-05,
+      "loss": 2.7523,
+      "step": 12640
+    },
+    {
+      "epoch": 0.7340035104515718,
+      "grad_norm": 0.12861104309558868,
+      "learning_rate": 7.254004290174788e-05,
+      "loss": 2.7459,
+      "step": 12650
+    },
+    {
+      "epoch": 0.7345837503807825,
+      "grad_norm": 0.11994564533233643,
+      "learning_rate": 7.224453360848358e-05,
+      "loss": 2.7383,
+      "step": 12660
+    },
+    {
+      "epoch": 0.7351639903099931,
+      "grad_norm": 0.11946084350347519,
+      "learning_rate": 7.194949472422998e-05,
+      "loss": 2.7489,
+      "step": 12670
+    },
+    {
+      "epoch": 0.7357442302392039,
+      "grad_norm": 0.129042848944664,
+      "learning_rate": 7.165492733535086e-05,
+      "loss": 2.7329,
+      "step": 12680
+    },
+    {
+      "epoch": 0.7363244701684146,
+      "grad_norm": 0.12452303618192673,
+      "learning_rate": 7.136083252647447e-05,
+      "loss": 2.7441,
+      "step": 12690
+    },
+    {
+      "epoch": 0.7369047100976254,
+      "grad_norm": 0.12241894006729126,
+      "learning_rate": 7.10672113804886e-05,
+      "loss": 2.7388,
+      "step": 12700
+    },
+    {
+      "epoch": 0.7374849500268361,
+      "grad_norm": 0.12106358259916306,
+      "learning_rate": 7.077406497853698e-05,
+      "loss": 2.7303,
+      "step": 12710
+    },
+    {
+      "epoch": 0.7380651899560469,
+      "grad_norm": 0.11599821597337723,
+      "learning_rate": 7.04813944000156e-05,
+      "loss": 2.7378,
+      "step": 12720
+    },
+    {
+      "epoch": 0.7386454298852576,
+      "grad_norm": 0.12181167304515839,
+      "learning_rate": 7.018920072256792e-05,
+      "loss": 2.745,
+      "step": 12730
+    },
+    {
+      "epoch": 0.7392256698144682,
+      "grad_norm": 0.12115510553121567,
+      "learning_rate": 6.989748502208186e-05,
+      "loss": 2.7309,
+      "step": 12740
+    },
+    {
+      "epoch": 0.739805909743679,
+      "grad_norm": 0.12130045890808105,
+      "learning_rate": 6.960624837268514e-05,
+      "loss": 2.7432,
+      "step": 12750
+    },
+    {
+      "epoch": 0.7403861496728897,
+      "grad_norm": 0.13140781223773956,
+      "learning_rate": 6.931549184674153e-05,
+      "loss": 2.7451,
+      "step": 12760
+    },
+    {
+      "epoch": 0.7409663896021005,
+      "grad_norm": 0.1199527308344841,
+      "learning_rate": 6.902521651484724e-05,
+      "loss": 2.7439,
+      "step": 12770
+    },
+    {
+      "epoch": 0.7415466295313112,
+      "grad_norm": 0.1253666877746582,
+      "learning_rate": 6.873542344582616e-05,
+      "loss": 2.7375,
+      "step": 12780
+    },
+    {
+      "epoch": 0.742126869460522,
+      "grad_norm": 0.12600301206111908,
+      "learning_rate": 6.844611370672691e-05,
+      "loss": 2.7401,
+      "step": 12790
+    },
+    {
+      "epoch": 0.7427071093897326,
+      "grad_norm": 0.12067416310310364,
+      "learning_rate": 6.815728836281823e-05,
+      "loss": 2.7335,
+      "step": 12800
+    },
+    {
+      "epoch": 0.7432873493189434,
+      "grad_norm": 0.12302874028682709,
+      "learning_rate": 6.786894847758527e-05,
+      "loss": 2.7447,
+      "step": 12810
+    },
+    {
+      "epoch": 0.7438675892481541,
+      "grad_norm": 0.12169067561626434,
+      "learning_rate": 6.75810951127257e-05,
+      "loss": 2.7416,
+      "step": 12820
+    },
+    {
+      "epoch": 0.7444478291773649,
+      "grad_norm": 0.12148208916187286,
+      "learning_rate": 6.729372932814571e-05,
+      "loss": 2.7341,
+      "step": 12830
+    },
+    {
+      "epoch": 0.7450280691065756,
+      "grad_norm": 0.1290818154811859,
+      "learning_rate": 6.700685218195639e-05,
+      "loss": 2.7445,
+      "step": 12840
+    },
+    {
+      "epoch": 0.7456083090357863,
+      "grad_norm": 0.12071933597326279,
+      "learning_rate": 6.672046473046921e-05,
+      "loss": 2.7398,
+      "step": 12850
+    },
+    {
+      "epoch": 0.746188548964997,
+      "grad_norm": 0.12640851736068726,
+      "learning_rate": 6.643456802819294e-05,
+      "loss": 2.7411,
+      "step": 12860
+    },
+    {
+      "epoch": 0.7467687888942077,
+      "grad_norm": 0.13351675868034363,
+      "learning_rate": 6.614916312782915e-05,
+      "loss": 2.728,
+      "step": 12870
+    },
+    {
+      "epoch": 0.7473490288234185,
+      "grad_norm": 0.12772291898727417,
+      "learning_rate": 6.58642510802685e-05,
+      "loss": 2.7412,
+      "step": 12880
+    },
+    {
+      "epoch": 0.7479292687526292,
+      "grad_norm": 0.12297698110342026,
+      "learning_rate": 6.55798329345872e-05,
+      "loss": 2.7494,
+      "step": 12890
+    },
+    {
+      "epoch": 0.74850950868184,
+      "grad_norm": 0.12183728814125061,
+      "learning_rate": 6.529590973804238e-05,
+      "loss": 2.7399,
+      "step": 12900
+    },
+    {
+      "epoch": 0.7490897486110507,
+      "grad_norm": 0.12221384793519974,
+      "learning_rate": 6.50124825360692e-05,
+      "loss": 2.7397,
+      "step": 12910
+    },
+    {
+      "epoch": 0.7496699885402613,
+      "grad_norm": 0.11953077465295792,
+      "learning_rate": 6.472955237227625e-05,
+      "loss": 2.7346,
+      "step": 12920
+    },
+    {
+      "epoch": 0.7502502284694721,
+      "grad_norm": 0.1241031214594841,
+      "learning_rate": 6.444712028844202e-05,
+      "loss": 2.7376,
+      "step": 12930
+    },
+    {
+      "epoch": 0.7508304683986828,
+      "grad_norm": 0.12113183736801147,
+      "learning_rate": 6.416518732451103e-05,
+      "loss": 2.7425,
+      "step": 12940
+    },
+    {
+      "epoch": 0.7514107083278936,
+      "grad_norm": 0.12162219732999802,
+      "learning_rate": 6.388375451858993e-05,
+      "loss": 2.7403,
+      "step": 12950
+    },
+    {
+      "epoch": 0.7519909482571043,
+      "grad_norm": 0.12234422564506531,
+      "learning_rate": 6.36028229069439e-05,
+      "loss": 2.7377,
+      "step": 12960
+    },
+    {
+      "epoch": 0.7525711881863151,
+      "grad_norm": 0.123825304210186,
+      "learning_rate": 6.332239352399254e-05,
+      "loss": 2.7276,
+      "step": 12970
+    },
+    {
+      "epoch": 0.7531514281155258,
+      "grad_norm": 0.12295151501893997,
+      "learning_rate": 6.304246740230619e-05,
+      "loss": 2.7404,
+      "step": 12980
+    },
+    {
+      "epoch": 0.7537316680447365,
+      "grad_norm": 0.12254820764064789,
+      "learning_rate": 6.276304557260215e-05,
+      "loss": 2.7373,
+      "step": 12990
+    },
+    {
+      "epoch": 0.7543119079739472,
+      "grad_norm": 0.12362895160913467,
+      "learning_rate": 6.248412906374082e-05,
+      "loss": 2.7418,
+      "step": 13000
+    },
+    {
+      "epoch": 0.7543119079739472,
+      "eval_loss": 2.70877742767334,
+      "eval_runtime": 5.3922,
+      "eval_samples_per_second": 803.007,
+      "eval_steps_per_second": 1.669,
+      "step": 13000
+    },
+    {
+      "epoch": 0.754892147903158,
+      "grad_norm": 0.12260004132986069,
+      "learning_rate": 6.220571890272213e-05,
+      "loss": 2.7352,
+      "step": 13010
+    },
+    {
+      "epoch": 0.7554723878323687,
+      "grad_norm": 0.12325233221054077,
+      "learning_rate": 6.192781611468137e-05,
+      "loss": 2.7369,
+      "step": 13020
+    },
+    {
+      "epoch": 0.7560526277615794,
+      "grad_norm": 0.12207638472318649,
+      "learning_rate": 6.165042172288576e-05,
+      "loss": 2.7365,
+      "step": 13030
+    },
+    {
+      "epoch": 0.7566328676907902,
+      "grad_norm": 0.1201200932264328,
+      "learning_rate": 6.137353674873046e-05,
+      "loss": 2.736,
+      "step": 13040
+    },
+    {
+      "epoch": 0.7572131076200008,
+      "grad_norm": 0.11834630370140076,
+      "learning_rate": 6.109716221173499e-05,
+      "loss": 2.7374,
+      "step": 13050
+    },
+    {
+      "epoch": 0.7577933475492116,
+      "grad_norm": 0.12760621309280396,
+      "learning_rate": 6.0821299129539267e-05,
+      "loss": 2.7297,
+      "step": 13060
+    },
+    {
+      "epoch": 0.7583735874784223,
+      "grad_norm": 0.12132176011800766,
+      "learning_rate": 6.0545948517900186e-05,
+      "loss": 2.7318,
+      "step": 13070
+    },
+    {
+      "epoch": 0.7589538274076331,
+      "grad_norm": 0.1261221319437027,
+      "learning_rate": 6.0271111390687506e-05,
+      "loss": 2.734,
+      "step": 13080
+    },
+    {
+      "epoch": 0.7595340673368438,
+      "grad_norm": 0.11981641501188278,
+      "learning_rate": 5.9996788759880265e-05,
+      "loss": 2.7343,
+      "step": 13090
+    },
+    {
+      "epoch": 0.7601143072660546,
+      "grad_norm": 0.12327931821346283,
+      "learning_rate": 5.972298163556318e-05,
+      "loss": 2.7309,
+      "step": 13100
+    },
+    {
+      "epoch": 0.7606945471952652,
+      "grad_norm": 0.12560048699378967,
+      "learning_rate": 5.944969102592275e-05,
+      "loss": 2.7368,
+      "step": 13110
+    },
+    {
+      "epoch": 0.7612747871244759,
+      "grad_norm": 0.12151816487312317,
+      "learning_rate": 5.9176917937243534e-05,
+      "loss": 2.7315,
+      "step": 13120
+    },
+    {
+      "epoch": 0.7618550270536867,
+      "grad_norm": 0.127033993601799,
+      "learning_rate": 5.890466337390481e-05,
+      "loss": 2.7405,
+      "step": 13130
+    },
+    {
+      "epoch": 0.7624352669828974,
+      "grad_norm": 0.11884118616580963,
+      "learning_rate": 5.863292833837628e-05,
+      "loss": 2.7324,
+      "step": 13140
+    },
+    {
+      "epoch": 0.7630155069121082,
+      "grad_norm": 0.12274018675088882,
+      "learning_rate": 5.836171383121483e-05,
+      "loss": 2.7327,
+      "step": 13150
+    },
+    {
+      "epoch": 0.7635957468413189,
+      "grad_norm": 0.12369240075349808,
+      "learning_rate": 5.809102085106071e-05,
+      "loss": 2.7322,
+      "step": 13160
+    },
+    {
+      "epoch": 0.7641759867705297,
+      "grad_norm": 0.121719129383564,
+      "learning_rate": 5.7820850394633786e-05,
+      "loss": 2.7368,
+      "step": 13170
+    },
+    {
+      "epoch": 0.7647562266997403,
+      "grad_norm": 0.12220139056444168,
+      "learning_rate": 5.755120345672995e-05,
+      "loss": 2.7283,
+      "step": 13180
+    },
+    {
+      "epoch": 0.765336466628951,
+      "grad_norm": 0.12299592047929764,
+      "learning_rate": 5.7282081030217595e-05,
+      "loss": 2.7314,
+      "step": 13190
+    },
+    {
+      "epoch": 0.7659167065581618,
+      "grad_norm": 0.13223952054977417,
+      "learning_rate": 5.70134841060336e-05,
+      "loss": 2.7392,
+      "step": 13200
+    },
+    {
+      "epoch": 0.7664969464873725,
+      "grad_norm": 0.12204018235206604,
+      "learning_rate": 5.674541367318003e-05,
+      "loss": 2.7342,
+      "step": 13210
+    },
+    {
+      "epoch": 0.7670771864165833,
+      "grad_norm": 0.12159973382949829,
+      "learning_rate": 5.647787071872024e-05,
+      "loss": 2.7378,
+      "step": 13220
+    },
+    {
+      "epoch": 0.767657426345794,
+      "grad_norm": 0.1208658516407013,
+      "learning_rate": 5.62108562277754e-05,
+      "loss": 2.7387,
+      "step": 13230
+    },
+    {
+      "epoch": 0.7682376662750047,
+      "grad_norm": 0.12024756520986557,
+      "learning_rate": 5.5944371183520964e-05,
+      "loss": 2.734,
+      "step": 13240
+    },
+    {
+      "epoch": 0.7688179062042154,
+      "grad_norm": 0.12415524572134018,
+      "learning_rate": 5.567841656718267e-05,
+      "loss": 2.7347,
+      "step": 13250
+    },
+    {
+      "epoch": 0.7693981461334262,
+      "grad_norm": 0.1189967542886734,
+      "learning_rate": 5.541299335803332e-05,
+      "loss": 2.7382,
+      "step": 13260
+    },
+    {
+      "epoch": 0.7699783860626369,
+      "grad_norm": 0.12029008567333221,
+      "learning_rate": 5.514810253338896e-05,
+      "loss": 2.7386,
+      "step": 13270
+    },
+    {
+      "epoch": 0.7705586259918477,
+      "grad_norm": 0.11998321861028671,
+      "learning_rate": 5.48837450686053e-05,
+      "loss": 2.7352,
+      "step": 13280
+    },
+    {
+      "epoch": 0.7711388659210584,
+      "grad_norm": 0.12730252742767334,
+      "learning_rate": 5.461992193707439e-05,
+      "loss": 2.7345,
+      "step": 13290
+    },
+    {
+      "epoch": 0.771719105850269,
+      "grad_norm": 0.12133444845676422,
+      "learning_rate": 5.4356634110220386e-05,
+      "loss": 2.7282,
+      "step": 13300
+    },
+    {
+      "epoch": 0.7722993457794798,
+      "grad_norm": 0.12256559729576111,
+      "learning_rate": 5.409388255749688e-05,
+      "loss": 2.7312,
+      "step": 13310
+    },
+    {
+      "epoch": 0.7728795857086905,
+      "grad_norm": 0.12941910326480865,
+      "learning_rate": 5.3831668246382485e-05,
+      "loss": 2.7344,
+      "step": 13320
+    },
+    {
+      "epoch": 0.7734598256379013,
+      "grad_norm": 0.12094937264919281,
+      "learning_rate": 5.356999214237777e-05,
+      "loss": 2.7331,
+      "step": 13330
+    },
+    {
+      "epoch": 0.774040065567112,
+      "grad_norm": 0.12196173518896103,
+      "learning_rate": 5.3308855209001684e-05,
+      "loss": 2.737,
+      "step": 13340
+    },
+    {
+      "epoch": 0.7746203054963228,
+      "grad_norm": 0.12579815089702606,
+      "learning_rate": 5.304825840778758e-05,
+      "loss": 2.7229,
+      "step": 13350
+    },
+    {
+      "epoch": 0.7752005454255334,
+      "grad_norm": 0.1212158054113388,
+      "learning_rate": 5.278820269828031e-05,
+      "loss": 2.7314,
+      "step": 13360
+    },
+    {
+      "epoch": 0.7757807853547442,
+      "grad_norm": 0.12120276689529419,
+      "learning_rate": 5.252868903803223e-05,
+      "loss": 2.7444,
+      "step": 13370
+    },
+    {
+      "epoch": 0.7763610252839549,
+      "grad_norm": 0.12396235018968582,
+      "learning_rate": 5.2269718382599796e-05,
+      "loss": 2.7397,
+      "step": 13380
+    },
+    {
+      "epoch": 0.7769412652131656,
+      "grad_norm": 0.11907251924276352,
+      "learning_rate": 5.201129168554009e-05,
+      "loss": 2.7282,
+      "step": 13390
+    },
+    {
+      "epoch": 0.7775215051423764,
+      "grad_norm": 0.12151432782411575,
+      "learning_rate": 5.1753409898407226e-05,
+      "loss": 2.7332,
+      "step": 13400
+    },
+    {
+      "epoch": 0.7781017450715871,
+      "grad_norm": 0.11936885118484497,
+      "learning_rate": 5.149607397074911e-05,
+      "loss": 2.7374,
+      "step": 13410
+    },
+    {
+      "epoch": 0.7786819850007979,
+      "grad_norm": 0.12219711393117905,
+      "learning_rate": 5.1239284850103407e-05,
+      "loss": 2.7287,
+      "step": 13420
+    },
+    {
+      "epoch": 0.7792622249300085,
+      "grad_norm": 0.11801363527774811,
+      "learning_rate": 5.098304348199472e-05,
+      "loss": 2.7224,
+      "step": 13430
+    },
+    {
+      "epoch": 0.7798424648592193,
+      "grad_norm": 0.11896246671676636,
+      "learning_rate": 5.072735080993052e-05,
+      "loss": 2.7295,
+      "step": 13440
+    },
+    {
+      "epoch": 0.78042270478843,
+      "grad_norm": 0.11604252457618713,
+      "learning_rate": 5.047220777539796e-05,
+      "loss": 2.7228,
+      "step": 13450
+    },
+    {
+      "epoch": 0.7810029447176408,
+      "grad_norm": 0.12236449867486954,
+      "learning_rate": 5.021761531786062e-05,
+      "loss": 2.7322,
+      "step": 13460
+    },
+    {
+      "epoch": 0.7815831846468515,
+      "grad_norm": 0.12057973444461823,
+      "learning_rate": 4.996357437475434e-05,
+      "loss": 2.73,
+      "step": 13470
+    },
+    {
+      "epoch": 0.7821634245760622,
+      "grad_norm": 0.12264855206012726,
+      "learning_rate": 4.9710085881484694e-05,
+      "loss": 2.7251,
+      "step": 13480
+    },
+    {
+      "epoch": 0.7827436645052729,
+      "grad_norm": 0.12081897258758545,
+      "learning_rate": 4.945715077142277e-05,
+      "loss": 2.7271,
+      "step": 13490
+    },
+    {
+      "epoch": 0.7833239044344836,
+      "grad_norm": 0.12207679450511932,
+      "learning_rate": 4.920476997590211e-05,
+      "loss": 2.7319,
+      "step": 13500
+    },
+    {
+      "epoch": 0.7839041443636944,
+      "grad_norm": 0.120822474360466,
+      "learning_rate": 4.895294442421541e-05,
+      "loss": 2.7308,
+      "step": 13510
+    },
+    {
+      "epoch": 0.7844843842929051,
+      "grad_norm": 0.12106281518936157,
+      "learning_rate": 4.8701675043610474e-05,
+      "loss": 2.7258,
+      "step": 13520
+    },
+    {
+      "epoch": 0.7850646242221159,
+      "grad_norm": 0.12197288870811462,
+      "learning_rate": 4.845096275928769e-05,
+      "loss": 2.7309,
+      "step": 13530
+    },
+    {
+      "epoch": 0.7856448641513266,
+      "grad_norm": 0.12526744604110718,
+      "learning_rate": 4.82008084943959e-05,
+      "loss": 2.7293,
+      "step": 13540
+    },
+    {
+      "epoch": 0.7862251040805373,
+      "grad_norm": 0.1226550042629242,
+      "learning_rate": 4.795121317002922e-05,
+      "loss": 2.7231,
+      "step": 13550
+    },
+    {
+      "epoch": 0.786805344009748,
+      "grad_norm": 0.11998672783374786,
+      "learning_rate": 4.770217770522398e-05,
+      "loss": 2.7271,
+      "step": 13560
+    },
+    {
+      "epoch": 0.7873855839389587,
+      "grad_norm": 0.122990183532238,
+      "learning_rate": 4.745370301695462e-05,
+      "loss": 2.7322,
+      "step": 13570
+    },
+    {
+      "epoch": 0.7879658238681695,
+      "grad_norm": 0.12234378606081009,
+      "learning_rate": 4.720579002013115e-05,
+      "loss": 2.7224,
+      "step": 13580
+    },
+    {
+      "epoch": 0.7885460637973802,
+      "grad_norm": 0.12186608463525772,
+      "learning_rate": 4.69584396275951e-05,
+      "loss": 2.7183,
+      "step": 13590
+    },
+    {
+      "epoch": 0.789126303726591,
+      "grad_norm": 0.12713122367858887,
+      "learning_rate": 4.6711652750116505e-05,
+      "loss": 2.7299,
+      "step": 13600
+    },
+    {
+      "epoch": 0.7897065436558017,
+      "grad_norm": 0.12040334939956665,
+      "learning_rate": 4.646543029639068e-05,
+      "loss": 2.7274,
+      "step": 13610
+    },
+    {
+      "epoch": 0.7902867835850124,
+      "grad_norm": 0.11610428988933563,
+      "learning_rate": 4.621977317303423e-05,
+      "loss": 2.7225,
+      "step": 13620
+    },
+    {
+      "epoch": 0.7908670235142231,
+      "grad_norm": 0.12505796551704407,
+      "learning_rate": 4.5974682284582656e-05,
+      "loss": 2.7291,
+      "step": 13630
+    },
+    {
+      "epoch": 0.7914472634434339,
+      "grad_norm": 0.12457990646362305,
+      "learning_rate": 4.573015853348608e-05,
+      "loss": 2.7279,
+      "step": 13640
+    },
+    {
+      "epoch": 0.7920275033726446,
+      "grad_norm": 0.12435191869735718,
+      "learning_rate": 4.5486202820106695e-05,
+      "loss": 2.7387,
+      "step": 13650
+    },
+    {
+      "epoch": 0.7926077433018553,
+      "grad_norm": 0.11536847054958344,
+      "learning_rate": 4.524281604271499e-05,
+      "loss": 2.7206,
+      "step": 13660
+    },
+    {
+      "epoch": 0.7931879832310661,
+      "grad_norm": 0.12171947956085205,
+      "learning_rate": 4.499999909748649e-05,
+      "loss": 2.717,
+      "step": 13670
+    },
+    {
+      "epoch": 0.7937682231602767,
+      "grad_norm": 0.11659212410449982,
+      "learning_rate": 4.4757752878498794e-05,
+      "loss": 2.729,
+      "step": 13680
+    },
+    {
+      "epoch": 0.7943484630894875,
+      "grad_norm": 0.1195807233452797,
+      "learning_rate": 4.4516078277727635e-05,
+      "loss": 2.7286,
+      "step": 13690
+    },
+    {
+      "epoch": 0.7949287030186982,
+      "grad_norm": 0.12461701035499573,
+      "learning_rate": 4.427497618504439e-05,
+      "loss": 2.7313,
+      "step": 13700
+    },
+    {
+      "epoch": 0.795508942947909,
+      "grad_norm": 0.11976811289787292,
+      "learning_rate": 4.403444748821215e-05,
+      "loss": 2.7217,
+      "step": 13710
+    },
+    {
+      "epoch": 0.7960891828771197,
+      "grad_norm": 0.12124411016702652,
+      "learning_rate": 4.37944930728827e-05,
+      "loss": 2.7247,
+      "step": 13720
+    },
+    {
+      "epoch": 0.7966694228063305,
+      "grad_norm": 0.11961103975772858,
+      "learning_rate": 4.355511382259356e-05,
+      "loss": 2.7238,
+      "step": 13730
+    },
+    {
+      "epoch": 0.7972496627355411,
+      "grad_norm": 0.12003281712532043,
+      "learning_rate": 4.3316310618763936e-05,
+      "loss": 2.7336,
+      "step": 13740
+    },
+    {
+      "epoch": 0.7978299026647518,
+      "grad_norm": 0.11927200108766556,
+      "learning_rate": 4.3078084340692406e-05,
+      "loss": 2.7312,
+      "step": 13750
+    },
+    {
+      "epoch": 0.7984101425939626,
+      "grad_norm": 0.12134095281362534,
+      "learning_rate": 4.2840435865553065e-05,
+      "loss": 2.7319,
+      "step": 13760
+    },
+    {
+      "epoch": 0.7989903825231733,
+      "grad_norm": 0.12080392986536026,
+      "learning_rate": 4.2603366068392455e-05,
+      "loss": 2.7316,
+      "step": 13770
+    },
+    {
+      "epoch": 0.7995706224523841,
+      "grad_norm": 0.12257473915815353,
+      "learning_rate": 4.236687582212642e-05,
+      "loss": 2.7358,
+      "step": 13780
+    },
+    {
+      "epoch": 0.8001508623815948,
+      "grad_norm": 0.11869396269321442,
+      "learning_rate": 4.213096599753676e-05,
+      "loss": 2.7313,
+      "step": 13790
+    },
+    {
+      "epoch": 0.8007311023108055,
+      "grad_norm": 0.11939482390880585,
+      "learning_rate": 4.189563746326828e-05,
+      "loss": 2.7261,
+      "step": 13800
+    },
+    {
+      "epoch": 0.8013113422400162,
+      "grad_norm": 0.11985889077186584,
+      "learning_rate": 4.166089108582523e-05,
+      "loss": 2.7359,
+      "step": 13810
+    },
+    {
+      "epoch": 0.801891582169227,
+      "grad_norm": 0.12536443769931793,
+      "learning_rate": 4.142672772956837e-05,
+      "loss": 2.7209,
+      "step": 13820
+    },
+    {
+      "epoch": 0.8024718220984377,
+      "grad_norm": 0.1199636310338974,
+      "learning_rate": 4.119314825671172e-05,
+      "loss": 2.7336,
+      "step": 13830
+    },
+    {
+      "epoch": 0.8030520620276485,
+      "grad_norm": 0.12134099006652832,
+      "learning_rate": 4.0960153527319276e-05,
+      "loss": 2.7214,
+      "step": 13840
+    },
+    {
+      "epoch": 0.8036323019568592,
+      "grad_norm": 0.1173613891005516,
+      "learning_rate": 4.07277443993022e-05,
+      "loss": 2.7255,
+      "step": 13850
+    },
+    {
+      "epoch": 0.8042125418860699,
+      "grad_norm": 0.1184094101190567,
+      "learning_rate": 4.049592172841516e-05,
+      "loss": 2.7238,
+      "step": 13860
+    },
+    {
+      "epoch": 0.8047927818152806,
+      "grad_norm": 0.11648872494697571,
+      "learning_rate": 4.026468636825351e-05,
+      "loss": 2.7161,
+      "step": 13870
+    },
+    {
+      "epoch": 0.8053730217444913,
+      "grad_norm": 0.12471094727516174,
+      "learning_rate": 4.00340391702501e-05,
+      "loss": 2.7194,
+      "step": 13880
+    },
+    {
+      "epoch": 0.8059532616737021,
+      "grad_norm": 0.12097521126270294,
+      "learning_rate": 3.980398098367206e-05,
+      "loss": 2.7344,
+      "step": 13890
+    },
+    {
+      "epoch": 0.8065335016029128,
+      "grad_norm": 0.11875994503498077,
+      "learning_rate": 3.957451265561767e-05,
+      "loss": 2.7282,
+      "step": 13900
+    },
+    {
+      "epoch": 0.8071137415321236,
+      "grad_norm": 0.12130565941333771,
+      "learning_rate": 3.934563503101345e-05,
+      "loss": 2.7285,
+      "step": 13910
+    },
+    {
+      "epoch": 0.8076939814613343,
+      "grad_norm": 0.12270623445510864,
+      "learning_rate": 3.911734895261079e-05,
+      "loss": 2.7338,
+      "step": 13920
+    },
+    {
+      "epoch": 0.808274221390545,
+      "grad_norm": 0.12415996193885803,
+      "learning_rate": 3.888965526098287e-05,
+      "loss": 2.7266,
+      "step": 13930
+    },
+    {
+      "epoch": 0.8088544613197557,
+      "grad_norm": 0.11743751913309097,
+      "learning_rate": 3.866255479452177e-05,
+      "loss": 2.7303,
+      "step": 13940
+    },
+    {
+      "epoch": 0.8094347012489664,
+      "grad_norm": 0.11905871331691742,
+      "learning_rate": 3.8436048389435196e-05,
+      "loss": 2.7235,
+      "step": 13950
+    },
+    {
+      "epoch": 0.8100149411781772,
+      "grad_norm": 0.11726722121238708,
+      "learning_rate": 3.8210136879743375e-05,
+      "loss": 2.7297,
+      "step": 13960
+    },
+    {
+      "epoch": 0.8105951811073879,
+      "grad_norm": 0.12093175947666168,
+      "learning_rate": 3.798482109727628e-05,
+      "loss": 2.7272,
+      "step": 13970
+    },
+    {
+      "epoch": 0.8111754210365987,
+      "grad_norm": 0.11959103494882584,
+      "learning_rate": 3.776010187167016e-05,
+      "loss": 2.7226,
+      "step": 13980
+    },
+    {
+      "epoch": 0.8117556609658093,
+      "grad_norm": 0.12149166315793991,
+      "learning_rate": 3.753598003036476e-05,
+      "loss": 2.7244,
+      "step": 13990
+    },
+    {
+      "epoch": 0.8123359008950201,
+      "grad_norm": 0.12182975560426712,
+      "learning_rate": 3.731245639860017e-05,
+      "loss": 2.7167,
+      "step": 14000
+    },
+    {
+      "epoch": 0.8123359008950201,
+      "eval_loss": 2.696218490600586,
+      "eval_runtime": 5.3955,
+      "eval_samples_per_second": 802.521,
+      "eval_steps_per_second": 1.668,
+      "step": 14000
+    },
+    {
+      "epoch": 0.8129161408242308,
+      "grad_norm": 0.12197499722242355,
+      "learning_rate": 3.7089531799413815e-05,
+      "loss": 2.7234,
+      "step": 14010
+    },
+    {
+      "epoch": 0.8134963807534416,
+      "grad_norm": 0.11665521562099457,
+      "learning_rate": 3.6867207053637376e-05,
+      "loss": 2.7232,
+      "step": 14020
+    },
+    {
+      "epoch": 0.8140766206826523,
+      "grad_norm": 0.11618278175592422,
+      "learning_rate": 3.6645482979893966e-05,
+      "loss": 2.7195,
+      "step": 14030
+    },
+    {
+      "epoch": 0.814656860611863,
+      "grad_norm": 0.11773023754358292,
+      "learning_rate": 3.642436039459478e-05,
+      "loss": 2.7262,
+      "step": 14040
+    },
+    {
+      "epoch": 0.8152371005410738,
+      "grad_norm": 0.12087783962488174,
+      "learning_rate": 3.620384011193636e-05,
+      "loss": 2.7267,
+      "step": 14050
+    },
+    {
+      "epoch": 0.8158173404702844,
+      "grad_norm": 0.11980880051851273,
+      "learning_rate": 3.598392294389747e-05,
+      "loss": 2.723,
+      "step": 14060
+    },
+    {
+      "epoch": 0.8163975803994952,
+      "grad_norm": 0.11848059296607971,
+      "learning_rate": 3.576460970023614e-05,
+      "loss": 2.7269,
+      "step": 14070
+    },
+    {
+      "epoch": 0.8169778203287059,
+      "grad_norm": 0.11839718371629715,
+      "learning_rate": 3.5545901188486776e-05,
+      "loss": 2.7135,
+      "step": 14080
+    },
+    {
+      "epoch": 0.8175580602579167,
+      "grad_norm": 0.11868947744369507,
+      "learning_rate": 3.5327798213957e-05,
+      "loss": 2.7271,
+      "step": 14090
+    },
+    {
+      "epoch": 0.8181383001871274,
+      "grad_norm": 0.11662468314170837,
+      "learning_rate": 3.511030157972479e-05,
+      "loss": 2.7257,
+      "step": 14100
+    },
+    {
+      "epoch": 0.8187185401163382,
+      "grad_norm": 0.11749114841222763,
+      "learning_rate": 3.4893412086635566e-05,
+      "loss": 2.7173,
+      "step": 14110
+    },
+    {
+      "epoch": 0.8192987800455488,
+      "grad_norm": 0.11833590269088745,
+      "learning_rate": 3.467713053329911e-05,
+      "loss": 2.7217,
+      "step": 14120
+    },
+    {
+      "epoch": 0.8198790199747595,
+      "grad_norm": 0.11520706862211227,
+      "learning_rate": 3.446145771608689e-05,
+      "loss": 2.7179,
+      "step": 14130
+    },
+    {
+      "epoch": 0.8204592599039703,
+      "grad_norm": 0.12385083734989166,
+      "learning_rate": 3.4246394429128604e-05,
+      "loss": 2.7162,
+      "step": 14140
+    },
+    {
+      "epoch": 0.821039499833181,
+      "grad_norm": 0.12348052859306335,
+      "learning_rate": 3.403194146430997e-05,
+      "loss": 2.7184,
+      "step": 14150
+    },
+    {
+      "epoch": 0.8216197397623918,
+      "grad_norm": 0.11942430585622787,
+      "learning_rate": 3.381809961126925e-05,
+      "loss": 2.7215,
+      "step": 14160
+    },
+    {
+      "epoch": 0.8221999796916025,
+      "grad_norm": 0.12090287357568741,
+      "learning_rate": 3.360486965739444e-05,
+      "loss": 2.7094,
+      "step": 14170
+    },
+    {
+      "epoch": 0.8227802196208132,
+      "grad_norm": 0.11702247709035873,
+      "learning_rate": 3.3392252387820754e-05,
+      "loss": 2.7298,
+      "step": 14180
+    },
+    {
+      "epoch": 0.8233604595500239,
+      "grad_norm": 0.12156783044338226,
+      "learning_rate": 3.3180248585427054e-05,
+      "loss": 2.7238,
+      "step": 14190
+    },
+    {
+      "epoch": 0.8239406994792347,
+      "grad_norm": 0.11878366768360138,
+      "learning_rate": 3.296885903083366e-05,
+      "loss": 2.7175,
+      "step": 14200
+    },
+    {
+      "epoch": 0.8245209394084454,
+      "grad_norm": 0.11734297126531601,
+      "learning_rate": 3.275808450239908e-05,
+      "loss": 2.7271,
+      "step": 14210
+    },
+    {
+      "epoch": 0.8251011793376561,
+      "grad_norm": 0.11686153709888458,
+      "learning_rate": 3.2547925776217126e-05,
+      "loss": 2.7206,
+      "step": 14220
+    },
+    {
+      "epoch": 0.8256814192668669,
+      "grad_norm": 0.12111202627420425,
+      "learning_rate": 3.23383836261143e-05,
+      "loss": 2.7179,
+      "step": 14230
+    },
+    {
+      "epoch": 0.8262616591960775,
+      "grad_norm": 0.12011639773845673,
+      "learning_rate": 3.212945882364666e-05,
+      "loss": 2.7172,
+      "step": 14240
+    },
+    {
+      "epoch": 0.8268418991252883,
+      "grad_norm": 0.11470998078584671,
+      "learning_rate": 3.192115213809741e-05,
+      "loss": 2.7249,
+      "step": 14250
+    },
+    {
+      "epoch": 0.827422139054499,
+      "grad_norm": 0.12129820883274078,
+      "learning_rate": 3.171346433647335e-05,
+      "loss": 2.7175,
+      "step": 14260
+    },
+    {
+      "epoch": 0.8280023789837098,
+      "grad_norm": 0.11826759576797485,
+      "learning_rate": 3.150639618350289e-05,
+      "loss": 2.7346,
+      "step": 14270
+    },
+    {
+      "epoch": 0.8285826189129205,
+      "grad_norm": 0.12145520746707916,
+      "learning_rate": 3.12999484416326e-05,
+      "loss": 2.7271,
+      "step": 14280
+    },
+    {
+      "epoch": 0.8291628588421313,
+      "grad_norm": 0.11638975143432617,
+      "learning_rate": 3.1094121871024676e-05,
+      "loss": 2.7148,
+      "step": 14290
+    },
+    {
+      "epoch": 0.829743098771342,
+      "grad_norm": 0.11663084477186203,
+      "learning_rate": 3.0888917229554204e-05,
+      "loss": 2.7238,
+      "step": 14300
+    },
+    {
+      "epoch": 0.8303233387005526,
+      "grad_norm": 0.11826130747795105,
+      "learning_rate": 3.068433527280601e-05,
+      "loss": 2.7233,
+      "step": 14310
+    },
+    {
+      "epoch": 0.8309035786297634,
+      "grad_norm": 0.11905242502689362,
+      "learning_rate": 3.0480376754072448e-05,
+      "loss": 2.7111,
+      "step": 14320
+    },
+    {
+      "epoch": 0.8314838185589741,
+      "grad_norm": 0.11812193691730499,
+      "learning_rate": 3.0277042424350076e-05,
+      "loss": 2.7146,
+      "step": 14330
+    },
+    {
+      "epoch": 0.8320640584881849,
+      "grad_norm": 0.1194220557808876,
+      "learning_rate": 3.0074333032337154e-05,
+      "loss": 2.7197,
+      "step": 14340
+    },
+    {
+      "epoch": 0.8326442984173956,
+      "grad_norm": 0.12228541076183319,
+      "learning_rate": 2.9872249324431046e-05,
+      "loss": 2.7239,
+      "step": 14350
+    },
+    {
+      "epoch": 0.8332245383466064,
+      "grad_norm": 0.11680544167757034,
+      "learning_rate": 2.9670792044724937e-05,
+      "loss": 2.7152,
+      "step": 14360
+    },
+    {
+      "epoch": 0.833804778275817,
+      "grad_norm": 0.117955781519413,
+      "learning_rate": 2.9469961935005797e-05,
+      "loss": 2.7177,
+      "step": 14370
+    },
+    {
+      "epoch": 0.8343850182050278,
+      "grad_norm": 0.11889876425266266,
+      "learning_rate": 2.9269759734751056e-05,
+      "loss": 2.7169,
+      "step": 14380
+    },
+    {
+      "epoch": 0.8349652581342385,
+      "grad_norm": 0.11606640368700027,
+      "learning_rate": 2.907018618112618e-05,
+      "loss": 2.7232,
+      "step": 14390
+    },
+    {
+      "epoch": 0.8355454980634492,
+      "grad_norm": 0.11701922863721848,
+      "learning_rate": 2.8871242008981992e-05,
+      "loss": 2.7254,
+      "step": 14400
+    },
+    {
+      "epoch": 0.83612573799266,
+      "grad_norm": 0.11705347150564194,
+      "learning_rate": 2.8672927950851612e-05,
+      "loss": 2.7296,
+      "step": 14410
+    },
+    {
+      "epoch": 0.8367059779218707,
+      "grad_norm": 0.121455118060112,
+      "learning_rate": 2.8475244736948315e-05,
+      "loss": 2.717,
+      "step": 14420
+    },
+    {
+      "epoch": 0.8372862178510814,
+      "grad_norm": 0.1162460669875145,
+      "learning_rate": 2.8278193095162353e-05,
+      "loss": 2.7161,
+      "step": 14430
+    },
+    {
+      "epoch": 0.8378664577802921,
+      "grad_norm": 0.11314946413040161,
+      "learning_rate": 2.8081773751058516e-05,
+      "loss": 2.7217,
+      "step": 14440
+    },
+    {
+      "epoch": 0.8384466977095029,
+      "grad_norm": 0.11895614117383957,
+      "learning_rate": 2.7885987427873406e-05,
+      "loss": 2.7165,
+      "step": 14450
+    },
+    {
+      "epoch": 0.8390269376387136,
+      "grad_norm": 0.12004794180393219,
+      "learning_rate": 2.7690834846512736e-05,
+      "loss": 2.7311,
+      "step": 14460
+    },
+    {
+      "epoch": 0.8396071775679244,
+      "grad_norm": 0.11621776968240738,
+      "learning_rate": 2.7496316725548887e-05,
+      "loss": 2.7114,
+      "step": 14470
+    },
+    {
+      "epoch": 0.8401874174971351,
+      "grad_norm": 0.11720835417509079,
+      "learning_rate": 2.7302433781217774e-05,
+      "loss": 2.7241,
+      "step": 14480
+    },
+    {
+      "epoch": 0.8407676574263458,
+      "grad_norm": 0.11709107458591461,
+      "learning_rate": 2.7109186727416824e-05,
+      "loss": 2.7181,
+      "step": 14490
+    },
+    {
+      "epoch": 0.8413478973555565,
+      "grad_norm": 0.11878082901239395,
+      "learning_rate": 2.691657627570192e-05,
+      "loss": 2.7206,
+      "step": 14500
+    },
+    {
+      "epoch": 0.8419281372847672,
+      "grad_norm": 0.11816674470901489,
+      "learning_rate": 2.6724603135284887e-05,
+      "loss": 2.7191,
+      "step": 14510
+    },
+    {
+      "epoch": 0.842508377213978,
+      "grad_norm": 0.11694590002298355,
+      "learning_rate": 2.653326801303102e-05,
+      "loss": 2.7187,
+      "step": 14520
+    },
+    {
+      "epoch": 0.8430886171431887,
+      "grad_norm": 0.11519923061132431,
+      "learning_rate": 2.6342571613456146e-05,
+      "loss": 2.7212,
+      "step": 14530
+    },
+    {
+      "epoch": 0.8436688570723995,
+      "grad_norm": 0.11877317726612091,
+      "learning_rate": 2.6152514638724522e-05,
+      "loss": 2.7245,
+      "step": 14540
+    },
+    {
+      "epoch": 0.8442490970016102,
+      "grad_norm": 0.11796387284994125,
+      "learning_rate": 2.5963097788645764e-05,
+      "loss": 2.7243,
+      "step": 14550
+    },
+    {
+      "epoch": 0.8448293369308209,
+      "grad_norm": 0.117183618247509,
+      "learning_rate": 2.577432176067258e-05,
+      "loss": 2.7266,
+      "step": 14560
+    },
+    {
+      "epoch": 0.8454095768600316,
+      "grad_norm": 0.12094635516405106,
+      "learning_rate": 2.5586187249898074e-05,
+      "loss": 2.719,
+      "step": 14570
+    },
+    {
+      "epoch": 0.8459898167892423,
+      "grad_norm": 0.11904298514127731,
+      "learning_rate": 2.539869494905318e-05,
+      "loss": 2.716,
+      "step": 14580
+    },
+    {
+      "epoch": 0.8465700567184531,
+      "grad_norm": 0.11641751229763031,
+      "learning_rate": 2.5211845548504264e-05,
+      "loss": 2.7237,
+      "step": 14590
+    },
+    {
+      "epoch": 0.8471502966476638,
+      "grad_norm": 0.12003939598798752,
+      "learning_rate": 2.5025639736250382e-05,
+      "loss": 2.7194,
+      "step": 14600
+    },
+    {
+      "epoch": 0.8477305365768746,
+      "grad_norm": 0.12015482783317566,
+      "learning_rate": 2.48400781979208e-05,
+      "loss": 2.7133,
+      "step": 14610
+    },
+    {
+      "epoch": 0.8483107765060852,
+      "grad_norm": 0.11893987655639648,
+      "learning_rate": 2.4655161616772594e-05,
+      "loss": 2.7217,
+      "step": 14620
+    },
+    {
+      "epoch": 0.848891016435296,
+      "grad_norm": 0.11744572222232819,
+      "learning_rate": 2.4470890673687884e-05,
+      "loss": 2.7173,
+      "step": 14630
+    },
+    {
+      "epoch": 0.8494712563645067,
+      "grad_norm": 0.12006239593029022,
+      "learning_rate": 2.428726604717173e-05,
+      "loss": 2.7304,
+      "step": 14640
+    },
+    {
+      "epoch": 0.8500514962937175,
+      "grad_norm": 0.11663104593753815,
+      "learning_rate": 2.410428841334915e-05,
+      "loss": 2.7114,
+      "step": 14650
+    },
+    {
+      "epoch": 0.8506317362229282,
+      "grad_norm": 0.11702371388673782,
+      "learning_rate": 2.392195844596299e-05,
+      "loss": 2.7135,
+      "step": 14660
+    },
+    {
+      "epoch": 0.851211976152139,
+      "grad_norm": 0.11572497338056564,
+      "learning_rate": 2.3740276816371278e-05,
+      "loss": 2.726,
+      "step": 14670
+    },
+    {
+      "epoch": 0.8517922160813496,
+      "grad_norm": 0.11705095320940018,
+      "learning_rate": 2.3559244193544806e-05,
+      "loss": 2.7128,
+      "step": 14680
+    },
+    {
+      "epoch": 0.8523724560105603,
+      "grad_norm": 0.11780356615781784,
+      "learning_rate": 2.337886124406461e-05,
+      "loss": 2.7184,
+      "step": 14690
+    },
+    {
+      "epoch": 0.8529526959397711,
+      "grad_norm": 0.11437036097049713,
+      "learning_rate": 2.3199128632119705e-05,
+      "loss": 2.713,
+      "step": 14700
+    },
+    {
+      "epoch": 0.8535329358689818,
+      "grad_norm": 0.11990920454263687,
+      "learning_rate": 2.3020047019504355e-05,
+      "loss": 2.7161,
+      "step": 14710
+    },
+    {
+      "epoch": 0.8541131757981926,
+      "grad_norm": 0.11468629539012909,
+      "learning_rate": 2.2841617065615805e-05,
+      "loss": 2.7223,
+      "step": 14720
+    },
+    {
+      "epoch": 0.8546934157274033,
+      "grad_norm": 0.11571449041366577,
+      "learning_rate": 2.266383942745185e-05,
+      "loss": 2.7133,
+      "step": 14730
+    },
+    {
+      "epoch": 0.8552736556566141,
+      "grad_norm": 0.11584048718214035,
+      "learning_rate": 2.2486714759608306e-05,
+      "loss": 2.7218,
+      "step": 14740
+    },
+    {
+      "epoch": 0.8558538955858247,
+      "grad_norm": 0.11709973216056824,
+      "learning_rate": 2.231024371427688e-05,
+      "loss": 2.7084,
+      "step": 14750
+    },
+    {
+      "epoch": 0.8564341355150354,
+      "grad_norm": 0.11297730356454849,
+      "learning_rate": 2.213442694124239e-05,
+      "loss": 2.7061,
+      "step": 14760
+    },
+    {
+      "epoch": 0.8570143754442462,
+      "grad_norm": 0.11450552940368652,
+      "learning_rate": 2.19592650878806e-05,
+      "loss": 2.7242,
+      "step": 14770
+    },
+    {
+      "epoch": 0.8575946153734569,
+      "grad_norm": 0.11598943918943405,
+      "learning_rate": 2.1784758799155803e-05,
+      "loss": 2.7153,
+      "step": 14780
+    },
+    {
+      "epoch": 0.8581748553026677,
+      "grad_norm": 0.11803678423166275,
+      "learning_rate": 2.161090871761846e-05,
+      "loss": 2.7207,
+      "step": 14790
+    },
+    {
+      "epoch": 0.8587550952318784,
+      "grad_norm": 0.11763158440589905,
+      "learning_rate": 2.1437715483402764e-05,
+      "loss": 2.7191,
+      "step": 14800
+    },
+    {
+      "epoch": 0.8593353351610891,
+      "grad_norm": 0.12016081064939499,
+      "learning_rate": 2.1265179734224307e-05,
+      "loss": 2.719,
+      "step": 14810
+    },
+    {
+      "epoch": 0.8599155750902998,
+      "grad_norm": 0.11358082294464111,
+      "learning_rate": 2.1093302105377877e-05,
+      "loss": 2.7062,
+      "step": 14820
+    },
+    {
+      "epoch": 0.8604958150195106,
+      "grad_norm": 0.11611097306013107,
+      "learning_rate": 2.0922083229734855e-05,
+      "loss": 2.7122,
+      "step": 14830
+    },
+    {
+      "epoch": 0.8610760549487213,
+      "grad_norm": 0.11881374567747116,
+      "learning_rate": 2.0751523737741095e-05,
+      "loss": 2.718,
+      "step": 14840
+    },
+    {
+      "epoch": 0.861656294877932,
+      "grad_norm": 0.11680326610803604,
+      "learning_rate": 2.058162425741452e-05,
+      "loss": 2.7115,
+      "step": 14850
+    },
+    {
+      "epoch": 0.8622365348071428,
+      "grad_norm": 0.11934536695480347,
+      "learning_rate": 2.041238541434276e-05,
+      "loss": 2.7155,
+      "step": 14860
+    },
+    {
+      "epoch": 0.8628167747363534,
+      "grad_norm": 0.11599881947040558,
+      "learning_rate": 2.0243807831681027e-05,
+      "loss": 2.715,
+      "step": 14870
+    },
+    {
+      "epoch": 0.8633970146655642,
+      "grad_norm": 0.11855433881282806,
+      "learning_rate": 2.007589213014964e-05,
+      "loss": 2.719,
+      "step": 14880
+    },
+    {
+      "epoch": 0.8639772545947749,
+      "grad_norm": 0.1187380775809288,
+      "learning_rate": 1.9908638928031765e-05,
+      "loss": 2.7197,
+      "step": 14890
+    },
+    {
+      "epoch": 0.8645574945239857,
+      "grad_norm": 0.12009769678115845,
+      "learning_rate": 1.9742048841171255e-05,
+      "loss": 2.7124,
+      "step": 14900
+    },
+    {
+      "epoch": 0.8651377344531964,
+      "grad_norm": 0.1180645152926445,
+      "learning_rate": 1.9576122482970184e-05,
+      "loss": 2.7175,
+      "step": 14910
+    },
+    {
+      "epoch": 0.8657179743824072,
+      "grad_norm": 0.11391542106866837,
+      "learning_rate": 1.9410860464386916e-05,
+      "loss": 2.7138,
+      "step": 14920
+    },
+    {
+      "epoch": 0.8662982143116179,
+      "grad_norm": 0.11604123562574387,
+      "learning_rate": 1.924626339393336e-05,
+      "loss": 2.7171,
+      "step": 14930
+    },
+    {
+      "epoch": 0.8668784542408285,
+      "grad_norm": 0.1202244833111763,
+      "learning_rate": 1.9082331877673277e-05,
+      "loss": 2.7213,
+      "step": 14940
+    },
+    {
+      "epoch": 0.8674586941700393,
+      "grad_norm": 0.11324458569288254,
+      "learning_rate": 1.8919066519219664e-05,
+      "loss": 2.7155,
+      "step": 14950
+    },
+    {
+      "epoch": 0.86803893409925,
+      "grad_norm": 0.11484729498624802,
+      "learning_rate": 1.8756467919732645e-05,
+      "loss": 2.7248,
+      "step": 14960
+    },
+    {
+      "epoch": 0.8686191740284608,
+      "grad_norm": 0.11329852789640427,
+      "learning_rate": 1.8594536677917373e-05,
+      "loss": 2.717,
+      "step": 14970
+    },
+    {
+      "epoch": 0.8691994139576715,
+      "grad_norm": 0.11292688548564911,
+      "learning_rate": 1.8433273390021523e-05,
+      "loss": 2.7189,
+      "step": 14980
+    },
+    {
+      "epoch": 0.8697796538868823,
+      "grad_norm": 0.11284969747066498,
+      "learning_rate": 1.8272678649833508e-05,
+      "loss": 2.7048,
+      "step": 14990
+    },
+    {
+      "epoch": 0.8703598938160929,
+      "grad_norm": 0.11553023010492325,
+      "learning_rate": 1.8112753048679965e-05,
+      "loss": 2.7161,
+      "step": 15000
+    },
+    {
+      "epoch": 0.8703598938160929,
+      "eval_loss": 2.687220335006714,
+      "eval_runtime": 5.3901,
+      "eval_samples_per_second": 803.32,
+      "eval_steps_per_second": 1.67,
+      "step": 15000
+    },
+    {
+      "epoch": 0.8709401337453037,
+      "grad_norm": 0.1153511255979538,
+      "learning_rate": 1.7953497175423673e-05,
+      "loss": 2.7144,
+      "step": 15010
+    },
+    {
+      "epoch": 0.8715203736745144,
+      "grad_norm": 0.11592899262905121,
+      "learning_rate": 1.7794911616461517e-05,
+      "loss": 2.7263,
+      "step": 15020
+    },
+    {
+      "epoch": 0.8721006136037251,
+      "grad_norm": 0.11693062633275986,
+      "learning_rate": 1.763699695572203e-05,
+      "loss": 2.7125,
+      "step": 15030
+    },
+    {
+      "epoch": 0.8726808535329359,
+      "grad_norm": 0.11720691621303558,
+      "learning_rate": 1.747975377466369e-05,
+      "loss": 2.7207,
+      "step": 15040
+    },
+    {
+      "epoch": 0.8732610934621466,
+      "grad_norm": 0.11742518097162247,
+      "learning_rate": 1.7323182652272173e-05,
+      "loss": 2.7137,
+      "step": 15050
+    },
+    {
+      "epoch": 0.8738413333913573,
+      "grad_norm": 0.1138685792684555,
+      "learning_rate": 1.7167284165058885e-05,
+      "loss": 2.7092,
+      "step": 15060
+    },
+    {
+      "epoch": 0.874421573320568,
+      "grad_norm": 0.11519136279821396,
+      "learning_rate": 1.701205888705837e-05,
+      "loss": 2.7266,
+      "step": 15070
+    },
+    {
+      "epoch": 0.8750018132497788,
+      "grad_norm": 0.11361874639987946,
+      "learning_rate": 1.68575073898263e-05,
+      "loss": 2.7219,
+      "step": 15080
+    },
+    {
+      "epoch": 0.8755820531789895,
+      "grad_norm": 0.11410374194383621,
+      "learning_rate": 1.6703630242437573e-05,
+      "loss": 2.724,
+      "step": 15090
+    },
+    {
+      "epoch": 0.8761622931082003,
+      "grad_norm": 0.11599191278219223,
+      "learning_rate": 1.6550428011483876e-05,
+      "loss": 2.7236,
+      "step": 15100
+    },
+    {
+      "epoch": 0.876742533037411,
+      "grad_norm": 0.11562803387641907,
+      "learning_rate": 1.6397901261071923e-05,
+      "loss": 2.7228,
+      "step": 15110
+    },
+    {
+      "epoch": 0.8773227729666216,
+      "grad_norm": 0.11603621393442154,
+      "learning_rate": 1.624605055282118e-05,
+      "loss": 2.7152,
+      "step": 15120
+    },
+    {
+      "epoch": 0.8779030128958324,
+      "grad_norm": 0.11416131258010864,
+      "learning_rate": 1.6094876445861828e-05,
+      "loss": 2.7124,
+      "step": 15130
+    },
+    {
+      "epoch": 0.8784832528250431,
+      "grad_norm": 0.11274771392345428,
+      "learning_rate": 1.5944379496832873e-05,
+      "loss": 2.7235,
+      "step": 15140
+    },
+    {
+      "epoch": 0.8790634927542539,
+      "grad_norm": 0.1134885847568512,
+      "learning_rate": 1.5794560259879686e-05,
+      "loss": 2.701,
+      "step": 15150
+    },
+    {
+      "epoch": 0.8796437326834646,
+      "grad_norm": 0.11914920806884766,
+      "learning_rate": 1.5645419286652507e-05,
+      "loss": 2.7114,
+      "step": 15160
+    },
+    {
+      "epoch": 0.8802239726126754,
+      "grad_norm": 0.11518207937479019,
+      "learning_rate": 1.5496957126304013e-05,
+      "loss": 2.7111,
+      "step": 15170
+    },
+    {
+      "epoch": 0.8808042125418861,
+      "grad_norm": 0.1119842380285263,
+      "learning_rate": 1.534917432548735e-05,
+      "loss": 2.7136,
+      "step": 15180
+    },
+    {
+      "epoch": 0.8813844524710968,
+      "grad_norm": 0.11328744888305664,
+      "learning_rate": 1.5202071428354414e-05,
+      "loss": 2.7128,
+      "step": 15190
+    },
+    {
+      "epoch": 0.8819646924003075,
+      "grad_norm": 0.11526224762201309,
+      "learning_rate": 1.5055648976553338e-05,
+      "loss": 2.7206,
+      "step": 15200
+    },
+    {
+      "epoch": 0.8825449323295183,
+      "grad_norm": 0.11353620141744614,
+      "learning_rate": 1.4909907509227006e-05,
+      "loss": 2.7275,
+      "step": 15210
+    },
+    {
+      "epoch": 0.883125172258729,
+      "grad_norm": 0.11482030898332596,
+      "learning_rate": 1.4764847563010753e-05,
+      "loss": 2.7176,
+      "step": 15220
+    },
+    {
+      "epoch": 0.8837054121879397,
+      "grad_norm": 0.11562719196081161,
+      "learning_rate": 1.4620469672030479e-05,
+      "loss": 2.7166,
+      "step": 15230
+    },
+    {
+      "epoch": 0.8842856521171505,
+      "grad_norm": 0.11470736563205719,
+      "learning_rate": 1.447677436790078e-05,
+      "loss": 2.7194,
+      "step": 15240
+    },
+    {
+      "epoch": 0.8848658920463611,
+      "grad_norm": 0.1143270805478096,
+      "learning_rate": 1.4333762179722688e-05,
+      "loss": 2.7086,
+      "step": 15250
+    },
+    {
+      "epoch": 0.8854461319755719,
+      "grad_norm": 0.11537613719701767,
+      "learning_rate": 1.4191433634082152e-05,
+      "loss": 2.7165,
+      "step": 15260
+    },
+    {
+      "epoch": 0.8860263719047826,
+      "grad_norm": 0.11545541882514954,
+      "learning_rate": 1.4049789255047786e-05,
+      "loss": 2.7135,
+      "step": 15270
+    },
+    {
+      "epoch": 0.8866066118339934,
+      "grad_norm": 0.11168920993804932,
+      "learning_rate": 1.3908829564169013e-05,
+      "loss": 2.7101,
+      "step": 15280
+    },
+    {
+      "epoch": 0.8871868517632041,
+      "grad_norm": 0.11263593286275864,
+      "learning_rate": 1.3768555080474189e-05,
+      "loss": 2.7157,
+      "step": 15290
+    },
+    {
+      "epoch": 0.8877670916924149,
+      "grad_norm": 0.1139439269900322,
+      "learning_rate": 1.3628966320468595e-05,
+      "loss": 2.7095,
+      "step": 15300
+    },
+    {
+      "epoch": 0.8883473316216255,
+      "grad_norm": 0.11641982942819595,
+      "learning_rate": 1.3490063798132802e-05,
+      "loss": 2.7105,
+      "step": 15310
+    },
+    {
+      "epoch": 0.8889275715508362,
+      "grad_norm": 0.11295609176158905,
+      "learning_rate": 1.335184802492031e-05,
+      "loss": 2.7104,
+      "step": 15320
+    },
+    {
+      "epoch": 0.889507811480047,
+      "grad_norm": 0.11365869641304016,
+      "learning_rate": 1.3214319509756158e-05,
+      "loss": 2.7151,
+      "step": 15330
+    },
+    {
+      "epoch": 0.8900880514092577,
+      "grad_norm": 0.11353792250156403,
+      "learning_rate": 1.3077478759034733e-05,
+      "loss": 2.7207,
+      "step": 15340
+    },
+    {
+      "epoch": 0.8906682913384685,
+      "grad_norm": 0.11343677341938019,
+      "learning_rate": 1.294132627661797e-05,
+      "loss": 2.7095,
+      "step": 15350
+    },
+    {
+      "epoch": 0.8912485312676792,
+      "grad_norm": 0.11483877897262573,
+      "learning_rate": 1.280586256383367e-05,
+      "loss": 2.7138,
+      "step": 15360
+    },
+    {
+      "epoch": 0.89182877119689,
+      "grad_norm": 0.11700621247291565,
+      "learning_rate": 1.2671088119473284e-05,
+      "loss": 2.7164,
+      "step": 15370
+    },
+    {
+      "epoch": 0.8924090111261006,
+      "grad_norm": 0.11624756455421448,
+      "learning_rate": 1.253700343979054e-05,
+      "loss": 2.7064,
+      "step": 15380
+    },
+    {
+      "epoch": 0.8929892510553114,
+      "grad_norm": 0.1139611005783081,
+      "learning_rate": 1.2403609018499219e-05,
+      "loss": 2.7125,
+      "step": 15390
+    },
+    {
+      "epoch": 0.8935694909845221,
+      "grad_norm": 0.11446714401245117,
+      "learning_rate": 1.2270905346771577e-05,
+      "loss": 2.7072,
+      "step": 15400
+    },
+    {
+      "epoch": 0.8941497309137328,
+      "grad_norm": 0.11538238823413849,
+      "learning_rate": 1.2138892913236444e-05,
+      "loss": 2.718,
+      "step": 15410
+    },
+    {
+      "epoch": 0.8947299708429436,
+      "grad_norm": 0.11417897045612335,
+      "learning_rate": 1.2007572203977369e-05,
+      "loss": 2.7022,
+      "step": 15420
+    },
+    {
+      "epoch": 0.8953102107721543,
+      "grad_norm": 0.11221955716609955,
+      "learning_rate": 1.1876943702531052e-05,
+      "loss": 2.7063,
+      "step": 15430
+    },
+    {
+      "epoch": 0.895890450701365,
+      "grad_norm": 0.11339222639799118,
+      "learning_rate": 1.1747007889885252e-05,
+      "loss": 2.7063,
+      "step": 15440
+    },
+    {
+      "epoch": 0.8964706906305757,
+      "grad_norm": 0.11208420246839523,
+      "learning_rate": 1.1617765244477285e-05,
+      "loss": 2.7113,
+      "step": 15450
+    },
+    {
+      "epoch": 0.8970509305597865,
+      "grad_norm": 0.11443324387073517,
+      "learning_rate": 1.148921624219208e-05,
+      "loss": 2.7151,
+      "step": 15460
+    },
+    {
+      "epoch": 0.8976311704889972,
+      "grad_norm": 0.1121024414896965,
+      "learning_rate": 1.1361361356360523e-05,
+      "loss": 2.7105,
+      "step": 15470
+    },
+    {
+      "epoch": 0.898211410418208,
+      "grad_norm": 0.11098407953977585,
+      "learning_rate": 1.1234201057757743e-05,
+      "loss": 2.7157,
+      "step": 15480
+    },
+    {
+      "epoch": 0.8987916503474187,
+      "grad_norm": 0.11558841168880463,
+      "learning_rate": 1.110773581460125e-05,
+      "loss": 2.7207,
+      "step": 15490
+    },
+    {
+      "epoch": 0.8993718902766293,
+      "grad_norm": 0.11299290508031845,
+      "learning_rate": 1.0981966092549311e-05,
+      "loss": 2.7231,
+      "step": 15500
+    },
+    {
+      "epoch": 0.8999521302058401,
+      "grad_norm": 0.11334340274333954,
+      "learning_rate": 1.0856892354699222e-05,
+      "loss": 2.7113,
+      "step": 15510
+    },
+    {
+      "epoch": 0.9005323701350508,
+      "grad_norm": 0.11435014754533768,
+      "learning_rate": 1.0732515061585613e-05,
+      "loss": 2.7142,
+      "step": 15520
+    },
+    {
+      "epoch": 0.9011126100642616,
+      "grad_norm": 0.11135628074407578,
+      "learning_rate": 1.0608834671178635e-05,
+      "loss": 2.7064,
+      "step": 15530
+    },
+    {
+      "epoch": 0.9016928499934723,
+      "grad_norm": 0.11612440645694733,
+      "learning_rate": 1.0485851638882537e-05,
+      "loss": 2.7138,
+      "step": 15540
+    },
+    {
+      "epoch": 0.9022730899226831,
+      "grad_norm": 0.1127479076385498,
+      "learning_rate": 1.0363566417533687e-05,
+      "loss": 2.7149,
+      "step": 15550
+    },
+    {
+      "epoch": 0.9028533298518937,
+      "grad_norm": 0.11329977214336395,
+      "learning_rate": 1.0241979457399064e-05,
+      "loss": 2.7056,
+      "step": 15560
+    },
+    {
+      "epoch": 0.9034335697811045,
+      "grad_norm": 0.1118236631155014,
+      "learning_rate": 1.0121091206174615e-05,
+      "loss": 2.7131,
+      "step": 15570
+    },
+    {
+      "epoch": 0.9040138097103152,
+      "grad_norm": 0.11316058784723282,
+      "learning_rate": 1.0000902108983523e-05,
+      "loss": 2.7104,
+      "step": 15580
+    },
+    {
+      "epoch": 0.9045940496395259,
+      "grad_norm": 0.11303921043872833,
+      "learning_rate": 9.881412608374629e-06,
+      "loss": 2.7026,
+      "step": 15590
+    },
+    {
+      "epoch": 0.9051742895687367,
+      "grad_norm": 0.11121919751167297,
+      "learning_rate": 9.762623144320838e-06,
+      "loss": 2.7049,
+      "step": 15600
+    },
+    {
+      "epoch": 0.9057545294979474,
+      "grad_norm": 0.11181233078241348,
+      "learning_rate": 9.644534154217354e-06,
+      "loss": 2.7145,
+      "step": 15610
+    },
+    {
+      "epoch": 0.9063347694271582,
+      "grad_norm": 0.11229850351810455,
+      "learning_rate": 9.527146072880254e-06,
+      "loss": 2.7089,
+      "step": 15620
+    },
+    {
+      "epoch": 0.9069150093563688,
+      "grad_norm": 0.11836584657430649,
+      "learning_rate": 9.410459332544697e-06,
+      "loss": 2.7143,
+      "step": 15630
+    },
+    {
+      "epoch": 0.9074952492855796,
+      "grad_norm": 0.11599016189575195,
+      "learning_rate": 9.294474362863525e-06,
+      "loss": 2.7071,
+      "step": 15640
+    },
+    {
+      "epoch": 0.9080754892147903,
+      "grad_norm": 0.11280685663223267,
+      "learning_rate": 9.179191590905523e-06,
+      "loss": 2.7099,
+      "step": 15650
+    },
+    {
+      "epoch": 0.9086557291440011,
+      "grad_norm": 0.11349350959062576,
+      "learning_rate": 9.064611441153935e-06,
+      "loss": 2.7031,
+      "step": 15660
+    },
+    {
+      "epoch": 0.9092359690732118,
+      "grad_norm": 0.1137542724609375,
+      "learning_rate": 8.950734335504907e-06,
+      "loss": 2.6978,
+      "step": 15670
+    },
+    {
+      "epoch": 0.9098162090024225,
+      "grad_norm": 0.11099807173013687,
+      "learning_rate": 8.837560693265844e-06,
+      "loss": 2.7104,
+      "step": 15680
+    },
+    {
+      "epoch": 0.9103964489316332,
+      "grad_norm": 0.11295212060213089,
+      "learning_rate": 8.725090931153968e-06,
+      "loss": 2.7144,
+      "step": 15690
+    },
+    {
+      "epoch": 0.9109766888608439,
+      "grad_norm": 0.11119643598794937,
+      "learning_rate": 8.613325463294675e-06,
+      "loss": 2.7047,
+      "step": 15700
+    },
+    {
+      "epoch": 0.9115569287900547,
+      "grad_norm": 0.11346277594566345,
+      "learning_rate": 8.502264701220198e-06,
+      "loss": 2.7137,
+      "step": 15710
+    },
+    {
+      "epoch": 0.9121371687192654,
+      "grad_norm": 0.11389115452766418,
+      "learning_rate": 8.391909053867863e-06,
+      "loss": 2.7077,
+      "step": 15720
+    },
+    {
+      "epoch": 0.9127174086484762,
+      "grad_norm": 0.11094717681407928,
+      "learning_rate": 8.282258927578723e-06,
+      "loss": 2.707,
+      "step": 15730
+    },
+    {
+      "epoch": 0.9132976485776869,
+      "grad_norm": 0.1122366338968277,
+      "learning_rate": 8.173314726096038e-06,
+      "loss": 2.7167,
+      "step": 15740
+    },
+    {
+      "epoch": 0.9138778885068976,
+      "grad_norm": 0.11315654963254929,
+      "learning_rate": 8.065076850563746e-06,
+      "loss": 2.7104,
+      "step": 15750
+    },
+    {
+      "epoch": 0.9144581284361083,
+      "grad_norm": 0.1108250766992569,
+      "learning_rate": 7.957545699525093e-06,
+      "loss": 2.7144,
+      "step": 15760
+    },
+    {
+      "epoch": 0.915038368365319,
+      "grad_norm": 0.11187420040369034,
+      "learning_rate": 7.85072166892098e-06,
+      "loss": 2.7163,
+      "step": 15770
+    },
+    {
+      "epoch": 0.9156186082945298,
+      "grad_norm": 0.11263624578714371,
+      "learning_rate": 7.744605152088724e-06,
+      "loss": 2.7184,
+      "step": 15780
+    },
+    {
+      "epoch": 0.9161988482237405,
+      "grad_norm": 0.11211346089839935,
+      "learning_rate": 7.639196539760462e-06,
+      "loss": 2.7106,
+      "step": 15790
+    },
+    {
+      "epoch": 0.9167790881529513,
+      "grad_norm": 0.11310411244630814,
+      "learning_rate": 7.534496220061682e-06,
+      "loss": 2.711,
+      "step": 15800
+    },
+    {
+      "epoch": 0.917359328082162,
+      "grad_norm": 0.11148010939359665,
+      "learning_rate": 7.430504578510023e-06,
+      "loss": 2.716,
+      "step": 15810
+    },
+    {
+      "epoch": 0.9179395680113727,
+      "grad_norm": 0.11117308586835861,
+      "learning_rate": 7.327221998013522e-06,
+      "loss": 2.7111,
+      "step": 15820
+    },
+    {
+      "epoch": 0.9185198079405834,
+      "grad_norm": 0.1113533303141594,
+      "learning_rate": 7.224648858869487e-06,
+      "loss": 2.7152,
+      "step": 15830
+    },
+    {
+      "epoch": 0.9191000478697942,
+      "grad_norm": 0.1133064553141594,
+      "learning_rate": 7.122785538762999e-06,
+      "loss": 2.7071,
+      "step": 15840
+    },
+    {
+      "epoch": 0.9196802877990049,
+      "grad_norm": 0.11209236830472946,
+      "learning_rate": 7.021632412765411e-06,
+      "loss": 2.7113,
+      "step": 15850
+    },
+    {
+      "epoch": 0.9202605277282156,
+      "grad_norm": 0.11444966495037079,
+      "learning_rate": 6.9211898533331874e-06,
+      "loss": 2.7072,
+      "step": 15860
+    },
+    {
+      "epoch": 0.9208407676574264,
+      "grad_norm": 0.1107277199625969,
+      "learning_rate": 6.821458230306288e-06,
+      "loss": 2.7075,
+      "step": 15870
+    },
+    {
+      "epoch": 0.921421007586637,
+      "grad_norm": 0.11145395040512085,
+      "learning_rate": 6.722437910907098e-06,
+      "loss": 2.7063,
+      "step": 15880
+    },
+    {
+      "epoch": 0.9220012475158478,
+      "grad_norm": 0.11115839332342148,
+      "learning_rate": 6.6241292597386764e-06,
+      "loss": 2.7071,
+      "step": 15890
+    },
+    {
+      "epoch": 0.9225814874450585,
+      "grad_norm": 0.114626444876194,
+      "learning_rate": 6.5265326387838885e-06,
+      "loss": 2.7121,
+      "step": 15900
+    },
+    {
+      "epoch": 0.9231617273742693,
+      "grad_norm": 0.11295740306377411,
+      "learning_rate": 6.429648407403655e-06,
+      "loss": 2.7101,
+      "step": 15910
+    },
+    {
+      "epoch": 0.92374196730348,
+      "grad_norm": 0.11063504219055176,
+      "learning_rate": 6.333476922335857e-06,
+      "loss": 2.7043,
+      "step": 15920
+    },
+    {
+      "epoch": 0.9243222072326908,
+      "grad_norm": 0.11329693347215652,
+      "learning_rate": 6.238018537694057e-06,
+      "loss": 2.7148,
+      "step": 15930
+    },
+    {
+      "epoch": 0.9249024471619014,
+      "grad_norm": 0.11279025673866272,
+      "learning_rate": 6.143273604965915e-06,
+      "loss": 2.7225,
+      "step": 15940
+    },
+    {
+      "epoch": 0.9254826870911121,
+      "grad_norm": 0.11234597116708755,
+      "learning_rate": 6.049242473012284e-06,
+      "loss": 2.71,
+      "step": 15950
+    },
+    {
+      "epoch": 0.9260629270203229,
+      "grad_norm": 0.11071603745222092,
+      "learning_rate": 5.955925488065605e-06,
+      "loss": 2.7065,
+      "step": 15960
+    },
+    {
+      "epoch": 0.9266431669495336,
+      "grad_norm": 0.11233749240636826,
+      "learning_rate": 5.863322993728781e-06,
+      "loss": 2.7108,
+      "step": 15970
+    },
+    {
+      "epoch": 0.9272234068787444,
+      "grad_norm": 0.1108274981379509,
+      "learning_rate": 5.771435330973973e-06,
+      "loss": 2.6998,
+      "step": 15980
+    },
+    {
+      "epoch": 0.9278036468079551,
+      "grad_norm": 0.11140532791614532,
+      "learning_rate": 5.6802628381410705e-06,
+      "loss": 2.7102,
+      "step": 15990
+    },
+    {
+      "epoch": 0.9283838867371658,
+      "grad_norm": 0.110744908452034,
+      "learning_rate": 5.5898058509368245e-06,
+      "loss": 2.7094,
+      "step": 16000
+    },
+    {
+      "epoch": 0.9283838867371658,
+      "eval_loss": 2.6828300952911377,
+      "eval_runtime": 5.3865,
+      "eval_samples_per_second": 803.858,
+      "eval_steps_per_second": 1.671,
+      "step": 16000
+    },
+    {
+      "epoch": 0.9289641266663765,
+      "grad_norm": 0.11381204426288605,
+      "learning_rate": 5.500064702433294e-06,
+      "loss": 2.7111,
+      "step": 16010
+    },
+    {
+      "epoch": 0.9295443665955873,
+      "grad_norm": 0.11151892691850662,
+      "learning_rate": 5.411039723066802e-06,
+      "loss": 2.714,
+      "step": 16020
+    },
+    {
+      "epoch": 0.930124606524798,
+      "grad_norm": 0.1121894121170044,
+      "learning_rate": 5.3227312406366915e-06,
+      "loss": 2.7176,
+      "step": 16030
+    },
+    {
+      "epoch": 0.9307048464540087,
+      "grad_norm": 0.11182450503110886,
+      "learning_rate": 5.235139580303949e-06,
+      "loss": 2.7077,
+      "step": 16040
+    },
+    {
+      "epoch": 0.9312850863832195,
+      "grad_norm": 0.11129864305257797,
+      "learning_rate": 5.148265064590341e-06,
+      "loss": 2.7088,
+      "step": 16050
+    },
+    {
+      "epoch": 0.9318653263124302,
+      "grad_norm": 0.1094869002699852,
+      "learning_rate": 5.062108013376876e-06,
+      "loss": 2.7157,
+      "step": 16060
+    },
+    {
+      "epoch": 0.9324455662416409,
+      "grad_norm": 0.1108771562576294,
+      "learning_rate": 4.976668743902857e-06,
+      "loss": 2.7095,
+      "step": 16070
+    },
+    {
+      "epoch": 0.9330258061708516,
+      "grad_norm": 0.1098882406949997,
+      "learning_rate": 4.891947570764655e-06,
+      "loss": 2.7085,
+      "step": 16080
+    },
+    {
+      "epoch": 0.9336060461000624,
+      "grad_norm": 0.11074597388505936,
+      "learning_rate": 4.807944805914444e-06,
+      "loss": 2.7089,
+      "step": 16090
+    },
+    {
+      "epoch": 0.9341862860292731,
+      "grad_norm": 0.11346107721328735,
+      "learning_rate": 4.724660758659272e-06,
+      "loss": 2.6967,
+      "step": 16100
+    },
+    {
+      "epoch": 0.9347665259584839,
+      "grad_norm": 0.11357604712247849,
+      "learning_rate": 4.64209573565968e-06,
+      "loss": 2.7057,
+      "step": 16110
+    },
+    {
+      "epoch": 0.9353467658876946,
+      "grad_norm": 0.11271534860134125,
+      "learning_rate": 4.560250040928748e-06,
+      "loss": 2.7033,
+      "step": 16120
+    },
+    {
+      "epoch": 0.9359270058169052,
+      "grad_norm": 0.11321555078029633,
+      "learning_rate": 4.479123975830879e-06,
+      "loss": 2.7137,
+      "step": 16130
+    },
+    {
+      "epoch": 0.936507245746116,
+      "grad_norm": 0.11405043303966522,
+      "learning_rate": 4.398717839080746e-06,
+      "loss": 2.7191,
+      "step": 16140
+    },
+    {
+      "epoch": 0.9370874856753267,
+      "grad_norm": 0.11109951883554459,
+      "learning_rate": 4.319031926742234e-06,
+      "loss": 2.7126,
+      "step": 16150
+    },
+    {
+      "epoch": 0.9376677256045375,
+      "grad_norm": 0.11525629460811615,
+      "learning_rate": 4.240066532227105e-06,
+      "loss": 2.7033,
+      "step": 16160
+    },
+    {
+      "epoch": 0.9382479655337482,
+      "grad_norm": 0.1110944077372551,
+      "learning_rate": 4.161821946294309e-06,
+      "loss": 2.7133,
+      "step": 16170
+    },
+    {
+      "epoch": 0.938828205462959,
+      "grad_norm": 0.1128690242767334,
+      "learning_rate": 4.084298457048563e-06,
+      "loss": 2.7172,
+      "step": 16180
+    },
+    {
+      "epoch": 0.9394084453921696,
+      "grad_norm": 0.11161357909440994,
+      "learning_rate": 4.007496349939466e-06,
+      "loss": 2.7159,
+      "step": 16190
+    },
+    {
+      "epoch": 0.9399886853213804,
+      "grad_norm": 0.11112505942583084,
+      "learning_rate": 3.931415907760494e-06,
+      "loss": 2.7015,
+      "step": 16200
+    },
+    {
+      "epoch": 0.9405689252505911,
+      "grad_norm": 0.11174745112657547,
+      "learning_rate": 3.856057410647695e-06,
+      "loss": 2.7141,
+      "step": 16210
+    },
+    {
+      "epoch": 0.9411491651798018,
+      "grad_norm": 0.11169620603322983,
+      "learning_rate": 3.781421136079044e-06,
+      "loss": 2.7126,
+      "step": 16220
+    },
+    {
+      "epoch": 0.9417294051090126,
+      "grad_norm": 0.11062929034233093,
+      "learning_rate": 3.707507358873086e-06,
+      "loss": 2.7104,
+      "step": 16230
+    },
+    {
+      "epoch": 0.9423096450382233,
+      "grad_norm": 0.11094637215137482,
+      "learning_rate": 3.634316351188094e-06,
+      "loss": 2.7094,
+      "step": 16240
+    },
+    {
+      "epoch": 0.9428898849674341,
+      "grad_norm": 0.11300352960824966,
+      "learning_rate": 3.5618483825210048e-06,
+      "loss": 2.7006,
+      "step": 16250
+    },
+    {
+      "epoch": 0.9434701248966447,
+      "grad_norm": 0.11278436332941055,
+      "learning_rate": 3.4901037197064834e-06,
+      "loss": 2.7102,
+      "step": 16260
+    },
+    {
+      "epoch": 0.9440503648258555,
+      "grad_norm": 0.10999085009098053,
+      "learning_rate": 3.419082626915926e-06,
+      "loss": 2.7085,
+      "step": 16270
+    },
+    {
+      "epoch": 0.9446306047550662,
+      "grad_norm": 0.11130973696708679,
+      "learning_rate": 3.3487853656563927e-06,
+      "loss": 2.709,
+      "step": 16280
+    },
+    {
+      "epoch": 0.945210844684277,
+      "grad_norm": 0.11214598268270493,
+      "learning_rate": 3.279212194769787e-06,
+      "loss": 2.7168,
+      "step": 16290
+    },
+    {
+      "epoch": 0.9457910846134877,
+      "grad_norm": 0.11364555358886719,
+      "learning_rate": 3.2103633704318124e-06,
+      "loss": 2.7116,
+      "step": 16300
+    },
+    {
+      "epoch": 0.9463713245426985,
+      "grad_norm": 0.11130227893590927,
+      "learning_rate": 3.142239146151016e-06,
+      "loss": 2.7174,
+      "step": 16310
+    },
+    {
+      "epoch": 0.9469515644719091,
+      "grad_norm": 0.1102483943104744,
+      "learning_rate": 3.07483977276799e-06,
+      "loss": 2.71,
+      "step": 16320
+    },
+    {
+      "epoch": 0.9475318044011198,
+      "grad_norm": 0.11199984699487686,
+      "learning_rate": 3.0081654984542628e-06,
+      "loss": 2.7217,
+      "step": 16330
+    },
+    {
+      "epoch": 0.9481120443303306,
+      "grad_norm": 0.11214699596166611,
+      "learning_rate": 2.9422165687114754e-06,
+      "loss": 2.7177,
+      "step": 16340
+    },
+    {
+      "epoch": 0.9486922842595413,
+      "grad_norm": 0.11154871433973312,
+      "learning_rate": 2.8769932263705167e-06,
+      "loss": 2.7183,
+      "step": 16350
+    },
+    {
+      "epoch": 0.9492725241887521,
+      "grad_norm": 0.11083105206489563,
+      "learning_rate": 2.8124957115905683e-06,
+      "loss": 2.7145,
+      "step": 16360
+    },
+    {
+      "epoch": 0.9498527641179628,
+      "grad_norm": 0.11013077944517136,
+      "learning_rate": 2.7487242618581933e-06,
+      "loss": 2.7047,
+      "step": 16370
+    },
+    {
+      "epoch": 0.9504330040471735,
+      "grad_norm": 0.11090140789747238,
+      "learning_rate": 2.6856791119866275e-06,
+      "loss": 2.7061,
+      "step": 16380
+    },
+    {
+      "epoch": 0.9510132439763842,
+      "grad_norm": 0.11299975216388702,
+      "learning_rate": 2.623360494114646e-06,
+      "loss": 2.7003,
+      "step": 16390
+    },
+    {
+      "epoch": 0.951593483905595,
+      "grad_norm": 0.1101190522313118,
+      "learning_rate": 2.5617686377059637e-06,
+      "loss": 2.706,
+      "step": 16400
+    },
+    {
+      "epoch": 0.9521737238348057,
+      "grad_norm": 0.11057887226343155,
+      "learning_rate": 2.5009037695482574e-06,
+      "loss": 2.7179,
+      "step": 16410
+    },
+    {
+      "epoch": 0.9527539637640164,
+      "grad_norm": 0.1128411814570427,
+      "learning_rate": 2.4407661137523243e-06,
+      "loss": 2.7082,
+      "step": 16420
+    },
+    {
+      "epoch": 0.9533342036932272,
+      "grad_norm": 0.11165603250265121,
+      "learning_rate": 2.3813558917513025e-06,
+      "loss": 2.7253,
+      "step": 16430
+    },
+    {
+      "epoch": 0.9539144436224378,
+      "grad_norm": 0.11112351715564728,
+      "learning_rate": 2.322673322299873e-06,
+      "loss": 2.7214,
+      "step": 16440
+    },
+    {
+      "epoch": 0.9544946835516486,
+      "grad_norm": 0.11173354089260101,
+      "learning_rate": 2.2647186214734162e-06,
+      "loss": 2.709,
+      "step": 16450
+    },
+    {
+      "epoch": 0.9550749234808593,
+      "grad_norm": 0.1092383936047554,
+      "learning_rate": 2.207492002667211e-06,
+      "loss": 2.7124,
+      "step": 16460
+    },
+    {
+      "epoch": 0.9556551634100701,
+      "grad_norm": 0.11275044083595276,
+      "learning_rate": 2.150993676595614e-06,
+      "loss": 2.7105,
+      "step": 16470
+    },
+    {
+      "epoch": 0.9562354033392808,
+      "grad_norm": 0.11197572201490402,
+      "learning_rate": 2.095223851291439e-06,
+      "loss": 2.7034,
+      "step": 16480
+    },
+    {
+      "epoch": 0.9568156432684916,
+      "grad_norm": 0.11042193323373795,
+      "learning_rate": 2.0401827321049783e-06,
+      "loss": 2.7091,
+      "step": 16490
+    },
+    {
+      "epoch": 0.9573958831977023,
+      "grad_norm": 0.11012552678585052,
+      "learning_rate": 1.9858705217034478e-06,
+      "loss": 2.7145,
+      "step": 16500
+    },
+    {
+      "epoch": 0.9579761231269129,
+      "grad_norm": 0.11172161996364594,
+      "learning_rate": 1.9322874200700558e-06,
+      "loss": 2.7217,
+      "step": 16510
+    },
+    {
+      "epoch": 0.9585563630561237,
+      "grad_norm": 0.11069660633802414,
+      "learning_rate": 1.8794336245034238e-06,
+      "loss": 2.7084,
+      "step": 16520
+    },
+    {
+      "epoch": 0.9591366029853344,
+      "grad_norm": 0.11089422553777695,
+      "learning_rate": 1.8273093296167443e-06,
+      "loss": 2.71,
+      "step": 16530
+    },
+    {
+      "epoch": 0.9597168429145452,
+      "grad_norm": 0.11164766550064087,
+      "learning_rate": 1.7759147273371136e-06,
+      "loss": 2.7163,
+      "step": 16540
+    },
+    {
+      "epoch": 0.9602970828437559,
+      "grad_norm": 0.1093573048710823,
+      "learning_rate": 1.7252500069048882e-06,
+      "loss": 2.7069,
+      "step": 16550
+    },
+    {
+      "epoch": 0.9608773227729667,
+      "grad_norm": 0.11053937673568726,
+      "learning_rate": 1.6753153548728417e-06,
+      "loss": 2.7067,
+      "step": 16560
+    },
+    {
+      "epoch": 0.9614575627021773,
+      "grad_norm": 0.11065319180488586,
+      "learning_rate": 1.6261109551056307e-06,
+      "loss": 2.7054,
+      "step": 16570
+    },
+    {
+      "epoch": 0.962037802631388,
+      "grad_norm": 0.11450429260730743,
+      "learning_rate": 1.5776369887789521e-06,
+      "loss": 2.7046,
+      "step": 16580
+    },
+    {
+      "epoch": 0.9626180425605988,
+      "grad_norm": 0.1109624058008194,
+      "learning_rate": 1.529893634379076e-06,
+      "loss": 2.7088,
+      "step": 16590
+    },
+    {
+      "epoch": 0.9631982824898095,
+      "grad_norm": 0.1117752194404602,
+      "learning_rate": 1.4828810677020244e-06,
+      "loss": 2.7108,
+      "step": 16600
+    },
+    {
+      "epoch": 0.9637785224190203,
+      "grad_norm": 0.11164771765470505,
+      "learning_rate": 1.4365994618529499e-06,
+      "loss": 2.7099,
+      "step": 16610
+    },
+    {
+      "epoch": 0.964358762348231,
+      "grad_norm": 0.11132363975048065,
+      "learning_rate": 1.3910489872456468e-06,
+      "loss": 2.7074,
+      "step": 16620
+    },
+    {
+      "epoch": 0.9649390022774417,
+      "grad_norm": 0.11013256758451462,
+      "learning_rate": 1.3462298116016847e-06,
+      "loss": 2.7112,
+      "step": 16630
+    },
+    {
+      "epoch": 0.9655192422066524,
+      "grad_norm": 0.11211226135492325,
+      "learning_rate": 1.3021420999499656e-06,
+      "loss": 2.7194,
+      "step": 16640
+    },
+    {
+      "epoch": 0.9660994821358632,
+      "grad_norm": 0.11033762246370316,
+      "learning_rate": 1.258786014626101e-06,
+      "loss": 2.7058,
+      "step": 16650
+    },
+    {
+      "epoch": 0.9666797220650739,
+      "grad_norm": 0.11029258370399475,
+      "learning_rate": 1.216161715271702e-06,
+      "loss": 2.698,
+      "step": 16660
+    },
+    {
+      "epoch": 0.9672599619942847,
+      "grad_norm": 0.1115414947271347,
+      "learning_rate": 1.1742693588339126e-06,
+      "loss": 2.7096,
+      "step": 16670
+    },
+    {
+      "epoch": 0.9678402019234954,
+      "grad_norm": 0.11232610791921616,
+      "learning_rate": 1.1331090995647665e-06,
+      "loss": 2.7081,
+      "step": 16680
+    },
+    {
+      "epoch": 0.9684204418527061,
+      "grad_norm": 0.11218692362308502,
+      "learning_rate": 1.0926810890206528e-06,
+      "loss": 2.7094,
+      "step": 16690
+    },
+    {
+      "epoch": 0.9690006817819168,
+      "grad_norm": 0.1100686714053154,
+      "learning_rate": 1.0529854760617853e-06,
+      "loss": 2.7133,
+      "step": 16700
+    },
+    {
+      "epoch": 0.9695809217111275,
+      "grad_norm": 0.11020591855049133,
+      "learning_rate": 1.0140224068515113e-06,
+      "loss": 2.7086,
+      "step": 16710
+    },
+    {
+      "epoch": 0.9701611616403383,
+      "grad_norm": 0.1117192879319191,
+      "learning_rate": 9.757920248559815e-07,
+      "loss": 2.7066,
+      "step": 16720
+    },
+    {
+      "epoch": 0.970741401569549,
+      "grad_norm": 0.10976511240005493,
+      "learning_rate": 9.382944708434149e-07,
+      "loss": 2.7279,
+      "step": 16730
+    },
+    {
+      "epoch": 0.9713216414987598,
+      "grad_norm": 0.1115812286734581,
+      "learning_rate": 9.01529882883767e-07,
+      "loss": 2.7104,
+      "step": 16740
+    },
+    {
+      "epoch": 0.9719018814279705,
+      "grad_norm": 0.11191580444574356,
+      "learning_rate": 8.654983963481078e-07,
+      "loss": 2.7109,
+      "step": 16750
+    },
+    {
+      "epoch": 0.9724821213571812,
+      "grad_norm": 0.11055436730384827,
+      "learning_rate": 8.302001439081108e-07,
+      "loss": 2.7166,
+      "step": 16760
+    },
+    {
+      "epoch": 0.9730623612863919,
+      "grad_norm": 0.1103782057762146,
+      "learning_rate": 7.956352555356761e-07,
+      "loss": 2.7106,
+      "step": 16770
+    },
+    {
+      "epoch": 0.9736426012156026,
+      "grad_norm": 0.11125820130109787,
+      "learning_rate": 7.618038585023301e-07,
+      "loss": 2.7046,
+      "step": 16780
+    },
+    {
+      "epoch": 0.9742228411448134,
+      "grad_norm": 0.10956889390945435,
+      "learning_rate": 7.287060773788268e-07,
+      "loss": 2.7138,
+      "step": 16790
+    },
+    {
+      "epoch": 0.9748030810740241,
+      "grad_norm": 0.10879559069871902,
+      "learning_rate": 6.96342034034636e-07,
+      "loss": 2.7084,
+      "step": 16800
+    },
+    {
+      "epoch": 0.9753833210032349,
+      "grad_norm": 0.11071236431598663,
+      "learning_rate": 6.647118476375891e-07,
+      "loss": 2.708,
+      "step": 16810
+    },
+    {
+      "epoch": 0.9759635609324455,
+      "grad_norm": 0.1103687658905983,
+      "learning_rate": 6.338156346533452e-07,
+      "loss": 2.7076,
+      "step": 16820
+    },
+    {
+      "epoch": 0.9765438008616563,
+      "grad_norm": 0.11137609928846359,
+      "learning_rate": 6.036535088449702e-07,
+      "loss": 2.7063,
+      "step": 16830
+    },
+    {
+      "epoch": 0.977124040790867,
+      "grad_norm": 0.10921121388673782,
+      "learning_rate": 5.742255812726027e-07,
+      "loss": 2.7155,
+      "step": 16840
+    },
+    {
+      "epoch": 0.9777042807200778,
+      "grad_norm": 0.10973076522350311,
+      "learning_rate": 5.455319602929221e-07,
+      "loss": 2.7143,
+      "step": 16850
+    },
+    {
+      "epoch": 0.9782845206492885,
+      "grad_norm": 0.11031708121299744,
+      "learning_rate": 5.175727515588591e-07,
+      "loss": 2.7045,
+      "step": 16860
+    },
+    {
+      "epoch": 0.9788647605784992,
+      "grad_norm": 0.10978976637125015,
+      "learning_rate": 4.903480580191744e-07,
+      "loss": 2.7021,
+      "step": 16870
+    },
+    {
+      "epoch": 0.9794450005077099,
+      "grad_norm": 0.11084671318531036,
+      "learning_rate": 4.638579799179921e-07,
+      "loss": 2.7039,
+      "step": 16880
+    },
+    {
+      "epoch": 0.9800252404369206,
+      "grad_norm": 0.11046291887760162,
+      "learning_rate": 4.381026147945999e-07,
+      "loss": 2.7127,
+      "step": 16890
+    },
+    {
+      "epoch": 0.9806054803661314,
+      "grad_norm": 0.10937215387821198,
+      "learning_rate": 4.130820574829386e-07,
+      "loss": 2.7184,
+      "step": 16900
+    },
+    {
+      "epoch": 0.9811857202953421,
+      "grad_norm": 0.11020946502685547,
+      "learning_rate": 3.887964001113131e-07,
+      "loss": 2.717,
+      "step": 16910
+    },
+    {
+      "epoch": 0.9817659602245529,
+      "grad_norm": 0.11252225935459137,
+      "learning_rate": 3.652457321020597e-07,
+      "loss": 2.7121,
+      "step": 16920
+    },
+    {
+      "epoch": 0.9823462001537636,
+      "grad_norm": 0.11090285331010818,
+      "learning_rate": 3.4243014017119045e-07,
+      "loss": 2.7113,
+      "step": 16930
+    },
+    {
+      "epoch": 0.9829264400829744,
+      "grad_norm": 0.11007850617170334,
+      "learning_rate": 3.203497083281493e-07,
+      "loss": 2.7147,
+      "step": 16940
+    },
+    {
+      "epoch": 0.983506680012185,
+      "grad_norm": 0.11209884285926819,
+      "learning_rate": 2.9900451787534533e-07,
+      "loss": 2.7086,
+      "step": 16950
+    },
+    {
+      "epoch": 0.9840869199413957,
+      "grad_norm": 0.1093859076499939,
+      "learning_rate": 2.783946474080423e-07,
+      "loss": 2.708,
+      "step": 16960
+    },
+    {
+      "epoch": 0.9846671598706065,
+      "grad_norm": 0.11188683658838272,
+      "learning_rate": 2.5852017281393636e-07,
+      "loss": 2.7107,
+      "step": 16970
+    },
+    {
+      "epoch": 0.9852473997998172,
+      "grad_norm": 0.10943835973739624,
+      "learning_rate": 2.393811672729118e-07,
+      "loss": 2.7148,
+      "step": 16980
+    },
+    {
+      "epoch": 0.985827639729028,
+      "grad_norm": 0.10962080210447311,
+      "learning_rate": 2.2097770125679705e-07,
+      "loss": 2.7067,
+      "step": 16990
+    },
+    {
+      "epoch": 0.9864078796582387,
+      "grad_norm": 0.11074183881282806,
+      "learning_rate": 2.0330984252909801e-07,
+      "loss": 2.7068,
+      "step": 17000
+    },
+    {
+      "epoch": 0.9864078796582387,
+      "eval_loss": 2.681455612182617,
+      "eval_runtime": 5.3878,
+      "eval_samples_per_second": 803.669,
+      "eval_steps_per_second": 1.67,
+      "step": 17000
+    },
+    {
+      "epoch": 0.9869881195874494,
+      "grad_norm": 0.11070505529642105,
+      "learning_rate": 1.8637765614468728e-07,
+      "loss": 2.7131,
+      "step": 17010
+    },
+    {
+      "epoch": 0.9875683595166601,
+      "grad_norm": 0.10992126911878586,
+      "learning_rate": 1.701812044496931e-07,
+      "loss": 2.7069,
+      "step": 17020
+    },
+    {
+      "epoch": 0.9881485994458709,
+      "grad_norm": 0.10988050699234009,
+      "learning_rate": 1.5472054708112193e-07,
+      "loss": 2.7057,
+      "step": 17030
+    },
+    {
+      "epoch": 0.9887288393750816,
+      "grad_norm": 0.10954176634550095,
+      "learning_rate": 1.3999574096672518e-07,
+      "loss": 2.7031,
+      "step": 17040
+    },
+    {
+      "epoch": 0.9893090793042923,
+      "grad_norm": 0.10873284935951233,
+      "learning_rate": 1.2600684032479936e-07,
+      "loss": 2.7103,
+      "step": 17050
+    },
+    {
+      "epoch": 0.9898893192335031,
+      "grad_norm": 0.10949879884719849,
+      "learning_rate": 1.1275389666391967e-07,
+      "loss": 2.716,
+      "step": 17060
+    },
+    {
+      "epoch": 0.9904695591627137,
+      "grad_norm": 0.11077286303043365,
+      "learning_rate": 1.0023695878285111e-07,
+      "loss": 2.6973,
+      "step": 17070
+    },
+    {
+      "epoch": 0.9910497990919245,
+      "grad_norm": 0.10955335944890976,
+      "learning_rate": 8.845607277021551e-08,
+      "loss": 2.7106,
+      "step": 17080
+    },
+    {
+      "epoch": 0.9916300390211352,
+      "grad_norm": 0.11161547154188156,
+      "learning_rate": 7.741128200453584e-08,
+      "loss": 2.7117,
+      "step": 17090
+    },
+    {
+      "epoch": 0.992210278950346,
+      "grad_norm": 0.11127256602048874,
+      "learning_rate": 6.710262715383664e-08,
+      "loss": 2.721,
+      "step": 17100
+    },
+    {
+      "epoch": 0.9927905188795567,
+      "grad_norm": 0.10978476703166962,
+      "learning_rate": 5.7530146175688305e-08,
+      "loss": 2.7057,
+      "step": 17110
+    },
+    {
+      "epoch": 0.9933707588087675,
+      "grad_norm": 0.10954172164201736,
+      "learning_rate": 4.869387431696293e-08,
+      "loss": 2.7204,
+      "step": 17120
+    },
+    {
+      "epoch": 0.9939509987379782,
+      "grad_norm": 0.10985864698886871,
+      "learning_rate": 4.059384411372325e-08,
+      "loss": 2.7082,
+      "step": 17130
+    },
+    {
+      "epoch": 0.9945312386671888,
+      "grad_norm": 0.11031010746955872,
+      "learning_rate": 3.323008539115602e-08,
+      "loss": 2.7154,
+      "step": 17140
+    },
+    {
+      "epoch": 0.9951114785963996,
+      "grad_norm": 0.10947979241609573,
+      "learning_rate": 2.660262526339441e-08,
+      "loss": 2.7108,
+      "step": 17150
+    },
+    {
+      "epoch": 0.9956917185256103,
+      "grad_norm": 0.11240995675325394,
+      "learning_rate": 2.0711488133406954e-08,
+      "loss": 2.7082,
+      "step": 17160
+    },
+    {
+      "epoch": 0.9962719584548211,
+      "grad_norm": 0.1106034591794014,
+      "learning_rate": 1.5556695693019763e-08,
+      "loss": 2.7135,
+      "step": 17170
+    },
+    {
+      "epoch": 0.9968521983840318,
+      "grad_norm": 0.11040083318948746,
+      "learning_rate": 1.113826692267228e-08,
+      "loss": 2.706,
+      "step": 17180
+    },
+    {
+      "epoch": 0.9974324383132426,
+      "grad_norm": 0.1106574684381485,
+      "learning_rate": 7.4562180915283e-09,
+      "loss": 2.7083,
+      "step": 17190
+    },
+    {
+      "epoch": 0.9980126782424532,
+      "grad_norm": 0.10932475328445435,
+      "learning_rate": 4.510562757231718e-09,
+      "loss": 2.7048,
+      "step": 17200
+    },
+    {
+      "epoch": 0.998592918171664,
+      "grad_norm": 0.11014498770236969,
+      "learning_rate": 2.30131176603976e-09,
+      "loss": 2.7159,
+      "step": 17210
+    },
+    {
+      "epoch": 0.9991731581008747,
+      "grad_norm": 0.10927695780992508,
+      "learning_rate": 8.284732526231409e-10,
+      "loss": 2.709,
+      "step": 17220
+    },
+    {
+      "epoch": 0.9997533980300854,
+      "grad_norm": 0.11252807825803757,
+      "learning_rate": 9.205264011047376e-11,
+      "loss": 2.7063,
+      "step": 17230
+    },
+    {
+      "epoch": 0.9999854940017697,
+      "step": 17234,
+      "total_flos": 8.768782702139251e+19,
+      "train_loss": 3.0425821965374635,
+      "train_runtime": 37582.8145,
+      "train_samples_per_second": 234.783,
+      "train_steps_per_second": 0.459
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 17234,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.768782702139251e+19,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}