diff --git "a/contextlm_gpt2_large/trainer_state.json" "b/contextlm_gpt2_large/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/contextlm_gpt2_large/trainer_state.json"
@@ -0,0 +1,12240 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9999709884243814,
+  "eval_steps": 1000,
+  "global_step": 17234,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.000580231512373437,
+      "grad_norm": 4.4758100509643555,
+      "learning_rate": 6.264501160092807e-06,
+      "loss": 10.4749,
+      "step": 10
+    },
+    {
+      "epoch": 0.001160463024746874,
+      "grad_norm": 1.6773627996444702,
+      "learning_rate": 1.322505800464037e-05,
+      "loss": 9.159,
+      "step": 20
+    },
+    {
+      "epoch": 0.001740694537120311,
+      "grad_norm": 1.5999170541763306,
+      "learning_rate": 2.018561484918793e-05,
+      "loss": 8.8189,
+      "step": 30
+    },
+    {
+      "epoch": 0.002320926049493748,
+      "grad_norm": 1.9260104894638062,
+      "learning_rate": 2.7146171693735496e-05,
+      "loss": 8.4574,
+      "step": 40
+    },
+    {
+      "epoch": 0.002901157561867185,
+      "grad_norm": 2.173593282699585,
+      "learning_rate": 3.410672853828306e-05,
+      "loss": 8.0835,
+      "step": 50
+    },
+    {
+      "epoch": 0.003481389074240622,
+      "grad_norm": 1.5830281972885132,
+      "learning_rate": 4.1067285382830626e-05,
+      "loss": 7.7376,
+      "step": 60
+    },
+    {
+      "epoch": 0.004061620586614059,
+      "grad_norm": 2.772728443145752,
+      "learning_rate": 4.802784222737819e-05,
+      "loss": 7.4168,
+      "step": 70
+    },
+    {
+      "epoch": 0.004641852098987496,
+      "grad_norm": 1.511775016784668,
+      "learning_rate": 5.498839907192575e-05,
+      "loss": 7.1442,
+      "step": 80
+    },
+    {
+      "epoch": 0.005222083611360933,
+      "grad_norm": 1.9058183431625366,
+      "learning_rate": 6.194895591647331e-05,
+      "loss": 6.9324,
+      "step": 90
+    },
+    {
+      "epoch": 0.00580231512373437,
+      "grad_norm": 1.6976985931396484,
+      "learning_rate": 6.890951276102087e-05,
+      "loss": 6.8005,
+      "step": 100
+    },
+    {
+      "epoch": 0.006382546636107807,
+      "grad_norm": 1.4346176385879517,
+      "learning_rate": 7.587006960556844e-05,
+      "loss": 6.6814,
+      "step": 110
+    },
+    {
+      "epoch": 0.006962778148481244,
+      "grad_norm": 1.0364270210266113,
+      "learning_rate": 8.283062645011599e-05,
+      "loss": 6.5547,
+      "step": 120
+    },
+    {
+      "epoch": 0.007543009660854681,
+      "grad_norm": 0.6528536677360535,
+      "learning_rate": 8.979118329466357e-05,
+      "loss": 6.4482,
+      "step": 130
+    },
+    {
+      "epoch": 0.008123241173228117,
+      "grad_norm": 1.1468390226364136,
+      "learning_rate": 9.675174013921112e-05,
+      "loss": 6.3518,
+      "step": 140
+    },
+    {
+      "epoch": 0.008703472685601555,
+      "grad_norm": 0.6249582171440125,
+      "learning_rate": 0.0001037122969837587,
+      "loss": 6.2749,
+      "step": 150
+    },
+    {
+      "epoch": 0.009283704197974993,
+      "grad_norm": 0.9577043652534485,
+      "learning_rate": 0.00011067285382830626,
+      "loss": 6.2026,
+      "step": 160
+    },
+    {
+      "epoch": 0.009863935710348428,
+      "grad_norm": 1.156731367111206,
+      "learning_rate": 0.00011763341067285381,
+      "loss": 6.1482,
+      "step": 170
+    },
+    {
+      "epoch": 0.010444167222721866,
+      "grad_norm": 0.7919487357139587,
+      "learning_rate": 0.0001245939675174014,
+      "loss": 6.0907,
+      "step": 180
+    },
+    {
+      "epoch": 0.011024398735095304,
+      "grad_norm": 0.5902596712112427,
+      "learning_rate": 0.00013155452436194894,
+      "loss": 6.0469,
+      "step": 190
+    },
+    {
+      "epoch": 0.01160463024746874,
+      "grad_norm": 0.9712298512458801,
+      "learning_rate": 0.00013851508120649652,
+      "loss": 6.0128,
+      "step": 200
+    },
+    {
+      "epoch": 0.012184861759842177,
+      "grad_norm": 0.6487208008766174,
+      "learning_rate": 0.00014547563805104407,
+      "loss": 5.949,
+      "step": 210
+    },
+    {
+      "epoch": 0.012765093272215615,
+      "grad_norm": 0.6659431457519531,
+      "learning_rate": 0.00015243619489559162,
+      "loss": 5.9004,
+      "step": 220
+    },
+    {
+      "epoch": 0.01334532478458905,
+      "grad_norm": 0.9973188042640686,
+      "learning_rate": 0.0001593967517401392,
+      "loss": 5.8727,
+      "step": 230
+    },
+    {
+      "epoch": 0.013925556296962488,
+      "grad_norm": 0.592413067817688,
+      "learning_rate": 0.00016635730858468675,
+      "loss": 5.8594,
+      "step": 240
+    },
+    {
+      "epoch": 0.014505787809335926,
+      "grad_norm": 0.6143619418144226,
+      "learning_rate": 0.00017331786542923433,
+      "loss": 5.8114,
+      "step": 250
+    },
+    {
+      "epoch": 0.015086019321709361,
+      "grad_norm": 0.5780689120292664,
+      "learning_rate": 0.00018027842227378188,
+      "loss": 5.7829,
+      "step": 260
+    },
+    {
+      "epoch": 0.0156662508340828,
+      "grad_norm": 0.41307076811790466,
+      "learning_rate": 0.00018723897911832944,
+      "loss": 5.7197,
+      "step": 270
+    },
+    {
+      "epoch": 0.016246482346456235,
+      "grad_norm": 0.6880993247032166,
+      "learning_rate": 0.00019419953596287701,
+      "loss": 5.7168,
+      "step": 280
+    },
+    {
+      "epoch": 0.016826713858829674,
+      "grad_norm": 0.4273562431335449,
+      "learning_rate": 0.0002011600928074246,
+      "loss": 5.6639,
+      "step": 290
+    },
+    {
+      "epoch": 0.01740694537120311,
+      "grad_norm": 0.5025382041931152,
+      "learning_rate": 0.00020812064965197212,
+      "loss": 5.6305,
+      "step": 300
+    },
+    {
+      "epoch": 0.017987176883576546,
+      "grad_norm": 0.7127647995948792,
+      "learning_rate": 0.0002150812064965197,
+      "loss": 5.5991,
+      "step": 310
+    },
+    {
+      "epoch": 0.018567408395949985,
+      "grad_norm": 0.6494776010513306,
+      "learning_rate": 0.00022204176334106727,
+      "loss": 5.5961,
+      "step": 320
+    },
+    {
+      "epoch": 0.01914763990832342,
+      "grad_norm": 0.43809765577316284,
+      "learning_rate": 0.00022900232018561485,
+      "loss": 5.5242,
+      "step": 330
+    },
+    {
+      "epoch": 0.019727871420696857,
+      "grad_norm": 0.5514947175979614,
+      "learning_rate": 0.00023596287703016238,
+      "loss": 5.4885,
+      "step": 340
+    },
+    {
+      "epoch": 0.020308102933070296,
+      "grad_norm": 0.7086557745933533,
+      "learning_rate": 0.00024292343387470995,
+      "loss": 5.4558,
+      "step": 350
+    },
+    {
+      "epoch": 0.020888334445443732,
+      "grad_norm": 0.44333210587501526,
+      "learning_rate": 0.0002498839907192575,
+      "loss": 5.4249,
+      "step": 360
+    },
+    {
+      "epoch": 0.021468565957817168,
+      "grad_norm": 0.5971847772598267,
+      "learning_rate": 0.0002568445475638051,
+      "loss": 5.3896,
+      "step": 370
+    },
+    {
+      "epoch": 0.022048797470190607,
+      "grad_norm": 0.5358195900917053,
+      "learning_rate": 0.0002638051044083526,
+      "loss": 5.3647,
+      "step": 380
+    },
+    {
+      "epoch": 0.022629028982564043,
+      "grad_norm": 0.4231407046318054,
+      "learning_rate": 0.0002707656612529002,
+      "loss": 5.3325,
+      "step": 390
+    },
+    {
+      "epoch": 0.02320926049493748,
+      "grad_norm": 0.48789191246032715,
+      "learning_rate": 0.00027772621809744777,
+      "loss": 5.2922,
+      "step": 400
+    },
+    {
+      "epoch": 0.023789492007310918,
+      "grad_norm": 0.46154582500457764,
+      "learning_rate": 0.0002846867749419953,
+      "loss": 5.2881,
+      "step": 410
+    },
+    {
+      "epoch": 0.024369723519684354,
+      "grad_norm": 0.44972172379493713,
+      "learning_rate": 0.00029164733178654287,
+      "loss": 5.2397,
+      "step": 420
+    },
+    {
+      "epoch": 0.02494995503205779,
+      "grad_norm": 0.505415678024292,
+      "learning_rate": 0.0002986078886310905,
+      "loss": 5.1841,
+      "step": 430
+    },
+    {
+      "epoch": 0.02553018654443123,
+      "grad_norm": 0.42717623710632324,
+      "learning_rate": 0.0003055684454756381,
+      "loss": 5.1848,
+      "step": 440
+    },
+    {
+      "epoch": 0.026110418056804665,
+      "grad_norm": 0.4216056168079376,
+      "learning_rate": 0.0003125290023201856,
+      "loss": 5.1447,
+      "step": 450
+    },
+    {
+      "epoch": 0.0266906495691781,
+      "grad_norm": 0.5051509141921997,
+      "learning_rate": 0.00031948955916473313,
+      "loss": 5.1084,
+      "step": 460
+    },
+    {
+      "epoch": 0.02727088108155154,
+      "grad_norm": 0.5205376744270325,
+      "learning_rate": 0.0003264501160092807,
+      "loss": 5.0462,
+      "step": 470
+    },
+    {
+      "epoch": 0.027851112593924976,
+      "grad_norm": 0.5111084580421448,
+      "learning_rate": 0.0003334106728538283,
+      "loss": 5.0225,
+      "step": 480
+    },
+    {
+      "epoch": 0.028431344106298412,
+      "grad_norm": 0.4395337402820587,
+      "learning_rate": 0.00034037122969837584,
+      "loss": 4.991,
+      "step": 490
+    },
+    {
+      "epoch": 0.02901157561867185,
+      "grad_norm": 0.2879785895347595,
+      "learning_rate": 0.00034733178654292344,
+      "loss": 4.9628,
+      "step": 500
+    },
+    {
+      "epoch": 0.029591807131045287,
+      "grad_norm": 0.3356530964374542,
+      "learning_rate": 0.000354292343387471,
+      "loss": 4.9165,
+      "step": 510
+    },
+    {
+      "epoch": 0.030172038643418723,
+      "grad_norm": 0.39410287141799927,
+      "learning_rate": 0.00036125290023201855,
+      "loss": 4.8802,
+      "step": 520
+    },
+    {
+      "epoch": 0.030752270155792162,
+      "grad_norm": 0.4210626184940338,
+      "learning_rate": 0.00036821345707656604,
+      "loss": 4.8403,
+      "step": 530
+    },
+    {
+      "epoch": 0.0313325016681656,
+      "grad_norm": 0.4170067608356476,
+      "learning_rate": 0.00037517401392111365,
+      "loss": 4.8156,
+      "step": 540
+    },
+    {
+      "epoch": 0.031912733180539034,
+      "grad_norm": 0.40876781940460205,
+      "learning_rate": 0.0003821345707656612,
+      "loss": 4.7932,
+      "step": 550
+    },
+    {
+      "epoch": 0.03249296469291247,
+      "grad_norm": 0.3717671036720276,
+      "learning_rate": 0.0003890951276102088,
+      "loss": 4.7812,
+      "step": 560
+    },
+    {
+      "epoch": 0.03307319620528591,
+      "grad_norm": 0.37275081872940063,
+      "learning_rate": 0.00039605568445475636,
+      "loss": 4.7324,
+      "step": 570
+    },
+    {
+      "epoch": 0.03365342771765935,
+      "grad_norm": 0.32523536682128906,
+      "learning_rate": 0.0004030162412993039,
+      "loss": 4.6891,
+      "step": 580
+    },
+    {
+      "epoch": 0.034233659230032784,
+      "grad_norm": 0.2909957468509674,
+      "learning_rate": 0.0004099767981438515,
+      "loss": 4.6555,
+      "step": 590
+    },
+    {
+      "epoch": 0.03481389074240622,
+      "grad_norm": 0.40268951654434204,
+      "learning_rate": 0.00041693735498839906,
+      "loss": 4.622,
+      "step": 600
+    },
+    {
+      "epoch": 0.035394122254779656,
+      "grad_norm": 0.433383584022522,
+      "learning_rate": 0.00042389791183294656,
+      "loss": 4.6122,
+      "step": 610
+    },
+    {
+      "epoch": 0.03597435376715309,
+      "grad_norm": 0.3096088171005249,
+      "learning_rate": 0.0004308584686774941,
+      "loss": 4.5976,
+      "step": 620
+    },
+    {
+      "epoch": 0.036554585279526534,
+      "grad_norm": 0.30540433526039124,
+      "learning_rate": 0.0004378190255220417,
+      "loss": 4.5569,
+      "step": 630
+    },
+    {
+      "epoch": 0.03713481679189997,
+      "grad_norm": 0.3136671781539917,
+      "learning_rate": 0.00044477958236658927,
+      "loss": 4.5228,
+      "step": 640
+    },
+    {
+      "epoch": 0.037715048304273406,
+      "grad_norm": 0.332621693611145,
+      "learning_rate": 0.0004517401392111369,
+      "loss": 4.4901,
+      "step": 650
+    },
+    {
+      "epoch": 0.03829527981664684,
+      "grad_norm": 0.3817736804485321,
+      "learning_rate": 0.0004587006960556844,
+      "loss": 4.475,
+      "step": 660
+    },
+    {
+      "epoch": 0.03887551132902028,
+      "grad_norm": 0.458741158246994,
+      "learning_rate": 0.000465661252900232,
+      "loss": 4.4545,
+      "step": 670
+    },
+    {
+      "epoch": 0.039455742841393714,
+      "grad_norm": 0.27561265230178833,
+      "learning_rate": 0.0004726218097447796,
+      "loss": 4.4406,
+      "step": 680
+    },
+    {
+      "epoch": 0.040035974353767156,
+      "grad_norm": 0.380633145570755,
+      "learning_rate": 0.0004795823665893271,
+      "loss": 4.4027,
+      "step": 690
+    },
+    {
+      "epoch": 0.04061620586614059,
+      "grad_norm": 0.3662358820438385,
+      "learning_rate": 0.00048654292343387463,
+      "loss": 4.377,
+      "step": 700
+    },
+    {
+      "epoch": 0.04119643737851403,
+      "grad_norm": 0.31104594469070435,
+      "learning_rate": 0.0004935034802784222,
+      "loss": 4.3399,
+      "step": 710
+    },
+    {
+      "epoch": 0.041776668890887464,
+      "grad_norm": 0.43897074460983276,
+      "learning_rate": 0.0005004640371229698,
+      "loss": 4.3229,
+      "step": 720
+    },
+    {
+      "epoch": 0.0423569004032609,
+      "grad_norm": 0.2685506343841553,
+      "learning_rate": 0.0005074245939675173,
+      "loss": 4.302,
+      "step": 730
+    },
+    {
+      "epoch": 0.042937131915634336,
+      "grad_norm": 0.2662206292152405,
+      "learning_rate": 0.0005143851508120649,
+      "loss": 4.2533,
+      "step": 740
+    },
+    {
+      "epoch": 0.04351736342800778,
+      "grad_norm": 0.31665244698524475,
+      "learning_rate": 0.0005213457076566126,
+      "loss": 4.2463,
+      "step": 750
+    },
+    {
+      "epoch": 0.044097594940381214,
+      "grad_norm": 0.3573771119117737,
+      "learning_rate": 0.0005283062645011601,
+      "loss": 4.2177,
+      "step": 760
+    },
+    {
+      "epoch": 0.04467782645275465,
+      "grad_norm": 0.3051789402961731,
+      "learning_rate": 0.0005352668213457077,
+      "loss": 4.2098,
+      "step": 770
+    },
+    {
+      "epoch": 0.045258057965128086,
+      "grad_norm": 0.26946839690208435,
+      "learning_rate": 0.0005422273781902551,
+      "loss": 4.1739,
+      "step": 780
+    },
+    {
+      "epoch": 0.04583828947750152,
+      "grad_norm": 0.21327945590019226,
+      "learning_rate": 0.0005491879350348028,
+      "loss": 4.151,
+      "step": 790
+    },
+    {
+      "epoch": 0.04641852098987496,
+      "grad_norm": 0.28413307666778564,
+      "learning_rate": 0.0005561484918793503,
+      "loss": 4.1455,
+      "step": 800
+    },
+    {
+      "epoch": 0.0469987525022484,
+      "grad_norm": 0.2847752869129181,
+      "learning_rate": 0.0005631090487238979,
+      "loss": 4.1166,
+      "step": 810
+    },
+    {
+      "epoch": 0.047578984014621836,
+      "grad_norm": 0.25382527709007263,
+      "learning_rate": 0.0005700696055684454,
+      "loss": 4.0986,
+      "step": 820
+    },
+    {
+      "epoch": 0.04815921552699527,
+      "grad_norm": 0.2375078797340393,
+      "learning_rate": 0.000577030162412993,
+      "loss": 4.0765,
+      "step": 830
+    },
+    {
+      "epoch": 0.04873944703936871,
+      "grad_norm": 0.3032638430595398,
+      "learning_rate": 0.0005839907192575406,
+      "loss": 4.085,
+      "step": 840
+    },
+    {
+      "epoch": 0.049319678551742144,
+      "grad_norm": 0.2454582005739212,
+      "learning_rate": 0.0005909512761020882,
+      "loss": 4.0505,
+      "step": 850
+    },
+    {
+      "epoch": 0.04989991006411558,
+      "grad_norm": 0.23829826712608337,
+      "learning_rate": 0.0005979118329466356,
+      "loss": 4.0391,
+      "step": 860
+    },
+    {
+      "epoch": 0.05048014157648902,
+      "grad_norm": 0.29694074392318726,
+      "learning_rate": 0.0005999997293652579,
+      "loss": 4.0195,
+      "step": 870
+    },
+    {
+      "epoch": 0.05106037308886246,
+      "grad_norm": 0.20268426835536957,
+      "learning_rate": 0.0005999984038085133,
+      "loss": 4.0023,
+      "step": 880
+    },
+    {
+      "epoch": 0.051640604601235894,
+      "grad_norm": 0.2563273310661316,
+      "learning_rate": 0.000599995973626219,
+      "loss": 3.98,
+      "step": 890
+    },
+    {
+      "epoch": 0.05222083611360933,
+      "grad_norm": 0.26515451073646545,
+      "learning_rate": 0.0005999924388273229,
+      "loss": 3.9799,
+      "step": 900
+    },
+    {
+      "epoch": 0.052801067625982766,
+      "grad_norm": 0.23011842370033264,
+      "learning_rate": 0.0005999877994248407,
+      "loss": 3.9592,
+      "step": 910
+    },
+    {
+      "epoch": 0.0533812991383562,
+      "grad_norm": 0.21570523083209991,
+      "learning_rate": 0.0005999820554358552,
+      "loss": 3.9366,
+      "step": 920
+    },
+    {
+      "epoch": 0.053961530650729644,
+      "grad_norm": 0.24623119831085205,
+      "learning_rate": 0.0005999752068815162,
+      "loss": 3.923,
+      "step": 930
+    },
+    {
+      "epoch": 0.05454176216310308,
+      "grad_norm": 0.26557642221450806,
+      "learning_rate": 0.0005999672537870409,
+      "loss": 3.9114,
+      "step": 940
+    },
+    {
+      "epoch": 0.055121993675476516,
+      "grad_norm": 0.23711174726486206,
+      "learning_rate": 0.0005999581961817135,
+      "loss": 3.9021,
+      "step": 950
+    },
+    {
+      "epoch": 0.05570222518784995,
+      "grad_norm": 0.2636472284793854,
+      "learning_rate": 0.000599948034098885,
+      "loss": 3.8945,
+      "step": 960
+    },
+    {
+      "epoch": 0.05628245670022339,
+      "grad_norm": 0.2139461785554886,
+      "learning_rate": 0.000599936767575973,
+      "loss": 3.8742,
+      "step": 970
+    },
+    {
+      "epoch": 0.056862688212596824,
+      "grad_norm": 0.2411975860595703,
+      "learning_rate": 0.0005999243966544624,
+      "loss": 3.8627,
+      "step": 980
+    },
+    {
+      "epoch": 0.057442919724970266,
+      "grad_norm": 0.22522902488708496,
+      "learning_rate": 0.000599910921379904,
+      "loss": 3.8439,
+      "step": 990
+    },
+    {
+      "epoch": 0.0580231512373437,
+      "grad_norm": 0.2505146861076355,
+      "learning_rate": 0.0005998963418019153,
+      "loss": 3.8376,
+      "step": 1000
+    },
+    {
+      "epoch": 0.0580231512373437,
+      "eval_loss": 3.7977514266967773,
+      "eval_runtime": 3.2666,
+      "eval_samples_per_second": 1325.524,
+      "eval_steps_per_second": 2.755,
+      "step": 1000
+    },
+    {
+      "epoch": 0.05860338274971714,
+      "grad_norm": 0.21931585669517517,
+      "learning_rate": 0.0005998806579741798,
+      "loss": 3.8196,
+      "step": 1010
+    },
+    {
+      "epoch": 0.059183614262090574,
+      "grad_norm": 0.19973556697368622,
+      "learning_rate": 0.0005998638699544469,
+      "loss": 3.813,
+      "step": 1020
+    },
+    {
+      "epoch": 0.05976384577446401,
+      "grad_norm": 0.21615122258663177,
+      "learning_rate": 0.0005998459778045319,
+      "loss": 3.7993,
+      "step": 1030
+    },
+    {
+      "epoch": 0.060344077286837446,
+      "grad_norm": 0.18904747068881989,
+      "learning_rate": 0.0005998269815903156,
+      "loss": 3.8122,
+      "step": 1040
+    },
+    {
+      "epoch": 0.06092430879921089,
+      "grad_norm": 0.20379868149757385,
+      "learning_rate": 0.000599806881381744,
+      "loss": 3.7891,
+      "step": 1050
+    },
+    {
+      "epoch": 0.061504540311584324,
+      "grad_norm": 0.21616701781749725,
+      "learning_rate": 0.0005997856772528283,
+      "loss": 3.7768,
+      "step": 1060
+    },
+    {
+      "epoch": 0.06208477182395776,
+      "grad_norm": 0.1838783323764801,
+      "learning_rate": 0.0005997633692816442,
+      "loss": 3.7744,
+      "step": 1070
+    },
+    {
+      "epoch": 0.0626650033363312,
+      "grad_norm": 0.17894767224788666,
+      "learning_rate": 0.0005997399575503321,
+      "loss": 3.7667,
+      "step": 1080
+    },
+    {
+      "epoch": 0.06324523484870463,
+      "grad_norm": 0.20992882549762726,
+      "learning_rate": 0.0005997154421450963,
+      "loss": 3.7449,
+      "step": 1090
+    },
+    {
+      "epoch": 0.06382546636107807,
+      "grad_norm": 0.19586902856826782,
+      "learning_rate": 0.0005996898231562051,
+      "loss": 3.7423,
+      "step": 1100
+    },
+    {
+      "epoch": 0.0644056978734515,
+      "grad_norm": 0.24105612933635712,
+      "learning_rate": 0.0005996631006779903,
+      "loss": 3.7223,
+      "step": 1110
+    },
+    {
+      "epoch": 0.06498592938582494,
+      "grad_norm": 0.19526907801628113,
+      "learning_rate": 0.0005996352748088471,
+      "loss": 3.7189,
+      "step": 1120
+    },
+    {
+      "epoch": 0.06556616089819838,
+      "grad_norm": 0.16144131124019623,
+      "learning_rate": 0.000599606345651233,
+      "loss": 3.7118,
+      "step": 1130
+    },
+    {
+      "epoch": 0.06614639241057182,
+      "grad_norm": 0.167442187666893,
+      "learning_rate": 0.0005995763133116683,
+      "loss": 3.6986,
+      "step": 1140
+    },
+    {
+      "epoch": 0.06672662392294526,
+      "grad_norm": 0.23503893613815308,
+      "learning_rate": 0.0005995451779007352,
+      "loss": 3.7049,
+      "step": 1150
+    },
+    {
+      "epoch": 0.0673068554353187,
+      "grad_norm": 0.2096278965473175,
+      "learning_rate": 0.0005995129395330776,
+      "loss": 3.6865,
+      "step": 1160
+    },
+    {
+      "epoch": 0.06788708694769213,
+      "grad_norm": 0.19825097918510437,
+      "learning_rate": 0.0005994795983274004,
+      "loss": 3.6712,
+      "step": 1170
+    },
+    {
+      "epoch": 0.06846731846006557,
+      "grad_norm": 0.15405306220054626,
+      "learning_rate": 0.0005994451544064696,
+      "loss": 3.6711,
+      "step": 1180
+    },
+    {
+      "epoch": 0.069047549972439,
+      "grad_norm": 0.563884437084198,
+      "learning_rate": 0.0005994096078971111,
+      "loss": 3.677,
+      "step": 1190
+    },
+    {
+      "epoch": 0.06962778148481244,
+      "grad_norm": 0.1655234694480896,
+      "learning_rate": 0.0005993729589302111,
+      "loss": 3.7143,
+      "step": 1200
+    },
+    {
+      "epoch": 0.07020801299718588,
+      "grad_norm": 0.15598031878471375,
+      "learning_rate": 0.0005993352076407148,
+      "loss": 3.6689,
+      "step": 1210
+    },
+    {
+      "epoch": 0.07078824450955931,
+      "grad_norm": 0.14992448687553406,
+      "learning_rate": 0.0005992963541676265,
+      "loss": 3.6581,
+      "step": 1220
+    },
+    {
+      "epoch": 0.07136847602193275,
+      "grad_norm": 0.1618255376815796,
+      "learning_rate": 0.0005992563986540086,
+      "loss": 3.642,
+      "step": 1230
+    },
+    {
+      "epoch": 0.07194870753430618,
+      "grad_norm": 0.16188852488994598,
+      "learning_rate": 0.0005992153412469816,
+      "loss": 3.6399,
+      "step": 1240
+    },
+    {
+      "epoch": 0.07252893904667962,
+      "grad_norm": 0.17180649936199188,
+      "learning_rate": 0.0005991731820977231,
+      "loss": 3.6252,
+      "step": 1250
+    },
+    {
+      "epoch": 0.07310917055905307,
+      "grad_norm": 0.1691058874130249,
+      "learning_rate": 0.0005991299213614678,
+      "loss": 3.6244,
+      "step": 1260
+    },
+    {
+      "epoch": 0.0736894020714265,
+      "grad_norm": 0.19470703601837158,
+      "learning_rate": 0.0005990855591975059,
+      "loss": 3.6199,
+      "step": 1270
+    },
+    {
+      "epoch": 0.07426963358379994,
+      "grad_norm": 0.15482653677463531,
+      "learning_rate": 0.0005990400957691835,
+      "loss": 3.6176,
+      "step": 1280
+    },
+    {
+      "epoch": 0.07484986509617338,
+      "grad_norm": 0.18342998623847961,
+      "learning_rate": 0.000598993531243902,
+      "loss": 3.6082,
+      "step": 1290
+    },
+    {
+      "epoch": 0.07543009660854681,
+      "grad_norm": 0.17348110675811768,
+      "learning_rate": 0.0005989458657931167,
+      "loss": 3.6063,
+      "step": 1300
+    },
+    {
+      "epoch": 0.07601032812092025,
+      "grad_norm": 0.1687677949666977,
+      "learning_rate": 0.0005988970995923368,
+      "loss": 3.6015,
+      "step": 1310
+    },
+    {
+      "epoch": 0.07659055963329368,
+      "grad_norm": 0.19341568648815155,
+      "learning_rate": 0.0005988472328211246,
+      "loss": 3.5912,
+      "step": 1320
+    },
+    {
+      "epoch": 0.07717079114566712,
+      "grad_norm": 0.15345478057861328,
+      "learning_rate": 0.0005987962656630947,
+      "loss": 3.586,
+      "step": 1330
+    },
+    {
+      "epoch": 0.07775102265804056,
+      "grad_norm": 0.16126085817813873,
+      "learning_rate": 0.0005987441983059136,
+      "loss": 3.5797,
+      "step": 1340
+    },
+    {
+      "epoch": 0.07833125417041399,
+      "grad_norm": 0.1716892272233963,
+      "learning_rate": 0.0005986910309412986,
+      "loss": 3.5751,
+      "step": 1350
+    },
+    {
+      "epoch": 0.07891148568278743,
+      "grad_norm": 0.15669932961463928,
+      "learning_rate": 0.0005986367637650177,
+      "loss": 3.5799,
+      "step": 1360
+    },
+    {
+      "epoch": 0.07949171719516086,
+      "grad_norm": 0.19878168404102325,
+      "learning_rate": 0.0005985813969768884,
+      "loss": 3.572,
+      "step": 1370
+    },
+    {
+      "epoch": 0.08007194870753431,
+      "grad_norm": 0.1505119651556015,
+      "learning_rate": 0.0005985249307807767,
+      "loss": 3.567,
+      "step": 1380
+    },
+    {
+      "epoch": 0.08065218021990775,
+      "grad_norm": 0.1548507809638977,
+      "learning_rate": 0.0005984673653845972,
+      "loss": 3.5427,
+      "step": 1390
+    },
+    {
+      "epoch": 0.08123241173228118,
+      "grad_norm": 0.15786635875701904,
+      "learning_rate": 0.0005984087010003119,
+      "loss": 3.5637,
+      "step": 1400
+    },
+    {
+      "epoch": 0.08181264324465462,
+      "grad_norm": 0.15546779334545135,
+      "learning_rate": 0.0005983489378439289,
+      "loss": 3.5475,
+      "step": 1410
+    },
+    {
+      "epoch": 0.08239287475702806,
+      "grad_norm": 0.17267097532749176,
+      "learning_rate": 0.0005982880761355026,
+      "loss": 3.5519,
+      "step": 1420
+    },
+    {
+      "epoch": 0.08297310626940149,
+      "grad_norm": 0.2120850831270218,
+      "learning_rate": 0.0005982261160991321,
+      "loss": 3.545,
+      "step": 1430
+    },
+    {
+      "epoch": 0.08355333778177493,
+      "grad_norm": 0.1541440784931183,
+      "learning_rate": 0.0005981630579629609,
+      "loss": 3.5236,
+      "step": 1440
+    },
+    {
+      "epoch": 0.08413356929414836,
+      "grad_norm": 0.1610753834247589,
+      "learning_rate": 0.0005980989019591753,
+      "loss": 3.5153,
+      "step": 1450
+    },
+    {
+      "epoch": 0.0847138008065218,
+      "grad_norm": 0.1872093677520752,
+      "learning_rate": 0.0005980336483240048,
+      "loss": 3.5208,
+      "step": 1460
+    },
+    {
+      "epoch": 0.08529403231889524,
+      "grad_norm": 0.15793032944202423,
+      "learning_rate": 0.0005979672972977201,
+      "loss": 3.5294,
+      "step": 1470
+    },
+    {
+      "epoch": 0.08587426383126867,
+      "grad_norm": 0.1738296002149582,
+      "learning_rate": 0.0005978998491246324,
+      "loss": 3.5234,
+      "step": 1480
+    },
+    {
+      "epoch": 0.08645449534364211,
+      "grad_norm": 0.1644987314939499,
+      "learning_rate": 0.0005978313040530931,
+      "loss": 3.515,
+      "step": 1490
+    },
+    {
+      "epoch": 0.08703472685601556,
+      "grad_norm": 0.16707918047904968,
+      "learning_rate": 0.0005977616623354923,
+      "loss": 3.5014,
+      "step": 1500
+    },
+    {
+      "epoch": 0.08761495836838899,
+      "grad_norm": 0.14812146127223969,
+      "learning_rate": 0.0005976909242282581,
+      "loss": 3.4923,
+      "step": 1510
+    },
+    {
+      "epoch": 0.08819518988076243,
+      "grad_norm": 0.15653282403945923,
+      "learning_rate": 0.0005976190899918555,
+      "loss": 3.4899,
+      "step": 1520
+    },
+    {
+      "epoch": 0.08877542139313586,
+      "grad_norm": 0.1531265377998352,
+      "learning_rate": 0.0005975461598907858,
+      "loss": 3.4939,
+      "step": 1530
+    },
+    {
+      "epoch": 0.0893556529055093,
+      "grad_norm": 0.19499650597572327,
+      "learning_rate": 0.0005974721341935854,
+      "loss": 3.4776,
+      "step": 1540
+    },
+    {
+      "epoch": 0.08993588441788274,
+      "grad_norm": 0.16522051393985748,
+      "learning_rate": 0.0005973970131728245,
+      "loss": 3.4843,
+      "step": 1550
+    },
+    {
+      "epoch": 0.09051611593025617,
+      "grad_norm": 0.14911240339279175,
+      "learning_rate": 0.0005973207971051066,
+      "loss": 3.4854,
+      "step": 1560
+    },
+    {
+      "epoch": 0.09109634744262961,
+      "grad_norm": 0.1797751784324646,
+      "learning_rate": 0.0005972434862710673,
+      "loss": 3.4814,
+      "step": 1570
+    },
+    {
+      "epoch": 0.09167657895500304,
+      "grad_norm": 0.14958298206329346,
+      "learning_rate": 0.0005971650809553729,
+      "loss": 3.4791,
+      "step": 1580
+    },
+    {
+      "epoch": 0.09225681046737648,
+      "grad_norm": 0.17834265530109406,
+      "learning_rate": 0.0005970855814467205,
+      "loss": 3.4633,
+      "step": 1590
+    },
+    {
+      "epoch": 0.09283704197974992,
+      "grad_norm": 0.15738125145435333,
+      "learning_rate": 0.0005970049880378353,
+      "loss": 3.4676,
+      "step": 1600
+    },
+    {
+      "epoch": 0.09341727349212335,
+      "grad_norm": 0.14483994245529175,
+      "learning_rate": 0.0005969233010254707,
+      "loss": 3.4661,
+      "step": 1610
+    },
+    {
+      "epoch": 0.0939975050044968,
+      "grad_norm": 0.14126789569854736,
+      "learning_rate": 0.0005968405207104068,
+      "loss": 3.4571,
+      "step": 1620
+    },
+    {
+      "epoch": 0.09457773651687024,
+      "grad_norm": 0.1578633040189743,
+      "learning_rate": 0.0005967566473974495,
+      "loss": 3.4558,
+      "step": 1630
+    },
+    {
+      "epoch": 0.09515796802924367,
+      "grad_norm": 0.1565486639738083,
+      "learning_rate": 0.000596671681395429,
+      "loss": 3.4604,
+      "step": 1640
+    },
+    {
+      "epoch": 0.09573819954161711,
+      "grad_norm": 0.13866451382637024,
+      "learning_rate": 0.0005965856230171993,
+      "loss": 3.4552,
+      "step": 1650
+    },
+    {
+      "epoch": 0.09631843105399054,
+      "grad_norm": 0.2121124267578125,
+      "learning_rate": 0.0005964984725796359,
+      "loss": 3.4541,
+      "step": 1660
+    },
+    {
+      "epoch": 0.09689866256636398,
+      "grad_norm": 0.17082008719444275,
+      "learning_rate": 0.0005964102304036363,
+      "loss": 3.4382,
+      "step": 1670
+    },
+    {
+      "epoch": 0.09747889407873742,
+      "grad_norm": 0.20681622624397278,
+      "learning_rate": 0.0005963208968141172,
+      "loss": 3.4372,
+      "step": 1680
+    },
+    {
+      "epoch": 0.09805912559111085,
+      "grad_norm": 0.1384105086326599,
+      "learning_rate": 0.0005962304721400142,
+      "loss": 3.4484,
+      "step": 1690
+    },
+    {
+      "epoch": 0.09863935710348429,
+      "grad_norm": 0.16820856928825378,
+      "learning_rate": 0.0005961389567142806,
+      "loss": 3.4302,
+      "step": 1700
+    },
+    {
+      "epoch": 0.09921958861585772,
+      "grad_norm": 0.16617996990680695,
+      "learning_rate": 0.0005960463508738855,
+      "loss": 3.4328,
+      "step": 1710
+    },
+    {
+      "epoch": 0.09979982012823116,
+      "grad_norm": 0.16344214975833893,
+      "learning_rate": 0.0005959526549598137,
+      "loss": 3.4326,
+      "step": 1720
+    },
+    {
+      "epoch": 0.1003800516406046,
+      "grad_norm": 0.16235540807247162,
+      "learning_rate": 0.000595857869317063,
+      "loss": 3.4271,
+      "step": 1730
+    },
+    {
+      "epoch": 0.10096028315297804,
+      "grad_norm": 0.1524738371372223,
+      "learning_rate": 0.0005957619942946442,
+      "loss": 3.424,
+      "step": 1740
+    },
+    {
+      "epoch": 0.10154051466535148,
+      "grad_norm": 0.18023791909217834,
+      "learning_rate": 0.0005956650302455793,
+      "loss": 3.4266,
+      "step": 1750
+    },
+    {
+      "epoch": 0.10212074617772492,
+      "grad_norm": 0.17738115787506104,
+      "learning_rate": 0.0005955669775268999,
+      "loss": 3.4046,
+      "step": 1760
+    },
+    {
+      "epoch": 0.10270097769009835,
+      "grad_norm": 0.13939271867275238,
+      "learning_rate": 0.0005954678364996466,
+      "loss": 3.4177,
+      "step": 1770
+    },
+    {
+      "epoch": 0.10328120920247179,
+      "grad_norm": 0.18028447031974792,
+      "learning_rate": 0.0005953676075288668,
+      "loss": 3.4113,
+      "step": 1780
+    },
+    {
+      "epoch": 0.10386144071484522,
+      "grad_norm": 0.15911422669887543,
+      "learning_rate": 0.0005952662909836142,
+      "loss": 3.4191,
+      "step": 1790
+    },
+    {
+      "epoch": 0.10444167222721866,
+      "grad_norm": 0.15596607327461243,
+      "learning_rate": 0.0005951638872369469,
+      "loss": 3.3993,
+      "step": 1800
+    },
+    {
+      "epoch": 0.1050219037395921,
+      "grad_norm": 0.15493981540203094,
+      "learning_rate": 0.0005950603966659264,
+      "loss": 3.4043,
+      "step": 1810
+    },
+    {
+      "epoch": 0.10560213525196553,
+      "grad_norm": 0.1727568507194519,
+      "learning_rate": 0.0005949558196516154,
+      "loss": 3.4028,
+      "step": 1820
+    },
+    {
+      "epoch": 0.10618236676433897,
+      "grad_norm": 0.1614874303340912,
+      "learning_rate": 0.0005948501565790779,
+      "loss": 3.3998,
+      "step": 1830
+    },
+    {
+      "epoch": 0.1067625982767124,
+      "grad_norm": 0.13620299100875854,
+      "learning_rate": 0.000594743407837376,
+      "loss": 3.3896,
+      "step": 1840
+    },
+    {
+      "epoch": 0.10734282978908584,
+      "grad_norm": 0.15391112864017487,
+      "learning_rate": 0.0005946355738195701,
+      "loss": 3.3823,
+      "step": 1850
+    },
+    {
+      "epoch": 0.10792306130145929,
+      "grad_norm": 0.15937426686286926,
+      "learning_rate": 0.0005945266549227162,
+      "loss": 3.3893,
+      "step": 1860
+    },
+    {
+      "epoch": 0.10850329281383272,
+      "grad_norm": 0.16253319382667542,
+      "learning_rate": 0.0005944166515478649,
+      "loss": 3.3905,
+      "step": 1870
+    },
+    {
+      "epoch": 0.10908352432620616,
+      "grad_norm": 0.14502382278442383,
+      "learning_rate": 0.0005943055641000604,
+      "loss": 3.3836,
+      "step": 1880
+    },
+    {
+      "epoch": 0.1096637558385796,
+      "grad_norm": 0.14128324389457703,
+      "learning_rate": 0.0005941933929883384,
+      "loss": 3.3854,
+      "step": 1890
+    },
+    {
+      "epoch": 0.11024398735095303,
+      "grad_norm": 0.19345618784427643,
+      "learning_rate": 0.0005940801386257244,
+      "loss": 3.3746,
+      "step": 1900
+    },
+    {
+      "epoch": 0.11082421886332647,
+      "grad_norm": 0.1499020904302597,
+      "learning_rate": 0.000593965801429233,
+      "loss": 3.3729,
+      "step": 1910
+    },
+    {
+      "epoch": 0.1114044503756999,
+      "grad_norm": 0.14975206553936005,
+      "learning_rate": 0.0005938503818198656,
+      "loss": 3.3676,
+      "step": 1920
+    },
+    {
+      "epoch": 0.11198468188807334,
+      "grad_norm": 0.13726426661014557,
+      "learning_rate": 0.0005937338802226094,
+      "loss": 3.373,
+      "step": 1930
+    },
+    {
+      "epoch": 0.11256491340044678,
+      "grad_norm": 0.1749139279127121,
+      "learning_rate": 0.0005936162970664355,
+      "loss": 3.3761,
+      "step": 1940
+    },
+    {
+      "epoch": 0.11314514491282021,
+      "grad_norm": 0.14197006821632385,
+      "learning_rate": 0.0005934976327842974,
+      "loss": 3.3513,
+      "step": 1950
+    },
+    {
+      "epoch": 0.11372537642519365,
+      "grad_norm": 0.15288510918617249,
+      "learning_rate": 0.0005933778878131294,
+      "loss": 3.357,
+      "step": 1960
+    },
+    {
+      "epoch": 0.11430560793756708,
+      "grad_norm": 0.1787514090538025,
+      "learning_rate": 0.000593257062593845,
+      "loss": 3.3642,
+      "step": 1970
+    },
+    {
+      "epoch": 0.11488583944994053,
+      "grad_norm": 0.13630741834640503,
+      "learning_rate": 0.0005931351575713353,
+      "loss": 3.3614,
+      "step": 1980
+    },
+    {
+      "epoch": 0.11546607096231397,
+      "grad_norm": 0.16102264821529388,
+      "learning_rate": 0.0005930121731944674,
+      "loss": 3.3523,
+      "step": 1990
+    },
+    {
+      "epoch": 0.1160463024746874,
+      "grad_norm": 0.16226573288440704,
+      "learning_rate": 0.0005928881099160826,
+      "loss": 3.3595,
+      "step": 2000
+    },
+    {
+      "epoch": 0.1160463024746874,
+      "eval_loss": 3.3178560733795166,
+      "eval_runtime": 3.2576,
+      "eval_samples_per_second": 1329.214,
+      "eval_steps_per_second": 2.763,
+      "step": 2000
+    },
+    {
+      "epoch": 0.11662653398706084,
+      "grad_norm": 0.14609858393669128,
+      "learning_rate": 0.0005927629681929951,
+      "loss": 3.3585,
+      "step": 2010
+    },
+    {
+      "epoch": 0.11720676549943428,
+      "grad_norm": 0.14387281239032745,
+      "learning_rate": 0.0005926367484859896,
+      "loss": 3.3517,
+      "step": 2020
+    },
+    {
+      "epoch": 0.11778699701180771,
+      "grad_norm": 0.14605766534805298,
+      "learning_rate": 0.0005925094512598202,
+      "loss": 3.3524,
+      "step": 2030
+    },
+    {
+      "epoch": 0.11836722852418115,
+      "grad_norm": 0.22022885084152222,
+      "learning_rate": 0.000592381076983209,
+      "loss": 3.3356,
+      "step": 2040
+    },
+    {
+      "epoch": 0.11894746003655458,
+      "grad_norm": 0.1847839504480362,
+      "learning_rate": 0.0005922516261288431,
+      "loss": 3.3441,
+      "step": 2050
+    },
+    {
+      "epoch": 0.11952769154892802,
+      "grad_norm": 0.13915176689624786,
+      "learning_rate": 0.0005921210991733745,
+      "loss": 3.352,
+      "step": 2060
+    },
+    {
+      "epoch": 0.12010792306130146,
+      "grad_norm": 0.1398390680551529,
+      "learning_rate": 0.0005919894965974168,
+      "loss": 3.3455,
+      "step": 2070
+    },
+    {
+      "epoch": 0.12068815457367489,
+      "grad_norm": 0.1368722915649414,
+      "learning_rate": 0.0005918568188855447,
+      "loss": 3.3403,
+      "step": 2080
+    },
+    {
+      "epoch": 0.12126838608604833,
+      "grad_norm": 0.16239017248153687,
+      "learning_rate": 0.0005917230665262914,
+      "loss": 3.3334,
+      "step": 2090
+    },
+    {
+      "epoch": 0.12184861759842178,
+      "grad_norm": 0.14380386471748352,
+      "learning_rate": 0.000591588240012147,
+      "loss": 3.3294,
+      "step": 2100
+    },
+    {
+      "epoch": 0.12242884911079521,
+      "grad_norm": 0.16626037657260895,
+      "learning_rate": 0.0005914523398395569,
+      "loss": 3.3425,
+      "step": 2110
+    },
+    {
+      "epoch": 0.12300908062316865,
+      "grad_norm": 0.15981921553611755,
+      "learning_rate": 0.0005913153665089197,
+      "loss": 3.3403,
+      "step": 2120
+    },
+    {
+      "epoch": 0.12358931213554208,
+      "grad_norm": 0.15275150537490845,
+      "learning_rate": 0.0005911773205245857,
+      "loss": 3.3261,
+      "step": 2130
+    },
+    {
+      "epoch": 0.12416954364791552,
+      "grad_norm": 0.1598198413848877,
+      "learning_rate": 0.0005910382023948546,
+      "loss": 3.3264,
+      "step": 2140
+    },
+    {
+      "epoch": 0.12474977516028896,
+      "grad_norm": 0.138661190867424,
+      "learning_rate": 0.0005908980126319739,
+      "loss": 3.3216,
+      "step": 2150
+    },
+    {
+      "epoch": 0.1253300066726624,
+      "grad_norm": 0.15583263337612152,
+      "learning_rate": 0.000590756751752137,
+      "loss": 3.3204,
+      "step": 2160
+    },
+    {
+      "epoch": 0.12591023818503583,
+      "grad_norm": 0.15883944928646088,
+      "learning_rate": 0.0005906144202754813,
+      "loss": 3.3274,
+      "step": 2170
+    },
+    {
+      "epoch": 0.12649046969740926,
+      "grad_norm": 0.15031637251377106,
+      "learning_rate": 0.0005904710187260862,
+      "loss": 3.3224,
+      "step": 2180
+    },
+    {
+      "epoch": 0.1270707012097827,
+      "grad_norm": 0.1994715929031372,
+      "learning_rate": 0.0005903265476319712,
+      "loss": 3.3204,
+      "step": 2190
+    },
+    {
+      "epoch": 0.12765093272215614,
+      "grad_norm": 0.16986873745918274,
+      "learning_rate": 0.000590181007525094,
+      "loss": 3.327,
+      "step": 2200
+    },
+    {
+      "epoch": 0.12823116423452957,
+      "grad_norm": 0.147616907954216,
+      "learning_rate": 0.0005900343989413485,
+      "loss": 3.3063,
+      "step": 2210
+    },
+    {
+      "epoch": 0.128811395746903,
+      "grad_norm": 0.16532088816165924,
+      "learning_rate": 0.0005898867224205629,
+      "loss": 3.3198,
+      "step": 2220
+    },
+    {
+      "epoch": 0.12939162725927644,
+      "grad_norm": 0.16687408089637756,
+      "learning_rate": 0.0005897379785064977,
+      "loss": 3.3193,
+      "step": 2230
+    },
+    {
+      "epoch": 0.12997185877164988,
+      "grad_norm": 0.16683116555213928,
+      "learning_rate": 0.0005895881677468434,
+      "loss": 3.3078,
+      "step": 2240
+    },
+    {
+      "epoch": 0.13055209028402331,
+      "grad_norm": 0.15461483597755432,
+      "learning_rate": 0.000589437290693219,
+      "loss": 3.3126,
+      "step": 2250
+    },
+    {
+      "epoch": 0.13113232179639675,
+      "grad_norm": 0.1432589441537857,
+      "learning_rate": 0.0005892853479011696,
+      "loss": 3.3004,
+      "step": 2260
+    },
+    {
+      "epoch": 0.13171255330877019,
+      "grad_norm": 0.1792496293783188,
+      "learning_rate": 0.0005891323399301646,
+      "loss": 3.2946,
+      "step": 2270
+    },
+    {
+      "epoch": 0.13229278482114365,
+      "grad_norm": 0.15189994871616364,
+      "learning_rate": 0.0005889782673435952,
+      "loss": 3.3013,
+      "step": 2280
+    },
+    {
+      "epoch": 0.13287301633351709,
+      "grad_norm": 0.15026351809501648,
+      "learning_rate": 0.0005888231307087728,
+      "loss": 3.295,
+      "step": 2290
+    },
+    {
+      "epoch": 0.13345324784589052,
+      "grad_norm": 0.16199465095996857,
+      "learning_rate": 0.0005886669305969269,
+      "loss": 3.2955,
+      "step": 2300
+    },
+    {
+      "epoch": 0.13403347935826396,
+      "grad_norm": 0.16704988479614258,
+      "learning_rate": 0.0005885096675832027,
+      "loss": 3.3057,
+      "step": 2310
+    },
+    {
+      "epoch": 0.1346137108706374,
+      "grad_norm": 0.14401213824748993,
+      "learning_rate": 0.0005883513422466588,
+      "loss": 3.2876,
+      "step": 2320
+    },
+    {
+      "epoch": 0.13519394238301083,
+      "grad_norm": 0.15336865186691284,
+      "learning_rate": 0.000588191955170266,
+      "loss": 3.2903,
+      "step": 2330
+    },
+    {
+      "epoch": 0.13577417389538426,
+      "grad_norm": 0.16176366806030273,
+      "learning_rate": 0.0005880315069409039,
+      "loss": 3.2873,
+      "step": 2340
+    },
+    {
+      "epoch": 0.1363544054077577,
+      "grad_norm": 0.14728406071662903,
+      "learning_rate": 0.00058786999814936,
+      "loss": 3.2862,
+      "step": 2350
+    },
+    {
+      "epoch": 0.13693463692013114,
+      "grad_norm": 0.14426636695861816,
+      "learning_rate": 0.0005877074293903264,
+      "loss": 3.2786,
+      "step": 2360
+    },
+    {
+      "epoch": 0.13751486843250457,
+      "grad_norm": 0.15023665130138397,
+      "learning_rate": 0.0005875438012623984,
+      "loss": 3.2888,
+      "step": 2370
+    },
+    {
+      "epoch": 0.138095099944878,
+      "grad_norm": 0.1882687211036682,
+      "learning_rate": 0.0005873791143680718,
+      "loss": 3.2806,
+      "step": 2380
+    },
+    {
+      "epoch": 0.13867533145725144,
+      "grad_norm": 0.14847789704799652,
+      "learning_rate": 0.000587213369313741,
+      "loss": 3.2698,
+      "step": 2390
+    },
+    {
+      "epoch": 0.13925556296962488,
+      "grad_norm": 0.14070352911949158,
+      "learning_rate": 0.0005870465667096969,
+      "loss": 3.2782,
+      "step": 2400
+    },
+    {
+      "epoch": 0.13983579448199832,
+      "grad_norm": 0.19226056337356567,
+      "learning_rate": 0.0005868787071701238,
+      "loss": 3.2639,
+      "step": 2410
+    },
+    {
+      "epoch": 0.14041602599437175,
+      "grad_norm": 0.1776312291622162,
+      "learning_rate": 0.0005867097913130982,
+      "loss": 3.2792,
+      "step": 2420
+    },
+    {
+      "epoch": 0.1409962575067452,
+      "grad_norm": 0.13482613861560822,
+      "learning_rate": 0.0005865398197605863,
+      "loss": 3.2834,
+      "step": 2430
+    },
+    {
+      "epoch": 0.14157648901911862,
+      "grad_norm": 0.16731715202331543,
+      "learning_rate": 0.0005863687931384408,
+      "loss": 3.2773,
+      "step": 2440
+    },
+    {
+      "epoch": 0.14215672053149206,
+      "grad_norm": 0.14542406797409058,
+      "learning_rate": 0.0005861967120763997,
+      "loss": 3.2676,
+      "step": 2450
+    },
+    {
+      "epoch": 0.1427369520438655,
+      "grad_norm": 0.1490476280450821,
+      "learning_rate": 0.0005860235772080836,
+      "loss": 3.2783,
+      "step": 2460
+    },
+    {
+      "epoch": 0.14331718355623893,
+      "grad_norm": 0.1446717530488968,
+      "learning_rate": 0.0005858493891709932,
+      "loss": 3.283,
+      "step": 2470
+    },
+    {
+      "epoch": 0.14389741506861237,
+      "grad_norm": 0.1412891447544098,
+      "learning_rate": 0.0005856741486065071,
+      "loss": 3.2652,
+      "step": 2480
+    },
+    {
+      "epoch": 0.1444776465809858,
+      "grad_norm": 0.14674563705921173,
+      "learning_rate": 0.0005854978561598794,
+      "loss": 3.2613,
+      "step": 2490
+    },
+    {
+      "epoch": 0.14505787809335924,
+      "grad_norm": 0.14808981120586395,
+      "learning_rate": 0.0005853205124802374,
+      "loss": 3.2742,
+      "step": 2500
+    },
+    {
+      "epoch": 0.14563810960573267,
+      "grad_norm": 0.14043253660202026,
+      "learning_rate": 0.0005851421182205789,
+      "loss": 3.2685,
+      "step": 2510
+    },
+    {
+      "epoch": 0.14621834111810614,
+      "grad_norm": 0.1568257212638855,
+      "learning_rate": 0.0005849626740377705,
+      "loss": 3.2711,
+      "step": 2520
+    },
+    {
+      "epoch": 0.14679857263047957,
+      "grad_norm": 0.13545943796634674,
+      "learning_rate": 0.0005847821805925444,
+      "loss": 3.2573,
+      "step": 2530
+    },
+    {
+      "epoch": 0.147378804142853,
+      "grad_norm": 0.18863698840141296,
+      "learning_rate": 0.0005846006385494964,
+      "loss": 3.2526,
+      "step": 2540
+    },
+    {
+      "epoch": 0.14795903565522645,
+      "grad_norm": 0.14628858864307404,
+      "learning_rate": 0.0005844180485770832,
+      "loss": 3.2629,
+      "step": 2550
+    },
+    {
+      "epoch": 0.14853926716759988,
+      "grad_norm": 0.1624503880739212,
+      "learning_rate": 0.0005842344113476202,
+      "loss": 3.2529,
+      "step": 2560
+    },
+    {
+      "epoch": 0.14911949867997332,
+      "grad_norm": 0.16218945384025574,
+      "learning_rate": 0.0005840497275372792,
+      "loss": 3.2548,
+      "step": 2570
+    },
+    {
+      "epoch": 0.14969973019234675,
+      "grad_norm": 0.16516704857349396,
+      "learning_rate": 0.0005838639978260851,
+      "loss": 3.2501,
+      "step": 2580
+    },
+    {
+      "epoch": 0.1502799617047202,
+      "grad_norm": 0.1366761326789856,
+      "learning_rate": 0.0005836772228979142,
+      "loss": 3.2467,
+      "step": 2590
+    },
+    {
+      "epoch": 0.15086019321709362,
+      "grad_norm": 0.15526661276817322,
+      "learning_rate": 0.0005834894034404913,
+      "loss": 3.242,
+      "step": 2600
+    },
+    {
+      "epoch": 0.15144042472946706,
+      "grad_norm": 0.1441916972398758,
+      "learning_rate": 0.0005833005401453874,
+      "loss": 3.2399,
+      "step": 2610
+    },
+    {
+      "epoch": 0.1520206562418405,
+      "grad_norm": 0.1708252727985382,
+      "learning_rate": 0.0005831106337080169,
+      "loss": 3.2427,
+      "step": 2620
+    },
+    {
+      "epoch": 0.15260088775421393,
+      "grad_norm": 0.14945155382156372,
+      "learning_rate": 0.0005829196848276351,
+      "loss": 3.2449,
+      "step": 2630
+    },
+    {
+      "epoch": 0.15318111926658737,
+      "grad_norm": 0.1512700468301773,
+      "learning_rate": 0.000582727694207336,
+      "loss": 3.2438,
+      "step": 2640
+    },
+    {
+      "epoch": 0.1537613507789608,
+      "grad_norm": 0.15101619064807892,
+      "learning_rate": 0.0005825346625540491,
+      "loss": 3.2396,
+      "step": 2650
+    },
+    {
+      "epoch": 0.15434158229133424,
+      "grad_norm": 0.13658584654331207,
+      "learning_rate": 0.000582340590578537,
+      "loss": 3.2475,
+      "step": 2660
+    },
+    {
+      "epoch": 0.15492181380370768,
+      "grad_norm": 0.16723176836967468,
+      "learning_rate": 0.0005821454789953932,
+      "loss": 3.2385,
+      "step": 2670
+    },
+    {
+      "epoch": 0.1555020453160811,
+      "grad_norm": 0.16236084699630737,
+      "learning_rate": 0.000581949328523039,
+      "loss": 3.2287,
+      "step": 2680
+    },
+    {
+      "epoch": 0.15608227682845455,
+      "grad_norm": 0.1473713517189026,
+      "learning_rate": 0.0005817521398837209,
+      "loss": 3.2335,
+      "step": 2690
+    },
+    {
+      "epoch": 0.15666250834082798,
+      "grad_norm": 0.14422966539859772,
+      "learning_rate": 0.0005815539138035082,
+      "loss": 3.2217,
+      "step": 2700
+    },
+    {
+      "epoch": 0.15724273985320142,
+      "grad_norm": 0.1676100343465805,
+      "learning_rate": 0.00058135465101229,
+      "loss": 3.2329,
+      "step": 2710
+    },
+    {
+      "epoch": 0.15782297136557485,
+      "grad_norm": 0.14574168622493744,
+      "learning_rate": 0.000581154352243773,
+      "loss": 3.2278,
+      "step": 2720
+    },
+    {
+      "epoch": 0.1584032028779483,
+      "grad_norm": 0.16981543600559235,
+      "learning_rate": 0.000580953018235478,
+      "loss": 3.229,
+      "step": 2730
+    },
+    {
+      "epoch": 0.15898343439032173,
+      "grad_norm": 0.13945645093917847,
+      "learning_rate": 0.0005807506497287379,
+      "loss": 3.2297,
+      "step": 2740
+    },
+    {
+      "epoch": 0.15956366590269516,
+      "grad_norm": 0.17302276194095612,
+      "learning_rate": 0.0005805472474686949,
+      "loss": 3.2227,
+      "step": 2750
+    },
+    {
+      "epoch": 0.16014389741506863,
+      "grad_norm": 0.15059055387973785,
+      "learning_rate": 0.0005803428122042974,
+      "loss": 3.2288,
+      "step": 2760
+    },
+    {
+      "epoch": 0.16072412892744206,
+      "grad_norm": 0.14908020198345184,
+      "learning_rate": 0.0005801373446882973,
+      "loss": 3.2293,
+      "step": 2770
+    },
+    {
+      "epoch": 0.1613043604398155,
+      "grad_norm": 0.1653462052345276,
+      "learning_rate": 0.0005799308456772478,
+      "loss": 3.2189,
+      "step": 2780
+    },
+    {
+      "epoch": 0.16188459195218893,
+      "grad_norm": 0.14483293890953064,
+      "learning_rate": 0.0005797233159314997,
+      "loss": 3.2239,
+      "step": 2790
+    },
+    {
+      "epoch": 0.16246482346456237,
+      "grad_norm": 0.15277917683124542,
+      "learning_rate": 0.0005795147562151992,
+      "loss": 3.2155,
+      "step": 2800
+    },
+    {
+      "epoch": 0.1630450549769358,
+      "grad_norm": 0.13660204410552979,
+      "learning_rate": 0.0005793051672962852,
+      "loss": 3.2183,
+      "step": 2810
+    },
+    {
+      "epoch": 0.16362528648930924,
+      "grad_norm": 0.15595564246177673,
+      "learning_rate": 0.0005790945499464861,
+      "loss": 3.2163,
+      "step": 2820
+    },
+    {
+      "epoch": 0.16420551800168268,
+      "grad_norm": 0.14608708024024963,
+      "learning_rate": 0.0005788829049413167,
+      "loss": 3.2222,
+      "step": 2830
+    },
+    {
+      "epoch": 0.1647857495140561,
+      "grad_norm": 0.14129003882408142,
+      "learning_rate": 0.0005786702330600764,
+      "loss": 3.2115,
+      "step": 2840
+    },
+    {
+      "epoch": 0.16536598102642955,
+      "grad_norm": 0.13925908505916595,
+      "learning_rate": 0.0005784565350858453,
+      "loss": 3.2115,
+      "step": 2850
+    },
+    {
+      "epoch": 0.16594621253880298,
+      "grad_norm": 0.15094564855098724,
+      "learning_rate": 0.0005782418118054816,
+      "loss": 3.216,
+      "step": 2860
+    },
+    {
+      "epoch": 0.16652644405117642,
+      "grad_norm": 0.1384998857975006,
+      "learning_rate": 0.0005780260640096189,
+      "loss": 3.2084,
+      "step": 2870
+    },
+    {
+      "epoch": 0.16710667556354986,
+      "grad_norm": 0.15442876517772675,
+      "learning_rate": 0.0005778092924926634,
+      "loss": 3.2071,
+      "step": 2880
+    },
+    {
+      "epoch": 0.1676869070759233,
+      "grad_norm": 0.16494965553283691,
+      "learning_rate": 0.0005775914980527904,
+      "loss": 3.2101,
+      "step": 2890
+    },
+    {
+      "epoch": 0.16826713858829673,
+      "grad_norm": 0.16855239868164062,
+      "learning_rate": 0.0005773726814919419,
+      "loss": 3.2019,
+      "step": 2900
+    },
+    {
+      "epoch": 0.16884737010067016,
+      "grad_norm": 0.1579483449459076,
+      "learning_rate": 0.0005771528436158233,
+      "loss": 3.209,
+      "step": 2910
+    },
+    {
+      "epoch": 0.1694276016130436,
+      "grad_norm": 0.1417829543352127,
+      "learning_rate": 0.0005769319852339008,
+      "loss": 3.2019,
+      "step": 2920
+    },
+    {
+      "epoch": 0.17000783312541703,
+      "grad_norm": 0.14454993605613708,
+      "learning_rate": 0.0005767101071593979,
+      "loss": 3.2047,
+      "step": 2930
+    },
+    {
+      "epoch": 0.17058806463779047,
+      "grad_norm": 0.16087666153907776,
+      "learning_rate": 0.0005764872102092931,
+      "loss": 3.2062,
+      "step": 2940
+    },
+    {
+      "epoch": 0.1711682961501639,
+      "grad_norm": 0.139312744140625,
+      "learning_rate": 0.0005762632952043163,
+      "loss": 3.1988,
+      "step": 2950
+    },
+    {
+      "epoch": 0.17174852766253734,
+      "grad_norm": 0.15459179878234863,
+      "learning_rate": 0.000576038362968946,
+      "loss": 3.2002,
+      "step": 2960
+    },
+    {
+      "epoch": 0.17232875917491078,
+      "grad_norm": 0.18820500373840332,
+      "learning_rate": 0.0005758124143314062,
+      "loss": 3.2035,
+      "step": 2970
+    },
+    {
+      "epoch": 0.17290899068728421,
+      "grad_norm": 0.14626365900039673,
+      "learning_rate": 0.0005755854501236635,
+      "loss": 3.194,
+      "step": 2980
+    },
+    {
+      "epoch": 0.17348922219965765,
+      "grad_norm": 0.14270606637001038,
+      "learning_rate": 0.0005753574711814238,
+      "loss": 3.1879,
+      "step": 2990
+    },
+    {
+      "epoch": 0.1740694537120311,
+      "grad_norm": 0.15857936441898346,
+      "learning_rate": 0.0005751284783441297,
+      "loss": 3.207,
+      "step": 3000
+    },
+    {
+      "epoch": 0.1740694537120311,
+      "eval_loss": 3.158046245574951,
+      "eval_runtime": 3.2654,
+      "eval_samples_per_second": 1326.029,
+      "eval_steps_per_second": 2.756,
+      "step": 3000
+    },
+    {
+      "epoch": 0.17464968522440455,
+      "grad_norm": 0.14403465390205383,
+      "learning_rate": 0.0005748984724549565,
+      "loss": 3.1895,
+      "step": 3010
+    },
+    {
+      "epoch": 0.17522991673677799,
+      "grad_norm": 0.1392756998538971,
+      "learning_rate": 0.0005746674543608101,
+      "loss": 3.1942,
+      "step": 3020
+    },
+    {
+      "epoch": 0.17581014824915142,
+      "grad_norm": 0.13957557082176208,
+      "learning_rate": 0.0005744354249123234,
+      "loss": 3.1969,
+      "step": 3030
+    },
+    {
+      "epoch": 0.17639037976152486,
+      "grad_norm": 0.151198148727417,
+      "learning_rate": 0.0005742023849638531,
+      "loss": 3.1903,
+      "step": 3040
+    },
+    {
+      "epoch": 0.1769706112738983,
+      "grad_norm": 0.14607684314250946,
+      "learning_rate": 0.0005739683353734766,
+      "loss": 3.2003,
+      "step": 3050
+    },
+    {
+      "epoch": 0.17755084278627173,
+      "grad_norm": 0.13925622403621674,
+      "learning_rate": 0.0005737332770029891,
+      "loss": 3.1927,
+      "step": 3060
+    },
+    {
+      "epoch": 0.17813107429864516,
+      "grad_norm": 0.13125456869602203,
+      "learning_rate": 0.0005734972107179001,
+      "loss": 3.1849,
+      "step": 3070
+    },
+    {
+      "epoch": 0.1787113058110186,
+      "grad_norm": 0.16905735433101654,
+      "learning_rate": 0.0005732601373874306,
+      "loss": 3.187,
+      "step": 3080
+    },
+    {
+      "epoch": 0.17929153732339204,
+      "grad_norm": 0.13563838601112366,
+      "learning_rate": 0.0005730220578845091,
+      "loss": 3.1853,
+      "step": 3090
+    },
+    {
+      "epoch": 0.17987176883576547,
+      "grad_norm": 0.15470236539840698,
+      "learning_rate": 0.0005727829730857695,
+      "loss": 3.1906,
+      "step": 3100
+    },
+    {
+      "epoch": 0.1804520003481389,
+      "grad_norm": 0.160013347864151,
+      "learning_rate": 0.0005725428838715469,
+      "loss": 3.1705,
+      "step": 3110
+    },
+    {
+      "epoch": 0.18103223186051234,
+      "grad_norm": 0.14684250950813293,
+      "learning_rate": 0.0005723017911258752,
+      "loss": 3.1825,
+      "step": 3120
+    },
+    {
+      "epoch": 0.18161246337288578,
+      "grad_norm": 0.1529027372598648,
+      "learning_rate": 0.0005720596957364829,
+      "loss": 3.1817,
+      "step": 3130
+    },
+    {
+      "epoch": 0.18219269488525922,
+      "grad_norm": 0.13860736787319183,
+      "learning_rate": 0.0005718165985947907,
+      "loss": 3.1844,
+      "step": 3140
+    },
+    {
+      "epoch": 0.18277292639763265,
+      "grad_norm": 0.14795511960983276,
+      "learning_rate": 0.0005715725005959077,
+      "loss": 3.1741,
+      "step": 3150
+    },
+    {
+      "epoch": 0.1833531579100061,
+      "grad_norm": 0.1455545276403427,
+      "learning_rate": 0.0005713274026386283,
+      "loss": 3.1869,
+      "step": 3160
+    },
+    {
+      "epoch": 0.18393338942237952,
+      "grad_norm": 0.14845995604991913,
+      "learning_rate": 0.0005710813056254289,
+      "loss": 3.1735,
+      "step": 3170
+    },
+    {
+      "epoch": 0.18451362093475296,
+      "grad_norm": 0.14949209988117218,
+      "learning_rate": 0.0005708342104624645,
+      "loss": 3.178,
+      "step": 3180
+    },
+    {
+      "epoch": 0.1850938524471264,
+      "grad_norm": 0.16276435554027557,
+      "learning_rate": 0.0005705861180595653,
+      "loss": 3.1712,
+      "step": 3190
+    },
+    {
+      "epoch": 0.18567408395949983,
+      "grad_norm": 0.14152179658412933,
+      "learning_rate": 0.0005703370293302335,
+      "loss": 3.1752,
+      "step": 3200
+    },
+    {
+      "epoch": 0.18625431547187327,
+      "grad_norm": 0.1554255187511444,
+      "learning_rate": 0.00057008694519164,
+      "loss": 3.169,
+      "step": 3210
+    },
+    {
+      "epoch": 0.1868345469842467,
+      "grad_norm": 0.14890237152576447,
+      "learning_rate": 0.0005698358665646207,
+      "loss": 3.1706,
+      "step": 3220
+    },
+    {
+      "epoch": 0.18741477849662014,
+      "grad_norm": 0.15197904407978058,
+      "learning_rate": 0.0005695837943736735,
+      "loss": 3.1691,
+      "step": 3230
+    },
+    {
+      "epoch": 0.1879950100089936,
+      "grad_norm": 0.15369053184986115,
+      "learning_rate": 0.0005693307295469547,
+      "loss": 3.1678,
+      "step": 3240
+    },
+    {
+      "epoch": 0.18857524152136704,
+      "grad_norm": 0.19938114285469055,
+      "learning_rate": 0.0005690766730162752,
+      "loss": 3.1706,
+      "step": 3250
+    },
+    {
+      "epoch": 0.18915547303374047,
+      "grad_norm": 0.14962078630924225,
+      "learning_rate": 0.0005688216257170979,
+      "loss": 3.1665,
+      "step": 3260
+    },
+    {
+      "epoch": 0.1897357045461139,
+      "grad_norm": 0.14826686680316925,
+      "learning_rate": 0.0005685655885885337,
+      "loss": 3.1478,
+      "step": 3270
+    },
+    {
+      "epoch": 0.19031593605848734,
+      "grad_norm": 0.137392058968544,
+      "learning_rate": 0.0005683085625733382,
+      "loss": 3.1645,
+      "step": 3280
+    },
+    {
+      "epoch": 0.19089616757086078,
+      "grad_norm": 0.15559589862823486,
+      "learning_rate": 0.000568050548617908,
+      "loss": 3.1674,
+      "step": 3290
+    },
+    {
+      "epoch": 0.19147639908323422,
+      "grad_norm": 0.17506170272827148,
+      "learning_rate": 0.0005677915476722775,
+      "loss": 3.1606,
+      "step": 3300
+    },
+    {
+      "epoch": 0.19205663059560765,
+      "grad_norm": 0.1602877825498581,
+      "learning_rate": 0.0005675315606901155,
+      "loss": 3.1586,
+      "step": 3310
+    },
+    {
+      "epoch": 0.1926368621079811,
+      "grad_norm": 0.13343220949172974,
+      "learning_rate": 0.0005672705886287211,
+      "loss": 3.1553,
+      "step": 3320
+    },
+    {
+      "epoch": 0.19321709362035452,
+      "grad_norm": 0.15390737354755402,
+      "learning_rate": 0.0005670086324490208,
+      "loss": 3.1687,
+      "step": 3330
+    },
+    {
+      "epoch": 0.19379732513272796,
+      "grad_norm": 0.13513082265853882,
+      "learning_rate": 0.0005667456931155647,
+      "loss": 3.1543,
+      "step": 3340
+    },
+    {
+      "epoch": 0.1943775566451014,
+      "grad_norm": 0.1489078551530838,
+      "learning_rate": 0.0005664817715965231,
+      "loss": 3.1623,
+      "step": 3350
+    },
+    {
+      "epoch": 0.19495778815747483,
+      "grad_norm": 0.14149461686611176,
+      "learning_rate": 0.0005662168688636826,
+      "loss": 3.1487,
+      "step": 3360
+    },
+    {
+      "epoch": 0.19553801966984827,
+      "grad_norm": 0.150479257106781,
+      "learning_rate": 0.0005659509858924428,
+      "loss": 3.1588,
+      "step": 3370
+    },
+    {
+      "epoch": 0.1961182511822217,
+      "grad_norm": 0.15041102468967438,
+      "learning_rate": 0.0005656841236618127,
+      "loss": 3.155,
+      "step": 3380
+    },
+    {
+      "epoch": 0.19669848269459514,
+      "grad_norm": 0.14053913950920105,
+      "learning_rate": 0.0005654162831544068,
+      "loss": 3.1581,
+      "step": 3390
+    },
+    {
+      "epoch": 0.19727871420696858,
+      "grad_norm": 0.15485486388206482,
+      "learning_rate": 0.0005651474653564421,
+      "loss": 3.1465,
+      "step": 3400
+    },
+    {
+      "epoch": 0.197858945719342,
+      "grad_norm": 0.1425885111093521,
+      "learning_rate": 0.0005648776712577338,
+      "loss": 3.1535,
+      "step": 3410
+    },
+    {
+      "epoch": 0.19843917723171545,
+      "grad_norm": 0.1361316442489624,
+      "learning_rate": 0.0005646069018516921,
+      "loss": 3.1466,
+      "step": 3420
+    },
+    {
+      "epoch": 0.19901940874408888,
+      "grad_norm": 0.15521439909934998,
+      "learning_rate": 0.0005643351581353184,
+      "loss": 3.1415,
+      "step": 3430
+    },
+    {
+      "epoch": 0.19959964025646232,
+      "grad_norm": 0.14644280076026917,
+      "learning_rate": 0.0005640624411092014,
+      "loss": 3.1411,
+      "step": 3440
+    },
+    {
+      "epoch": 0.20017987176883575,
+      "grad_norm": 0.14116531610488892,
+      "learning_rate": 0.0005637887517775137,
+      "loss": 3.1542,
+      "step": 3450
+    },
+    {
+      "epoch": 0.2007601032812092,
+      "grad_norm": 0.1301729828119278,
+      "learning_rate": 0.0005635140911480082,
+      "loss": 3.1448,
+      "step": 3460
+    },
+    {
+      "epoch": 0.20134033479358263,
+      "grad_norm": 0.16307103633880615,
+      "learning_rate": 0.000563238460232014,
+      "loss": 3.1397,
+      "step": 3470
+    },
+    {
+      "epoch": 0.2019205663059561,
+      "grad_norm": 0.13141117990016937,
+      "learning_rate": 0.0005629618600444332,
+      "loss": 3.1469,
+      "step": 3480
+    },
+    {
+      "epoch": 0.20250079781832953,
+      "grad_norm": 0.13741467893123627,
+      "learning_rate": 0.0005626842916037365,
+      "loss": 3.1419,
+      "step": 3490
+    },
+    {
+      "epoch": 0.20308102933070296,
+      "grad_norm": 0.16112880408763885,
+      "learning_rate": 0.0005624057559319601,
+      "loss": 3.1449,
+      "step": 3500
+    },
+    {
+      "epoch": 0.2036612608430764,
+      "grad_norm": 0.153072327375412,
+      "learning_rate": 0.0005621262540547015,
+      "loss": 3.1365,
+      "step": 3510
+    },
+    {
+      "epoch": 0.20424149235544983,
+      "grad_norm": 0.1413891613483429,
+      "learning_rate": 0.0005618457870011158,
+      "loss": 3.1307,
+      "step": 3520
+    },
+    {
+      "epoch": 0.20482172386782327,
+      "grad_norm": 0.15589068830013275,
+      "learning_rate": 0.0005615643558039121,
+      "loss": 3.1418,
+      "step": 3530
+    },
+    {
+      "epoch": 0.2054019553801967,
+      "grad_norm": 0.12889379262924194,
+      "learning_rate": 0.0005612819614993496,
+      "loss": 3.1366,
+      "step": 3540
+    },
+    {
+      "epoch": 0.20598218689257014,
+      "grad_norm": 0.14375300705432892,
+      "learning_rate": 0.0005609986051272336,
+      "loss": 3.13,
+      "step": 3550
+    },
+    {
+      "epoch": 0.20656241840494358,
+      "grad_norm": 0.1587209552526474,
+      "learning_rate": 0.000560714287730912,
+      "loss": 3.1338,
+      "step": 3560
+    },
+    {
+      "epoch": 0.207142649917317,
+      "grad_norm": 0.15273341536521912,
+      "learning_rate": 0.0005604290103572714,
+      "loss": 3.1393,
+      "step": 3570
+    },
+    {
+      "epoch": 0.20772288142969045,
+      "grad_norm": 0.13435807824134827,
+      "learning_rate": 0.0005601427740567328,
+      "loss": 3.137,
+      "step": 3580
+    },
+    {
+      "epoch": 0.20830311294206388,
+      "grad_norm": 0.1391715109348297,
+      "learning_rate": 0.0005598555798832482,
+      "loss": 3.1347,
+      "step": 3590
+    },
+    {
+      "epoch": 0.20888334445443732,
+      "grad_norm": 0.16318084299564362,
+      "learning_rate": 0.0005595674288942969,
+      "loss": 3.1279,
+      "step": 3600
+    },
+    {
+      "epoch": 0.20946357596681076,
+      "grad_norm": 0.1386035829782486,
+      "learning_rate": 0.0005592783221508807,
+      "loss": 3.1335,
+      "step": 3610
+    },
+    {
+      "epoch": 0.2100438074791842,
+      "grad_norm": 0.14639577269554138,
+      "learning_rate": 0.000558988260717521,
+      "loss": 3.142,
+      "step": 3620
+    },
+    {
+      "epoch": 0.21062403899155763,
+      "grad_norm": 0.13666051626205444,
+      "learning_rate": 0.0005586972456622546,
+      "loss": 3.1287,
+      "step": 3630
+    },
+    {
+      "epoch": 0.21120427050393106,
+      "grad_norm": 0.14930284023284912,
+      "learning_rate": 0.0005584052780566293,
+      "loss": 3.1283,
+      "step": 3640
+    },
+    {
+      "epoch": 0.2117845020163045,
+      "grad_norm": 0.13987945020198822,
+      "learning_rate": 0.0005581123589757002,
+      "loss": 3.1329,
+      "step": 3650
+    },
+    {
+      "epoch": 0.21236473352867793,
+      "grad_norm": 0.1452946811914444,
+      "learning_rate": 0.0005578184894980263,
+      "loss": 3.1294,
+      "step": 3660
+    },
+    {
+      "epoch": 0.21294496504105137,
+      "grad_norm": 0.15192043781280518,
+      "learning_rate": 0.0005575236707056657,
+      "loss": 3.1206,
+      "step": 3670
+    },
+    {
+      "epoch": 0.2135251965534248,
+      "grad_norm": 0.16006827354431152,
+      "learning_rate": 0.0005572279036841721,
+      "loss": 3.1273,
+      "step": 3680
+    },
+    {
+      "epoch": 0.21410542806579824,
+      "grad_norm": 0.18141302466392517,
+      "learning_rate": 0.0005569311895225906,
+      "loss": 3.1245,
+      "step": 3690
+    },
+    {
+      "epoch": 0.21468565957817168,
+      "grad_norm": 0.14263153076171875,
+      "learning_rate": 0.0005566335293134539,
+      "loss": 3.1211,
+      "step": 3700
+    },
+    {
+      "epoch": 0.21526589109054511,
+      "grad_norm": 0.1435001790523529,
+      "learning_rate": 0.0005563349241527781,
+      "loss": 3.1258,
+      "step": 3710
+    },
+    {
+      "epoch": 0.21584612260291858,
+      "grad_norm": 0.15155887603759766,
+      "learning_rate": 0.0005560353751400585,
+      "loss": 3.1233,
+      "step": 3720
+    },
+    {
+      "epoch": 0.216426354115292,
+      "grad_norm": 0.1545734703540802,
+      "learning_rate": 0.0005557348833782663,
+      "loss": 3.1292,
+      "step": 3730
+    },
+    {
+      "epoch": 0.21700658562766545,
+      "grad_norm": 0.15549300611019135,
+      "learning_rate": 0.0005554334499738433,
+      "loss": 3.1142,
+      "step": 3740
+    },
+    {
+      "epoch": 0.21758681714003889,
+      "grad_norm": 0.15990693867206573,
+      "learning_rate": 0.000555131076036699,
+      "loss": 3.125,
+      "step": 3750
+    },
+    {
+      "epoch": 0.21816704865241232,
+      "grad_norm": 0.16630201041698456,
+      "learning_rate": 0.0005548277626802058,
+      "loss": 3.1216,
+      "step": 3760
+    },
+    {
+      "epoch": 0.21874728016478576,
+      "grad_norm": 0.1408713161945343,
+      "learning_rate": 0.0005545235110211954,
+      "loss": 3.1111,
+      "step": 3770
+    },
+    {
+      "epoch": 0.2193275116771592,
+      "grad_norm": 0.1488475650548935,
+      "learning_rate": 0.0005542183221799544,
+      "loss": 3.1253,
+      "step": 3780
+    },
+    {
+      "epoch": 0.21990774318953263,
+      "grad_norm": 0.14259935915470123,
+      "learning_rate": 0.0005539121972802198,
+      "loss": 3.1179,
+      "step": 3790
+    },
+    {
+      "epoch": 0.22048797470190606,
+      "grad_norm": 0.14055614173412323,
+      "learning_rate": 0.0005536051374491757,
+      "loss": 3.1113,
+      "step": 3800
+    },
+    {
+      "epoch": 0.2210682062142795,
+      "grad_norm": 0.1665177196264267,
+      "learning_rate": 0.0005532971438174485,
+      "loss": 3.1197,
+      "step": 3810
+    },
+    {
+      "epoch": 0.22164843772665294,
+      "grad_norm": 0.15349626541137695,
+      "learning_rate": 0.0005529882175191031,
+      "loss": 3.1086,
+      "step": 3820
+    },
+    {
+      "epoch": 0.22222866923902637,
+      "grad_norm": 0.14321498572826385,
+      "learning_rate": 0.0005526783596916385,
+      "loss": 3.1161,
+      "step": 3830
+    },
+    {
+      "epoch": 0.2228089007513998,
+      "grad_norm": 0.14768148958683014,
+      "learning_rate": 0.0005523675714759835,
+      "loss": 3.1164,
+      "step": 3840
+    },
+    {
+      "epoch": 0.22338913226377324,
+      "grad_norm": 0.1546637862920761,
+      "learning_rate": 0.000552055854016493,
+      "loss": 3.1185,
+      "step": 3850
+    },
+    {
+      "epoch": 0.22396936377614668,
+      "grad_norm": 0.16114896535873413,
+      "learning_rate": 0.0005517432084609434,
+      "loss": 3.1083,
+      "step": 3860
+    },
+    {
+      "epoch": 0.22454959528852012,
+      "grad_norm": 0.13796792924404144,
+      "learning_rate": 0.0005514296359605284,
+      "loss": 3.102,
+      "step": 3870
+    },
+    {
+      "epoch": 0.22512982680089355,
+      "grad_norm": 0.13948635756969452,
+      "learning_rate": 0.0005511151376698546,
+      "loss": 3.1079,
+      "step": 3880
+    },
+    {
+      "epoch": 0.225710058313267,
+      "grad_norm": 0.13826532661914825,
+      "learning_rate": 0.0005507997147469378,
+      "loss": 3.107,
+      "step": 3890
+    },
+    {
+      "epoch": 0.22629028982564042,
+      "grad_norm": 0.1437525451183319,
+      "learning_rate": 0.0005504833683531981,
+      "loss": 3.1076,
+      "step": 3900
+    },
+    {
+      "epoch": 0.22687052133801386,
+      "grad_norm": 0.14256474375724792,
+      "learning_rate": 0.0005501660996534563,
+      "loss": 3.1056,
+      "step": 3910
+    },
+    {
+      "epoch": 0.2274507528503873,
+      "grad_norm": 0.1531156748533249,
+      "learning_rate": 0.0005498479098159289,
+      "loss": 3.101,
+      "step": 3920
+    },
+    {
+      "epoch": 0.22803098436276073,
+      "grad_norm": 0.16901366412639618,
+      "learning_rate": 0.0005495288000122242,
+      "loss": 3.0981,
+      "step": 3930
+    },
+    {
+      "epoch": 0.22861121587513417,
+      "grad_norm": 0.1440243273973465,
+      "learning_rate": 0.0005492087714173378,
+      "loss": 3.1052,
+      "step": 3940
+    },
+    {
+      "epoch": 0.2291914473875076,
+      "grad_norm": 0.1603139340877533,
+      "learning_rate": 0.0005488878252096487,
+      "loss": 3.105,
+      "step": 3950
+    },
+    {
+      "epoch": 0.22977167889988107,
+      "grad_norm": 0.1588706523180008,
+      "learning_rate": 0.0005485659625709144,
+      "loss": 3.1107,
+      "step": 3960
+    },
+    {
+      "epoch": 0.2303519104122545,
+      "grad_norm": 0.1452343761920929,
+      "learning_rate": 0.0005482431846862667,
+      "loss": 3.1074,
+      "step": 3970
+    },
+    {
+      "epoch": 0.23093214192462794,
+      "grad_norm": 0.15799881517887115,
+      "learning_rate": 0.0005479194927442078,
+      "loss": 3.0985,
+      "step": 3980
+    },
+    {
+      "epoch": 0.23151237343700137,
+      "grad_norm": 0.12657681107521057,
+      "learning_rate": 0.0005475948879366053,
+      "loss": 3.0958,
+      "step": 3990
+    },
+    {
+      "epoch": 0.2320926049493748,
+      "grad_norm": 0.13606688380241394,
+      "learning_rate": 0.000547269371458688,
+      "loss": 3.0999,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2320926049493748,
+      "eval_loss": 3.0630993843078613,
+      "eval_runtime": 3.264,
+      "eval_samples_per_second": 1326.576,
+      "eval_steps_per_second": 2.757,
+      "step": 4000
+    },
+    {
+      "epoch": 0.23267283646174824,
+      "grad_norm": 0.16136619448661804,
+      "learning_rate": 0.0005469429445090417,
+      "loss": 3.1004,
+      "step": 4010
+    },
+    {
+      "epoch": 0.23325306797412168,
+      "grad_norm": 0.14767828583717346,
+      "learning_rate": 0.0005466156082896047,
+      "loss": 3.1075,
+      "step": 4020
+    },
+    {
+      "epoch": 0.23383329948649512,
+      "grad_norm": 0.1492021530866623,
+      "learning_rate": 0.0005462873640056632,
+      "loss": 3.1025,
+      "step": 4030
+    },
+    {
+      "epoch": 0.23441353099886855,
+      "grad_norm": 0.14654645323753357,
+      "learning_rate": 0.000545958212865847,
+      "loss": 3.0966,
+      "step": 4040
+    },
+    {
+      "epoch": 0.234993762511242,
+      "grad_norm": 0.15648731589317322,
+      "learning_rate": 0.0005456281560821252,
+      "loss": 3.0937,
+      "step": 4050
+    },
+    {
+      "epoch": 0.23557399402361542,
+      "grad_norm": 0.13584694266319275,
+      "learning_rate": 0.0005452971948698014,
+      "loss": 3.1052,
+      "step": 4060
+    },
+    {
+      "epoch": 0.23615422553598886,
+      "grad_norm": 0.13829472661018372,
+      "learning_rate": 0.0005449653304475094,
+      "loss": 3.0933,
+      "step": 4070
+    },
+    {
+      "epoch": 0.2367344570483623,
+      "grad_norm": 0.16889816522598267,
+      "learning_rate": 0.0005446325640372088,
+      "loss": 3.0949,
+      "step": 4080
+    },
+    {
+      "epoch": 0.23731468856073573,
+      "grad_norm": 0.12351599335670471,
+      "learning_rate": 0.0005442988968641804,
+      "loss": 3.0914,
+      "step": 4090
+    },
+    {
+      "epoch": 0.23789492007310917,
+      "grad_norm": 0.14327877759933472,
+      "learning_rate": 0.0005439643301570216,
+      "loss": 3.0814,
+      "step": 4100
+    },
+    {
+      "epoch": 0.2384751515854826,
+      "grad_norm": 0.15155468881130219,
+      "learning_rate": 0.0005436288651476421,
+      "loss": 3.0849,
+      "step": 4110
+    },
+    {
+      "epoch": 0.23905538309785604,
+      "grad_norm": 0.14292922616004944,
+      "learning_rate": 0.0005432925030712594,
+      "loss": 3.0887,
+      "step": 4120
+    },
+    {
+      "epoch": 0.23963561461022947,
+      "grad_norm": 0.14884264767169952,
+      "learning_rate": 0.0005429552451663936,
+      "loss": 3.0911,
+      "step": 4130
+    },
+    {
+      "epoch": 0.2402158461226029,
+      "grad_norm": 0.1403530389070511,
+      "learning_rate": 0.0005426170926748639,
+      "loss": 3.0926,
+      "step": 4140
+    },
+    {
+      "epoch": 0.24079607763497635,
+      "grad_norm": 0.14543718099594116,
+      "learning_rate": 0.0005422780468417829,
+      "loss": 3.0897,
+      "step": 4150
+    },
+    {
+      "epoch": 0.24137630914734978,
+      "grad_norm": 0.12813718616962433,
+      "learning_rate": 0.0005419381089155532,
+      "loss": 3.0902,
+      "step": 4160
+    },
+    {
+      "epoch": 0.24195654065972322,
+      "grad_norm": 0.13375824689865112,
+      "learning_rate": 0.0005415972801478617,
+      "loss": 3.0915,
+      "step": 4170
+    },
+    {
+      "epoch": 0.24253677217209665,
+      "grad_norm": 0.14347635209560394,
+      "learning_rate": 0.0005412555617936755,
+      "loss": 3.0892,
+      "step": 4180
+    },
+    {
+      "epoch": 0.2431170036844701,
+      "grad_norm": 0.14166522026062012,
+      "learning_rate": 0.0005409129551112377,
+      "loss": 3.0808,
+      "step": 4190
+    },
+    {
+      "epoch": 0.24369723519684355,
+      "grad_norm": 0.13924048840999603,
+      "learning_rate": 0.0005405694613620617,
+      "loss": 3.0854,
+      "step": 4200
+    },
+    {
+      "epoch": 0.244277466709217,
+      "grad_norm": 0.13338492810726166,
+      "learning_rate": 0.0005402250818109276,
+      "loss": 3.0836,
+      "step": 4210
+    },
+    {
+      "epoch": 0.24485769822159043,
+      "grad_norm": 0.14531342685222626,
+      "learning_rate": 0.0005398798177258768,
+      "loss": 3.0971,
+      "step": 4220
+    },
+    {
+      "epoch": 0.24543792973396386,
+      "grad_norm": 0.1432162970304489,
+      "learning_rate": 0.0005395336703782082,
+      "loss": 3.0838,
+      "step": 4230
+    },
+    {
+      "epoch": 0.2460181612463373,
+      "grad_norm": 0.15475274622440338,
+      "learning_rate": 0.0005391866410424722,
+      "loss": 3.0764,
+      "step": 4240
+    },
+    {
+      "epoch": 0.24659839275871073,
+      "grad_norm": 0.15521539747714996,
+      "learning_rate": 0.0005388387309964675,
+      "loss": 3.0837,
+      "step": 4250
+    },
+    {
+      "epoch": 0.24717862427108417,
+      "grad_norm": 0.1430870145559311,
+      "learning_rate": 0.0005384899415212351,
+      "loss": 3.0889,
+      "step": 4260
+    },
+    {
+      "epoch": 0.2477588557834576,
+      "grad_norm": 0.14807622134685516,
+      "learning_rate": 0.0005381402739010545,
+      "loss": 3.0769,
+      "step": 4270
+    },
+    {
+      "epoch": 0.24833908729583104,
+      "grad_norm": 0.1509249359369278,
+      "learning_rate": 0.0005377897294234385,
+      "loss": 3.0815,
+      "step": 4280
+    },
+    {
+      "epoch": 0.24891931880820448,
+      "grad_norm": 0.1451188027858734,
+      "learning_rate": 0.0005374383093791287,
+      "loss": 3.0766,
+      "step": 4290
+    },
+    {
+      "epoch": 0.2494995503205779,
+      "grad_norm": 0.130240797996521,
+      "learning_rate": 0.0005370860150620901,
+      "loss": 3.0824,
+      "step": 4300
+    },
+    {
+      "epoch": 0.2500797818329513,
+      "grad_norm": 0.14696471393108368,
+      "learning_rate": 0.0005367328477695077,
+      "loss": 3.0678,
+      "step": 4310
+    },
+    {
+      "epoch": 0.2506600133453248,
+      "grad_norm": 0.13198255002498627,
+      "learning_rate": 0.0005363788088017803,
+      "loss": 3.0759,
+      "step": 4320
+    },
+    {
+      "epoch": 0.25124024485769825,
+      "grad_norm": 0.1413690447807312,
+      "learning_rate": 0.0005360238994625166,
+      "loss": 3.0842,
+      "step": 4330
+    },
+    {
+      "epoch": 0.25182047637007166,
+      "grad_norm": 0.1560727059841156,
+      "learning_rate": 0.0005356681210585297,
+      "loss": 3.074,
+      "step": 4340
+    },
+    {
+      "epoch": 0.2524007078824451,
+      "grad_norm": 0.13727669417858124,
+      "learning_rate": 0.0005353114748998332,
+      "loss": 3.082,
+      "step": 4350
+    },
+    {
+      "epoch": 0.2529809393948185,
+      "grad_norm": 0.1479531228542328,
+      "learning_rate": 0.0005349539622996356,
+      "loss": 3.0804,
+      "step": 4360
+    },
+    {
+      "epoch": 0.253561170907192,
+      "grad_norm": 0.13756506145000458,
+      "learning_rate": 0.0005345955845743358,
+      "loss": 3.0829,
+      "step": 4370
+    },
+    {
+      "epoch": 0.2541414024195654,
+      "grad_norm": 0.14778585731983185,
+      "learning_rate": 0.0005342363430435177,
+      "loss": 3.0785,
+      "step": 4380
+    },
+    {
+      "epoch": 0.25472163393193886,
+      "grad_norm": 0.13227440416812897,
+      "learning_rate": 0.0005338762390299467,
+      "loss": 3.0776,
+      "step": 4390
+    },
+    {
+      "epoch": 0.25530186544431227,
+      "grad_norm": 0.14178766310214996,
+      "learning_rate": 0.0005335152738595634,
+      "loss": 3.0799,
+      "step": 4400
+    },
+    {
+      "epoch": 0.25588209695668573,
+      "grad_norm": 0.14833244681358337,
+      "learning_rate": 0.0005331534488614794,
+      "loss": 3.0674,
+      "step": 4410
+    },
+    {
+      "epoch": 0.25646232846905914,
+      "grad_norm": 0.13829241693019867,
+      "learning_rate": 0.0005327907653679721,
+      "loss": 3.0643,
+      "step": 4420
+    },
+    {
+      "epoch": 0.2570425599814326,
+      "grad_norm": 0.16908784210681915,
+      "learning_rate": 0.0005324272247144802,
+      "loss": 3.0649,
+      "step": 4430
+    },
+    {
+      "epoch": 0.257622791493806,
+      "grad_norm": 0.14392369985580444,
+      "learning_rate": 0.0005320628282395985,
+      "loss": 3.0761,
+      "step": 4440
+    },
+    {
+      "epoch": 0.2582030230061795,
+      "grad_norm": 0.16387993097305298,
+      "learning_rate": 0.0005316975772850729,
+      "loss": 3.0666,
+      "step": 4450
+    },
+    {
+      "epoch": 0.2587832545185529,
+      "grad_norm": 0.13506962358951569,
+      "learning_rate": 0.0005313314731957957,
+      "loss": 3.0672,
+      "step": 4460
+    },
+    {
+      "epoch": 0.25936348603092635,
+      "grad_norm": 0.1522989273071289,
+      "learning_rate": 0.0005309645173198007,
+      "loss": 3.0607,
+      "step": 4470
+    },
+    {
+      "epoch": 0.25994371754329976,
+      "grad_norm": 0.13824021816253662,
+      "learning_rate": 0.0005305967110082576,
+      "loss": 3.0627,
+      "step": 4480
+    },
+    {
+      "epoch": 0.2605239490556732,
+      "grad_norm": 0.13685718178749084,
+      "learning_rate": 0.000530228055615468,
+      "loss": 3.0612,
+      "step": 4490
+    },
+    {
+      "epoch": 0.26110418056804663,
+      "grad_norm": 0.13309134542942047,
+      "learning_rate": 0.0005298585524988594,
+      "loss": 3.0548,
+      "step": 4500
+    },
+    {
+      "epoch": 0.2616844120804201,
+      "grad_norm": 0.17121103405952454,
+      "learning_rate": 0.0005294882030189812,
+      "loss": 3.066,
+      "step": 4510
+    },
+    {
+      "epoch": 0.2622646435927935,
+      "grad_norm": 0.13467055559158325,
+      "learning_rate": 0.000529117008539499,
+      "loss": 3.0606,
+      "step": 4520
+    },
+    {
+      "epoch": 0.26284487510516696,
+      "grad_norm": 0.12970523536205292,
+      "learning_rate": 0.0005287449704271896,
+      "loss": 3.0553,
+      "step": 4530
+    },
+    {
+      "epoch": 0.26342510661754037,
+      "grad_norm": 0.1509917676448822,
+      "learning_rate": 0.0005283720900519365,
+      "loss": 3.0571,
+      "step": 4540
+    },
+    {
+      "epoch": 0.26400533812991384,
+      "grad_norm": 0.1372883915901184,
+      "learning_rate": 0.0005279983687867243,
+      "loss": 3.0635,
+      "step": 4550
+    },
+    {
+      "epoch": 0.2645855696422873,
+      "grad_norm": 0.1482354998588562,
+      "learning_rate": 0.0005276238080076335,
+      "loss": 3.0619,
+      "step": 4560
+    },
+    {
+      "epoch": 0.2651658011546607,
+      "grad_norm": 0.13884900510311127,
+      "learning_rate": 0.0005272484090938365,
+      "loss": 3.069,
+      "step": 4570
+    },
+    {
+      "epoch": 0.26574603266703417,
+      "grad_norm": 0.14500798285007477,
+      "learning_rate": 0.0005268721734275914,
+      "loss": 3.0715,
+      "step": 4580
+    },
+    {
+      "epoch": 0.2663262641794076,
+      "grad_norm": 0.1357218474149704,
+      "learning_rate": 0.000526495102394237,
+      "loss": 3.0584,
+      "step": 4590
+    },
+    {
+      "epoch": 0.26690649569178104,
+      "grad_norm": 0.14025723934173584,
+      "learning_rate": 0.0005261171973821887,
+      "loss": 3.0613,
+      "step": 4600
+    },
+    {
+      "epoch": 0.26748672720415445,
+      "grad_norm": 0.15253092348575592,
+      "learning_rate": 0.0005257384597829322,
+      "loss": 3.0584,
+      "step": 4610
+    },
+    {
+      "epoch": 0.2680669587165279,
+      "grad_norm": 0.14573270082473755,
+      "learning_rate": 0.0005253588909910191,
+      "loss": 3.0634,
+      "step": 4620
+    },
+    {
+      "epoch": 0.2686471902289013,
+      "grad_norm": 0.15005233883857727,
+      "learning_rate": 0.0005249784924040614,
+      "loss": 3.0526,
+      "step": 4630
+    },
+    {
+      "epoch": 0.2692274217412748,
+      "grad_norm": 0.15314225852489471,
+      "learning_rate": 0.0005245972654227265,
+      "loss": 3.0635,
+      "step": 4640
+    },
+    {
+      "epoch": 0.2698076532536482,
+      "grad_norm": 0.14412705600261688,
+      "learning_rate": 0.0005242152114507321,
+      "loss": 3.055,
+      "step": 4650
+    },
+    {
+      "epoch": 0.27038788476602166,
+      "grad_norm": 0.15046367049217224,
+      "learning_rate": 0.0005238323318948412,
+      "loss": 3.066,
+      "step": 4660
+    },
+    {
+      "epoch": 0.27096811627839507,
+      "grad_norm": 0.12618590891361237,
+      "learning_rate": 0.0005234486281648559,
+      "loss": 3.0433,
+      "step": 4670
+    },
+    {
+      "epoch": 0.27154834779076853,
+      "grad_norm": 0.14097653329372406,
+      "learning_rate": 0.000523064101673614,
+      "loss": 3.0593,
+      "step": 4680
+    },
+    {
+      "epoch": 0.27212857930314194,
+      "grad_norm": 0.14015048742294312,
+      "learning_rate": 0.0005226787538369821,
+      "loss": 3.057,
+      "step": 4690
+    },
+    {
+      "epoch": 0.2727088108155154,
+      "grad_norm": 0.1534152328968048,
+      "learning_rate": 0.0005222925860738513,
+      "loss": 3.06,
+      "step": 4700
+    },
+    {
+      "epoch": 0.2732890423278888,
+      "grad_norm": 0.1350966989994049,
+      "learning_rate": 0.0005219055998061319,
+      "loss": 3.0518,
+      "step": 4710
+    },
+    {
+      "epoch": 0.2738692738402623,
+      "grad_norm": 0.15589705109596252,
+      "learning_rate": 0.0005215177964587478,
+      "loss": 3.0468,
+      "step": 4720
+    },
+    {
+      "epoch": 0.2744495053526357,
+      "grad_norm": 0.14144299924373627,
+      "learning_rate": 0.0005211291774596316,
+      "loss": 3.0555,
+      "step": 4730
+    },
+    {
+      "epoch": 0.27502973686500914,
+      "grad_norm": 0.14553704857826233,
+      "learning_rate": 0.000520739744239719,
+      "loss": 3.0531,
+      "step": 4740
+    },
+    {
+      "epoch": 0.27560996837738255,
+      "grad_norm": 0.15157508850097656,
+      "learning_rate": 0.0005203494982329441,
+      "loss": 3.0504,
+      "step": 4750
+    },
+    {
+      "epoch": 0.276190199889756,
+      "grad_norm": 0.14391539990901947,
+      "learning_rate": 0.0005199584408762335,
+      "loss": 3.0512,
+      "step": 4760
+    },
+    {
+      "epoch": 0.2767704314021294,
+      "grad_norm": 0.1297539621591568,
+      "learning_rate": 0.0005195665736095013,
+      "loss": 3.036,
+      "step": 4770
+    },
+    {
+      "epoch": 0.2773506629145029,
+      "grad_norm": 0.13723768293857574,
+      "learning_rate": 0.0005191738978756439,
+      "loss": 3.0532,
+      "step": 4780
+    },
+    {
+      "epoch": 0.2779308944268763,
+      "grad_norm": 0.1422174870967865,
+      "learning_rate": 0.0005187804151205345,
+      "loss": 3.0605,
+      "step": 4790
+    },
+    {
+      "epoch": 0.27851112593924976,
+      "grad_norm": 0.137346088886261,
+      "learning_rate": 0.0005183861267930177,
+      "loss": 3.0552,
+      "step": 4800
+    },
+    {
+      "epoch": 0.2790913574516232,
+      "grad_norm": 0.13471810519695282,
+      "learning_rate": 0.0005179910343449046,
+      "loss": 3.0426,
+      "step": 4810
+    },
+    {
+      "epoch": 0.27967158896399663,
+      "grad_norm": 0.12727439403533936,
+      "learning_rate": 0.0005175951392309669,
+      "loss": 3.0448,
+      "step": 4820
+    },
+    {
+      "epoch": 0.2802518204763701,
+      "grad_norm": 0.13242101669311523,
+      "learning_rate": 0.0005171984429089318,
+      "loss": 3.0546,
+      "step": 4830
+    },
+    {
+      "epoch": 0.2808320519887435,
+      "grad_norm": 0.14276637136936188,
+      "learning_rate": 0.0005168009468394769,
+      "loss": 3.0392,
+      "step": 4840
+    },
+    {
+      "epoch": 0.28141228350111697,
+      "grad_norm": 0.1340208798646927,
+      "learning_rate": 0.0005164026524862242,
+      "loss": 3.0491,
+      "step": 4850
+    },
+    {
+      "epoch": 0.2819925150134904,
+      "grad_norm": 0.14000356197357178,
+      "learning_rate": 0.0005160035613157354,
+      "loss": 3.0396,
+      "step": 4860
+    },
+    {
+      "epoch": 0.28257274652586384,
+      "grad_norm": 0.15974439680576324,
+      "learning_rate": 0.0005156036747975059,
+      "loss": 3.0406,
+      "step": 4870
+    },
+    {
+      "epoch": 0.28315297803823725,
+      "grad_norm": 0.1382746398448944,
+      "learning_rate": 0.0005152029944039597,
+      "loss": 3.0449,
+      "step": 4880
+    },
+    {
+      "epoch": 0.2837332095506107,
+      "grad_norm": 0.14049001038074493,
+      "learning_rate": 0.000514801521610444,
+      "loss": 3.0463,
+      "step": 4890
+    },
+    {
+      "epoch": 0.2843134410629841,
+      "grad_norm": 0.13699445128440857,
+      "learning_rate": 0.0005143992578952238,
+      "loss": 3.0393,
+      "step": 4900
+    },
+    {
+      "epoch": 0.2848936725753576,
+      "grad_norm": 0.1515870988368988,
+      "learning_rate": 0.0005139962047394761,
+      "loss": 3.0399,
+      "step": 4910
+    },
+    {
+      "epoch": 0.285473904087731,
+      "grad_norm": 0.1437605917453766,
+      "learning_rate": 0.0005135923636272849,
+      "loss": 3.0378,
+      "step": 4920
+    },
+    {
+      "epoch": 0.28605413560010445,
+      "grad_norm": 0.13769088685512543,
+      "learning_rate": 0.0005131877360456355,
+      "loss": 3.0377,
+      "step": 4930
+    },
+    {
+      "epoch": 0.28663436711247786,
+      "grad_norm": 0.15194256603717804,
+      "learning_rate": 0.000512782323484409,
+      "loss": 3.0399,
+      "step": 4940
+    },
+    {
+      "epoch": 0.2872145986248513,
+      "grad_norm": 0.14672812819480896,
+      "learning_rate": 0.0005123761274363769,
+      "loss": 3.04,
+      "step": 4950
+    },
+    {
+      "epoch": 0.28779483013722473,
+      "grad_norm": 0.13162557780742645,
+      "learning_rate": 0.0005119691493971957,
+      "loss": 3.0317,
+      "step": 4960
+    },
+    {
+      "epoch": 0.2883750616495982,
+      "grad_norm": 0.13286751508712769,
+      "learning_rate": 0.0005115613908654011,
+      "loss": 3.0486,
+      "step": 4970
+    },
+    {
+      "epoch": 0.2889552931619716,
+      "grad_norm": 0.13034851849079132,
+      "learning_rate": 0.0005111528533424027,
+      "loss": 3.0399,
+      "step": 4980
+    },
+    {
+      "epoch": 0.28953552467434507,
+      "grad_norm": 0.1405908614397049,
+      "learning_rate": 0.0005107435383324786,
+      "loss": 3.0372,
+      "step": 4990
+    },
+    {
+      "epoch": 0.2901157561867185,
+      "grad_norm": 0.16415055096149445,
+      "learning_rate": 0.0005103334473427695,
+      "loss": 3.0333,
+      "step": 5000
+    },
+    {
+      "epoch": 0.2901157561867185,
+      "eval_loss": 2.9981322288513184,
+      "eval_runtime": 3.2581,
+      "eval_samples_per_second": 1329.001,
+      "eval_steps_per_second": 2.762,
+      "step": 5000
+    },
+    {
+      "epoch": 0.29069598769909194,
+      "grad_norm": 0.12301915884017944,
+      "learning_rate": 0.0005099225818832731,
+      "loss": 3.0312,
+      "step": 5010
+    },
+    {
+      "epoch": 0.29127621921146535,
+      "grad_norm": 0.16767041385173798,
+      "learning_rate": 0.0005095109434668395,
+      "loss": 3.0247,
+      "step": 5020
+    },
+    {
+      "epoch": 0.2918564507238388,
+      "grad_norm": 0.13234609365463257,
+      "learning_rate": 0.0005090985336091642,
+      "loss": 3.0348,
+      "step": 5030
+    },
+    {
+      "epoch": 0.2924366822362123,
+      "grad_norm": 0.14020933210849762,
+      "learning_rate": 0.0005086853538287835,
+      "loss": 3.0317,
+      "step": 5040
+    },
+    {
+      "epoch": 0.2930169137485857,
+      "grad_norm": 0.14580604434013367,
+      "learning_rate": 0.0005082714056470687,
+      "loss": 3.0321,
+      "step": 5050
+    },
+    {
+      "epoch": 0.29359714526095915,
+      "grad_norm": 0.13627541065216064,
+      "learning_rate": 0.0005078566905882205,
+      "loss": 3.0318,
+      "step": 5060
+    },
+    {
+      "epoch": 0.29417737677333256,
+      "grad_norm": 0.12629657983779907,
+      "learning_rate": 0.0005074412101792631,
+      "loss": 3.0284,
+      "step": 5070
+    },
+    {
+      "epoch": 0.294757608285706,
+      "grad_norm": 0.13409367203712463,
+      "learning_rate": 0.0005070249659500387,
+      "loss": 3.0381,
+      "step": 5080
+    },
+    {
+      "epoch": 0.2953378397980794,
+      "grad_norm": 0.1341470181941986,
+      "learning_rate": 0.0005066079594332023,
+      "loss": 3.0229,
+      "step": 5090
+    },
+    {
+      "epoch": 0.2959180713104529,
+      "grad_norm": 0.1630919873714447,
+      "learning_rate": 0.0005061901921642156,
+      "loss": 3.0315,
+      "step": 5100
+    },
+    {
+      "epoch": 0.2964983028228263,
+      "grad_norm": 0.12825888395309448,
+      "learning_rate": 0.0005057716656813416,
+      "loss": 3.0249,
+      "step": 5110
+    },
+    {
+      "epoch": 0.29707853433519976,
+      "grad_norm": 0.1613105833530426,
+      "learning_rate": 0.0005053523815256384,
+      "loss": 3.0238,
+      "step": 5120
+    },
+    {
+      "epoch": 0.29765876584757317,
+      "grad_norm": 0.14038483798503876,
+      "learning_rate": 0.0005049323412409542,
+      "loss": 3.0294,
+      "step": 5130
+    },
+    {
+      "epoch": 0.29823899735994663,
+      "grad_norm": 0.16509568691253662,
+      "learning_rate": 0.0005045115463739215,
+      "loss": 3.0356,
+      "step": 5140
+    },
+    {
+      "epoch": 0.29881922887232004,
+      "grad_norm": 0.14289237558841705,
+      "learning_rate": 0.0005040899984739509,
+      "loss": 3.0228,
+      "step": 5150
+    },
+    {
+      "epoch": 0.2993994603846935,
+      "grad_norm": 0.14584140479564667,
+      "learning_rate": 0.000503667699093226,
+      "loss": 3.0294,
+      "step": 5160
+    },
+    {
+      "epoch": 0.2999796918970669,
+      "grad_norm": 0.12970221042633057,
+      "learning_rate": 0.0005032446497866973,
+      "loss": 3.0321,
+      "step": 5170
+    },
+    {
+      "epoch": 0.3005599234094404,
+      "grad_norm": 0.13744401931762695,
+      "learning_rate": 0.0005028208521120769,
+      "loss": 3.0236,
+      "step": 5180
+    },
+    {
+      "epoch": 0.3011401549218138,
+      "grad_norm": 0.1317235380411148,
+      "learning_rate": 0.0005023963076298321,
+      "loss": 3.0254,
+      "step": 5190
+    },
+    {
+      "epoch": 0.30172038643418725,
+      "grad_norm": 0.14213494956493378,
+      "learning_rate": 0.0005019710179031801,
+      "loss": 3.0275,
+      "step": 5200
+    },
+    {
+      "epoch": 0.30230061794656066,
+      "grad_norm": 0.13712069392204285,
+      "learning_rate": 0.0005015449844980823,
+      "loss": 3.0249,
+      "step": 5210
+    },
+    {
+      "epoch": 0.3028808494589341,
+      "grad_norm": 0.14411009848117828,
+      "learning_rate": 0.0005011182089832381,
+      "loss": 3.0215,
+      "step": 5220
+    },
+    {
+      "epoch": 0.30346108097130753,
+      "grad_norm": 0.12583871185779572,
+      "learning_rate": 0.0005006906929300799,
+      "loss": 3.0275,
+      "step": 5230
+    },
+    {
+      "epoch": 0.304041312483681,
+      "grad_norm": 0.14499635994434357,
+      "learning_rate": 0.0005002624379127666,
+      "loss": 3.0258,
+      "step": 5240
+    },
+    {
+      "epoch": 0.3046215439960544,
+      "grad_norm": 0.14918765425682068,
+      "learning_rate": 0.0004998334455081779,
+      "loss": 3.0209,
+      "step": 5250
+    },
+    {
+      "epoch": 0.30520177550842786,
+      "grad_norm": 0.13245496153831482,
+      "learning_rate": 0.0004994037172959089,
+      "loss": 3.0212,
+      "step": 5260
+    },
+    {
+      "epoch": 0.3057820070208013,
+      "grad_norm": 0.12850724160671234,
+      "learning_rate": 0.0004989732548582638,
+      "loss": 3.0258,
+      "step": 5270
+    },
+    {
+      "epoch": 0.30636223853317474,
+      "grad_norm": 0.1346123367547989,
+      "learning_rate": 0.0004985420597802503,
+      "loss": 3.0138,
+      "step": 5280
+    },
+    {
+      "epoch": 0.3069424700455482,
+      "grad_norm": 0.14746621251106262,
+      "learning_rate": 0.0004981101336495741,
+      "loss": 3.0202,
+      "step": 5290
+    },
+    {
+      "epoch": 0.3075227015579216,
+      "grad_norm": 0.140406534075737,
+      "learning_rate": 0.0004976774780566324,
+      "loss": 3.0276,
+      "step": 5300
+    },
+    {
+      "epoch": 0.30810293307029507,
+      "grad_norm": 0.133416548371315,
+      "learning_rate": 0.0004972440945945083,
+      "loss": 3.0228,
+      "step": 5310
+    },
+    {
+      "epoch": 0.3086831645826685,
+      "grad_norm": 0.140433207154274,
+      "learning_rate": 0.0004968099848589651,
+      "loss": 3.0219,
+      "step": 5320
+    },
+    {
+      "epoch": 0.30926339609504194,
+      "grad_norm": 0.14963370561599731,
+      "learning_rate": 0.0004963751504484403,
+      "loss": 3.0119,
+      "step": 5330
+    },
+    {
+      "epoch": 0.30984362760741535,
+      "grad_norm": 0.12273452430963516,
+      "learning_rate": 0.0004959395929640401,
+      "loss": 3.0136,
+      "step": 5340
+    },
+    {
+      "epoch": 0.3104238591197888,
+      "grad_norm": 0.14232607185840607,
+      "learning_rate": 0.0004955033140095322,
+      "loss": 3.0088,
+      "step": 5350
+    },
+    {
+      "epoch": 0.3110040906321622,
+      "grad_norm": 0.15276071429252625,
+      "learning_rate": 0.0004950663151913419,
+      "loss": 3.0189,
+      "step": 5360
+    },
+    {
+      "epoch": 0.3115843221445357,
+      "grad_norm": 0.14110638201236725,
+      "learning_rate": 0.0004946285981185446,
+      "loss": 3.0273,
+      "step": 5370
+    },
+    {
+      "epoch": 0.3121645536569091,
+      "grad_norm": 0.12971307337284088,
+      "learning_rate": 0.0004941901644028601,
+      "loss": 3.0181,
+      "step": 5380
+    },
+    {
+      "epoch": 0.31274478516928256,
+      "grad_norm": 0.12775759398937225,
+      "learning_rate": 0.0004937510156586474,
+      "loss": 3.0108,
+      "step": 5390
+    },
+    {
+      "epoch": 0.31332501668165597,
+      "grad_norm": 0.15120139718055725,
+      "learning_rate": 0.0004933111535028983,
+      "loss": 3.0142,
+      "step": 5400
+    },
+    {
+      "epoch": 0.31390524819402943,
+      "grad_norm": 0.14965811371803284,
+      "learning_rate": 0.0004928705795552312,
+      "loss": 3.0137,
+      "step": 5410
+    },
+    {
+      "epoch": 0.31448547970640284,
+      "grad_norm": 0.1459018588066101,
+      "learning_rate": 0.0004924292954378856,
+      "loss": 3.0146,
+      "step": 5420
+    },
+    {
+      "epoch": 0.3150657112187763,
+      "grad_norm": 0.1286230981349945,
+      "learning_rate": 0.0004919873027757159,
+      "loss": 3.0162,
+      "step": 5430
+    },
+    {
+      "epoch": 0.3156459427311497,
+      "grad_norm": 0.13560357689857483,
+      "learning_rate": 0.0004915446031961854,
+      "loss": 3.0129,
+      "step": 5440
+    },
+    {
+      "epoch": 0.3162261742435232,
+      "grad_norm": 0.1419978141784668,
+      "learning_rate": 0.0004911011983293601,
+      "loss": 3.0115,
+      "step": 5450
+    },
+    {
+      "epoch": 0.3168064057558966,
+      "grad_norm": 0.12910611927509308,
+      "learning_rate": 0.0004906570898079032,
+      "loss": 3.0151,
+      "step": 5460
+    },
+    {
+      "epoch": 0.31738663726827004,
+      "grad_norm": 0.15491628646850586,
+      "learning_rate": 0.0004902122792670692,
+      "loss": 3.0118,
+      "step": 5470
+    },
+    {
+      "epoch": 0.31796686878064345,
+      "grad_norm": 0.12448934465646744,
+      "learning_rate": 0.0004897667683446967,
+      "loss": 3.0119,
+      "step": 5480
+    },
+    {
+      "epoch": 0.3185471002930169,
+      "grad_norm": 0.1288510411977768,
+      "learning_rate": 0.0004893205586812036,
+      "loss": 3.0078,
+      "step": 5490
+    },
+    {
+      "epoch": 0.3191273318053903,
+      "grad_norm": 0.12903016805648804,
+      "learning_rate": 0.000488873651919581,
+      "loss": 3.0085,
+      "step": 5500
+    },
+    {
+      "epoch": 0.3197075633177638,
+      "grad_norm": 0.14042973518371582,
+      "learning_rate": 0.0004884260497053859,
+      "loss": 3.0093,
+      "step": 5510
+    },
+    {
+      "epoch": 0.32028779483013725,
+      "grad_norm": 0.13995361328125,
+      "learning_rate": 0.0004879777536867369,
+      "loss": 3.0009,
+      "step": 5520
+    },
+    {
+      "epoch": 0.32086802634251066,
+      "grad_norm": 0.13979199528694153,
+      "learning_rate": 0.00048752876551430677,
+      "loss": 3.0089,
+      "step": 5530
+    },
+    {
+      "epoch": 0.3214482578548841,
+      "grad_norm": 0.130417600274086,
+      "learning_rate": 0.0004870790868413171,
+      "loss": 3.0087,
+      "step": 5540
+    },
+    {
+      "epoch": 0.32202848936725753,
+      "grad_norm": 0.13676275312900543,
+      "learning_rate": 0.00048662871932353164,
+      "loss": 3.0092,
+      "step": 5550
+    },
+    {
+      "epoch": 0.322608720879631,
+      "grad_norm": 0.12869158387184143,
+      "learning_rate": 0.00048617766461925104,
+      "loss": 3.0074,
+      "step": 5560
+    },
+    {
+      "epoch": 0.3231889523920044,
+      "grad_norm": 0.13846737146377563,
+      "learning_rate": 0.0004857259243893058,
+      "loss": 3.0079,
+      "step": 5570
+    },
+    {
+      "epoch": 0.32376918390437787,
+      "grad_norm": 0.1349971890449524,
+      "learning_rate": 0.0004852735002970509,
+      "loss": 2.9915,
+      "step": 5580
+    },
+    {
+      "epoch": 0.3243494154167513,
+      "grad_norm": 0.13398951292037964,
+      "learning_rate": 0.000484820394008359,
+      "loss": 2.9982,
+      "step": 5590
+    },
+    {
+      "epoch": 0.32492964692912474,
+      "grad_norm": 0.13627557456493378,
+      "learning_rate": 0.0004843666071916152,
+      "loss": 3.0019,
+      "step": 5600
+    },
+    {
+      "epoch": 0.32550987844149815,
+      "grad_norm": 0.13470283150672913,
+      "learning_rate": 0.00048391214151771,
+      "loss": 3.0015,
+      "step": 5610
+    },
+    {
+      "epoch": 0.3260901099538716,
+      "grad_norm": 0.14207038283348083,
+      "learning_rate": 0.0004834569986600336,
+      "loss": 3.0051,
+      "step": 5620
+    },
+    {
+      "epoch": 0.326670341466245,
+      "grad_norm": 0.13324964046478271,
+      "learning_rate": 0.00048300118029446967,
+      "loss": 2.9956,
+      "step": 5630
+    },
+    {
+      "epoch": 0.3272505729786185,
+      "grad_norm": 0.15288645029067993,
+      "learning_rate": 0.0004825446880993892,
+      "loss": 3.0087,
+      "step": 5640
+    },
+    {
+      "epoch": 0.3278308044909919,
+      "grad_norm": 0.13744772970676422,
+      "learning_rate": 0.00048208752375564424,
+      "loss": 3.0049,
+      "step": 5650
+    },
+    {
+      "epoch": 0.32841103600336535,
+      "grad_norm": 0.13114534318447113,
+      "learning_rate": 0.00048162968894656193,
+      "loss": 2.9993,
+      "step": 5660
+    },
+    {
+      "epoch": 0.32899126751573876,
+      "grad_norm": 0.1254429966211319,
+      "learning_rate": 0.00048117118535793773,
+      "loss": 2.9937,
+      "step": 5670
+    },
+    {
+      "epoch": 0.3295714990281122,
+      "grad_norm": 0.15155521035194397,
+      "learning_rate": 0.00048071201467803017,
+      "loss": 3.0017,
+      "step": 5680
+    },
+    {
+      "epoch": 0.33015173054048563,
+      "grad_norm": 0.1420249044895172,
+      "learning_rate": 0.00048025217859755365,
+      "loss": 3.017,
+      "step": 5690
+    },
+    {
+      "epoch": 0.3307319620528591,
+      "grad_norm": 0.14615775644779205,
+      "learning_rate": 0.0004797916788096728,
+      "loss": 3.0052,
+      "step": 5700
+    },
+    {
+      "epoch": 0.3313121935652325,
+      "grad_norm": 0.12851493060588837,
+      "learning_rate": 0.00047933051700999605,
+      "loss": 3.0041,
+      "step": 5710
+    },
+    {
+      "epoch": 0.33189242507760597,
+      "grad_norm": 0.13371190428733826,
+      "learning_rate": 0.00047886869489656956,
+      "loss": 2.9879,
+      "step": 5720
+    },
+    {
+      "epoch": 0.3324726565899794,
+      "grad_norm": 0.13223771750926971,
+      "learning_rate": 0.0004784062141698707,
+      "loss": 2.993,
+      "step": 5730
+    },
+    {
+      "epoch": 0.33305288810235284,
+      "grad_norm": 0.13460920751094818,
+      "learning_rate": 0.00047794307653280184,
+      "loss": 2.9928,
+      "step": 5740
+    },
+    {
+      "epoch": 0.3336331196147263,
+      "grad_norm": 0.12678171694278717,
+      "learning_rate": 0.0004774792836906844,
+      "loss": 3.0053,
+      "step": 5750
+    },
+    {
+      "epoch": 0.3342133511270997,
+      "grad_norm": 0.14595790207386017,
+      "learning_rate": 0.0004770148373512522,
+      "loss": 2.9974,
+      "step": 5760
+    },
+    {
+      "epoch": 0.3347935826394732,
+      "grad_norm": 0.1505734771490097,
+      "learning_rate": 0.00047654973922464525,
+      "loss": 3.0053,
+      "step": 5770
+    },
+    {
+      "epoch": 0.3353738141518466,
+      "grad_norm": 0.13636811077594757,
+      "learning_rate": 0.00047608399102340367,
+      "loss": 2.9984,
+      "step": 5780
+    },
+    {
+      "epoch": 0.33595404566422005,
+      "grad_norm": 0.14487333595752716,
+      "learning_rate": 0.000475617594462461,
+      "loss": 3.0013,
+      "step": 5790
+    },
+    {
+      "epoch": 0.33653427717659345,
+      "grad_norm": 0.13392585515975952,
+      "learning_rate": 0.00047515055125913825,
+      "loss": 2.9897,
+      "step": 5800
+    },
+    {
+      "epoch": 0.3371145086889669,
+      "grad_norm": 0.1241224929690361,
+      "learning_rate": 0.0004746828631331376,
+      "loss": 2.9918,
+      "step": 5810
+    },
+    {
+      "epoch": 0.3376947402013403,
+      "grad_norm": 0.1381169706583023,
+      "learning_rate": 0.00047421453180653553,
+      "loss": 2.9874,
+      "step": 5820
+    },
+    {
+      "epoch": 0.3382749717137138,
+      "grad_norm": 0.12413561344146729,
+      "learning_rate": 0.00047374555900377716,
+      "loss": 2.9928,
+      "step": 5830
+    },
+    {
+      "epoch": 0.3388552032260872,
+      "grad_norm": 0.13286706805229187,
+      "learning_rate": 0.0004732759464516694,
+      "loss": 2.9907,
+      "step": 5840
+    },
+    {
+      "epoch": 0.33943543473846066,
+      "grad_norm": 0.1558184027671814,
+      "learning_rate": 0.0004728056958793749,
+      "loss": 3.0036,
+      "step": 5850
+    },
+    {
+      "epoch": 0.34001566625083407,
+      "grad_norm": 0.13220670819282532,
+      "learning_rate": 0.0004723348090184056,
+      "loss": 2.9945,
+      "step": 5860
+    },
+    {
+      "epoch": 0.34059589776320753,
+      "grad_norm": 0.13015997409820557,
+      "learning_rate": 0.00047186328760261603,
+      "loss": 3.0005,
+      "step": 5870
+    },
+    {
+      "epoch": 0.34117612927558094,
+      "grad_norm": 0.146441251039505,
+      "learning_rate": 0.0004713911333681976,
+      "loss": 2.9984,
+      "step": 5880
+    },
+    {
+      "epoch": 0.3417563607879544,
+      "grad_norm": 0.12352869659662247,
+      "learning_rate": 0.0004709183480536718,
+      "loss": 2.9946,
+      "step": 5890
+    },
+    {
+      "epoch": 0.3423365923003278,
+      "grad_norm": 0.12516902387142181,
+      "learning_rate": 0.0004704449333998834,
+      "loss": 2.9918,
+      "step": 5900
+    },
+    {
+      "epoch": 0.3429168238127013,
+      "grad_norm": 0.14155182242393494,
+      "learning_rate": 0.00046997089114999494,
+      "loss": 2.9937,
+      "step": 5910
+    },
+    {
+      "epoch": 0.3434970553250747,
+      "grad_norm": 0.12636148929595947,
+      "learning_rate": 0.0004694962230494796,
+      "loss": 2.9869,
+      "step": 5920
+    },
+    {
+      "epoch": 0.34407728683744815,
+      "grad_norm": 0.14390048384666443,
+      "learning_rate": 0.000469020930846115,
+      "loss": 2.9759,
+      "step": 5930
+    },
+    {
+      "epoch": 0.34465751834982156,
+      "grad_norm": 0.14705798029899597,
+      "learning_rate": 0.0004685450162899768,
+      "loss": 2.9876,
+      "step": 5940
+    },
+    {
+      "epoch": 0.345237749862195,
+      "grad_norm": 0.13937653601169586,
+      "learning_rate": 0.00046806848113343234,
+      "loss": 2.9872,
+      "step": 5950
+    },
+    {
+      "epoch": 0.34581798137456843,
+      "grad_norm": 0.13351042568683624,
+      "learning_rate": 0.00046759132713113403,
+      "loss": 2.986,
+      "step": 5960
+    },
+    {
+      "epoch": 0.3463982128869419,
+      "grad_norm": 0.133000910282135,
+      "learning_rate": 0.0004671135560400127,
+      "loss": 2.9886,
+      "step": 5970
+    },
+    {
+      "epoch": 0.3469784443993153,
+      "grad_norm": 0.1261400580406189,
+      "learning_rate": 0.0004666351696192718,
+      "loss": 2.9811,
+      "step": 5980
+    },
+    {
+      "epoch": 0.34755867591168876,
+      "grad_norm": 0.13575439155101776,
+      "learning_rate": 0.00046615616963038007,
+      "loss": 2.9796,
+      "step": 5990
+    },
+    {
+      "epoch": 0.3481389074240622,
+      "grad_norm": 0.13202066719532013,
+      "learning_rate": 0.0004656765578370657,
+      "loss": 2.9958,
+      "step": 6000
+    },
+    {
+      "epoch": 0.3481389074240622,
+      "eval_loss": 2.949599027633667,
+      "eval_runtime": 3.2655,
+      "eval_samples_per_second": 1325.986,
+      "eval_steps_per_second": 2.756,
+      "step": 6000
+    },
+    {
+      "epoch": 0.34871913893643564,
+      "grad_norm": 0.14002783596515656,
+      "learning_rate": 0.0004651963360053096,
+      "loss": 2.9811,
+      "step": 6010
+    },
+    {
+      "epoch": 0.3492993704488091,
+      "grad_norm": 0.1519598364830017,
+      "learning_rate": 0.00046471550590333874,
+      "loss": 2.9884,
+      "step": 6020
+    },
+    {
+      "epoch": 0.3498796019611825,
+      "grad_norm": 0.1435564160346985,
+      "learning_rate": 0.00046423406930162,
+      "loss": 2.9831,
+      "step": 6030
+    },
+    {
+      "epoch": 0.35045983347355597,
+      "grad_norm": 0.1241581067442894,
+      "learning_rate": 0.0004637520279728534,
+      "loss": 2.9801,
+      "step": 6040
+    },
+    {
+      "epoch": 0.3510400649859294,
+      "grad_norm": 0.124722421169281,
+      "learning_rate": 0.00046326938369196566,
+      "loss": 2.9872,
+      "step": 6050
+    },
+    {
+      "epoch": 0.35162029649830284,
+      "grad_norm": 0.12400694936513901,
+      "learning_rate": 0.0004627861382361034,
+      "loss": 2.9863,
+      "step": 6060
+    },
+    {
+      "epoch": 0.35220052801067625,
+      "grad_norm": 0.14388398826122284,
+      "learning_rate": 0.0004623022933846272,
+      "loss": 2.973,
+      "step": 6070
+    },
+    {
+      "epoch": 0.3527807595230497,
+      "grad_norm": 0.14111004769802094,
+      "learning_rate": 0.0004618178509191045,
+      "loss": 2.9902,
+      "step": 6080
+    },
+    {
+      "epoch": 0.3533609910354231,
+      "grad_norm": 0.1257510930299759,
+      "learning_rate": 0.000461332812623303,
+      "loss": 2.9877,
+      "step": 6090
+    },
+    {
+      "epoch": 0.3539412225477966,
+      "grad_norm": 0.1282566338777542,
+      "learning_rate": 0.00046084718028318466,
+      "loss": 2.9832,
+      "step": 6100
+    },
+    {
+      "epoch": 0.35452145406017,
+      "grad_norm": 0.14325213432312012,
+      "learning_rate": 0.00046036095568689864,
+      "loss": 2.9782,
+      "step": 6110
+    },
+    {
+      "epoch": 0.35510168557254346,
+      "grad_norm": 0.1563083529472351,
+      "learning_rate": 0.0004598741406247748,
+      "loss": 2.9793,
+      "step": 6120
+    },
+    {
+      "epoch": 0.35568191708491687,
+      "grad_norm": 0.1327456384897232,
+      "learning_rate": 0.0004593867368893172,
+      "loss": 2.9843,
+      "step": 6130
+    },
+    {
+      "epoch": 0.35626214859729033,
+      "grad_norm": 0.13930997252464294,
+      "learning_rate": 0.0004588987462751975,
+      "loss": 2.976,
+      "step": 6140
+    },
+    {
+      "epoch": 0.35684238010966374,
+      "grad_norm": 0.1295255720615387,
+      "learning_rate": 0.00045841017057924807,
+      "loss": 2.9801,
+      "step": 6150
+    },
+    {
+      "epoch": 0.3574226116220372,
+      "grad_norm": 0.1404607594013214,
+      "learning_rate": 0.00045792101160045613,
+      "loss": 2.9788,
+      "step": 6160
+    },
+    {
+      "epoch": 0.3580028431344106,
+      "grad_norm": 0.12297389656305313,
+      "learning_rate": 0.0004574312711399561,
+      "loss": 2.9853,
+      "step": 6170
+    },
+    {
+      "epoch": 0.3585830746467841,
+      "grad_norm": 0.15521986782550812,
+      "learning_rate": 0.0004569409510010236,
+      "loss": 2.9825,
+      "step": 6180
+    },
+    {
+      "epoch": 0.3591633061591575,
+      "grad_norm": 0.12915629148483276,
+      "learning_rate": 0.00045645005298906887,
+      "loss": 2.984,
+      "step": 6190
+    },
+    {
+      "epoch": 0.35974353767153094,
+      "grad_norm": 0.12852182984352112,
+      "learning_rate": 0.00045595857891162964,
+      "loss": 2.9703,
+      "step": 6200
+    },
+    {
+      "epoch": 0.36032376918390435,
+      "grad_norm": 0.1300152987241745,
+      "learning_rate": 0.00045546653057836517,
+      "loss": 2.971,
+      "step": 6210
+    },
+    {
+      "epoch": 0.3609040006962778,
+      "grad_norm": 0.13348935544490814,
+      "learning_rate": 0.00045497390980104885,
+      "loss": 2.9762,
+      "step": 6220
+    },
+    {
+      "epoch": 0.3614842322086513,
+      "grad_norm": 0.13476519286632538,
+      "learning_rate": 0.00045448071839356203,
+      "loss": 2.9756,
+      "step": 6230
+    },
+    {
+      "epoch": 0.3620644637210247,
+      "grad_norm": 0.13884297013282776,
+      "learning_rate": 0.000453986958171887,
+      "loss": 2.9829,
+      "step": 6240
+    },
+    {
+      "epoch": 0.36264469523339815,
+      "grad_norm": 0.12928573787212372,
+      "learning_rate": 0.00045349263095410087,
+      "loss": 2.9752,
+      "step": 6250
+    },
+    {
+      "epoch": 0.36322492674577156,
+      "grad_norm": 0.13350141048431396,
+      "learning_rate": 0.000452997738560368,
+      "loss": 2.9748,
+      "step": 6260
+    },
+    {
+      "epoch": 0.363805158258145,
+      "grad_norm": 0.13747799396514893,
+      "learning_rate": 0.00045250228281293423,
+      "loss": 2.9705,
+      "step": 6270
+    },
+    {
+      "epoch": 0.36438538977051843,
+      "grad_norm": 0.1344989687204361,
+      "learning_rate": 0.00045200626553611943,
+      "loss": 2.9801,
+      "step": 6280
+    },
+    {
+      "epoch": 0.3649656212828919,
+      "grad_norm": 0.1321888118982315,
+      "learning_rate": 0.00045150968855631104,
+      "loss": 2.9781,
+      "step": 6290
+    },
+    {
+      "epoch": 0.3655458527952653,
+      "grad_norm": 0.12561041116714478,
+      "learning_rate": 0.0004510125537019577,
+      "loss": 2.973,
+      "step": 6300
+    },
+    {
+      "epoch": 0.36612608430763877,
+      "grad_norm": 0.13948814570903778,
+      "learning_rate": 0.00045051486280356194,
+      "loss": 2.9731,
+      "step": 6310
+    },
+    {
+      "epoch": 0.3667063158200122,
+      "grad_norm": 0.12595129013061523,
+      "learning_rate": 0.0004500166176936739,
+      "loss": 2.9659,
+      "step": 6320
+    },
+    {
+      "epoch": 0.36728654733238564,
+      "grad_norm": 0.12941335141658783,
+      "learning_rate": 0.00044951782020688415,
+      "loss": 2.973,
+      "step": 6330
+    },
+    {
+      "epoch": 0.36786677884475905,
+      "grad_norm": 0.14215658605098724,
+      "learning_rate": 0.00044901847217981736,
+      "loss": 2.975,
+      "step": 6340
+    },
+    {
+      "epoch": 0.3684470103571325,
+      "grad_norm": 0.12309448421001434,
+      "learning_rate": 0.00044851857545112525,
+      "loss": 2.9749,
+      "step": 6350
+    },
+    {
+      "epoch": 0.3690272418695059,
+      "grad_norm": 0.12824192643165588,
+      "learning_rate": 0.00044801813186147986,
+      "loss": 2.9672,
+      "step": 6360
+    },
+    {
+      "epoch": 0.3696074733818794,
+      "grad_norm": 0.12063992768526077,
+      "learning_rate": 0.00044751714325356697,
+      "loss": 2.9708,
+      "step": 6370
+    },
+    {
+      "epoch": 0.3701877048942528,
+      "grad_norm": 0.12898465991020203,
+      "learning_rate": 0.0004470156114720792,
+      "loss": 2.9699,
+      "step": 6380
+    },
+    {
+      "epoch": 0.37076793640662625,
+      "grad_norm": 0.1321457326412201,
+      "learning_rate": 0.00044651353836370897,
+      "loss": 2.9661,
+      "step": 6390
+    },
+    {
+      "epoch": 0.37134816791899966,
+      "grad_norm": 0.13804246485233307,
+      "learning_rate": 0.0004460109257771422,
+      "loss": 2.9783,
+      "step": 6400
+    },
+    {
+      "epoch": 0.3719283994313731,
+      "grad_norm": 0.12447643280029297,
+      "learning_rate": 0.00044550777556305094,
+      "loss": 2.9691,
+      "step": 6410
+    },
+    {
+      "epoch": 0.37250863094374653,
+      "grad_norm": 0.1610770970582962,
+      "learning_rate": 0.00044500408957408706,
+      "loss": 2.972,
+      "step": 6420
+    },
+    {
+      "epoch": 0.37308886245612,
+      "grad_norm": 0.1278504580259323,
+      "learning_rate": 0.00044449986966487527,
+      "loss": 2.9694,
+      "step": 6430
+    },
+    {
+      "epoch": 0.3736690939684934,
+      "grad_norm": 0.13527578115463257,
+      "learning_rate": 0.0004439951176920059,
+      "loss": 2.9707,
+      "step": 6440
+    },
+    {
+      "epoch": 0.37424932548086687,
+      "grad_norm": 0.14050637185573578,
+      "learning_rate": 0.0004434898355140287,
+      "loss": 2.9712,
+      "step": 6450
+    },
+    {
+      "epoch": 0.3748295569932403,
+      "grad_norm": 0.1513315588235855,
+      "learning_rate": 0.00044298402499144554,
+      "loss": 2.9705,
+      "step": 6460
+    },
+    {
+      "epoch": 0.37540978850561374,
+      "grad_norm": 0.1299854964017868,
+      "learning_rate": 0.00044247768798670367,
+      "loss": 2.9662,
+      "step": 6470
+    },
+    {
+      "epoch": 0.3759900200179872,
+      "grad_norm": 0.1321675330400467,
+      "learning_rate": 0.00044197082636418907,
+      "loss": 2.9675,
+      "step": 6480
+    },
+    {
+      "epoch": 0.3765702515303606,
+      "grad_norm": 0.1453583687543869,
+      "learning_rate": 0.00044146344199021934,
+      "loss": 2.9639,
+      "step": 6490
+    },
+    {
+      "epoch": 0.3771504830427341,
+      "grad_norm": 0.13450521230697632,
+      "learning_rate": 0.00044095553673303685,
+      "loss": 2.9661,
+      "step": 6500
+    },
+    {
+      "epoch": 0.3777307145551075,
+      "grad_norm": 0.13579097390174866,
+      "learning_rate": 0.00044044711246280215,
+      "loss": 2.9608,
+      "step": 6510
+    },
+    {
+      "epoch": 0.37831094606748095,
+      "grad_norm": 0.1469910442829132,
+      "learning_rate": 0.00043993817105158627,
+      "loss": 2.9686,
+      "step": 6520
+    },
+    {
+      "epoch": 0.37889117757985435,
+      "grad_norm": 0.1311839371919632,
+      "learning_rate": 0.00043942871437336527,
+      "loss": 2.9636,
+      "step": 6530
+    },
+    {
+      "epoch": 0.3794714090922278,
+      "grad_norm": 0.15060357749462128,
+      "learning_rate": 0.0004389187443040116,
+      "loss": 2.9613,
+      "step": 6540
+    },
+    {
+      "epoch": 0.3800516406046012,
+      "grad_norm": 0.13408997654914856,
+      "learning_rate": 0.00043840826272128873,
+      "loss": 2.9626,
+      "step": 6550
+    },
+    {
+      "epoch": 0.3806318721169747,
+      "grad_norm": 0.1458410769701004,
+      "learning_rate": 0.0004378972715048434,
+      "loss": 2.9604,
+      "step": 6560
+    },
+    {
+      "epoch": 0.3812121036293481,
+      "grad_norm": 0.13342171907424927,
+      "learning_rate": 0.0004373857725361984,
+      "loss": 2.9602,
+      "step": 6570
+    },
+    {
+      "epoch": 0.38179233514172156,
+      "grad_norm": 0.12624911963939667,
+      "learning_rate": 0.00043687376769874686,
+      "loss": 2.9703,
+      "step": 6580
+    },
+    {
+      "epoch": 0.38237256665409497,
+      "grad_norm": 0.13120518624782562,
+      "learning_rate": 0.0004363612588777442,
+      "loss": 2.9601,
+      "step": 6590
+    },
+    {
+      "epoch": 0.38295279816646843,
+      "grad_norm": 0.1357596516609192,
+      "learning_rate": 0.00043584824796030145,
+      "loss": 2.9561,
+      "step": 6600
+    },
+    {
+      "epoch": 0.38353302967884184,
+      "grad_norm": 0.1270647495985031,
+      "learning_rate": 0.00043533473683537863,
+      "loss": 2.9522,
+      "step": 6610
+    },
+    {
+      "epoch": 0.3841132611912153,
+      "grad_norm": 0.1325126439332962,
+      "learning_rate": 0.0004348207273937776,
+      "loss": 2.9603,
+      "step": 6620
+    },
+    {
+      "epoch": 0.3846934927035887,
+      "grad_norm": 0.13015331327915192,
+      "learning_rate": 0.0004343062215281347,
+      "loss": 2.955,
+      "step": 6630
+    },
+    {
+      "epoch": 0.3852737242159622,
+      "grad_norm": 0.12867479026317596,
+      "learning_rate": 0.00043379122113291465,
+      "loss": 2.9692,
+      "step": 6640
+    },
+    {
+      "epoch": 0.3858539557283356,
+      "grad_norm": 0.14423881471157074,
+      "learning_rate": 0.00043327572810440283,
+      "loss": 2.9539,
+      "step": 6650
+    },
+    {
+      "epoch": 0.38643418724070905,
+      "grad_norm": 0.13097575306892395,
+      "learning_rate": 0.00043275974434069846,
+      "loss": 2.9576,
+      "step": 6660
+    },
+    {
+      "epoch": 0.38701441875308246,
+      "grad_norm": 0.129910409450531,
+      "learning_rate": 0.0004322432717417079,
+      "loss": 2.9617,
+      "step": 6670
+    },
+    {
+      "epoch": 0.3875946502654559,
+      "grad_norm": 0.13308489322662354,
+      "learning_rate": 0.00043172631220913735,
+      "loss": 2.9514,
+      "step": 6680
+    },
+    {
+      "epoch": 0.38817488177782933,
+      "grad_norm": 0.12263292074203491,
+      "learning_rate": 0.00043120886764648605,
+      "loss": 2.9557,
+      "step": 6690
+    },
+    {
+      "epoch": 0.3887551132902028,
+      "grad_norm": 0.1288110911846161,
+      "learning_rate": 0.0004306909399590389,
+      "loss": 2.9558,
+      "step": 6700
+    },
+    {
+      "epoch": 0.38933534480257626,
+      "grad_norm": 0.12322728335857391,
+      "learning_rate": 0.00043017253105386005,
+      "loss": 2.9551,
+      "step": 6710
+    },
+    {
+      "epoch": 0.38991557631494966,
+      "grad_norm": 0.1551227867603302,
+      "learning_rate": 0.0004296536428397853,
+      "loss": 2.9583,
+      "step": 6720
+    },
+    {
+      "epoch": 0.3904958078273231,
+      "grad_norm": 0.12883497774600983,
+      "learning_rate": 0.00042913427722741546,
+      "loss": 2.9495,
+      "step": 6730
+    },
+    {
+      "epoch": 0.39107603933969654,
+      "grad_norm": 0.12460558116436005,
+      "learning_rate": 0.00042861443612910913,
+      "loss": 2.9597,
+      "step": 6740
+    },
+    {
+      "epoch": 0.39165627085207,
+      "grad_norm": 0.122388556599617,
+      "learning_rate": 0.00042809412145897576,
+      "loss": 2.9557,
+      "step": 6750
+    },
+    {
+      "epoch": 0.3922365023644434,
+      "grad_norm": 0.12150498479604721,
+      "learning_rate": 0.00042757333513286834,
+      "loss": 2.9489,
+      "step": 6760
+    },
+    {
+      "epoch": 0.39281673387681687,
+      "grad_norm": 0.15273340046405792,
+      "learning_rate": 0.00042705207906837666,
+      "loss": 2.9503,
+      "step": 6770
+    },
+    {
+      "epoch": 0.3933969653891903,
+      "grad_norm": 0.13954737782478333,
+      "learning_rate": 0.00042653035518482025,
+      "loss": 2.9481,
+      "step": 6780
+    },
+    {
+      "epoch": 0.39397719690156374,
+      "grad_norm": 0.15386004745960236,
+      "learning_rate": 0.0004260081654032411,
+      "loss": 2.9596,
+      "step": 6790
+    },
+    {
+      "epoch": 0.39455742841393715,
+      "grad_norm": 0.1319696307182312,
+      "learning_rate": 0.0004254855116463966,
+      "loss": 2.9526,
+      "step": 6800
+    },
+    {
+      "epoch": 0.3951376599263106,
+      "grad_norm": 0.14486876130104065,
+      "learning_rate": 0.00042496239583875286,
+      "loss": 2.9501,
+      "step": 6810
+    },
+    {
+      "epoch": 0.395717891438684,
+      "grad_norm": 0.12461838871240616,
+      "learning_rate": 0.0004244388199064768,
+      "loss": 2.9519,
+      "step": 6820
+    },
+    {
+      "epoch": 0.3962981229510575,
+      "grad_norm": 0.14132647216320038,
+      "learning_rate": 0.00042391478577743006,
+      "loss": 2.9533,
+      "step": 6830
+    },
+    {
+      "epoch": 0.3968783544634309,
+      "grad_norm": 0.12907026708126068,
+      "learning_rate": 0.00042339029538116104,
+      "loss": 2.9451,
+      "step": 6840
+    },
+    {
+      "epoch": 0.39745858597580436,
+      "grad_norm": 0.13801275193691254,
+      "learning_rate": 0.0004228653506488984,
+      "loss": 2.9382,
+      "step": 6850
+    },
+    {
+      "epoch": 0.39803881748817777,
+      "grad_norm": 0.11962810158729553,
+      "learning_rate": 0.00042233995351354366,
+      "loss": 2.9501,
+      "step": 6860
+    },
+    {
+      "epoch": 0.39861904900055123,
+      "grad_norm": 0.12804014980793,
+      "learning_rate": 0.00042181410590966413,
+      "loss": 2.9556,
+      "step": 6870
+    },
+    {
+      "epoch": 0.39919928051292464,
+      "grad_norm": 0.1232592836022377,
+      "learning_rate": 0.0004212878097734857,
+      "loss": 2.9493,
+      "step": 6880
+    },
+    {
+      "epoch": 0.3997795120252981,
+      "grad_norm": 0.12467402964830399,
+      "learning_rate": 0.0004207610670428859,
+      "loss": 2.9518,
+      "step": 6890
+    },
+    {
+      "epoch": 0.4003597435376715,
+      "grad_norm": 0.13029509782791138,
+      "learning_rate": 0.0004202338796573866,
+      "loss": 2.9476,
+      "step": 6900
+    },
+    {
+      "epoch": 0.40093997505004497,
+      "grad_norm": 0.13504283130168915,
+      "learning_rate": 0.0004197062495581471,
+      "loss": 2.9457,
+      "step": 6910
+    },
+    {
+      "epoch": 0.4015202065624184,
+      "grad_norm": 0.12205976992845535,
+      "learning_rate": 0.00041917817868795666,
+      "loss": 2.9418,
+      "step": 6920
+    },
+    {
+      "epoch": 0.40210043807479184,
+      "grad_norm": 0.14173905551433563,
+      "learning_rate": 0.0004186496689912275,
+      "loss": 2.9401,
+      "step": 6930
+    },
+    {
+      "epoch": 0.40268066958716525,
+      "grad_norm": 0.131003275513649,
+      "learning_rate": 0.00041812072241398764,
+      "loss": 2.9416,
+      "step": 6940
+    },
+    {
+      "epoch": 0.4032609010995387,
+      "grad_norm": 0.1430942267179489,
+      "learning_rate": 0.00041759134090387396,
+      "loss": 2.9526,
+      "step": 6950
+    },
+    {
+      "epoch": 0.4038411326119122,
+      "grad_norm": 0.11908053606748581,
+      "learning_rate": 0.00041706152641012435,
+      "loss": 2.9457,
+      "step": 6960
+    },
+    {
+      "epoch": 0.4044213641242856,
+      "grad_norm": 0.12189971655607224,
+      "learning_rate": 0.0004165312808835716,
+      "loss": 2.9497,
+      "step": 6970
+    },
+    {
+      "epoch": 0.40500159563665905,
+      "grad_norm": 0.1238475888967514,
+      "learning_rate": 0.00041600060627663515,
+      "loss": 2.9426,
+      "step": 6980
+    },
+    {
+      "epoch": 0.40558182714903246,
+      "grad_norm": 0.13269031047821045,
+      "learning_rate": 0.00041546950454331437,
+      "loss": 2.9441,
+      "step": 6990
+    },
+    {
+      "epoch": 0.4061620586614059,
+      "grad_norm": 0.14216388761997223,
+      "learning_rate": 0.0004149379776391817,
+      "loss": 2.9443,
+      "step": 7000
+    },
+    {
+      "epoch": 0.4061620586614059,
+      "eval_loss": 2.910210609436035,
+      "eval_runtime": 3.2597,
+      "eval_samples_per_second": 1328.339,
+      "eval_steps_per_second": 2.761,
+      "step": 7000
+    },
+    {
+      "epoch": 0.40674229017377933,
+      "grad_norm": 0.13298869132995605,
+      "learning_rate": 0.0004144060275213747,
+      "loss": 2.946,
+      "step": 7010
+    },
+    {
+      "epoch": 0.4073225216861528,
+      "grad_norm": 0.14648084342479706,
+      "learning_rate": 0.00041387365614858955,
+      "loss": 2.9468,
+      "step": 7020
+    },
+    {
+      "epoch": 0.4079027531985262,
+      "grad_norm": 0.13918638229370117,
+      "learning_rate": 0.00041334086548107336,
+      "loss": 2.9561,
+      "step": 7030
+    },
+    {
+      "epoch": 0.40848298471089967,
+      "grad_norm": 0.1421622335910797,
+      "learning_rate": 0.00041280765748061727,
+      "loss": 2.9437,
+      "step": 7040
+    },
+    {
+      "epoch": 0.4090632162232731,
+      "grad_norm": 0.1364564597606659,
+      "learning_rate": 0.0004122740341105488,
+      "loss": 2.9354,
+      "step": 7050
+    },
+    {
+      "epoch": 0.40964344773564654,
+      "grad_norm": 0.1310495287179947,
+      "learning_rate": 0.00041173999733572523,
+      "loss": 2.9471,
+      "step": 7060
+    },
+    {
+      "epoch": 0.41022367924801995,
+      "grad_norm": 0.14024296402931213,
+      "learning_rate": 0.000411205549122526,
+      "loss": 2.9372,
+      "step": 7070
+    },
+    {
+      "epoch": 0.4108039107603934,
+      "grad_norm": 0.1430574357509613,
+      "learning_rate": 0.0004106706914388452,
+      "loss": 2.9468,
+      "step": 7080
+    },
+    {
+      "epoch": 0.4113841422727668,
+      "grad_norm": 0.12103896588087082,
+      "learning_rate": 0.00041013542625408504,
+      "loss": 2.9463,
+      "step": 7090
+    },
+    {
+      "epoch": 0.4119643737851403,
+      "grad_norm": 0.12720054388046265,
+      "learning_rate": 0.00040959975553914787,
+      "loss": 2.9427,
+      "step": 7100
+    },
+    {
+      "epoch": 0.4125446052975137,
+      "grad_norm": 0.14135150611400604,
+      "learning_rate": 0.0004090636812664295,
+      "loss": 2.9407,
+      "step": 7110
+    },
+    {
+      "epoch": 0.41312483680988715,
+      "grad_norm": 0.14666588604450226,
+      "learning_rate": 0.0004085272054098115,
+      "loss": 2.9435,
+      "step": 7120
+    },
+    {
+      "epoch": 0.41370506832226056,
+      "grad_norm": 0.13804596662521362,
+      "learning_rate": 0.0004079903299446541,
+      "loss": 2.9365,
+      "step": 7130
+    },
+    {
+      "epoch": 0.414285299834634,
+      "grad_norm": 0.1470736414194107,
+      "learning_rate": 0.00040745305684778907,
+      "loss": 2.9278,
+      "step": 7140
+    },
+    {
+      "epoch": 0.41486553134700743,
+      "grad_norm": 0.12926244735717773,
+      "learning_rate": 0.00040691538809751234,
+      "loss": 2.9354,
+      "step": 7150
+    },
+    {
+      "epoch": 0.4154457628593809,
+      "grad_norm": 0.1294509321451187,
+      "learning_rate": 0.00040637732567357635,
+      "loss": 2.9466,
+      "step": 7160
+    },
+    {
+      "epoch": 0.4160259943717543,
+      "grad_norm": 0.12196213006973267,
+      "learning_rate": 0.0004058388715571835,
+      "loss": 2.9322,
+      "step": 7170
+    },
+    {
+      "epoch": 0.41660622588412777,
+      "grad_norm": 0.15902066230773926,
+      "learning_rate": 0.00040530002773097825,
+      "loss": 2.9448,
+      "step": 7180
+    },
+    {
+      "epoch": 0.41718645739650123,
+      "grad_norm": 0.11859998106956482,
+      "learning_rate": 0.0004047607961790399,
+      "loss": 2.9428,
+      "step": 7190
+    },
+    {
+      "epoch": 0.41776668890887464,
+      "grad_norm": 0.13470393419265747,
+      "learning_rate": 0.00040422117888687555,
+      "loss": 2.942,
+      "step": 7200
+    },
+    {
+      "epoch": 0.4183469204212481,
+      "grad_norm": 0.1288190484046936,
+      "learning_rate": 0.0004036811778414125,
+      "loss": 2.9362,
+      "step": 7210
+    },
+    {
+      "epoch": 0.4189271519336215,
+      "grad_norm": 0.12759481370449066,
+      "learning_rate": 0.0004031407950309915,
+      "loss": 2.9447,
+      "step": 7220
+    },
+    {
+      "epoch": 0.419507383445995,
+      "grad_norm": 0.13468439877033234,
+      "learning_rate": 0.0004026000324453584,
+      "loss": 2.9313,
+      "step": 7230
+    },
+    {
+      "epoch": 0.4200876149583684,
+      "grad_norm": 0.12287794053554535,
+      "learning_rate": 0.0004020588920756577,
+      "loss": 2.9369,
+      "step": 7240
+    },
+    {
+      "epoch": 0.42066784647074185,
+      "grad_norm": 0.12006892263889313,
+      "learning_rate": 0.00040151737591442497,
+      "loss": 2.9329,
+      "step": 7250
+    },
+    {
+      "epoch": 0.42124807798311525,
+      "grad_norm": 0.13062633574008942,
+      "learning_rate": 0.00040097548595557935,
+      "loss": 2.9474,
+      "step": 7260
+    },
+    {
+      "epoch": 0.4218283094954887,
+      "grad_norm": 0.12141095846891403,
+      "learning_rate": 0.00040043322419441667,
+      "loss": 2.9386,
+      "step": 7270
+    },
+    {
+      "epoch": 0.4224085410078621,
+      "grad_norm": 0.13452979922294617,
+      "learning_rate": 0.0003998905926276014,
+      "loss": 2.9203,
+      "step": 7280
+    },
+    {
+      "epoch": 0.4229887725202356,
+      "grad_norm": 0.13672851026058197,
+      "learning_rate": 0.0003993475932531598,
+      "loss": 2.9353,
+      "step": 7290
+    },
+    {
+      "epoch": 0.423569004032609,
+      "grad_norm": 0.1266540139913559,
+      "learning_rate": 0.0003988042280704724,
+      "loss": 2.929,
+      "step": 7300
+    },
+    {
+      "epoch": 0.42414923554498246,
+      "grad_norm": 0.1192171648144722,
+      "learning_rate": 0.0003982604990802668,
+      "loss": 2.9314,
+      "step": 7310
+    },
+    {
+      "epoch": 0.42472946705735587,
+      "grad_norm": 0.11528236418962479,
+      "learning_rate": 0.0003977164082846101,
+      "loss": 2.9349,
+      "step": 7320
+    },
+    {
+      "epoch": 0.42530969856972933,
+      "grad_norm": 0.12837885320186615,
+      "learning_rate": 0.00039717195768690155,
+      "loss": 2.9211,
+      "step": 7330
+    },
+    {
+      "epoch": 0.42588993008210274,
+      "grad_norm": 0.1254536211490631,
+      "learning_rate": 0.0003966271492918654,
+      "loss": 2.9311,
+      "step": 7340
+    },
+    {
+      "epoch": 0.4264701615944762,
+      "grad_norm": 0.12365511804819107,
+      "learning_rate": 0.0003960819851055432,
+      "loss": 2.9411,
+      "step": 7350
+    },
+    {
+      "epoch": 0.4270503931068496,
+      "grad_norm": 0.14178220927715302,
+      "learning_rate": 0.00039553646713528644,
+      "loss": 2.9322,
+      "step": 7360
+    },
+    {
+      "epoch": 0.4276306246192231,
+      "grad_norm": 0.13220851123332977,
+      "learning_rate": 0.0003949905973897496,
+      "loss": 2.9397,
+      "step": 7370
+    },
+    {
+      "epoch": 0.4282108561315965,
+      "grad_norm": 0.12264362722635269,
+      "learning_rate": 0.00039444437787888224,
+      "loss": 2.9355,
+      "step": 7380
+    },
+    {
+      "epoch": 0.42879108764396995,
+      "grad_norm": 0.12907512485980988,
+      "learning_rate": 0.00039389781061392184,
+      "loss": 2.9259,
+      "step": 7390
+    },
+    {
+      "epoch": 0.42937131915634336,
+      "grad_norm": 0.1319524645805359,
+      "learning_rate": 0.00039335089760738625,
+      "loss": 2.9284,
+      "step": 7400
+    },
+    {
+      "epoch": 0.4299515506687168,
+      "grad_norm": 0.1404864490032196,
+      "learning_rate": 0.0003928036408730664,
+      "loss": 2.932,
+      "step": 7410
+    },
+    {
+      "epoch": 0.43053178218109023,
+      "grad_norm": 0.12499509751796722,
+      "learning_rate": 0.00039225604242601914,
+      "loss": 2.9313,
+      "step": 7420
+    },
+    {
+      "epoch": 0.4311120136934637,
+      "grad_norm": 0.13161097466945648,
+      "learning_rate": 0.0003917081042825591,
+      "loss": 2.9261,
+      "step": 7430
+    },
+    {
+      "epoch": 0.43169224520583716,
+      "grad_norm": 0.13262121379375458,
+      "learning_rate": 0.000391159828460252,
+      "loss": 2.9302,
+      "step": 7440
+    },
+    {
+      "epoch": 0.43227247671821056,
+      "grad_norm": 0.13169781863689423,
+      "learning_rate": 0.0003906112169779069,
+      "loss": 2.9247,
+      "step": 7450
+    },
+    {
+      "epoch": 0.432852708230584,
+      "grad_norm": 0.1297696828842163,
+      "learning_rate": 0.00039006227185556865,
+      "loss": 2.9422,
+      "step": 7460
+    },
+    {
+      "epoch": 0.43343293974295743,
+      "grad_norm": 0.1292199194431305,
+      "learning_rate": 0.00038951299511451077,
+      "loss": 2.9232,
+      "step": 7470
+    },
+    {
+      "epoch": 0.4340131712553309,
+      "grad_norm": 0.13055439293384552,
+      "learning_rate": 0.0003889633887772278,
+      "loss": 2.9246,
+      "step": 7480
+    },
+    {
+      "epoch": 0.4345934027677043,
+      "grad_norm": 0.1166820153594017,
+      "learning_rate": 0.0003884134548674278,
+      "loss": 2.9361,
+      "step": 7490
+    },
+    {
+      "epoch": 0.43517363428007777,
+      "grad_norm": 0.12382174283266068,
+      "learning_rate": 0.00038786319541002487,
+      "loss": 2.9221,
+      "step": 7500
+    },
+    {
+      "epoch": 0.4357538657924512,
+      "grad_norm": 0.12510880827903748,
+      "learning_rate": 0.0003873126124311323,
+      "loss": 2.9289,
+      "step": 7510
+    },
+    {
+      "epoch": 0.43633409730482464,
+      "grad_norm": 0.13196755945682526,
+      "learning_rate": 0.000386761707958054,
+      "loss": 2.9203,
+      "step": 7520
+    },
+    {
+      "epoch": 0.43691432881719805,
+      "grad_norm": 0.13719266653060913,
+      "learning_rate": 0.00038621048401927817,
+      "loss": 2.9319,
+      "step": 7530
+    },
+    {
+      "epoch": 0.4374945603295715,
+      "grad_norm": 0.13211804628372192,
+      "learning_rate": 0.000385658942644469,
+      "loss": 2.9326,
+      "step": 7540
+    },
+    {
+      "epoch": 0.4380747918419449,
+      "grad_norm": 0.12999597191810608,
+      "learning_rate": 0.0003851070858644596,
+      "loss": 2.9239,
+      "step": 7550
+    },
+    {
+      "epoch": 0.4386550233543184,
+      "grad_norm": 0.13165125250816345,
+      "learning_rate": 0.0003845549157112445,
+      "loss": 2.9312,
+      "step": 7560
+    },
+    {
+      "epoch": 0.4392352548666918,
+      "grad_norm": 0.13743376731872559,
+      "learning_rate": 0.00038400243421797206,
+      "loss": 2.9254,
+      "step": 7570
+    },
+    {
+      "epoch": 0.43981548637906526,
+      "grad_norm": 0.12621231377124786,
+      "learning_rate": 0.00038344964341893684,
+      "loss": 2.9203,
+      "step": 7580
+    },
+    {
+      "epoch": 0.44039571789143866,
+      "grad_norm": 0.12167075276374817,
+      "learning_rate": 0.00038289654534957266,
+      "loss": 2.9281,
+      "step": 7590
+    },
+    {
+      "epoch": 0.44097594940381213,
+      "grad_norm": 0.13523493707180023,
+      "learning_rate": 0.0003823431420464444,
+      "loss": 2.916,
+      "step": 7600
+    },
+    {
+      "epoch": 0.44155618091618554,
+      "grad_norm": 0.11718156933784485,
+      "learning_rate": 0.0003817894355472413,
+      "loss": 2.9145,
+      "step": 7610
+    },
+    {
+      "epoch": 0.442136412428559,
+      "grad_norm": 0.13470205664634705,
+      "learning_rate": 0.0003812354278907683,
+      "loss": 2.9173,
+      "step": 7620
+    },
+    {
+      "epoch": 0.4427166439409324,
+      "grad_norm": 0.1286102533340454,
+      "learning_rate": 0.00038068112111693984,
+      "loss": 2.9249,
+      "step": 7630
+    },
+    {
+      "epoch": 0.44329687545330587,
+      "grad_norm": 0.13669750094413757,
+      "learning_rate": 0.00038012651726677146,
+      "loss": 2.9239,
+      "step": 7640
+    },
+    {
+      "epoch": 0.4438771069656793,
+      "grad_norm": 0.14638318121433258,
+      "learning_rate": 0.0003795716183823728,
+      "loss": 2.9306,
+      "step": 7650
+    },
+    {
+      "epoch": 0.44445733847805274,
+      "grad_norm": 0.13569045066833496,
+      "learning_rate": 0.00037901642650693944,
+      "loss": 2.9168,
+      "step": 7660
+    },
+    {
+      "epoch": 0.4450375699904262,
+      "grad_norm": 0.1257532387971878,
+      "learning_rate": 0.00037846094368474613,
+      "loss": 2.9242,
+      "step": 7670
+    },
+    {
+      "epoch": 0.4456178015027996,
+      "grad_norm": 0.11852803826332092,
+      "learning_rate": 0.0003779051719611389,
+      "loss": 2.9209,
+      "step": 7680
+    },
+    {
+      "epoch": 0.4461980330151731,
+      "grad_norm": 0.12594154477119446,
+      "learning_rate": 0.0003773491133825273,
+      "loss": 2.929,
+      "step": 7690
+    },
+    {
+      "epoch": 0.4467782645275465,
+      "grad_norm": 0.12566526234149933,
+      "learning_rate": 0.00037679276999637746,
+      "loss": 2.9119,
+      "step": 7700
+    },
+    {
+      "epoch": 0.44735849603991995,
+      "grad_norm": 0.13207079470157623,
+      "learning_rate": 0.0003762361438512038,
+      "loss": 2.917,
+      "step": 7710
+    },
+    {
+      "epoch": 0.44793872755229336,
+      "grad_norm": 0.13788865506649017,
+      "learning_rate": 0.00037567923699656226,
+      "loss": 2.92,
+      "step": 7720
+    },
+    {
+      "epoch": 0.4485189590646668,
+      "grad_norm": 0.13110986351966858,
+      "learning_rate": 0.00037512205148304204,
+      "loss": 2.9249,
+      "step": 7730
+    },
+    {
+      "epoch": 0.44909919057704023,
+      "grad_norm": 0.1643168181180954,
+      "learning_rate": 0.00037456458936225873,
+      "loss": 2.9232,
+      "step": 7740
+    },
+    {
+      "epoch": 0.4496794220894137,
+      "grad_norm": 0.14076946675777435,
+      "learning_rate": 0.00037400685268684623,
+      "loss": 2.9252,
+      "step": 7750
+    },
+    {
+      "epoch": 0.4502596536017871,
+      "grad_norm": 0.1238834485411644,
+      "learning_rate": 0.0003734488435104494,
+      "loss": 2.9093,
+      "step": 7760
+    },
+    {
+      "epoch": 0.45083988511416057,
+      "grad_norm": 0.11924099922180176,
+      "learning_rate": 0.00037289056388771643,
+      "loss": 2.9324,
+      "step": 7770
+    },
+    {
+      "epoch": 0.451420116626534,
+      "grad_norm": 0.13720078766345978,
+      "learning_rate": 0.0003723320158742914,
+      "loss": 2.9154,
+      "step": 7780
+    },
+    {
+      "epoch": 0.45200034813890744,
+      "grad_norm": 0.12532520294189453,
+      "learning_rate": 0.00037177320152680663,
+      "loss": 2.9228,
+      "step": 7790
+    },
+    {
+      "epoch": 0.45258057965128085,
+      "grad_norm": 0.129350483417511,
+      "learning_rate": 0.0003712141229028751,
+      "loss": 2.9071,
+      "step": 7800
+    },
+    {
+      "epoch": 0.4531608111636543,
+      "grad_norm": 0.12484076619148254,
+      "learning_rate": 0.0003706547820610828,
+      "loss": 2.9107,
+      "step": 7810
+    },
+    {
+      "epoch": 0.4537410426760277,
+      "grad_norm": 0.12527912855148315,
+      "learning_rate": 0.0003700951810609815,
+      "loss": 2.9166,
+      "step": 7820
+    },
+    {
+      "epoch": 0.4543212741884012,
+      "grad_norm": 0.1453130692243576,
+      "learning_rate": 0.0003695353219630803,
+      "loss": 2.9195,
+      "step": 7830
+    },
+    {
+      "epoch": 0.4549015057007746,
+      "grad_norm": 0.1291913241147995,
+      "learning_rate": 0.0003689752068288395,
+      "loss": 2.9124,
+      "step": 7840
+    },
+    {
+      "epoch": 0.45548173721314805,
+      "grad_norm": 0.12470022588968277,
+      "learning_rate": 0.0003684148377206615,
+      "loss": 2.9241,
+      "step": 7850
+    },
+    {
+      "epoch": 0.45606196872552146,
+      "grad_norm": 0.1276790350675583,
+      "learning_rate": 0.00036785421670188395,
+      "loss": 2.9178,
+      "step": 7860
+    },
+    {
+      "epoch": 0.4566422002378949,
+      "grad_norm": 0.15164950489997864,
+      "learning_rate": 0.0003672933458367724,
+      "loss": 2.9072,
+      "step": 7870
+    },
+    {
+      "epoch": 0.45722243175026833,
+      "grad_norm": 0.14891022443771362,
+      "learning_rate": 0.00036673222719051194,
+      "loss": 2.9235,
+      "step": 7880
+    },
+    {
+      "epoch": 0.4578026632626418,
+      "grad_norm": 0.1266569346189499,
+      "learning_rate": 0.0003661708628292003,
+      "loss": 2.9159,
+      "step": 7890
+    },
+    {
+      "epoch": 0.4583828947750152,
+      "grad_norm": 0.12030439078807831,
+      "learning_rate": 0.0003656092548198399,
+      "loss": 2.912,
+      "step": 7900
+    },
+    {
+      "epoch": 0.45896312628738867,
+      "grad_norm": 0.12590278685092926,
+      "learning_rate": 0.00036504740523033016,
+      "loss": 2.91,
+      "step": 7910
+    },
+    {
+      "epoch": 0.45954335779976213,
+      "grad_norm": 0.1255042403936386,
+      "learning_rate": 0.0003644853161294601,
+      "loss": 2.9127,
+      "step": 7920
+    },
+    {
+      "epoch": 0.46012358931213554,
+      "grad_norm": 0.1253713071346283,
+      "learning_rate": 0.0003639229895869009,
+      "loss": 2.9242,
+      "step": 7930
+    },
+    {
+      "epoch": 0.460703820824509,
+      "grad_norm": 0.1254982203245163,
+      "learning_rate": 0.0003633604276731975,
+      "loss": 2.9115,
+      "step": 7940
+    },
+    {
+      "epoch": 0.4612840523368824,
+      "grad_norm": 0.12157725542783737,
+      "learning_rate": 0.00036279763245976207,
+      "loss": 2.9114,
+      "step": 7950
+    },
+    {
+      "epoch": 0.4618642838492559,
+      "grad_norm": 0.12421195954084396,
+      "learning_rate": 0.00036223460601886537,
+      "loss": 2.9083,
+      "step": 7960
+    },
+    {
+      "epoch": 0.4624445153616293,
+      "grad_norm": 0.11870937049388885,
+      "learning_rate": 0.00036167135042362977,
+      "loss": 2.907,
+      "step": 7970
+    },
+    {
+      "epoch": 0.46302474687400275,
+      "grad_norm": 0.12460967898368835,
+      "learning_rate": 0.00036110786774802133,
+      "loss": 2.9088,
+      "step": 7980
+    },
+    {
+      "epoch": 0.46360497838637615,
+      "grad_norm": 0.1310334950685501,
+      "learning_rate": 0.00036054416006684245,
+      "loss": 2.9102,
+      "step": 7990
+    },
+    {
+      "epoch": 0.4641852098987496,
+      "grad_norm": 0.12560488283634186,
+      "learning_rate": 0.00035998022945572366,
+      "loss": 2.9097,
+      "step": 8000
+    },
+    {
+      "epoch": 0.4641852098987496,
+      "eval_loss": 2.875955820083618,
+      "eval_runtime": 3.2545,
+      "eval_samples_per_second": 1330.484,
+      "eval_steps_per_second": 2.765,
+      "step": 8000
+    },
+    {
+      "epoch": 0.464765441411123,
+      "grad_norm": 0.12761953473091125,
+      "learning_rate": 0.00035941607799111675,
+      "loss": 2.91,
+      "step": 8010
+    },
+    {
+      "epoch": 0.4653456729234965,
+      "grad_norm": 0.1247384324669838,
+      "learning_rate": 0.0003588517077502864,
+      "loss": 2.9149,
+      "step": 8020
+    },
+    {
+      "epoch": 0.4659259044358699,
+      "grad_norm": 0.14209751784801483,
+      "learning_rate": 0.00035828712081130296,
+      "loss": 2.9083,
+      "step": 8030
+    },
+    {
+      "epoch": 0.46650613594824336,
+      "grad_norm": 0.12985317409038544,
+      "learning_rate": 0.00035772231925303464,
+      "loss": 2.9046,
+      "step": 8040
+    },
+    {
+      "epoch": 0.46708636746061677,
+      "grad_norm": 0.14672869443893433,
+      "learning_rate": 0.00035715730515514,
+      "loss": 2.9113,
+      "step": 8050
+    },
+    {
+      "epoch": 0.46766659897299023,
+      "grad_norm": 0.13361111283302307,
+      "learning_rate": 0.0003565920805980602,
+      "loss": 2.913,
+      "step": 8060
+    },
+    {
+      "epoch": 0.46824683048536364,
+      "grad_norm": 0.12082985788583755,
+      "learning_rate": 0.0003560266476630112,
+      "loss": 2.9138,
+      "step": 8070
+    },
+    {
+      "epoch": 0.4688270619977371,
+      "grad_norm": 0.1150035560131073,
+      "learning_rate": 0.0003554610084319763,
+      "loss": 2.9048,
+      "step": 8080
+    },
+    {
+      "epoch": 0.4694072935101105,
+      "grad_norm": 0.1214471235871315,
+      "learning_rate": 0.0003548951649876984,
+      "loss": 2.9123,
+      "step": 8090
+    },
+    {
+      "epoch": 0.469987525022484,
+      "grad_norm": 0.12934035062789917,
+      "learning_rate": 0.0003543291194136723,
+      "loss": 2.9028,
+      "step": 8100
+    },
+    {
+      "epoch": 0.4705677565348574,
+      "grad_norm": 0.15276013314723969,
+      "learning_rate": 0.00035376287379413723,
+      "loss": 2.9031,
+      "step": 8110
+    },
+    {
+      "epoch": 0.47114798804723085,
+      "grad_norm": 0.1335725337266922,
+      "learning_rate": 0.00035319643021406886,
+      "loss": 2.9124,
+      "step": 8120
+    },
+    {
+      "epoch": 0.47172821955960426,
+      "grad_norm": 0.12289181351661682,
+      "learning_rate": 0.00035262979075917166,
+      "loss": 2.9053,
+      "step": 8130
+    },
+    {
+      "epoch": 0.4723084510719777,
+      "grad_norm": 0.11827896535396576,
+      "learning_rate": 0.0003520629575158715,
+      "loss": 2.9138,
+      "step": 8140
+    },
+    {
+      "epoch": 0.4728886825843512,
+      "grad_norm": 0.12505313754081726,
+      "learning_rate": 0.0003514959325713078,
+      "loss": 2.909,
+      "step": 8150
+    },
+    {
+      "epoch": 0.4734689140967246,
+      "grad_norm": 0.1321611851453781,
+      "learning_rate": 0.00035092871801332574,
+      "loss": 2.9075,
+      "step": 8160
+    },
+    {
+      "epoch": 0.47404914560909805,
+      "grad_norm": 0.12144722044467926,
+      "learning_rate": 0.00035036131593046895,
+      "loss": 2.9046,
+      "step": 8170
+    },
+    {
+      "epoch": 0.47462937712147146,
+      "grad_norm": 0.11893021315336227,
+      "learning_rate": 0.0003497937284119711,
+      "loss": 2.9021,
+      "step": 8180
+    },
+    {
+      "epoch": 0.4752096086338449,
+      "grad_norm": 0.13043691217899323,
+      "learning_rate": 0.0003492259575477491,
+      "loss": 2.9052,
+      "step": 8190
+    },
+    {
+      "epoch": 0.47578984014621833,
+      "grad_norm": 0.12443230301141739,
+      "learning_rate": 0.00034865800542839445,
+      "loss": 2.9003,
+      "step": 8200
+    },
+    {
+      "epoch": 0.4763700716585918,
+      "grad_norm": 0.1350659728050232,
+      "learning_rate": 0.0003480898741451667,
+      "loss": 2.9077,
+      "step": 8210
+    },
+    {
+      "epoch": 0.4769503031709652,
+      "grad_norm": 0.13212652504444122,
+      "learning_rate": 0.0003475215657899844,
+      "loss": 2.8955,
+      "step": 8220
+    },
+    {
+      "epoch": 0.47753053468333867,
+      "grad_norm": 0.13865076005458832,
+      "learning_rate": 0.0003469530824554188,
+      "loss": 2.9015,
+      "step": 8230
+    },
+    {
+      "epoch": 0.4781107661957121,
+      "grad_norm": 0.1313691884279251,
+      "learning_rate": 0.00034638442623468484,
+      "loss": 2.9014,
+      "step": 8240
+    },
+    {
+      "epoch": 0.47869099770808554,
+      "grad_norm": 0.13368923962116241,
+      "learning_rate": 0.00034581559922163447,
+      "loss": 2.8962,
+      "step": 8250
+    },
+    {
+      "epoch": 0.47927122922045895,
+      "grad_norm": 0.12228936702013016,
+      "learning_rate": 0.0003452466035107481,
+      "loss": 2.8997,
+      "step": 8260
+    },
+    {
+      "epoch": 0.4798514607328324,
+      "grad_norm": 0.12648892402648926,
+      "learning_rate": 0.00034467744119712787,
+      "loss": 2.9052,
+      "step": 8270
+    },
+    {
+      "epoch": 0.4804316922452058,
+      "grad_norm": 0.12937045097351074,
+      "learning_rate": 0.00034410811437648873,
+      "loss": 2.9037,
+      "step": 8280
+    },
+    {
+      "epoch": 0.4810119237575793,
+      "grad_norm": 0.12095940858125687,
+      "learning_rate": 0.00034353862514515185,
+      "loss": 2.9002,
+      "step": 8290
+    },
+    {
+      "epoch": 0.4815921552699527,
+      "grad_norm": 0.11992644518613815,
+      "learning_rate": 0.0003429689756000362,
+      "loss": 2.9051,
+      "step": 8300
+    },
+    {
+      "epoch": 0.48217238678232616,
+      "grad_norm": 0.1110587939620018,
+      "learning_rate": 0.0003423991678386511,
+      "loss": 2.9046,
+      "step": 8310
+    },
+    {
+      "epoch": 0.48275261829469956,
+      "grad_norm": 0.11831989139318466,
+      "learning_rate": 0.00034182920395908837,
+      "loss": 2.9001,
+      "step": 8320
+    },
+    {
+      "epoch": 0.48333284980707303,
+      "grad_norm": 0.11492130905389786,
+      "learning_rate": 0.0003412590860600148,
+      "loss": 2.8944,
+      "step": 8330
+    },
+    {
+      "epoch": 0.48391308131944644,
+      "grad_norm": 0.12855441868305206,
+      "learning_rate": 0.00034068881624066405,
+      "loss": 2.8941,
+      "step": 8340
+    },
+    {
+      "epoch": 0.4844933128318199,
+      "grad_norm": 0.12829254567623138,
+      "learning_rate": 0.0003401183966008296,
+      "loss": 2.8989,
+      "step": 8350
+    },
+    {
+      "epoch": 0.4850735443441933,
+      "grad_norm": 0.1167573556303978,
+      "learning_rate": 0.00033954782924085604,
+      "loss": 2.9027,
+      "step": 8360
+    },
+    {
+      "epoch": 0.48565377585656677,
+      "grad_norm": 0.12906575202941895,
+      "learning_rate": 0.0003389771162616324,
+      "loss": 2.893,
+      "step": 8370
+    },
+    {
+      "epoch": 0.4862340073689402,
+      "grad_norm": 0.12219451367855072,
+      "learning_rate": 0.00033840625976458357,
+      "loss": 2.8971,
+      "step": 8380
+    },
+    {
+      "epoch": 0.48681423888131364,
+      "grad_norm": 0.1430503875017166,
+      "learning_rate": 0.00033783526185166295,
+      "loss": 2.8945,
+      "step": 8390
+    },
+    {
+      "epoch": 0.4873944703936871,
+      "grad_norm": 0.1279267519712448,
+      "learning_rate": 0.00033726412462534454,
+      "loss": 2.8969,
+      "step": 8400
+    },
+    {
+      "epoch": 0.4879747019060605,
+      "grad_norm": 0.1239406168460846,
+      "learning_rate": 0.00033669285018861567,
+      "loss": 2.8994,
+      "step": 8410
+    },
+    {
+      "epoch": 0.488554933418434,
+      "grad_norm": 0.1379164159297943,
+      "learning_rate": 0.00033612144064496853,
+      "loss": 2.8949,
+      "step": 8420
+    },
+    {
+      "epoch": 0.4891351649308074,
+      "grad_norm": 0.12819483876228333,
+      "learning_rate": 0.00033554989809839294,
+      "loss": 2.897,
+      "step": 8430
+    },
+    {
+      "epoch": 0.48971539644318085,
+      "grad_norm": 0.12451434880495071,
+      "learning_rate": 0.00033497822465336854,
+      "loss": 2.903,
+      "step": 8440
+    },
+    {
+      "epoch": 0.49029562795555426,
+      "grad_norm": 0.1466275155544281,
+      "learning_rate": 0.0003344064224148567,
+      "loss": 2.8912,
+      "step": 8450
+    },
+    {
+      "epoch": 0.4908758594679277,
+      "grad_norm": 0.12186205387115479,
+      "learning_rate": 0.0003338344934882932,
+      "loss": 2.8998,
+      "step": 8460
+    },
+    {
+      "epoch": 0.49145609098030113,
+      "grad_norm": 0.12687867879867554,
+      "learning_rate": 0.00033326243997958014,
+      "loss": 2.8983,
+      "step": 8470
+    },
+    {
+      "epoch": 0.4920363224926746,
+      "grad_norm": 0.12620693445205688,
+      "learning_rate": 0.00033269026399507874,
+      "loss": 2.895,
+      "step": 8480
+    },
+    {
+      "epoch": 0.492616554005048,
+      "grad_norm": 0.1362224668264389,
+      "learning_rate": 0.00033211796764160074,
+      "loss": 2.9007,
+      "step": 8490
+    },
+    {
+      "epoch": 0.49319678551742147,
+      "grad_norm": 0.1300470530986786,
+      "learning_rate": 0.00033154555302640135,
+      "loss": 2.8914,
+      "step": 8500
+    },
+    {
+      "epoch": 0.4937770170297949,
+      "grad_norm": 0.12057654559612274,
+      "learning_rate": 0.00033097302225717096,
+      "loss": 2.8971,
+      "step": 8510
+    },
+    {
+      "epoch": 0.49435724854216834,
+      "grad_norm": 0.13263335824012756,
+      "learning_rate": 0.00033040037744202805,
+      "loss": 2.8971,
+      "step": 8520
+    },
+    {
+      "epoch": 0.49493748005454175,
+      "grad_norm": 0.12660051882266998,
+      "learning_rate": 0.00032982762068951073,
+      "loss": 2.8914,
+      "step": 8530
+    },
+    {
+      "epoch": 0.4955177115669152,
+      "grad_norm": 0.12398383021354675,
+      "learning_rate": 0.0003292547541085694,
+      "loss": 2.8936,
+      "step": 8540
+    },
+    {
+      "epoch": 0.4960979430792886,
+      "grad_norm": 0.1229000836610794,
+      "learning_rate": 0.00032868177980855876,
+      "loss": 2.888,
+      "step": 8550
+    },
+    {
+      "epoch": 0.4966781745916621,
+      "grad_norm": 0.11801040917634964,
+      "learning_rate": 0.0003281086998992303,
+      "loss": 2.8909,
+      "step": 8560
+    },
+    {
+      "epoch": 0.4972584061040355,
+      "grad_norm": 0.12945981323719025,
+      "learning_rate": 0.0003275355164907241,
+      "loss": 2.8878,
+      "step": 8570
+    },
+    {
+      "epoch": 0.49783863761640895,
+      "grad_norm": 0.12002068758010864,
+      "learning_rate": 0.0003269622316935618,
+      "loss": 2.892,
+      "step": 8580
+    },
+    {
+      "epoch": 0.49841886912878236,
+      "grad_norm": 0.12449994683265686,
+      "learning_rate": 0.0003263888476186377,
+      "loss": 2.8912,
+      "step": 8590
+    },
+    {
+      "epoch": 0.4989991006411558,
+      "grad_norm": 0.13638156652450562,
+      "learning_rate": 0.0003258153663772124,
+      "loss": 2.8877,
+      "step": 8600
+    },
+    {
+      "epoch": 0.49957933215352923,
+      "grad_norm": 0.12280316650867462,
+      "learning_rate": 0.0003252417900809038,
+      "loss": 2.8879,
+      "step": 8610
+    },
+    {
+      "epoch": 0.5001595636659026,
+      "grad_norm": 0.12275322526693344,
+      "learning_rate": 0.0003246681208416797,
+      "loss": 2.8906,
+      "step": 8620
+    },
+    {
+      "epoch": 0.5007397951782762,
+      "grad_norm": 0.1220172718167305,
+      "learning_rate": 0.0003240943607718506,
+      "loss": 2.8952,
+      "step": 8630
+    },
+    {
+      "epoch": 0.5013200266906496,
+      "grad_norm": 0.11458177119493484,
+      "learning_rate": 0.00032352051198406104,
+      "loss": 2.902,
+      "step": 8640
+    },
+    {
+      "epoch": 0.501900258203023,
+      "grad_norm": 0.12652765214443207,
+      "learning_rate": 0.0003229465765912824,
+      "loss": 2.9038,
+      "step": 8650
+    },
+    {
+      "epoch": 0.5024804897153965,
+      "grad_norm": 0.12456042319536209,
+      "learning_rate": 0.000322372556706805,
+      "loss": 2.8844,
+      "step": 8660
+    },
+    {
+      "epoch": 0.5030607212277699,
+      "grad_norm": 0.13799023628234863,
+      "learning_rate": 0.0003217984544442301,
+      "loss": 2.8987,
+      "step": 8670
+    },
+    {
+      "epoch": 0.5036409527401433,
+      "grad_norm": 0.12474406510591507,
+      "learning_rate": 0.00032122427191746234,
+      "loss": 2.8976,
+      "step": 8680
+    },
+    {
+      "epoch": 0.5042211842525167,
+      "grad_norm": 0.12724703550338745,
+      "learning_rate": 0.00032065001124070207,
+      "loss": 2.8862,
+      "step": 8690
+    },
+    {
+      "epoch": 0.5048014157648902,
+      "grad_norm": 0.11946358531713486,
+      "learning_rate": 0.0003200756745284371,
+      "loss": 2.8926,
+      "step": 8700
+    },
+    {
+      "epoch": 0.5053816472772636,
+      "grad_norm": 0.1258503645658493,
+      "learning_rate": 0.0003195012638954354,
+      "loss": 2.8932,
+      "step": 8710
+    },
+    {
+      "epoch": 0.505961878789637,
+      "grad_norm": 0.12079302221536636,
+      "learning_rate": 0.00031892678145673724,
+      "loss": 2.8914,
+      "step": 8720
+    },
+    {
+      "epoch": 0.5065421103020105,
+      "grad_norm": 0.12168605625629425,
+      "learning_rate": 0.000318352229327647,
+      "loss": 2.8867,
+      "step": 8730
+    },
+    {
+      "epoch": 0.507122341814384,
+      "grad_norm": 0.13427579402923584,
+      "learning_rate": 0.00031777760962372584,
+      "loss": 2.8893,
+      "step": 8740
+    },
+    {
+      "epoch": 0.5077025733267574,
+      "grad_norm": 0.1176985576748848,
+      "learning_rate": 0.00031720292446078374,
+      "loss": 2.8887,
+      "step": 8750
+    },
+    {
+      "epoch": 0.5082828048391308,
+      "grad_norm": 0.12351604551076889,
+      "learning_rate": 0.00031662817595487166,
+      "loss": 2.8915,
+      "step": 8760
+    },
+    {
+      "epoch": 0.5088630363515042,
+      "grad_norm": 0.1390778124332428,
+      "learning_rate": 0.00031605336622227365,
+      "loss": 2.8737,
+      "step": 8770
+    },
+    {
+      "epoch": 0.5094432678638777,
+      "grad_norm": 0.11954103410243988,
+      "learning_rate": 0.00031547849737949957,
+      "loss": 2.8888,
+      "step": 8780
+    },
+    {
+      "epoch": 0.5100234993762511,
+      "grad_norm": 0.12293373793363571,
+      "learning_rate": 0.00031490357154327674,
+      "loss": 2.8814,
+      "step": 8790
+    },
+    {
+      "epoch": 0.5106037308886245,
+      "grad_norm": 0.12284509837627411,
+      "learning_rate": 0.0003143285908305422,
+      "loss": 2.8874,
+      "step": 8800
+    },
+    {
+      "epoch": 0.511183962400998,
+      "grad_norm": 0.11924895644187927,
+      "learning_rate": 0.00031375355735843523,
+      "loss": 2.8813,
+      "step": 8810
+    },
+    {
+      "epoch": 0.5117641939133715,
+      "grad_norm": 0.12003005295991898,
+      "learning_rate": 0.00031317847324428924,
+      "loss": 2.8836,
+      "step": 8820
+    },
+    {
+      "epoch": 0.5123444254257449,
+      "grad_norm": 0.13070861995220184,
+      "learning_rate": 0.00031260334060562416,
+      "loss": 2.8851,
+      "step": 8830
+    },
+    {
+      "epoch": 0.5129246569381183,
+      "grad_norm": 0.11900255084037781,
+      "learning_rate": 0.0003120281615601387,
+      "loss": 2.8827,
+      "step": 8840
+    },
+    {
+      "epoch": 0.5135048884504917,
+      "grad_norm": 0.12470702081918716,
+      "learning_rate": 0.0003114529382257024,
+      "loss": 2.8916,
+      "step": 8850
+    },
+    {
+      "epoch": 0.5140851199628652,
+      "grad_norm": 0.1312616765499115,
+      "learning_rate": 0.0003108776727203478,
+      "loss": 2.897,
+      "step": 8860
+    },
+    {
+      "epoch": 0.5146653514752386,
+      "grad_norm": 0.13872870802879333,
+      "learning_rate": 0.00031030236716226265,
+      "loss": 2.8836,
+      "step": 8870
+    },
+    {
+      "epoch": 0.515245582987612,
+      "grad_norm": 0.11608674377202988,
+      "learning_rate": 0.00030972702366978237,
+      "loss": 2.8875,
+      "step": 8880
+    },
+    {
+      "epoch": 0.5158258144999855,
+      "grad_norm": 0.12205769121646881,
+      "learning_rate": 0.000309151644361382,
+      "loss": 2.8862,
+      "step": 8890
+    },
+    {
+      "epoch": 0.516406046012359,
+      "grad_norm": 0.12009671330451965,
+      "learning_rate": 0.0003085762313556683,
+      "loss": 2.8797,
+      "step": 8900
+    },
+    {
+      "epoch": 0.5169862775247324,
+      "grad_norm": 0.12120591104030609,
+      "learning_rate": 0.0003080007867713724,
+      "loss": 2.8905,
+      "step": 8910
+    },
+    {
+      "epoch": 0.5175665090371058,
+      "grad_norm": 0.12842518091201782,
+      "learning_rate": 0.00030742531272734153,
+      "loss": 2.8747,
+      "step": 8920
+    },
+    {
+      "epoch": 0.5181467405494793,
+      "grad_norm": 0.12532438337802887,
+      "learning_rate": 0.00030684981134253123,
+      "loss": 2.8892,
+      "step": 8930
+    },
+    {
+      "epoch": 0.5187269720618527,
+      "grad_norm": 0.1295221596956253,
+      "learning_rate": 0.0003062742847359981,
+      "loss": 2.8842,
+      "step": 8940
+    },
+    {
+      "epoch": 0.5193072035742261,
+      "grad_norm": 0.1296953707933426,
+      "learning_rate": 0.00030569873502689116,
+      "loss": 2.878,
+      "step": 8950
+    },
+    {
+      "epoch": 0.5198874350865995,
+      "grad_norm": 0.14120282232761383,
+      "learning_rate": 0.00030512316433444495,
+      "loss": 2.8809,
+      "step": 8960
+    },
+    {
+      "epoch": 0.520467666598973,
+      "grad_norm": 0.12610268592834473,
+      "learning_rate": 0.000304547574777971,
+      "loss": 2.8794,
+      "step": 8970
+    },
+    {
+      "epoch": 0.5210478981113464,
+      "grad_norm": 0.11908390372991562,
+      "learning_rate": 0.0003039719684768503,
+      "loss": 2.8839,
+      "step": 8980
+    },
+    {
+      "epoch": 0.5216281296237198,
+      "grad_norm": 0.13508306443691254,
+      "learning_rate": 0.0003033963475505256,
+      "loss": 2.8782,
+      "step": 8990
+    },
+    {
+      "epoch": 0.5222083611360933,
+      "grad_norm": 0.12108524888753891,
+      "learning_rate": 0.00030282071411849343,
+      "loss": 2.879,
+      "step": 9000
+    },
+    {
+      "epoch": 0.5222083611360933,
+      "eval_loss": 2.845144271850586,
+      "eval_runtime": 3.2553,
+      "eval_samples_per_second": 1330.14,
+      "eval_steps_per_second": 2.765,
+      "step": 9000
+    },
+    {
+      "epoch": 0.5227885926484668,
+      "grad_norm": 0.13046176731586456,
+      "learning_rate": 0.00030224507030029627,
+      "loss": 2.8809,
+      "step": 9010
+    },
+    {
+      "epoch": 0.5233688241608402,
+      "grad_norm": 0.12113803625106812,
+      "learning_rate": 0.0003016694182155152,
+      "loss": 2.8839,
+      "step": 9020
+    },
+    {
+      "epoch": 0.5239490556732136,
+      "grad_norm": 0.12337899953126907,
+      "learning_rate": 0.0003010937599837613,
+      "loss": 2.8821,
+      "step": 9030
+    },
+    {
+      "epoch": 0.524529287185587,
+      "grad_norm": 0.11981160938739777,
+      "learning_rate": 0.0003005180977246686,
+      "loss": 2.888,
+      "step": 9040
+    },
+    {
+      "epoch": 0.5251095186979605,
+      "grad_norm": 0.12357629835605621,
+      "learning_rate": 0.0002999424335578858,
+      "loss": 2.8804,
+      "step": 9050
+    },
+    {
+      "epoch": 0.5256897502103339,
+      "grad_norm": 0.11688230186700821,
+      "learning_rate": 0.00029936676960306863,
+      "loss": 2.8891,
+      "step": 9060
+    },
+    {
+      "epoch": 0.5262699817227073,
+      "grad_norm": 0.11743608117103577,
+      "learning_rate": 0.0002987911079798723,
+      "loss": 2.8685,
+      "step": 9070
+    },
+    {
+      "epoch": 0.5268502132350807,
+      "grad_norm": 0.1338096410036087,
+      "learning_rate": 0.0002982154508079428,
+      "loss": 2.8758,
+      "step": 9080
+    },
+    {
+      "epoch": 0.5274304447474543,
+      "grad_norm": 0.13182982802391052,
+      "learning_rate": 0.0002976398002069105,
+      "loss": 2.882,
+      "step": 9090
+    },
+    {
+      "epoch": 0.5280106762598277,
+      "grad_norm": 0.12470164895057678,
+      "learning_rate": 0.000297064158296381,
+      "loss": 2.8817,
+      "step": 9100
+    },
+    {
+      "epoch": 0.5285909077722011,
+      "grad_norm": 0.11741513013839722,
+      "learning_rate": 0.0002964885271959282,
+      "loss": 2.8768,
+      "step": 9110
+    },
+    {
+      "epoch": 0.5291711392845746,
+      "grad_norm": 0.1364392340183258,
+      "learning_rate": 0.0002959129090250863,
+      "loss": 2.8822,
+      "step": 9120
+    },
+    {
+      "epoch": 0.529751370796948,
+      "grad_norm": 0.12005024403333664,
+      "learning_rate": 0.0002953373059033413,
+      "loss": 2.8789,
+      "step": 9130
+    },
+    {
+      "epoch": 0.5303316023093214,
+      "grad_norm": 0.1239180713891983,
+      "learning_rate": 0.0002947617199501245,
+      "loss": 2.8754,
+      "step": 9140
+    },
+    {
+      "epoch": 0.5309118338216948,
+      "grad_norm": 0.12774530053138733,
+      "learning_rate": 0.00029418615328480357,
+      "loss": 2.8773,
+      "step": 9150
+    },
+    {
+      "epoch": 0.5314920653340683,
+      "grad_norm": 0.11815381795167923,
+      "learning_rate": 0.00029361060802667526,
+      "loss": 2.8711,
+      "step": 9160
+    },
+    {
+      "epoch": 0.5320722968464418,
+      "grad_norm": 0.12450312077999115,
+      "learning_rate": 0.0002930350862949577,
+      "loss": 2.8743,
+      "step": 9170
+    },
+    {
+      "epoch": 0.5326525283588152,
+      "grad_norm": 0.12741632759571075,
+      "learning_rate": 0.00029245959020878187,
+      "loss": 2.8846,
+      "step": 9180
+    },
+    {
+      "epoch": 0.5332327598711886,
+      "grad_norm": 0.12712997198104858,
+      "learning_rate": 0.0002918841218871848,
+      "loss": 2.8774,
+      "step": 9190
+    },
+    {
+      "epoch": 0.5338129913835621,
+      "grad_norm": 0.11238303780555725,
+      "learning_rate": 0.0002913086834491012,
+      "loss": 2.8782,
+      "step": 9200
+    },
+    {
+      "epoch": 0.5343932228959355,
+      "grad_norm": 0.1266774982213974,
+      "learning_rate": 0.00029073327701335566,
+      "loss": 2.883,
+      "step": 9210
+    },
+    {
+      "epoch": 0.5349734544083089,
+      "grad_norm": 0.12266207486391068,
+      "learning_rate": 0.00029015790469865484,
+      "loss": 2.8735,
+      "step": 9220
+    },
+    {
+      "epoch": 0.5355536859206823,
+      "grad_norm": 0.10979332774877548,
+      "learning_rate": 0.0002895825686235799,
+      "loss": 2.8791,
+      "step": 9230
+    },
+    {
+      "epoch": 0.5361339174330558,
+      "grad_norm": 0.11939531564712524,
+      "learning_rate": 0.0002890072709065787,
+      "loss": 2.8745,
+      "step": 9240
+    },
+    {
+      "epoch": 0.5367141489454292,
+      "grad_norm": 0.12080537527799606,
+      "learning_rate": 0.0002884320136659575,
+      "loss": 2.8775,
+      "step": 9250
+    },
+    {
+      "epoch": 0.5372943804578026,
+      "grad_norm": 0.12394317239522934,
+      "learning_rate": 0.00028785679901987394,
+      "loss": 2.8734,
+      "step": 9260
+    },
+    {
+      "epoch": 0.537874611970176,
+      "grad_norm": 0.12320924550294876,
+      "learning_rate": 0.0002872816290863283,
+      "loss": 2.8703,
+      "step": 9270
+    },
+    {
+      "epoch": 0.5384548434825496,
+      "grad_norm": 0.12183520197868347,
+      "learning_rate": 0.0002867065059831568,
+      "loss": 2.8731,
+      "step": 9280
+    },
+    {
+      "epoch": 0.539035074994923,
+      "grad_norm": 0.13638751208782196,
+      "learning_rate": 0.0002861314318280229,
+      "loss": 2.8725,
+      "step": 9290
+    },
+    {
+      "epoch": 0.5396153065072964,
+      "grad_norm": 0.12684093415737152,
+      "learning_rate": 0.0002855564087384098,
+      "loss": 2.8714,
+      "step": 9300
+    },
+    {
+      "epoch": 0.5401955380196698,
+      "grad_norm": 0.11322664469480515,
+      "learning_rate": 0.00028498143883161277,
+      "loss": 2.8693,
+      "step": 9310
+    },
+    {
+      "epoch": 0.5407757695320433,
+      "grad_norm": 0.11759771406650543,
+      "learning_rate": 0.00028440652422473124,
+      "loss": 2.8679,
+      "step": 9320
+    },
+    {
+      "epoch": 0.5413560010444167,
+      "grad_norm": 0.12511123716831207,
+      "learning_rate": 0.0002838316670346612,
+      "loss": 2.8744,
+      "step": 9330
+    },
+    {
+      "epoch": 0.5419362325567901,
+      "grad_norm": 0.1160508468747139,
+      "learning_rate": 0.00028325686937808673,
+      "loss": 2.874,
+      "step": 9340
+    },
+    {
+      "epoch": 0.5425164640691637,
+      "grad_norm": 0.11813979595899582,
+      "learning_rate": 0.0002826821333714732,
+      "loss": 2.8691,
+      "step": 9350
+    },
+    {
+      "epoch": 0.5430966955815371,
+      "grad_norm": 0.11728700250387192,
+      "learning_rate": 0.0002821074611310588,
+      "loss": 2.8717,
+      "step": 9360
+    },
+    {
+      "epoch": 0.5436769270939105,
+      "grad_norm": 0.12824493646621704,
+      "learning_rate": 0.0002815328547728469,
+      "loss": 2.875,
+      "step": 9370
+    },
+    {
+      "epoch": 0.5442571586062839,
+      "grad_norm": 0.12653270363807678,
+      "learning_rate": 0.0002809583164125983,
+      "loss": 2.8682,
+      "step": 9380
+    },
+    {
+      "epoch": 0.5448373901186574,
+      "grad_norm": 0.13113363087177277,
+      "learning_rate": 0.00028038384816582337,
+      "loss": 2.8583,
+      "step": 9390
+    },
+    {
+      "epoch": 0.5454176216310308,
+      "grad_norm": 0.11145169287919998,
+      "learning_rate": 0.0002798094521477744,
+      "loss": 2.8714,
+      "step": 9400
+    },
+    {
+      "epoch": 0.5459978531434042,
+      "grad_norm": 0.12025914341211319,
+      "learning_rate": 0.0002792351304734378,
+      "loss": 2.8689,
+      "step": 9410
+    },
+    {
+      "epoch": 0.5465780846557776,
+      "grad_norm": 0.1347450315952301,
+      "learning_rate": 0.000278660885257526,
+      "loss": 2.8803,
+      "step": 9420
+    },
+    {
+      "epoch": 0.5471583161681511,
+      "grad_norm": 0.11728854477405548,
+      "learning_rate": 0.0002780867186144703,
+      "loss": 2.8614,
+      "step": 9430
+    },
+    {
+      "epoch": 0.5477385476805245,
+      "grad_norm": 0.1399793028831482,
+      "learning_rate": 0.00027751263265841204,
+      "loss": 2.8777,
+      "step": 9440
+    },
+    {
+      "epoch": 0.548318779192898,
+      "grad_norm": 0.13229645788669586,
+      "learning_rate": 0.0002769386295031961,
+      "loss": 2.8723,
+      "step": 9450
+    },
+    {
+      "epoch": 0.5488990107052714,
+      "grad_norm": 0.12199070304632187,
+      "learning_rate": 0.00027636471126236213,
+      "loss": 2.8577,
+      "step": 9460
+    },
+    {
+      "epoch": 0.5494792422176449,
+      "grad_norm": 0.14131730794906616,
+      "learning_rate": 0.0002757908800491373,
+      "loss": 2.857,
+      "step": 9470
+    },
+    {
+      "epoch": 0.5500594737300183,
+      "grad_norm": 0.1343252956867218,
+      "learning_rate": 0.0002752171379764283,
+      "loss": 2.8689,
+      "step": 9480
+    },
+    {
+      "epoch": 0.5506397052423917,
+      "grad_norm": 0.1338685154914856,
+      "learning_rate": 0.0002746434871568133,
+      "loss": 2.8775,
+      "step": 9490
+    },
+    {
+      "epoch": 0.5512199367547651,
+      "grad_norm": 0.12388128787279129,
+      "learning_rate": 0.00027406992970253506,
+      "loss": 2.8761,
+      "step": 9500
+    },
+    {
+      "epoch": 0.5518001682671386,
+      "grad_norm": 0.12272147834300995,
+      "learning_rate": 0.0002734964677254918,
+      "loss": 2.8722,
+      "step": 9510
+    },
+    {
+      "epoch": 0.552380399779512,
+      "grad_norm": 0.12000911682844162,
+      "learning_rate": 0.00027292310333723086,
+      "loss": 2.8743,
+      "step": 9520
+    },
+    {
+      "epoch": 0.5529606312918854,
+      "grad_norm": 0.13635672628879547,
+      "learning_rate": 0.00027234983864894,
+      "loss": 2.8657,
+      "step": 9530
+    },
+    {
+      "epoch": 0.5535408628042588,
+      "grad_norm": 0.12129581719636917,
+      "learning_rate": 0.0002717766757714398,
+      "loss": 2.8661,
+      "step": 9540
+    },
+    {
+      "epoch": 0.5541210943166324,
+      "grad_norm": 0.11717355996370316,
+      "learning_rate": 0.00027120361681517606,
+      "loss": 2.8707,
+      "step": 9550
+    },
+    {
+      "epoch": 0.5547013258290058,
+      "grad_norm": 0.12199341505765915,
+      "learning_rate": 0.0002706306638902117,
+      "loss": 2.8555,
+      "step": 9560
+    },
+    {
+      "epoch": 0.5552815573413792,
+      "grad_norm": 0.1175154522061348,
+      "learning_rate": 0.0002700578191062196,
+      "loss": 2.8721,
+      "step": 9570
+    },
+    {
+      "epoch": 0.5558617888537526,
+      "grad_norm": 0.12546683847904205,
+      "learning_rate": 0.00026948508457247416,
+      "loss": 2.8689,
+      "step": 9580
+    },
+    {
+      "epoch": 0.5564420203661261,
+      "grad_norm": 0.11439734697341919,
+      "learning_rate": 0.000268912462397844,
+      "loss": 2.8552,
+      "step": 9590
+    },
+    {
+      "epoch": 0.5570222518784995,
+      "grad_norm": 0.13139833509922028,
+      "learning_rate": 0.00026833995469078404,
+      "loss": 2.8728,
+      "step": 9600
+    },
+    {
+      "epoch": 0.5576024833908729,
+      "grad_norm": 0.14722158014774323,
+      "learning_rate": 0.00026776756355932743,
+      "loss": 2.8594,
+      "step": 9610
+    },
+    {
+      "epoch": 0.5581827149032464,
+      "grad_norm": 0.12206868082284927,
+      "learning_rate": 0.00026719529111107846,
+      "loss": 2.8713,
+      "step": 9620
+    },
+    {
+      "epoch": 0.5587629464156199,
+      "grad_norm": 0.11777371913194656,
+      "learning_rate": 0.00026662313945320404,
+      "loss": 2.8656,
+      "step": 9630
+    },
+    {
+      "epoch": 0.5593431779279933,
+      "grad_norm": 0.12058188021183014,
+      "learning_rate": 0.00026605111069242664,
+      "loss": 2.8712,
+      "step": 9640
+    },
+    {
+      "epoch": 0.5599234094403667,
+      "grad_norm": 0.1278459131717682,
+      "learning_rate": 0.00026547920693501616,
+      "loss": 2.8686,
+      "step": 9650
+    },
+    {
+      "epoch": 0.5605036409527402,
+      "grad_norm": 0.12272592633962631,
+      "learning_rate": 0.00026490743028678194,
+      "loss": 2.8636,
+      "step": 9660
+    },
+    {
+      "epoch": 0.5610838724651136,
+      "grad_norm": 0.11543965339660645,
+      "learning_rate": 0.00026433578285306567,
+      "loss": 2.8592,
+      "step": 9670
+    },
+    {
+      "epoch": 0.561664103977487,
+      "grad_norm": 0.11765621602535248,
+      "learning_rate": 0.0002637642667387329,
+      "loss": 2.867,
+      "step": 9680
+    },
+    {
+      "epoch": 0.5622443354898604,
+      "grad_norm": 0.12996822595596313,
+      "learning_rate": 0.0002631928840481662,
+      "loss": 2.8669,
+      "step": 9690
+    },
+    {
+      "epoch": 0.5628245670022339,
+      "grad_norm": 0.11992313712835312,
+      "learning_rate": 0.00026262163688525606,
+      "loss": 2.8576,
+      "step": 9700
+    },
+    {
+      "epoch": 0.5634047985146073,
+      "grad_norm": 0.1216612309217453,
+      "learning_rate": 0.00026205052735339457,
+      "loss": 2.8656,
+      "step": 9710
+    },
+    {
+      "epoch": 0.5639850300269807,
+      "grad_norm": 0.11923664063215256,
+      "learning_rate": 0.00026147955755546686,
+      "loss": 2.8625,
+      "step": 9720
+    },
+    {
+      "epoch": 0.5645652615393542,
+      "grad_norm": 0.1174679845571518,
+      "learning_rate": 0.00026090872959384353,
+      "loss": 2.8589,
+      "step": 9730
+    },
+    {
+      "epoch": 0.5651454930517277,
+      "grad_norm": 0.12439408898353577,
+      "learning_rate": 0.00026033804557037304,
+      "loss": 2.8573,
+      "step": 9740
+    },
+    {
+      "epoch": 0.5657257245641011,
+      "grad_norm": 0.12268688529729843,
+      "learning_rate": 0.0002597675075863735,
+      "loss": 2.8612,
+      "step": 9750
+    },
+    {
+      "epoch": 0.5663059560764745,
+      "grad_norm": 0.11994469910860062,
+      "learning_rate": 0.0002591971177426256,
+      "loss": 2.8667,
+      "step": 9760
+    },
+    {
+      "epoch": 0.5668861875888479,
+      "grad_norm": 0.12739793956279755,
+      "learning_rate": 0.0002586268781393648,
+      "loss": 2.8657,
+      "step": 9770
+    },
+    {
+      "epoch": 0.5674664191012214,
+      "grad_norm": 0.12942016124725342,
+      "learning_rate": 0.00025805679087627267,
+      "loss": 2.863,
+      "step": 9780
+    },
+    {
+      "epoch": 0.5680466506135948,
+      "grad_norm": 0.12867708504199982,
+      "learning_rate": 0.00025748685805247046,
+      "loss": 2.8596,
+      "step": 9790
+    },
+    {
+      "epoch": 0.5686268821259682,
+      "grad_norm": 0.1384700983762741,
+      "learning_rate": 0.00025691708176651034,
+      "loss": 2.8612,
+      "step": 9800
+    },
+    {
+      "epoch": 0.5692071136383416,
+      "grad_norm": 0.11695626378059387,
+      "learning_rate": 0.0002563474641163686,
+      "loss": 2.8613,
+      "step": 9810
+    },
+    {
+      "epoch": 0.5697873451507152,
+      "grad_norm": 0.12379258126020432,
+      "learning_rate": 0.0002557780071994367,
+      "loss": 2.8637,
+      "step": 9820
+    },
+    {
+      "epoch": 0.5703675766630886,
+      "grad_norm": 0.13220758736133575,
+      "learning_rate": 0.00025520871311251493,
+      "loss": 2.8572,
+      "step": 9830
+    },
+    {
+      "epoch": 0.570947808175462,
+      "grad_norm": 0.12004509568214417,
+      "learning_rate": 0.00025463958395180377,
+      "loss": 2.8614,
+      "step": 9840
+    },
+    {
+      "epoch": 0.5715280396878355,
+      "grad_norm": 0.12457242608070374,
+      "learning_rate": 0.0002540706218128962,
+      "loss": 2.8606,
+      "step": 9850
+    },
+    {
+      "epoch": 0.5721082712002089,
+      "grad_norm": 0.125260129570961,
+      "learning_rate": 0.0002535018287907707,
+      "loss": 2.8606,
+      "step": 9860
+    },
+    {
+      "epoch": 0.5726885027125823,
+      "grad_norm": 0.11718660593032837,
+      "learning_rate": 0.00025293320697978254,
+      "loss": 2.86,
+      "step": 9870
+    },
+    {
+      "epoch": 0.5732687342249557,
+      "grad_norm": 0.1096329316496849,
+      "learning_rate": 0.0002523647584736568,
+      "loss": 2.8743,
+      "step": 9880
+    },
+    {
+      "epoch": 0.5738489657373292,
+      "grad_norm": 0.11327598243951797,
+      "learning_rate": 0.0002517964853654806,
+      "loss": 2.8492,
+      "step": 9890
+    },
+    {
+      "epoch": 0.5744291972497026,
+      "grad_norm": 0.1237105280160904,
+      "learning_rate": 0.0002512283897476949,
+      "loss": 2.852,
+      "step": 9900
+    },
+    {
+      "epoch": 0.5750094287620761,
+      "grad_norm": 0.11739984154701233,
+      "learning_rate": 0.0002506604737120874,
+      "loss": 2.8535,
+      "step": 9910
+    },
+    {
+      "epoch": 0.5755896602744495,
+      "grad_norm": 0.12682320177555084,
+      "learning_rate": 0.00025009273934978424,
+      "loss": 2.8575,
+      "step": 9920
+    },
+    {
+      "epoch": 0.576169891786823,
+      "grad_norm": 0.12347414344549179,
+      "learning_rate": 0.00024952518875124305,
+      "loss": 2.8596,
+      "step": 9930
+    },
+    {
+      "epoch": 0.5767501232991964,
+      "grad_norm": 0.11207421123981476,
+      "learning_rate": 0.0002489578240062444,
+      "loss": 2.8563,
+      "step": 9940
+    },
+    {
+      "epoch": 0.5773303548115698,
+      "grad_norm": 0.12151192873716354,
+      "learning_rate": 0.0002483906472038848,
+      "loss": 2.8513,
+      "step": 9950
+    },
+    {
+      "epoch": 0.5779105863239432,
+      "grad_norm": 0.11661417037248611,
+      "learning_rate": 0.00024782366043256876,
+      "loss": 2.8538,
+      "step": 9960
+    },
+    {
+      "epoch": 0.5784908178363167,
+      "grad_norm": 0.11908597499132156,
+      "learning_rate": 0.0002472568657800007,
+      "loss": 2.8549,
+      "step": 9970
+    },
+    {
+      "epoch": 0.5790710493486901,
+      "grad_norm": 0.12369140982627869,
+      "learning_rate": 0.00024669026533317816,
+      "loss": 2.859,
+      "step": 9980
+    },
+    {
+      "epoch": 0.5796512808610635,
+      "grad_norm": 0.12169597297906876,
+      "learning_rate": 0.0002461238611783832,
+      "loss": 2.8516,
+      "step": 9990
+    },
+    {
+      "epoch": 0.580231512373437,
+      "grad_norm": 0.1137092188000679,
+      "learning_rate": 0.0002455576554011753,
+      "loss": 2.8506,
+      "step": 10000
+    },
+    {
+      "epoch": 0.580231512373437,
+      "eval_loss": 2.8198139667510986,
+      "eval_runtime": 3.2544,
+      "eval_samples_per_second": 1330.504,
+      "eval_steps_per_second": 2.765,
+      "step": 10000
+    },
+    {
+      "epoch": 0.5808117438858105,
+      "grad_norm": 0.11945224553346634,
+      "learning_rate": 0.00024499165008638355,
+      "loss": 2.8527,
+      "step": 10010
+    },
+    {
+      "epoch": 0.5813919753981839,
+      "grad_norm": 0.12194681167602539,
+      "learning_rate": 0.0002444258473180986,
+      "loss": 2.8676,
+      "step": 10020
+    },
+    {
+      "epoch": 0.5819722069105573,
+      "grad_norm": 0.12587039172649384,
+      "learning_rate": 0.00024386024917966563,
+      "loss": 2.8468,
+      "step": 10030
+    },
+    {
+      "epoch": 0.5825524384229307,
+      "grad_norm": 0.12192162871360779,
+      "learning_rate": 0.0002432948577536762,
+      "loss": 2.8484,
+      "step": 10040
+    },
+    {
+      "epoch": 0.5831326699353042,
+      "grad_norm": 0.11401449888944626,
+      "learning_rate": 0.00024272967512196093,
+      "loss": 2.8636,
+      "step": 10050
+    },
+    {
+      "epoch": 0.5837129014476776,
+      "grad_norm": 0.12227935343980789,
+      "learning_rate": 0.0002421647033655812,
+      "loss": 2.8497,
+      "step": 10060
+    },
+    {
+      "epoch": 0.584293132960051,
+      "grad_norm": 0.11773716658353806,
+      "learning_rate": 0.00024159994456482233,
+      "loss": 2.857,
+      "step": 10070
+    },
+    {
+      "epoch": 0.5848733644724246,
+      "grad_norm": 0.124253049492836,
+      "learning_rate": 0.00024103540079918555,
+      "loss": 2.8499,
+      "step": 10080
+    },
+    {
+      "epoch": 0.585453595984798,
+      "grad_norm": 0.11704014986753464,
+      "learning_rate": 0.00024047107414737985,
+      "loss": 2.8522,
+      "step": 10090
+    },
+    {
+      "epoch": 0.5860338274971714,
+      "grad_norm": 0.11885286867618561,
+      "learning_rate": 0.0002399069666873153,
+      "loss": 2.855,
+      "step": 10100
+    },
+    {
+      "epoch": 0.5866140590095448,
+      "grad_norm": 0.12006965279579163,
+      "learning_rate": 0.00023934308049609453,
+      "loss": 2.8488,
+      "step": 10110
+    },
+    {
+      "epoch": 0.5871942905219183,
+      "grad_norm": 0.12023113667964935,
+      "learning_rate": 0.00023877941765000564,
+      "loss": 2.8542,
+      "step": 10120
+    },
+    {
+      "epoch": 0.5877745220342917,
+      "grad_norm": 0.12737338244915009,
+      "learning_rate": 0.00023821598022451436,
+      "loss": 2.8588,
+      "step": 10130
+    },
+    {
+      "epoch": 0.5883547535466651,
+      "grad_norm": 0.11698620766401291,
+      "learning_rate": 0.00023765277029425607,
+      "loss": 2.8544,
+      "step": 10140
+    },
+    {
+      "epoch": 0.5889349850590385,
+      "grad_norm": 0.12589864432811737,
+      "learning_rate": 0.000237089789933029,
+      "loss": 2.8448,
+      "step": 10150
+    },
+    {
+      "epoch": 0.589515216571412,
+      "grad_norm": 0.11532309651374817,
+      "learning_rate": 0.0002365270412137856,
+      "loss": 2.8618,
+      "step": 10160
+    },
+    {
+      "epoch": 0.5900954480837854,
+      "grad_norm": 0.10937913507223129,
+      "learning_rate": 0.00023596452620862585,
+      "loss": 2.8527,
+      "step": 10170
+    },
+    {
+      "epoch": 0.5906756795961589,
+      "grad_norm": 0.11980416625738144,
+      "learning_rate": 0.00023540224698878861,
+      "loss": 2.8553,
+      "step": 10180
+    },
+    {
+      "epoch": 0.5912559111085323,
+      "grad_norm": 0.11810686439275742,
+      "learning_rate": 0.00023484020562464507,
+      "loss": 2.8545,
+      "step": 10190
+    },
+    {
+      "epoch": 0.5918361426209058,
+      "grad_norm": 0.11651547253131866,
+      "learning_rate": 0.00023427840418569043,
+      "loss": 2.8522,
+      "step": 10200
+    },
+    {
+      "epoch": 0.5924163741332792,
+      "grad_norm": 0.11145967990159988,
+      "learning_rate": 0.00023371684474053633,
+      "loss": 2.8564,
+      "step": 10210
+    },
+    {
+      "epoch": 0.5929966056456526,
+      "grad_norm": 0.11742381006479263,
+      "learning_rate": 0.0002331555293569037,
+      "loss": 2.8529,
+      "step": 10220
+    },
+    {
+      "epoch": 0.593576837158026,
+      "grad_norm": 0.1287650465965271,
+      "learning_rate": 0.00023259446010161425,
+      "loss": 2.847,
+      "step": 10230
+    },
+    {
+      "epoch": 0.5941570686703995,
+      "grad_norm": 0.12560808658599854,
+      "learning_rate": 0.00023203363904058394,
+      "loss": 2.8424,
+      "step": 10240
+    },
+    {
+      "epoch": 0.5947373001827729,
+      "grad_norm": 0.13144509494304657,
+      "learning_rate": 0.0002314730682388147,
+      "loss": 2.8497,
+      "step": 10250
+    },
+    {
+      "epoch": 0.5953175316951463,
+      "grad_norm": 0.11483640223741531,
+      "learning_rate": 0.00023091274976038686,
+      "loss": 2.8525,
+      "step": 10260
+    },
+    {
+      "epoch": 0.5958977632075197,
+      "grad_norm": 0.12085619568824768,
+      "learning_rate": 0.0002303526856684519,
+      "loss": 2.846,
+      "step": 10270
+    },
+    {
+      "epoch": 0.5964779947198933,
+      "grad_norm": 0.13581375777721405,
+      "learning_rate": 0.00022979287802522423,
+      "loss": 2.8471,
+      "step": 10280
+    },
+    {
+      "epoch": 0.5970582262322667,
+      "grad_norm": 0.11522037535905838,
+      "learning_rate": 0.00022923332889197447,
+      "loss": 2.841,
+      "step": 10290
+    },
+    {
+      "epoch": 0.5976384577446401,
+      "grad_norm": 0.1114853248000145,
+      "learning_rate": 0.00022867404032902097,
+      "loss": 2.8507,
+      "step": 10300
+    },
+    {
+      "epoch": 0.5982186892570136,
+      "grad_norm": 0.1106984093785286,
+      "learning_rate": 0.00022811501439572288,
+      "loss": 2.8501,
+      "step": 10310
+    },
+    {
+      "epoch": 0.598798920769387,
+      "grad_norm": 0.12095363438129425,
+      "learning_rate": 0.0002275562531504724,
+      "loss": 2.8392,
+      "step": 10320
+    },
+    {
+      "epoch": 0.5993791522817604,
+      "grad_norm": 0.11527710407972336,
+      "learning_rate": 0.00022699775865068667,
+      "loss": 2.8498,
+      "step": 10330
+    },
+    {
+      "epoch": 0.5999593837941338,
+      "grad_norm": 0.11631615459918976,
+      "learning_rate": 0.00022643953295280127,
+      "loss": 2.8526,
+      "step": 10340
+    },
+    {
+      "epoch": 0.6005396153065073,
+      "grad_norm": 0.1107979491353035,
+      "learning_rate": 0.0002258815781122614,
+      "loss": 2.8488,
+      "step": 10350
+    },
+    {
+      "epoch": 0.6011198468188808,
+      "grad_norm": 0.1126491129398346,
+      "learning_rate": 0.00022532389618351532,
+      "loss": 2.8404,
+      "step": 10360
+    },
+    {
+      "epoch": 0.6017000783312542,
+      "grad_norm": 0.11740950495004654,
+      "learning_rate": 0.00022476648922000646,
+      "loss": 2.8499,
+      "step": 10370
+    },
+    {
+      "epoch": 0.6022803098436276,
+      "grad_norm": 0.11938904970884323,
+      "learning_rate": 0.00022420935927416547,
+      "loss": 2.8547,
+      "step": 10380
+    },
+    {
+      "epoch": 0.6028605413560011,
+      "grad_norm": 0.11484769731760025,
+      "learning_rate": 0.00022365250839740338,
+      "loss": 2.8392,
+      "step": 10390
+    },
+    {
+      "epoch": 0.6034407728683745,
+      "grad_norm": 0.12051428109407425,
+      "learning_rate": 0.0002230959386401032,
+      "loss": 2.8416,
+      "step": 10400
+    },
+    {
+      "epoch": 0.6040210043807479,
+      "grad_norm": 0.12364054471254349,
+      "learning_rate": 0.00022253965205161326,
+      "loss": 2.8343,
+      "step": 10410
+    },
+    {
+      "epoch": 0.6046012358931213,
+      "grad_norm": 0.1125280112028122,
+      "learning_rate": 0.00022198365068023892,
+      "loss": 2.8441,
+      "step": 10420
+    },
+    {
+      "epoch": 0.6051814674054948,
+      "grad_norm": 0.11715447157621384,
+      "learning_rate": 0.00022142793657323558,
+      "loss": 2.8391,
+      "step": 10430
+    },
+    {
+      "epoch": 0.6057616989178682,
+      "grad_norm": 0.11433437466621399,
+      "learning_rate": 0.00022087251177680086,
+      "loss": 2.8549,
+      "step": 10440
+    },
+    {
+      "epoch": 0.6063419304302416,
+      "grad_norm": 0.1222948208451271,
+      "learning_rate": 0.00022031737833606686,
+      "loss": 2.8406,
+      "step": 10450
+    },
+    {
+      "epoch": 0.6069221619426151,
+      "grad_norm": 0.11805406212806702,
+      "learning_rate": 0.0002197625382950932,
+      "loss": 2.8415,
+      "step": 10460
+    },
+    {
+      "epoch": 0.6075023934549886,
+      "grad_norm": 0.13002602756023407,
+      "learning_rate": 0.00021920799369685892,
+      "loss": 2.851,
+      "step": 10470
+    },
+    {
+      "epoch": 0.608082624967362,
+      "grad_norm": 0.11929357796907425,
+      "learning_rate": 0.00021865374658325544,
+      "loss": 2.8437,
+      "step": 10480
+    },
+    {
+      "epoch": 0.6086628564797354,
+      "grad_norm": 0.11752030998468399,
+      "learning_rate": 0.00021809979899507876,
+      "loss": 2.8532,
+      "step": 10490
+    },
+    {
+      "epoch": 0.6092430879921088,
+      "grad_norm": 0.12201694399118423,
+      "learning_rate": 0.00021754615297202168,
+      "loss": 2.8474,
+      "step": 10500
+    },
+    {
+      "epoch": 0.6098233195044823,
+      "grad_norm": 0.12019883096218109,
+      "learning_rate": 0.00021699281055266706,
+      "loss": 2.8422,
+      "step": 10510
+    },
+    {
+      "epoch": 0.6104035510168557,
+      "grad_norm": 0.12413442134857178,
+      "learning_rate": 0.00021643977377447954,
+      "loss": 2.8316,
+      "step": 10520
+    },
+    {
+      "epoch": 0.6109837825292291,
+      "grad_norm": 0.11983013898134232,
+      "learning_rate": 0.00021588704467379862,
+      "loss": 2.8448,
+      "step": 10530
+    },
+    {
+      "epoch": 0.6115640140416027,
+      "grad_norm": 0.13365738093852997,
+      "learning_rate": 0.0002153346252858306,
+      "loss": 2.837,
+      "step": 10540
+    },
+    {
+      "epoch": 0.6121442455539761,
+      "grad_norm": 0.13185539841651917,
+      "learning_rate": 0.00021478251764464148,
+      "loss": 2.8468,
+      "step": 10550
+    },
+    {
+      "epoch": 0.6127244770663495,
+      "grad_norm": 0.1213960349559784,
+      "learning_rate": 0.00021423072378314964,
+      "loss": 2.8444,
+      "step": 10560
+    },
+    {
+      "epoch": 0.6133047085787229,
+      "grad_norm": 0.12037312239408493,
+      "learning_rate": 0.00021367924573311773,
+      "loss": 2.8438,
+      "step": 10570
+    },
+    {
+      "epoch": 0.6138849400910964,
+      "grad_norm": 0.12542636692523956,
+      "learning_rate": 0.00021312808552514592,
+      "loss": 2.8424,
+      "step": 10580
+    },
+    {
+      "epoch": 0.6144651716034698,
+      "grad_norm": 0.14415085315704346,
+      "learning_rate": 0.00021257724518866352,
+      "loss": 2.8417,
+      "step": 10590
+    },
+    {
+      "epoch": 0.6150454031158432,
+      "grad_norm": 0.1150176003575325,
+      "learning_rate": 0.00021202672675192248,
+      "loss": 2.8435,
+      "step": 10600
+    },
+    {
+      "epoch": 0.6156256346282166,
+      "grad_norm": 0.11662835627794266,
+      "learning_rate": 0.00021147653224198951,
+      "loss": 2.8441,
+      "step": 10610
+    },
+    {
+      "epoch": 0.6162058661405901,
+      "grad_norm": 0.11693531274795532,
+      "learning_rate": 0.00021092666368473817,
+      "loss": 2.8391,
+      "step": 10620
+    },
+    {
+      "epoch": 0.6167860976529635,
+      "grad_norm": 0.11077579110860825,
+      "learning_rate": 0.0002103771231048423,
+      "loss": 2.8345,
+      "step": 10630
+    },
+    {
+      "epoch": 0.617366329165337,
+      "grad_norm": 0.11653861403465271,
+      "learning_rate": 0.00020982791252576773,
+      "loss": 2.8448,
+      "step": 10640
+    },
+    {
+      "epoch": 0.6179465606777104,
+      "grad_norm": 0.11749275773763657,
+      "learning_rate": 0.00020927903396976552,
+      "loss": 2.8558,
+      "step": 10650
+    },
+    {
+      "epoch": 0.6185267921900839,
+      "grad_norm": 0.11677636206150055,
+      "learning_rate": 0.00020873048945786382,
+      "loss": 2.8353,
+      "step": 10660
+    },
+    {
+      "epoch": 0.6191070237024573,
+      "grad_norm": 0.11745753139257431,
+      "learning_rate": 0.00020818228100986106,
+      "loss": 2.8494,
+      "step": 10670
+    },
+    {
+      "epoch": 0.6196872552148307,
+      "grad_norm": 0.11747489869594574,
+      "learning_rate": 0.00020763441064431827,
+      "loss": 2.8397,
+      "step": 10680
+    },
+    {
+      "epoch": 0.6202674867272041,
+      "grad_norm": 0.11356910318136215,
+      "learning_rate": 0.00020708688037855138,
+      "loss": 2.8472,
+      "step": 10690
+    },
+    {
+      "epoch": 0.6208477182395776,
+      "grad_norm": 0.11063719540834427,
+      "learning_rate": 0.00020653969222862435,
+      "loss": 2.8508,
+      "step": 10700
+    },
+    {
+      "epoch": 0.621427949751951,
+      "grad_norm": 0.10978058725595474,
+      "learning_rate": 0.00020599284820934112,
+      "loss": 2.8308,
+      "step": 10710
+    },
+    {
+      "epoch": 0.6220081812643244,
+      "grad_norm": 0.11860186606645584,
+      "learning_rate": 0.00020544635033423867,
+      "loss": 2.8263,
+      "step": 10720
+    },
+    {
+      "epoch": 0.6225884127766979,
+      "grad_norm": 0.1312050074338913,
+      "learning_rate": 0.00020490020061557953,
+      "loss": 2.8455,
+      "step": 10730
+    },
+    {
+      "epoch": 0.6231686442890714,
+      "grad_norm": 0.13181331753730774,
+      "learning_rate": 0.00020435440106434408,
+      "loss": 2.8489,
+      "step": 10740
+    },
+    {
+      "epoch": 0.6237488758014448,
+      "grad_norm": 0.1471181958913803,
+      "learning_rate": 0.00020380895369022357,
+      "loss": 2.8285,
+      "step": 10750
+    },
+    {
+      "epoch": 0.6243291073138182,
+      "grad_norm": 0.12075991183519363,
+      "learning_rate": 0.00020326386050161215,
+      "loss": 2.8402,
+      "step": 10760
+    },
+    {
+      "epoch": 0.6249093388261916,
+      "grad_norm": 0.1117480993270874,
+      "learning_rate": 0.0002027191235056003,
+      "loss": 2.8426,
+      "step": 10770
+    },
+    {
+      "epoch": 0.6254895703385651,
+      "grad_norm": 0.11622477322816849,
+      "learning_rate": 0.0002021747447079665,
+      "loss": 2.8423,
+      "step": 10780
+    },
+    {
+      "epoch": 0.6260698018509385,
+      "grad_norm": 0.11475232988595963,
+      "learning_rate": 0.00020163072611317055,
+      "loss": 2.835,
+      "step": 10790
+    },
+    {
+      "epoch": 0.6266500333633119,
+      "grad_norm": 0.12252891808748245,
+      "learning_rate": 0.00020108706972434606,
+      "loss": 2.8381,
+      "step": 10800
+    },
+    {
+      "epoch": 0.6272302648756855,
+      "grad_norm": 0.11319098621606827,
+      "learning_rate": 0.00020054377754329258,
+      "loss": 2.8326,
+      "step": 10810
+    },
+    {
+      "epoch": 0.6278104963880589,
+      "grad_norm": 0.11103735119104385,
+      "learning_rate": 0.00020000085157046902,
+      "loss": 2.8292,
+      "step": 10820
+    },
+    {
+      "epoch": 0.6283907279004323,
+      "grad_norm": 0.12254971265792847,
+      "learning_rate": 0.00019945829380498556,
+      "loss": 2.8379,
+      "step": 10830
+    },
+    {
+      "epoch": 0.6289709594128057,
+      "grad_norm": 0.1253294050693512,
+      "learning_rate": 0.00019891610624459674,
+      "loss": 2.8404,
+      "step": 10840
+    },
+    {
+      "epoch": 0.6295511909251792,
+      "grad_norm": 0.12701797485351562,
+      "learning_rate": 0.0001983742908856942,
+      "loss": 2.8331,
+      "step": 10850
+    },
+    {
+      "epoch": 0.6301314224375526,
+      "grad_norm": 0.1351822167634964,
+      "learning_rate": 0.00019783284972329845,
+      "loss": 2.831,
+      "step": 10860
+    },
+    {
+      "epoch": 0.630711653949926,
+      "grad_norm": 0.11504077911376953,
+      "learning_rate": 0.00019729178475105292,
+      "loss": 2.8397,
+      "step": 10870
+    },
+    {
+      "epoch": 0.6312918854622994,
+      "grad_norm": 0.11900710314512253,
+      "learning_rate": 0.00019675109796121523,
+      "loss": 2.8328,
+      "step": 10880
+    },
+    {
+      "epoch": 0.6318721169746729,
+      "grad_norm": 0.11879398673772812,
+      "learning_rate": 0.00019621079134465096,
+      "loss": 2.8275,
+      "step": 10890
+    },
+    {
+      "epoch": 0.6324523484870463,
+      "grad_norm": 0.11795203387737274,
+      "learning_rate": 0.00019567086689082562,
+      "loss": 2.828,
+      "step": 10900
+    },
+    {
+      "epoch": 0.6330325799994198,
+      "grad_norm": 0.1163572296500206,
+      "learning_rate": 0.00019513132658779758,
+      "loss": 2.8387,
+      "step": 10910
+    },
+    {
+      "epoch": 0.6336128115117932,
+      "grad_norm": 0.11812139302492142,
+      "learning_rate": 0.00019459217242221092,
+      "loss": 2.8336,
+      "step": 10920
+    },
+    {
+      "epoch": 0.6341930430241667,
+      "grad_norm": 0.11195320636034012,
+      "learning_rate": 0.00019405340637928755,
+      "loss": 2.8427,
+      "step": 10930
+    },
+    {
+      "epoch": 0.6347732745365401,
+      "grad_norm": 0.11674754321575165,
+      "learning_rate": 0.0001935150304428206,
+      "loss": 2.8279,
+      "step": 10940
+    },
+    {
+      "epoch": 0.6353535060489135,
+      "grad_norm": 0.11432943493127823,
+      "learning_rate": 0.00019297704659516655,
+      "loss": 2.8267,
+      "step": 10950
+    },
+    {
+      "epoch": 0.6359337375612869,
+      "grad_norm": 0.12507887184619904,
+      "learning_rate": 0.0001924394568172384,
+      "loss": 2.8309,
+      "step": 10960
+    },
+    {
+      "epoch": 0.6365139690736604,
+      "grad_norm": 0.12057894468307495,
+      "learning_rate": 0.0001919022630884981,
+      "loss": 2.8422,
+      "step": 10970
+    },
+    {
+      "epoch": 0.6370942005860338,
+      "grad_norm": 0.11377721279859543,
+      "learning_rate": 0.000191365467386949,
+      "loss": 2.8381,
+      "step": 10980
+    },
+    {
+      "epoch": 0.6376744320984072,
+      "grad_norm": 0.11800755560398102,
+      "learning_rate": 0.00019082907168912932,
+      "loss": 2.8331,
+      "step": 10990
+    },
+    {
+      "epoch": 0.6382546636107806,
+      "grad_norm": 0.12301038950681686,
+      "learning_rate": 0.00019029307797010402,
+      "loss": 2.831,
+      "step": 11000
+    },
+    {
+      "epoch": 0.6382546636107806,
+      "eval_loss": 2.796895742416382,
+      "eval_runtime": 3.2627,
+      "eval_samples_per_second": 1327.123,
+      "eval_steps_per_second": 2.758,
+      "step": 11000
+    },
+    {
+      "epoch": 0.6388348951231542,
+      "grad_norm": 0.1179603561758995,
+      "learning_rate": 0.00018975748820345838,
+      "loss": 2.8436,
+      "step": 11010
+    },
+    {
+      "epoch": 0.6394151266355276,
+      "grad_norm": 0.13155020773410797,
+      "learning_rate": 0.0001892223043612898,
+      "loss": 2.8317,
+      "step": 11020
+    },
+    {
+      "epoch": 0.639995358147901,
+      "grad_norm": 0.11468763649463654,
+      "learning_rate": 0.00018868752841420122,
+      "loss": 2.8284,
+      "step": 11030
+    },
+    {
+      "epoch": 0.6405755896602745,
+      "grad_norm": 0.10960279405117035,
+      "learning_rate": 0.00018815316233129393,
+      "loss": 2.8286,
+      "step": 11040
+    },
+    {
+      "epoch": 0.6411558211726479,
+      "grad_norm": 0.1298363208770752,
+      "learning_rate": 0.00018761920808015966,
+      "loss": 2.8326,
+      "step": 11050
+    },
+    {
+      "epoch": 0.6417360526850213,
+      "grad_norm": 0.11535240709781647,
+      "learning_rate": 0.00018708566762687403,
+      "loss": 2.8281,
+      "step": 11060
+    },
+    {
+      "epoch": 0.6423162841973947,
+      "grad_norm": 0.12528617680072784,
+      "learning_rate": 0.00018655254293598866,
+      "loss": 2.8179,
+      "step": 11070
+    },
+    {
+      "epoch": 0.6428965157097682,
+      "grad_norm": 0.11952237784862518,
+      "learning_rate": 0.00018601983597052468,
+      "loss": 2.8294,
+      "step": 11080
+    },
+    {
+      "epoch": 0.6434767472221417,
+      "grad_norm": 0.12121649086475372,
+      "learning_rate": 0.00018548754869196496,
+      "loss": 2.8336,
+      "step": 11090
+    },
+    {
+      "epoch": 0.6440569787345151,
+      "grad_norm": 0.12465447187423706,
+      "learning_rate": 0.00018495568306024687,
+      "loss": 2.8314,
+      "step": 11100
+    },
+    {
+      "epoch": 0.6446372102468885,
+      "grad_norm": 0.10858411341905594,
+      "learning_rate": 0.00018442424103375563,
+      "loss": 2.8191,
+      "step": 11110
+    },
+    {
+      "epoch": 0.645217441759262,
+      "grad_norm": 0.1240803673863411,
+      "learning_rate": 0.00018389322456931616,
+      "loss": 2.8334,
+      "step": 11120
+    },
+    {
+      "epoch": 0.6457976732716354,
+      "grad_norm": 0.11604313552379608,
+      "learning_rate": 0.00018336263562218695,
+      "loss": 2.8241,
+      "step": 11130
+    },
+    {
+      "epoch": 0.6463779047840088,
+      "grad_norm": 0.10764401406049728,
+      "learning_rate": 0.00018283247614605185,
+      "loss": 2.8343,
+      "step": 11140
+    },
+    {
+      "epoch": 0.6469581362963822,
+      "grad_norm": 0.11341771483421326,
+      "learning_rate": 0.00018230274809301377,
+      "loss": 2.8323,
+      "step": 11150
+    },
+    {
+      "epoch": 0.6475383678087557,
+      "grad_norm": 0.11618595570325851,
+      "learning_rate": 0.00018177345341358699,
+      "loss": 2.8295,
+      "step": 11160
+    },
+    {
+      "epoch": 0.6481185993211291,
+      "grad_norm": 0.11492364853620529,
+      "learning_rate": 0.00018124459405668967,
+      "loss": 2.8253,
+      "step": 11170
+    },
+    {
+      "epoch": 0.6486988308335025,
+      "grad_norm": 0.12541726231575012,
+      "learning_rate": 0.0001807161719696377,
+      "loss": 2.8305,
+      "step": 11180
+    },
+    {
+      "epoch": 0.649279062345876,
+      "grad_norm": 0.1240224838256836,
+      "learning_rate": 0.0001801881890981362,
+      "loss": 2.832,
+      "step": 11190
+    },
+    {
+      "epoch": 0.6498592938582495,
+      "grad_norm": 0.12260005623102188,
+      "learning_rate": 0.00017966064738627363,
+      "loss": 2.8274,
+      "step": 11200
+    },
+    {
+      "epoch": 0.6504395253706229,
+      "grad_norm": 0.11284399777650833,
+      "learning_rate": 0.00017913354877651386,
+      "loss": 2.8291,
+      "step": 11210
+    },
+    {
+      "epoch": 0.6510197568829963,
+      "grad_norm": 0.11993937194347382,
+      "learning_rate": 0.00017860689520968906,
+      "loss": 2.8357,
+      "step": 11220
+    },
+    {
+      "epoch": 0.6515999883953697,
+      "grad_norm": 0.11259515583515167,
+      "learning_rate": 0.00017808068862499302,
+      "loss": 2.8134,
+      "step": 11230
+    },
+    {
+      "epoch": 0.6521802199077432,
+      "grad_norm": 0.1146656796336174,
+      "learning_rate": 0.0001775549309599733,
+      "loss": 2.8275,
+      "step": 11240
+    },
+    {
+      "epoch": 0.6527604514201166,
+      "grad_norm": 0.11118417978286743,
+      "learning_rate": 0.0001770296241505248,
+      "loss": 2.8276,
+      "step": 11250
+    },
+    {
+      "epoch": 0.65334068293249,
+      "grad_norm": 0.1155654564499855,
+      "learning_rate": 0.00017650477013088218,
+      "loss": 2.8333,
+      "step": 11260
+    },
+    {
+      "epoch": 0.6539209144448636,
+      "grad_norm": 0.12370238453149796,
+      "learning_rate": 0.000175980370833613,
+      "loss": 2.8209,
+      "step": 11270
+    },
+    {
+      "epoch": 0.654501145957237,
+      "grad_norm": 0.11332956701517105,
+      "learning_rate": 0.00017545642818961045,
+      "loss": 2.824,
+      "step": 11280
+    },
+    {
+      "epoch": 0.6550813774696104,
+      "grad_norm": 0.11696597188711166,
+      "learning_rate": 0.00017493294412808603,
+      "loss": 2.8285,
+      "step": 11290
+    },
+    {
+      "epoch": 0.6556616089819838,
+      "grad_norm": 0.11556991934776306,
+      "learning_rate": 0.00017440992057656302,
+      "loss": 2.833,
+      "step": 11300
+    },
+    {
+      "epoch": 0.6562418404943573,
+      "grad_norm": 0.11072834581136703,
+      "learning_rate": 0.000173887359460869,
+      "loss": 2.8202,
+      "step": 11310
+    },
+    {
+      "epoch": 0.6568220720067307,
+      "grad_norm": 0.12139474600553513,
+      "learning_rate": 0.0001733652627051285,
+      "loss": 2.8323,
+      "step": 11320
+    },
+    {
+      "epoch": 0.6574023035191041,
+      "grad_norm": 0.11882605403661728,
+      "learning_rate": 0.0001728436322317567,
+      "loss": 2.8325,
+      "step": 11330
+    },
+    {
+      "epoch": 0.6579825350314775,
+      "grad_norm": 0.10851707309484482,
+      "learning_rate": 0.00017232246996145163,
+      "loss": 2.8304,
+      "step": 11340
+    },
+    {
+      "epoch": 0.658562766543851,
+      "grad_norm": 0.11566723883152008,
+      "learning_rate": 0.0001718017778131873,
+      "loss": 2.8359,
+      "step": 11350
+    },
+    {
+      "epoch": 0.6591429980562244,
+      "grad_norm": 0.1224483922123909,
+      "learning_rate": 0.00017128155770420673,
+      "loss": 2.8246,
+      "step": 11360
+    },
+    {
+      "epoch": 0.6597232295685979,
+      "grad_norm": 0.11472085118293762,
+      "learning_rate": 0.00017076181155001492,
+      "loss": 2.8274,
+      "step": 11370
+    },
+    {
+      "epoch": 0.6603034610809713,
+      "grad_norm": 0.11463634669780731,
+      "learning_rate": 0.00017024254126437149,
+      "loss": 2.8208,
+      "step": 11380
+    },
+    {
+      "epoch": 0.6608836925933448,
+      "grad_norm": 0.11640073359012604,
+      "learning_rate": 0.00016972374875928427,
+      "loss": 2.8351,
+      "step": 11390
+    },
+    {
+      "epoch": 0.6614639241057182,
+      "grad_norm": 0.12146312743425369,
+      "learning_rate": 0.00016920543594500147,
+      "loss": 2.8249,
+      "step": 11400
+    },
+    {
+      "epoch": 0.6620441556180916,
+      "grad_norm": 0.11683548241853714,
+      "learning_rate": 0.00016868760473000524,
+      "loss": 2.8281,
+      "step": 11410
+    },
+    {
+      "epoch": 0.662624387130465,
+      "grad_norm": 0.11443763226270676,
+      "learning_rate": 0.0001681702570210043,
+      "loss": 2.8239,
+      "step": 11420
+    },
+    {
+      "epoch": 0.6632046186428385,
+      "grad_norm": 0.1136617586016655,
+      "learning_rate": 0.00016765339472292714,
+      "loss": 2.827,
+      "step": 11430
+    },
+    {
+      "epoch": 0.6637848501552119,
+      "grad_norm": 0.11093004792928696,
+      "learning_rate": 0.00016713701973891472,
+      "loss": 2.8359,
+      "step": 11440
+    },
+    {
+      "epoch": 0.6643650816675853,
+      "grad_norm": 0.12110643088817596,
+      "learning_rate": 0.00016662113397031413,
+      "loss": 2.8164,
+      "step": 11450
+    },
+    {
+      "epoch": 0.6649453131799588,
+      "grad_norm": 0.12236957252025604,
+      "learning_rate": 0.00016610573931667065,
+      "loss": 2.8295,
+      "step": 11460
+    },
+    {
+      "epoch": 0.6655255446923323,
+      "grad_norm": 0.11643628776073456,
+      "learning_rate": 0.0001655908376757214,
+      "loss": 2.8199,
+      "step": 11470
+    },
+    {
+      "epoch": 0.6661057762047057,
+      "grad_norm": 0.12198419123888016,
+      "learning_rate": 0.00016507643094338818,
+      "loss": 2.8234,
+      "step": 11480
+    },
+    {
+      "epoch": 0.6666860077170791,
+      "grad_norm": 0.11697736382484436,
+      "learning_rate": 0.00016456252101377042,
+      "loss": 2.8309,
+      "step": 11490
+    },
+    {
+      "epoch": 0.6672662392294526,
+      "grad_norm": 0.11377154290676117,
+      "learning_rate": 0.00016404910977913824,
+      "loss": 2.8174,
+      "step": 11500
+    },
+    {
+      "epoch": 0.667846470741826,
+      "grad_norm": 0.1169874370098114,
+      "learning_rate": 0.0001635361991299258,
+      "loss": 2.8174,
+      "step": 11510
+    },
+    {
+      "epoch": 0.6684267022541994,
+      "grad_norm": 0.11022408306598663,
+      "learning_rate": 0.00016302379095472374,
+      "loss": 2.8251,
+      "step": 11520
+    },
+    {
+      "epoch": 0.6690069337665728,
+      "grad_norm": 0.11143022775650024,
+      "learning_rate": 0.00016251188714027265,
+      "loss": 2.832,
+      "step": 11530
+    },
+    {
+      "epoch": 0.6695871652789464,
+      "grad_norm": 0.11829391121864319,
+      "learning_rate": 0.00016200048957145597,
+      "loss": 2.8181,
+      "step": 11540
+    },
+    {
+      "epoch": 0.6701673967913198,
+      "grad_norm": 0.11668332666158676,
+      "learning_rate": 0.00016148960013129303,
+      "loss": 2.8163,
+      "step": 11550
+    },
+    {
+      "epoch": 0.6707476283036932,
+      "grad_norm": 0.11444656550884247,
+      "learning_rate": 0.0001609792207009325,
+      "loss": 2.8171,
+      "step": 11560
+    },
+    {
+      "epoch": 0.6713278598160666,
+      "grad_norm": 0.11538255959749222,
+      "learning_rate": 0.00016046935315964476,
+      "loss": 2.8192,
+      "step": 11570
+    },
+    {
+      "epoch": 0.6719080913284401,
+      "grad_norm": 0.13890443742275238,
+      "learning_rate": 0.0001599599993848155,
+      "loss": 2.814,
+      "step": 11580
+    },
+    {
+      "epoch": 0.6724883228408135,
+      "grad_norm": 0.10878733545541763,
+      "learning_rate": 0.00015945116125193876,
+      "loss": 2.8161,
+      "step": 11590
+    },
+    {
+      "epoch": 0.6730685543531869,
+      "grad_norm": 0.11337769776582718,
+      "learning_rate": 0.00015894284063460966,
+      "loss": 2.8161,
+      "step": 11600
+    },
+    {
+      "epoch": 0.6736487858655603,
+      "grad_norm": 0.1095629557967186,
+      "learning_rate": 0.00015843503940451834,
+      "loss": 2.8087,
+      "step": 11610
+    },
+    {
+      "epoch": 0.6742290173779338,
+      "grad_norm": 0.1378069370985031,
+      "learning_rate": 0.00015792775943144165,
+      "loss": 2.8151,
+      "step": 11620
+    },
+    {
+      "epoch": 0.6748092488903072,
+      "grad_norm": 0.1202809140086174,
+      "learning_rate": 0.00015742100258323794,
+      "loss": 2.831,
+      "step": 11630
+    },
+    {
+      "epoch": 0.6753894804026807,
+      "grad_norm": 0.12298610061407089,
+      "learning_rate": 0.00015691477072583894,
+      "loss": 2.8247,
+      "step": 11640
+    },
+    {
+      "epoch": 0.6759697119150541,
+      "grad_norm": 0.11947082728147507,
+      "learning_rate": 0.00015640906572324319,
+      "loss": 2.8238,
+      "step": 11650
+    },
+    {
+      "epoch": 0.6765499434274276,
+      "grad_norm": 0.11039472371339798,
+      "learning_rate": 0.00015590388943750988,
+      "loss": 2.8267,
+      "step": 11660
+    },
+    {
+      "epoch": 0.677130174939801,
+      "grad_norm": 0.11807908117771149,
+      "learning_rate": 0.0001553992437287505,
+      "loss": 2.8222,
+      "step": 11670
+    },
+    {
+      "epoch": 0.6777104064521744,
+      "grad_norm": 0.11934113502502441,
+      "learning_rate": 0.00015489513045512386,
+      "loss": 2.8193,
+      "step": 11680
+    },
+    {
+      "epoch": 0.6782906379645478,
+      "grad_norm": 0.11163033545017242,
+      "learning_rate": 0.00015439155147282764,
+      "loss": 2.8137,
+      "step": 11690
+    },
+    {
+      "epoch": 0.6788708694769213,
+      "grad_norm": 0.11381068080663681,
+      "learning_rate": 0.0001538885086360923,
+      "loss": 2.8202,
+      "step": 11700
+    },
+    {
+      "epoch": 0.6794511009892947,
+      "grad_norm": 0.11011006683111191,
+      "learning_rate": 0.0001533860037971747,
+      "loss": 2.8213,
+      "step": 11710
+    },
+    {
+      "epoch": 0.6800313325016681,
+      "grad_norm": 0.11611464619636536,
+      "learning_rate": 0.0001528840388063497,
+      "loss": 2.8216,
+      "step": 11720
+    },
+    {
+      "epoch": 0.6806115640140415,
+      "grad_norm": 0.10734301805496216,
+      "learning_rate": 0.0001523826155119055,
+      "loss": 2.8188,
+      "step": 11730
+    },
+    {
+      "epoch": 0.6811917955264151,
+      "grad_norm": 0.12189003825187683,
+      "learning_rate": 0.00015188173576013482,
+      "loss": 2.8206,
+      "step": 11740
+    },
+    {
+      "epoch": 0.6817720270387885,
+      "grad_norm": 0.11146776378154755,
+      "learning_rate": 0.0001513814013953296,
+      "loss": 2.8176,
+      "step": 11750
+    },
+    {
+      "epoch": 0.6823522585511619,
+      "grad_norm": 0.11531021445989609,
+      "learning_rate": 0.0001508816142597733,
+      "loss": 2.8192,
+      "step": 11760
+    },
+    {
+      "epoch": 0.6829324900635354,
+      "grad_norm": 0.11541693657636642,
+      "learning_rate": 0.00015038237619373443,
+      "loss": 2.8219,
+      "step": 11770
+    },
+    {
+      "epoch": 0.6835127215759088,
+      "grad_norm": 0.11345332115888596,
+      "learning_rate": 0.0001498836890354602,
+      "loss": 2.8024,
+      "step": 11780
+    },
+    {
+      "epoch": 0.6840929530882822,
+      "grad_norm": 0.10796009749174118,
+      "learning_rate": 0.00014938555462116842,
+      "loss": 2.8119,
+      "step": 11790
+    },
+    {
+      "epoch": 0.6846731846006556,
+      "grad_norm": 0.11463455855846405,
+      "learning_rate": 0.00014888797478504261,
+      "loss": 2.8119,
+      "step": 11800
+    },
+    {
+      "epoch": 0.6852534161130291,
+      "grad_norm": 0.11192594468593597,
+      "learning_rate": 0.00014839095135922372,
+      "loss": 2.8252,
+      "step": 11810
+    },
+    {
+      "epoch": 0.6858336476254026,
+      "grad_norm": 0.11805829405784607,
+      "learning_rate": 0.000147894486173804,
+      "loss": 2.8095,
+      "step": 11820
+    },
+    {
+      "epoch": 0.686413879137776,
+      "grad_norm": 0.11721805483102798,
+      "learning_rate": 0.00014739858105682053,
+      "loss": 2.8123,
+      "step": 11830
+    },
+    {
+      "epoch": 0.6869941106501494,
+      "grad_norm": 0.11619780957698822,
+      "learning_rate": 0.0001469032378342475,
+      "loss": 2.8177,
+      "step": 11840
+    },
+    {
+      "epoch": 0.6875743421625229,
+      "grad_norm": 0.10933215916156769,
+      "learning_rate": 0.00014640845832999087,
+      "loss": 2.8078,
+      "step": 11850
+    },
+    {
+      "epoch": 0.6881545736748963,
+      "grad_norm": 0.11362309753894806,
+      "learning_rate": 0.0001459142443658805,
+      "loss": 2.8103,
+      "step": 11860
+    },
+    {
+      "epoch": 0.6887348051872697,
+      "grad_norm": 0.10805781930685043,
+      "learning_rate": 0.00014542059776166382,
+      "loss": 2.8073,
+      "step": 11870
+    },
+    {
+      "epoch": 0.6893150366996431,
+      "grad_norm": 0.124758280813694,
+      "learning_rate": 0.00014492752033499977,
+      "loss": 2.8133,
+      "step": 11880
+    },
+    {
+      "epoch": 0.6898952682120166,
+      "grad_norm": 0.11096182465553284,
+      "learning_rate": 0.00014443501390145057,
+      "loss": 2.8061,
+      "step": 11890
+    },
+    {
+      "epoch": 0.69047549972439,
+      "grad_norm": 0.1132817193865776,
+      "learning_rate": 0.00014394308027447685,
+      "loss": 2.8209,
+      "step": 11900
+    },
+    {
+      "epoch": 0.6910557312367634,
+      "grad_norm": 0.10996360331773758,
+      "learning_rate": 0.00014345172126542966,
+      "loss": 2.8161,
+      "step": 11910
+    },
+    {
+      "epoch": 0.6916359627491369,
+      "grad_norm": 0.11297384649515152,
+      "learning_rate": 0.0001429609386835442,
+      "loss": 2.8116,
+      "step": 11920
+    },
+    {
+      "epoch": 0.6922161942615104,
+      "grad_norm": 0.12191120535135269,
+      "learning_rate": 0.00014247073433593373,
+      "loss": 2.8156,
+      "step": 11930
+    },
+    {
+      "epoch": 0.6927964257738838,
+      "grad_norm": 0.11631318181753159,
+      "learning_rate": 0.00014198111002758154,
+      "loss": 2.8225,
+      "step": 11940
+    },
+    {
+      "epoch": 0.6933766572862572,
+      "grad_norm": 0.14487071335315704,
+      "learning_rate": 0.00014149206756133595,
+      "loss": 2.8153,
+      "step": 11950
+    },
+    {
+      "epoch": 0.6939568887986306,
+      "grad_norm": 0.11780226230621338,
+      "learning_rate": 0.00014100360873790248,
+      "loss": 2.8163,
+      "step": 11960
+    },
+    {
+      "epoch": 0.6945371203110041,
+      "grad_norm": 0.11396613717079163,
+      "learning_rate": 0.00014051573535583766,
+      "loss": 2.8101,
+      "step": 11970
+    },
+    {
+      "epoch": 0.6951173518233775,
+      "grad_norm": 0.11514125019311905,
+      "learning_rate": 0.00014002844921154233,
+      "loss": 2.819,
+      "step": 11980
+    },
+    {
+      "epoch": 0.6956975833357509,
+      "grad_norm": 0.11687569320201874,
+      "learning_rate": 0.00013954175209925513,
+      "loss": 2.8106,
+      "step": 11990
+    },
+    {
+      "epoch": 0.6962778148481245,
+      "grad_norm": 0.11218845099210739,
+      "learning_rate": 0.00013905564581104607,
+      "loss": 2.8156,
+      "step": 12000
+    },
+    {
+      "epoch": 0.6962778148481245,
+      "eval_loss": 2.778130531311035,
+      "eval_runtime": 3.2555,
+      "eval_samples_per_second": 1330.053,
+      "eval_steps_per_second": 2.765,
+      "step": 12000
+    },
+    {
+      "epoch": 0.6968580463604979,
+      "grad_norm": 0.11513704061508179,
+      "learning_rate": 0.000138570132136809,
+      "loss": 2.8185,
+      "step": 12010
+    },
+    {
+      "epoch": 0.6974382778728713,
+      "grad_norm": 0.12384956330060959,
+      "learning_rate": 0.00013808521286425644,
+      "loss": 2.8159,
+      "step": 12020
+    },
+    {
+      "epoch": 0.6980185093852447,
+      "grad_norm": 0.11136494576931,
+      "learning_rate": 0.0001376008897789119,
+      "loss": 2.8196,
+      "step": 12030
+    },
+    {
+      "epoch": 0.6985987408976182,
+      "grad_norm": 0.11704517900943756,
+      "learning_rate": 0.00013711716466410353,
+      "loss": 2.8118,
+      "step": 12040
+    },
+    {
+      "epoch": 0.6991789724099916,
+      "grad_norm": 0.11521551758050919,
+      "learning_rate": 0.00013663403930095827,
+      "loss": 2.8131,
+      "step": 12050
+    },
+    {
+      "epoch": 0.699759203922365,
+      "grad_norm": 0.10568945109844208,
+      "learning_rate": 0.00013615151546839382,
+      "loss": 2.8098,
+      "step": 12060
+    },
+    {
+      "epoch": 0.7003394354347384,
+      "grad_norm": 0.1213884949684143,
+      "learning_rate": 0.00013566959494311386,
+      "loss": 2.8091,
+      "step": 12070
+    },
+    {
+      "epoch": 0.7009196669471119,
+      "grad_norm": 0.11004059761762619,
+      "learning_rate": 0.00013518827949960015,
+      "loss": 2.8238,
+      "step": 12080
+    },
+    {
+      "epoch": 0.7014998984594853,
+      "grad_norm": 0.11095508933067322,
+      "learning_rate": 0.00013470757091010649,
+      "loss": 2.8116,
+      "step": 12090
+    },
+    {
+      "epoch": 0.7020801299718588,
+      "grad_norm": 0.11275944113731384,
+      "learning_rate": 0.00013422747094465234,
+      "loss": 2.8109,
+      "step": 12100
+    },
+    {
+      "epoch": 0.7026603614842322,
+      "grad_norm": 0.11312493681907654,
+      "learning_rate": 0.00013374798137101595,
+      "loss": 2.814,
+      "step": 12110
+    },
+    {
+      "epoch": 0.7032405929966057,
+      "grad_norm": 0.10738647729158401,
+      "learning_rate": 0.00013326910395472833,
+      "loss": 2.8111,
+      "step": 12120
+    },
+    {
+      "epoch": 0.7038208245089791,
+      "grad_norm": 0.11198966205120087,
+      "learning_rate": 0.00013279084045906623,
+      "loss": 2.806,
+      "step": 12130
+    },
+    {
+      "epoch": 0.7044010560213525,
+      "grad_norm": 0.11718153953552246,
+      "learning_rate": 0.00013231319264504594,
+      "loss": 2.8186,
+      "step": 12140
+    },
+    {
+      "epoch": 0.7049812875337259,
+      "grad_norm": 0.11054380983114243,
+      "learning_rate": 0.00013183616227141674,
+      "loss": 2.8144,
+      "step": 12150
+    },
+    {
+      "epoch": 0.7055615190460994,
+      "grad_norm": 0.11579257249832153,
+      "learning_rate": 0.0001313597510946543,
+      "loss": 2.8101,
+      "step": 12160
+    },
+    {
+      "epoch": 0.7061417505584728,
+      "grad_norm": 0.10710903257131577,
+      "learning_rate": 0.00013088396086895476,
+      "loss": 2.8104,
+      "step": 12170
+    },
+    {
+      "epoch": 0.7067219820708462,
+      "grad_norm": 0.11220473051071167,
+      "learning_rate": 0.00013040879334622738,
+      "loss": 2.8049,
+      "step": 12180
+    },
+    {
+      "epoch": 0.7073022135832197,
+      "grad_norm": 0.10872667282819748,
+      "learning_rate": 0.00012993425027608884,
+      "loss": 2.8175,
+      "step": 12190
+    },
+    {
+      "epoch": 0.7078824450955932,
+      "grad_norm": 0.10861840099096298,
+      "learning_rate": 0.00012946033340585641,
+      "loss": 2.8072,
+      "step": 12200
+    },
+    {
+      "epoch": 0.7084626766079666,
+      "grad_norm": 0.11558268964290619,
+      "learning_rate": 0.00012898704448054162,
+      "loss": 2.8034,
+      "step": 12210
+    },
+    {
+      "epoch": 0.70904290812034,
+      "grad_norm": 0.11709378659725189,
+      "learning_rate": 0.00012851438524284382,
+      "loss": 2.8047,
+      "step": 12220
+    },
+    {
+      "epoch": 0.7096231396327135,
+      "grad_norm": 0.12139759957790375,
+      "learning_rate": 0.00012804235743314401,
+      "loss": 2.8056,
+      "step": 12230
+    },
+    {
+      "epoch": 0.7102033711450869,
+      "grad_norm": 0.11130308359861374,
+      "learning_rate": 0.00012757096278949792,
+      "loss": 2.8138,
+      "step": 12240
+    },
+    {
+      "epoch": 0.7107836026574603,
+      "grad_norm": 0.1112653836607933,
+      "learning_rate": 0.00012710020304763003,
+      "loss": 2.8004,
+      "step": 12250
+    },
+    {
+      "epoch": 0.7113638341698337,
+      "grad_norm": 0.11182957142591476,
+      "learning_rate": 0.00012663007994092703,
+      "loss": 2.8064,
+      "step": 12260
+    },
+    {
+      "epoch": 0.7119440656822072,
+      "grad_norm": 0.13386094570159912,
+      "learning_rate": 0.00012616059520043145,
+      "loss": 2.8148,
+      "step": 12270
+    },
+    {
+      "epoch": 0.7125242971945807,
+      "grad_norm": 0.11641652137041092,
+      "learning_rate": 0.0001256917505548352,
+      "loss": 2.8102,
+      "step": 12280
+    },
+    {
+      "epoch": 0.7131045287069541,
+      "grad_norm": 0.10916447639465332,
+      "learning_rate": 0.00012522354773047352,
+      "loss": 2.8148,
+      "step": 12290
+    },
+    {
+      "epoch": 0.7136847602193275,
+      "grad_norm": 0.10887318104505539,
+      "learning_rate": 0.0001247559884513182,
+      "loss": 2.8047,
+      "step": 12300
+    },
+    {
+      "epoch": 0.714264991731701,
+      "grad_norm": 0.11701834946870804,
+      "learning_rate": 0.0001242890744389715,
+      "loss": 2.8144,
+      "step": 12310
+    },
+    {
+      "epoch": 0.7148452232440744,
+      "grad_norm": 0.10473381727933884,
+      "learning_rate": 0.00012382280741265968,
+      "loss": 2.8057,
+      "step": 12320
+    },
+    {
+      "epoch": 0.7154254547564478,
+      "grad_norm": 0.10586260259151459,
+      "learning_rate": 0.00012335718908922685,
+      "loss": 2.8032,
+      "step": 12330
+    },
+    {
+      "epoch": 0.7160056862688212,
+      "grad_norm": 0.10688824206590652,
+      "learning_rate": 0.00012289222118312822,
+      "loss": 2.8054,
+      "step": 12340
+    },
+    {
+      "epoch": 0.7165859177811947,
+      "grad_norm": 0.11233460903167725,
+      "learning_rate": 0.0001224279054064247,
+      "loss": 2.801,
+      "step": 12350
+    },
+    {
+      "epoch": 0.7171661492935681,
+      "grad_norm": 0.10600557923316956,
+      "learning_rate": 0.00012196424346877541,
+      "loss": 2.8035,
+      "step": 12360
+    },
+    {
+      "epoch": 0.7177463808059416,
+      "grad_norm": 0.11300963163375854,
+      "learning_rate": 0.00012150123707743219,
+      "loss": 2.8098,
+      "step": 12370
+    },
+    {
+      "epoch": 0.718326612318315,
+      "grad_norm": 0.11773265898227692,
+      "learning_rate": 0.00012103888793723312,
+      "loss": 2.8103,
+      "step": 12380
+    },
+    {
+      "epoch": 0.7189068438306885,
+      "grad_norm": 0.11092250049114227,
+      "learning_rate": 0.00012057719775059602,
+      "loss": 2.8028,
+      "step": 12390
+    },
+    {
+      "epoch": 0.7194870753430619,
+      "grad_norm": 0.10554751008749008,
+      "learning_rate": 0.00012011616821751271,
+      "loss": 2.8044,
+      "step": 12400
+    },
+    {
+      "epoch": 0.7200673068554353,
+      "grad_norm": 0.1148175522685051,
+      "learning_rate": 0.0001196558010355422,
+      "loss": 2.8099,
+      "step": 12410
+    },
+    {
+      "epoch": 0.7206475383678087,
+      "grad_norm": 0.10981535166501999,
+      "learning_rate": 0.00011919609789980458,
+      "loss": 2.7991,
+      "step": 12420
+    },
+    {
+      "epoch": 0.7212277698801822,
+      "grad_norm": 0.11188452690839767,
+      "learning_rate": 0.00011873706050297508,
+      "loss": 2.8067,
+      "step": 12430
+    },
+    {
+      "epoch": 0.7218080013925556,
+      "grad_norm": 0.11328940838575363,
+      "learning_rate": 0.00011827869053527727,
+      "loss": 2.8049,
+      "step": 12440
+    },
+    {
+      "epoch": 0.722388232904929,
+      "grad_norm": 0.11542364954948425,
+      "learning_rate": 0.00011782098968447774,
+      "loss": 2.7988,
+      "step": 12450
+    },
+    {
+      "epoch": 0.7229684644173026,
+      "grad_norm": 0.11087549477815628,
+      "learning_rate": 0.00011736395963587857,
+      "loss": 2.8102,
+      "step": 12460
+    },
+    {
+      "epoch": 0.723548695929676,
+      "grad_norm": 0.11298040300607681,
+      "learning_rate": 0.00011690760207231256,
+      "loss": 2.8063,
+      "step": 12470
+    },
+    {
+      "epoch": 0.7241289274420494,
+      "grad_norm": 0.10775293409824371,
+      "learning_rate": 0.00011645191867413596,
+      "loss": 2.8065,
+      "step": 12480
+    },
+    {
+      "epoch": 0.7247091589544228,
+      "grad_norm": 0.11240221560001373,
+      "learning_rate": 0.00011599691111922272,
+      "loss": 2.8062,
+      "step": 12490
+    },
+    {
+      "epoch": 0.7252893904667963,
+      "grad_norm": 0.1069854348897934,
+      "learning_rate": 0.00011554258108295859,
+      "loss": 2.79,
+      "step": 12500
+    },
+    {
+      "epoch": 0.7258696219791697,
+      "grad_norm": 0.11566832661628723,
+      "learning_rate": 0.00011508893023823393,
+      "loss": 2.7977,
+      "step": 12510
+    },
+    {
+      "epoch": 0.7264498534915431,
+      "grad_norm": 0.11771980673074722,
+      "learning_rate": 0.00011463596025543905,
+      "loss": 2.803,
+      "step": 12520
+    },
+    {
+      "epoch": 0.7270300850039165,
+      "grad_norm": 0.11435101926326752,
+      "learning_rate": 0.0001141836728024567,
+      "loss": 2.7985,
+      "step": 12530
+    },
+    {
+      "epoch": 0.72761031651629,
+      "grad_norm": 0.10902056097984314,
+      "learning_rate": 0.0001137320695446566,
+      "loss": 2.8096,
+      "step": 12540
+    },
+    {
+      "epoch": 0.7281905480286635,
+      "grad_norm": 0.10939980298280716,
+      "learning_rate": 0.0001132811521448896,
+      "loss": 2.8121,
+      "step": 12550
+    },
+    {
+      "epoch": 0.7287707795410369,
+      "grad_norm": 0.10922636091709137,
+      "learning_rate": 0.00011283092226348031,
+      "loss": 2.8093,
+      "step": 12560
+    },
+    {
+      "epoch": 0.7293510110534103,
+      "grad_norm": 0.10520195960998535,
+      "learning_rate": 0.00011238138155822275,
+      "loss": 2.8031,
+      "step": 12570
+    },
+    {
+      "epoch": 0.7299312425657838,
+      "grad_norm": 0.10655706375837326,
+      "learning_rate": 0.00011193253168437253,
+      "loss": 2.8083,
+      "step": 12580
+    },
+    {
+      "epoch": 0.7305114740781572,
+      "grad_norm": 0.11627507954835892,
+      "learning_rate": 0.00011148437429464215,
+      "loss": 2.7994,
+      "step": 12590
+    },
+    {
+      "epoch": 0.7310917055905306,
+      "grad_norm": 0.1093965470790863,
+      "learning_rate": 0.00011103691103919401,
+      "loss": 2.8054,
+      "step": 12600
+    },
+    {
+      "epoch": 0.731671937102904,
+      "grad_norm": 0.113887257874012,
+      "learning_rate": 0.00011059014356563458,
+      "loss": 2.7963,
+      "step": 12610
+    },
+    {
+      "epoch": 0.7322521686152775,
+      "grad_norm": 0.10929399728775024,
+      "learning_rate": 0.00011014407351900879,
+      "loss": 2.8033,
+      "step": 12620
+    },
+    {
+      "epoch": 0.7328324001276509,
+      "grad_norm": 0.11176785826683044,
+      "learning_rate": 0.00010969870254179285,
+      "loss": 2.8061,
+      "step": 12630
+    },
+    {
+      "epoch": 0.7334126316400243,
+      "grad_norm": 0.10631275177001953,
+      "learning_rate": 0.00010925403227388973,
+      "loss": 2.8107,
+      "step": 12640
+    },
+    {
+      "epoch": 0.7339928631523978,
+      "grad_norm": 0.11108485609292984,
+      "learning_rate": 0.00010881006435262179,
+      "loss": 2.8059,
+      "step": 12650
+    },
+    {
+      "epoch": 0.7345730946647713,
+      "grad_norm": 0.10749488323926926,
+      "learning_rate": 0.00010836680041272536,
+      "loss": 2.8004,
+      "step": 12660
+    },
+    {
+      "epoch": 0.7351533261771447,
+      "grad_norm": 0.10994744300842285,
+      "learning_rate": 0.00010792424208634495,
+      "loss": 2.8093,
+      "step": 12670
+    },
+    {
+      "epoch": 0.7357335576895181,
+      "grad_norm": 0.10910103470087051,
+      "learning_rate": 0.00010748239100302627,
+      "loss": 2.7928,
+      "step": 12680
+    },
+    {
+      "epoch": 0.7363137892018915,
+      "grad_norm": 0.10835743695497513,
+      "learning_rate": 0.0001070412487897117,
+      "loss": 2.8077,
+      "step": 12690
+    },
+    {
+      "epoch": 0.736894020714265,
+      "grad_norm": 0.10580655187368393,
+      "learning_rate": 0.00010660081707073288,
+      "loss": 2.7991,
+      "step": 12700
+    },
+    {
+      "epoch": 0.7374742522266384,
+      "grad_norm": 0.10928157716989517,
+      "learning_rate": 0.00010616109746780546,
+      "loss": 2.7905,
+      "step": 12710
+    },
+    {
+      "epoch": 0.7380544837390118,
+      "grad_norm": 0.10654684156179428,
+      "learning_rate": 0.00010572209160002339,
+      "loss": 2.8021,
+      "step": 12720
+    },
+    {
+      "epoch": 0.7386347152513854,
+      "grad_norm": 0.10834140330553055,
+      "learning_rate": 0.00010528380108385186,
+      "loss": 2.805,
+      "step": 12730
+    },
+    {
+      "epoch": 0.7392149467637588,
+      "grad_norm": 0.1152142882347107,
+      "learning_rate": 0.00010484622753312279,
+      "loss": 2.7916,
+      "step": 12740
+    },
+    {
+      "epoch": 0.7397951782761322,
+      "grad_norm": 0.10981319844722748,
+      "learning_rate": 0.0001044093725590277,
+      "loss": 2.8029,
+      "step": 12750
+    },
+    {
+      "epoch": 0.7403754097885056,
+      "grad_norm": 0.1065368577837944,
+      "learning_rate": 0.00010397323777011229,
+      "loss": 2.8048,
+      "step": 12760
+    },
+    {
+      "epoch": 0.7409556413008791,
+      "grad_norm": 0.10563939809799194,
+      "learning_rate": 0.00010353782477227083,
+      "loss": 2.8058,
+      "step": 12770
+    },
+    {
+      "epoch": 0.7415358728132525,
+      "grad_norm": 0.11117275804281235,
+      "learning_rate": 0.00010310313516873922,
+      "loss": 2.7985,
+      "step": 12780
+    },
+    {
+      "epoch": 0.7421161043256259,
+      "grad_norm": 0.11544723808765411,
+      "learning_rate": 0.00010266917056009036,
+      "loss": 2.8001,
+      "step": 12790
+    },
+    {
+      "epoch": 0.7426963358379993,
+      "grad_norm": 0.11005005240440369,
+      "learning_rate": 0.00010223593254422733,
+      "loss": 2.7954,
+      "step": 12800
+    },
+    {
+      "epoch": 0.7432765673503728,
+      "grad_norm": 0.11374104768037796,
+      "learning_rate": 0.0001018034227163779,
+      "loss": 2.8053,
+      "step": 12810
+    },
+    {
+      "epoch": 0.7438567988627462,
+      "grad_norm": 0.11264318227767944,
+      "learning_rate": 0.00010137164266908854,
+      "loss": 2.8029,
+      "step": 12820
+    },
+    {
+      "epoch": 0.7444370303751197,
+      "grad_norm": 0.10718287527561188,
+      "learning_rate": 0.00010094059399221855,
+      "loss": 2.7964,
+      "step": 12830
+    },
+    {
+      "epoch": 0.7450172618874931,
+      "grad_norm": 0.11395127326250076,
+      "learning_rate": 0.00010051027827293457,
+      "loss": 2.8057,
+      "step": 12840
+    },
+    {
+      "epoch": 0.7455974933998666,
+      "grad_norm": 0.11251317709684372,
+      "learning_rate": 0.00010008069709570378,
+      "loss": 2.8036,
+      "step": 12850
+    },
+    {
+      "epoch": 0.74617772491224,
+      "grad_norm": 0.1180030032992363,
+      "learning_rate": 9.965185204228941e-05,
+      "loss": 2.8016,
+      "step": 12860
+    },
+    {
+      "epoch": 0.7467579564246134,
+      "grad_norm": 0.12361141294240952,
+      "learning_rate": 9.922374469174372e-05,
+      "loss": 2.7891,
+      "step": 12870
+    },
+    {
+      "epoch": 0.7473381879369868,
+      "grad_norm": 0.11456003040075302,
+      "learning_rate": 9.879637662040275e-05,
+      "loss": 2.8028,
+      "step": 12880
+    },
+    {
+      "epoch": 0.7479184194493603,
+      "grad_norm": 0.11008987575769424,
+      "learning_rate": 9.83697494018808e-05,
+      "loss": 2.8093,
+      "step": 12890
+    },
+    {
+      "epoch": 0.7484986509617337,
+      "grad_norm": 0.11017616838216782,
+      "learning_rate": 9.794386460706356e-05,
+      "loss": 2.8005,
+      "step": 12900
+    },
+    {
+      "epoch": 0.7490788824741071,
+      "grad_norm": 0.11627316474914551,
+      "learning_rate": 9.751872380410378e-05,
+      "loss": 2.799,
+      "step": 12910
+    },
+    {
+      "epoch": 0.7496591139864806,
+      "grad_norm": 0.11369270831346512,
+      "learning_rate": 9.709432855841436e-05,
+      "loss": 2.7941,
+      "step": 12920
+    },
+    {
+      "epoch": 0.7502393454988541,
+      "grad_norm": 0.10983362793922424,
+      "learning_rate": 9.667068043266302e-05,
+      "loss": 2.7996,
+      "step": 12930
+    },
+    {
+      "epoch": 0.7508195770112275,
+      "grad_norm": 0.10419350117444992,
+      "learning_rate": 9.624778098676652e-05,
+      "loss": 2.8052,
+      "step": 12940
+    },
+    {
+      "epoch": 0.7513998085236009,
+      "grad_norm": 0.10500075668096542,
+      "learning_rate": 9.582563177788487e-05,
+      "loss": 2.7993,
+      "step": 12950
+    },
+    {
+      "epoch": 0.7519800400359744,
+      "grad_norm": 0.10765775293111801,
+      "learning_rate": 9.540423436041585e-05,
+      "loss": 2.7964,
+      "step": 12960
+    },
+    {
+      "epoch": 0.7525602715483478,
+      "grad_norm": 0.10872151702642441,
+      "learning_rate": 9.49835902859888e-05,
+      "loss": 2.7876,
+      "step": 12970
+    },
+    {
+      "epoch": 0.7531405030607212,
+      "grad_norm": 0.10935165733098984,
+      "learning_rate": 9.456370110345927e-05,
+      "loss": 2.8003,
+      "step": 12980
+    },
+    {
+      "epoch": 0.7537207345730946,
+      "grad_norm": 0.1083398386836052,
+      "learning_rate": 9.414456835890322e-05,
+      "loss": 2.7945,
+      "step": 12990
+    },
+    {
+      "epoch": 0.7543009660854681,
+      "grad_norm": 0.10846253484487534,
+      "learning_rate": 9.372619359561121e-05,
+      "loss": 2.799,
+      "step": 13000
+    },
+    {
+      "epoch": 0.7543009660854681,
+      "eval_loss": 2.7616169452667236,
+      "eval_runtime": 3.2768,
+      "eval_samples_per_second": 1321.408,
+      "eval_steps_per_second": 2.747,
+      "step": 13000
+    },
+    {
+      "epoch": 0.7548811975978416,
+      "grad_norm": 0.10937865823507309,
+      "learning_rate": 9.330857835408318e-05,
+      "loss": 2.7962,
+      "step": 13010
+    },
+    {
+      "epoch": 0.755461429110215,
+      "grad_norm": 0.10633205622434616,
+      "learning_rate": 9.289172417202205e-05,
+      "loss": 2.7989,
+      "step": 13020
+    },
+    {
+      "epoch": 0.7560416606225884,
+      "grad_norm": 0.11001235246658325,
+      "learning_rate": 9.247563258432861e-05,
+      "loss": 2.7955,
+      "step": 13030
+    },
+    {
+      "epoch": 0.7566218921349619,
+      "grad_norm": 0.10847952216863632,
+      "learning_rate": 9.206030512309566e-05,
+      "loss": 2.7959,
+      "step": 13040
+    },
+    {
+      "epoch": 0.7572021236473353,
+      "grad_norm": 0.10858704149723053,
+      "learning_rate": 9.164574331760246e-05,
+      "loss": 2.7965,
+      "step": 13050
+    },
+    {
+      "epoch": 0.7577823551597087,
+      "grad_norm": 0.10710106790065765,
+      "learning_rate": 9.123194869430888e-05,
+      "loss": 2.7921,
+      "step": 13060
+    },
+    {
+      "epoch": 0.7583625866720821,
+      "grad_norm": 0.10932508111000061,
+      "learning_rate": 9.081892277685026e-05,
+      "loss": 2.7921,
+      "step": 13070
+    },
+    {
+      "epoch": 0.7589428181844556,
+      "grad_norm": 0.11362321674823761,
+      "learning_rate": 9.040666708603125e-05,
+      "loss": 2.7981,
+      "step": 13080
+    },
+    {
+      "epoch": 0.759523049696829,
+      "grad_norm": 0.10791613906621933,
+      "learning_rate": 8.999518313982039e-05,
+      "loss": 2.7993,
+      "step": 13090
+    },
+    {
+      "epoch": 0.7601032812092025,
+      "grad_norm": 0.11038652807474136,
+      "learning_rate": 8.958447245334476e-05,
+      "loss": 2.7922,
+      "step": 13100
+    },
+    {
+      "epoch": 0.7606835127215759,
+      "grad_norm": 0.11153964698314667,
+      "learning_rate": 8.91745365388841e-05,
+      "loss": 2.8016,
+      "step": 13110
+    },
+    {
+      "epoch": 0.7612637442339494,
+      "grad_norm": 0.10748942941427231,
+      "learning_rate": 8.876537690586529e-05,
+      "loss": 2.791,
+      "step": 13120
+    },
+    {
+      "epoch": 0.7618439757463228,
+      "grad_norm": 0.1106482520699501,
+      "learning_rate": 8.83569950608572e-05,
+      "loss": 2.8008,
+      "step": 13130
+    },
+    {
+      "epoch": 0.7624242072586962,
+      "grad_norm": 0.10443028807640076,
+      "learning_rate": 8.794939250756441e-05,
+      "loss": 2.7936,
+      "step": 13140
+    },
+    {
+      "epoch": 0.7630044387710696,
+      "grad_norm": 0.11383570730686188,
+      "learning_rate": 8.754257074682222e-05,
+      "loss": 2.7912,
+      "step": 13150
+    },
+    {
+      "epoch": 0.7635846702834431,
+      "grad_norm": 0.10836578160524368,
+      "learning_rate": 8.713653127659105e-05,
+      "loss": 2.7939,
+      "step": 13160
+    },
+    {
+      "epoch": 0.7641649017958165,
+      "grad_norm": 0.10870825499296188,
+      "learning_rate": 8.673127559195066e-05,
+      "loss": 2.7991,
+      "step": 13170
+    },
+    {
+      "epoch": 0.7647451333081899,
+      "grad_norm": 0.10718671977519989,
+      "learning_rate": 8.632680518509492e-05,
+      "loss": 2.7879,
+      "step": 13180
+    },
+    {
+      "epoch": 0.7653253648205635,
+      "grad_norm": 0.11277935653924942,
+      "learning_rate": 8.592312154532637e-05,
+      "loss": 2.7947,
+      "step": 13190
+    },
+    {
+      "epoch": 0.7659055963329369,
+      "grad_norm": 0.11088382452726364,
+      "learning_rate": 8.552022615905038e-05,
+      "loss": 2.7996,
+      "step": 13200
+    },
+    {
+      "epoch": 0.7664858278453103,
+      "grad_norm": 0.10912182927131653,
+      "learning_rate": 8.511812050977003e-05,
+      "loss": 2.7943,
+      "step": 13210
+    },
+    {
+      "epoch": 0.7670660593576837,
+      "grad_norm": 0.10919041931629181,
+      "learning_rate": 8.471680607808035e-05,
+      "loss": 2.7992,
+      "step": 13220
+    },
+    {
+      "epoch": 0.7676462908700572,
+      "grad_norm": 0.10616286844015121,
+      "learning_rate": 8.431628434166309e-05,
+      "loss": 2.7977,
+      "step": 13230
+    },
+    {
+      "epoch": 0.7682265223824306,
+      "grad_norm": 0.10572168231010437,
+      "learning_rate": 8.391655677528143e-05,
+      "loss": 2.7959,
+      "step": 13240
+    },
+    {
+      "epoch": 0.768806753894804,
+      "grad_norm": 0.10937794297933578,
+      "learning_rate": 8.3517624850774e-05,
+      "loss": 2.793,
+      "step": 13250
+    },
+    {
+      "epoch": 0.7693869854071774,
+      "grad_norm": 0.10820769518613815,
+      "learning_rate": 8.311949003704996e-05,
+      "loss": 2.7991,
+      "step": 13260
+    },
+    {
+      "epoch": 0.769967216919551,
+      "grad_norm": 0.10802992433309555,
+      "learning_rate": 8.272215380008343e-05,
+      "loss": 2.7965,
+      "step": 13270
+    },
+    {
+      "epoch": 0.7705474484319244,
+      "grad_norm": 0.10747858881950378,
+      "learning_rate": 8.232561760290794e-05,
+      "loss": 2.7957,
+      "step": 13280
+    },
+    {
+      "epoch": 0.7711276799442978,
+      "grad_norm": 0.11238089948892593,
+      "learning_rate": 8.192988290561157e-05,
+      "loss": 2.7922,
+      "step": 13290
+    },
+    {
+      "epoch": 0.7717079114566712,
+      "grad_norm": 0.1034981980919838,
+      "learning_rate": 8.153495116533056e-05,
+      "loss": 2.789,
+      "step": 13300
+    },
+    {
+      "epoch": 0.7722881429690447,
+      "grad_norm": 0.10910629481077194,
+      "learning_rate": 8.11408238362453e-05,
+      "loss": 2.7899,
+      "step": 13310
+    },
+    {
+      "epoch": 0.7728683744814181,
+      "grad_norm": 0.11309719830751419,
+      "learning_rate": 8.07475023695737e-05,
+      "loss": 2.7978,
+      "step": 13320
+    },
+    {
+      "epoch": 0.7734486059937915,
+      "grad_norm": 0.10908596217632294,
+      "learning_rate": 8.035498821356664e-05,
+      "loss": 2.7938,
+      "step": 13330
+    },
+    {
+      "epoch": 0.7740288375061649,
+      "grad_norm": 0.11714279651641846,
+      "learning_rate": 7.996328281350252e-05,
+      "loss": 2.7967,
+      "step": 13340
+    },
+    {
+      "epoch": 0.7746090690185384,
+      "grad_norm": 0.10943669080734253,
+      "learning_rate": 7.957238761168135e-05,
+      "loss": 2.7803,
+      "step": 13350
+    },
+    {
+      "epoch": 0.7751893005309118,
+      "grad_norm": 0.11171719431877136,
+      "learning_rate": 7.918230404742045e-05,
+      "loss": 2.7941,
+      "step": 13360
+    },
+    {
+      "epoch": 0.7757695320432852,
+      "grad_norm": 0.10363152623176575,
+      "learning_rate": 7.879303355704834e-05,
+      "loss": 2.8043,
+      "step": 13370
+    },
+    {
+      "epoch": 0.7763497635556587,
+      "grad_norm": 0.1147744432091713,
+      "learning_rate": 7.840457757389968e-05,
+      "loss": 2.8022,
+      "step": 13380
+    },
+    {
+      "epoch": 0.7769299950680322,
+      "grad_norm": 0.10682083666324615,
+      "learning_rate": 7.801693752831012e-05,
+      "loss": 2.7914,
+      "step": 13390
+    },
+    {
+      "epoch": 0.7775102265804056,
+      "grad_norm": 0.11352023482322693,
+      "learning_rate": 7.763011484761082e-05,
+      "loss": 2.7958,
+      "step": 13400
+    },
+    {
+      "epoch": 0.778090458092779,
+      "grad_norm": 0.10785870254039764,
+      "learning_rate": 7.724411095612366e-05,
+      "loss": 2.7971,
+      "step": 13410
+    },
+    {
+      "epoch": 0.7786706896051525,
+      "grad_norm": 0.10762759298086166,
+      "learning_rate": 7.68589272751551e-05,
+      "loss": 2.7916,
+      "step": 13420
+    },
+    {
+      "epoch": 0.7792509211175259,
+      "grad_norm": 0.10556434839963913,
+      "learning_rate": 7.647456522299207e-05,
+      "loss": 2.784,
+      "step": 13430
+    },
+    {
+      "epoch": 0.7798311526298993,
+      "grad_norm": 0.1077750101685524,
+      "learning_rate": 7.609102621489577e-05,
+      "loss": 2.7906,
+      "step": 13440
+    },
+    {
+      "epoch": 0.7804113841422727,
+      "grad_norm": 0.10472170263528824,
+      "learning_rate": 7.570831166309693e-05,
+      "loss": 2.7833,
+      "step": 13450
+    },
+    {
+      "epoch": 0.7809916156546463,
+      "grad_norm": 0.1061674952507019,
+      "learning_rate": 7.532642297679093e-05,
+      "loss": 2.796,
+      "step": 13460
+    },
+    {
+      "epoch": 0.7815718471670197,
+      "grad_norm": 0.10716653615236282,
+      "learning_rate": 7.494536156213151e-05,
+      "loss": 2.791,
+      "step": 13470
+    },
+    {
+      "epoch": 0.7821520786793931,
+      "grad_norm": 0.11008104681968689,
+      "learning_rate": 7.456512882222703e-05,
+      "loss": 2.7874,
+      "step": 13480
+    },
+    {
+      "epoch": 0.7827323101917665,
+      "grad_norm": 0.11095033586025238,
+      "learning_rate": 7.418572615713413e-05,
+      "loss": 2.7874,
+      "step": 13490
+    },
+    {
+      "epoch": 0.78331254170414,
+      "grad_norm": 0.10690274834632874,
+      "learning_rate": 7.380715496385316e-05,
+      "loss": 2.7897,
+      "step": 13500
+    },
+    {
+      "epoch": 0.7838927732165134,
+      "grad_norm": 0.10463336110115051,
+      "learning_rate": 7.34294166363231e-05,
+      "loss": 2.7965,
+      "step": 13510
+    },
+    {
+      "epoch": 0.7844730047288868,
+      "grad_norm": 0.10628803819417953,
+      "learning_rate": 7.30525125654157e-05,
+      "loss": 2.7878,
+      "step": 13520
+    },
+    {
+      "epoch": 0.7850532362412602,
+      "grad_norm": 0.10758186876773834,
+      "learning_rate": 7.267644413893152e-05,
+      "loss": 2.7893,
+      "step": 13530
+    },
+    {
+      "epoch": 0.7856334677536337,
+      "grad_norm": 0.10785481333732605,
+      "learning_rate": 7.230121274159384e-05,
+      "loss": 2.7896,
+      "step": 13540
+    },
+    {
+      "epoch": 0.7862136992660071,
+      "grad_norm": 0.10700030624866486,
+      "learning_rate": 7.192681975504382e-05,
+      "loss": 2.786,
+      "step": 13550
+    },
+    {
+      "epoch": 0.7867939307783806,
+      "grad_norm": 0.10182949900627136,
+      "learning_rate": 7.155326655783597e-05,
+      "loss": 2.7889,
+      "step": 13560
+    },
+    {
+      "epoch": 0.787374162290754,
+      "grad_norm": 0.10802864283323288,
+      "learning_rate": 7.118055452543193e-05,
+      "loss": 2.7946,
+      "step": 13570
+    },
+    {
+      "epoch": 0.7879543938031275,
+      "grad_norm": 0.10849913954734802,
+      "learning_rate": 7.080868503019672e-05,
+      "loss": 2.786,
+      "step": 13580
+    },
+    {
+      "epoch": 0.7885346253155009,
+      "grad_norm": 0.10770730674266815,
+      "learning_rate": 7.043765944139264e-05,
+      "loss": 2.7804,
+      "step": 13590
+    },
+    {
+      "epoch": 0.7891148568278743,
+      "grad_norm": 0.11441770195960999,
+      "learning_rate": 7.006747912517475e-05,
+      "loss": 2.79,
+      "step": 13600
+    },
+    {
+      "epoch": 0.7896950883402477,
+      "grad_norm": 0.10908571630716324,
+      "learning_rate": 6.9698145444586e-05,
+      "loss": 2.7897,
+      "step": 13610
+    },
+    {
+      "epoch": 0.7902753198526212,
+      "grad_norm": 0.10705877095460892,
+      "learning_rate": 6.932965975955134e-05,
+      "loss": 2.7857,
+      "step": 13620
+    },
+    {
+      "epoch": 0.7908555513649946,
+      "grad_norm": 0.11635982990264893,
+      "learning_rate": 6.896202342687397e-05,
+      "loss": 2.7888,
+      "step": 13630
+    },
+    {
+      "epoch": 0.791435782877368,
+      "grad_norm": 0.1107436865568161,
+      "learning_rate": 6.859523780022911e-05,
+      "loss": 2.7902,
+      "step": 13640
+    },
+    {
+      "epoch": 0.7920160143897415,
+      "grad_norm": 0.11131720244884491,
+      "learning_rate": 6.822930423016003e-05,
+      "loss": 2.7982,
+      "step": 13650
+    },
+    {
+      "epoch": 0.792596245902115,
+      "grad_norm": 0.10535065829753876,
+      "learning_rate": 6.786422406407247e-05,
+      "loss": 2.7838,
+      "step": 13660
+    },
+    {
+      "epoch": 0.7931764774144884,
+      "grad_norm": 0.10784085094928741,
+      "learning_rate": 6.749999864622973e-05,
+      "loss": 2.7778,
+      "step": 13670
+    },
+    {
+      "epoch": 0.7937567089268618,
+      "grad_norm": 0.10266363620758057,
+      "learning_rate": 6.713662931774818e-05,
+      "loss": 2.7929,
+      "step": 13680
+    },
+    {
+      "epoch": 0.7943369404392353,
+      "grad_norm": 0.11121921241283417,
+      "learning_rate": 6.677411741659145e-05,
+      "loss": 2.787,
+      "step": 13690
+    },
+    {
+      "epoch": 0.7949171719516087,
+      "grad_norm": 0.10687406361103058,
+      "learning_rate": 6.641246427756657e-05,
+      "loss": 2.7915,
+      "step": 13700
+    },
+    {
+      "epoch": 0.7954974034639821,
+      "grad_norm": 0.10604474693536758,
+      "learning_rate": 6.605167123231822e-05,
+      "loss": 2.7816,
+      "step": 13710
+    },
+    {
+      "epoch": 0.7960776349763555,
+      "grad_norm": 0.10484491288661957,
+      "learning_rate": 6.569173960932404e-05,
+      "loss": 2.7844,
+      "step": 13720
+    },
+    {
+      "epoch": 0.796657866488729,
+      "grad_norm": 0.10788851231336594,
+      "learning_rate": 6.533267073389034e-05,
+      "loss": 2.7815,
+      "step": 13730
+    },
+    {
+      "epoch": 0.7972380980011025,
+      "grad_norm": 0.10421809554100037,
+      "learning_rate": 6.49744659281459e-05,
+      "loss": 2.7953,
+      "step": 13740
+    },
+    {
+      "epoch": 0.7978183295134759,
+      "grad_norm": 0.10567434132099152,
+      "learning_rate": 6.461712651103859e-05,
+      "loss": 2.7898,
+      "step": 13750
+    },
+    {
+      "epoch": 0.7983985610258493,
+      "grad_norm": 0.10381162911653519,
+      "learning_rate": 6.426065379832959e-05,
+      "loss": 2.7902,
+      "step": 13760
+    },
+    {
+      "epoch": 0.7989787925382228,
+      "grad_norm": 0.10707089304924011,
+      "learning_rate": 6.390504910258867e-05,
+      "loss": 2.7923,
+      "step": 13770
+    },
+    {
+      "epoch": 0.7995590240505962,
+      "grad_norm": 0.10568366944789886,
+      "learning_rate": 6.355031373318961e-05,
+      "loss": 2.793,
+      "step": 13780
+    },
+    {
+      "epoch": 0.8001392555629696,
+      "grad_norm": 0.10662976652383804,
+      "learning_rate": 6.319644899630514e-05,
+      "loss": 2.7954,
+      "step": 13790
+    },
+    {
+      "epoch": 0.800719487075343,
+      "grad_norm": 0.10822783410549164,
+      "learning_rate": 6.28434561949024e-05,
+      "loss": 2.7875,
+      "step": 13800
+    },
+    {
+      "epoch": 0.8012997185877165,
+      "grad_norm": 0.10903995484113693,
+      "learning_rate": 6.249133662873783e-05,
+      "loss": 2.7952,
+      "step": 13810
+    },
+    {
+      "epoch": 0.8018799501000899,
+      "grad_norm": 0.11016574501991272,
+      "learning_rate": 6.214009159435254e-05,
+      "loss": 2.7833,
+      "step": 13820
+    },
+    {
+      "epoch": 0.8024601816124634,
+      "grad_norm": 0.10669629275798798,
+      "learning_rate": 6.178972238506758e-05,
+      "loss": 2.7966,
+      "step": 13830
+    },
+    {
+      "epoch": 0.8030404131248368,
+      "grad_norm": 0.10725666582584381,
+      "learning_rate": 6.144023029097891e-05,
+      "loss": 2.781,
+      "step": 13840
+    },
+    {
+      "epoch": 0.8036206446372103,
+      "grad_norm": 0.10259473323822021,
+      "learning_rate": 6.10916165989533e-05,
+      "loss": 2.7858,
+      "step": 13850
+    },
+    {
+      "epoch": 0.8042008761495837,
+      "grad_norm": 0.10819372534751892,
+      "learning_rate": 6.0743882592622736e-05,
+      "loss": 2.782,
+      "step": 13860
+    },
+    {
+      "epoch": 0.8047811076619571,
+      "grad_norm": 0.09982424229383469,
+      "learning_rate": 6.039702955238026e-05,
+      "loss": 2.7767,
+      "step": 13870
+    },
+    {
+      "epoch": 0.8053613391743305,
+      "grad_norm": 0.11254626512527466,
+      "learning_rate": 6.005105875537515e-05,
+      "loss": 2.7773,
+      "step": 13880
+    },
+    {
+      "epoch": 0.805941570686704,
+      "grad_norm": 0.10880761593580246,
+      "learning_rate": 5.970597147550808e-05,
+      "loss": 2.7925,
+      "step": 13890
+    },
+    {
+      "epoch": 0.8065218021990774,
+      "grad_norm": 0.10454876720905304,
+      "learning_rate": 5.936176898342649e-05,
+      "loss": 2.7887,
+      "step": 13900
+    },
+    {
+      "epoch": 0.8071020337114508,
+      "grad_norm": 0.10871117562055588,
+      "learning_rate": 5.9018452546520165e-05,
+      "loss": 2.7914,
+      "step": 13910
+    },
+    {
+      "epoch": 0.8076822652238244,
+      "grad_norm": 0.10645408183336258,
+      "learning_rate": 5.8676023428916175e-05,
+      "loss": 2.7946,
+      "step": 13920
+    },
+    {
+      "epoch": 0.8082624967361978,
+      "grad_norm": 0.11597729474306107,
+      "learning_rate": 5.83344828914743e-05,
+      "loss": 2.7917,
+      "step": 13930
+    },
+    {
+      "epoch": 0.8088427282485712,
+      "grad_norm": 0.1034785658121109,
+      "learning_rate": 5.799383219178264e-05,
+      "loss": 2.7912,
+      "step": 13940
+    },
+    {
+      "epoch": 0.8094229597609446,
+      "grad_norm": 0.10739534348249435,
+      "learning_rate": 5.7654072584152787e-05,
+      "loss": 2.7848,
+      "step": 13950
+    },
+    {
+      "epoch": 0.8100031912733181,
+      "grad_norm": 0.10825861990451813,
+      "learning_rate": 5.731520531961505e-05,
+      "loss": 2.7908,
+      "step": 13960
+    },
+    {
+      "epoch": 0.8105834227856915,
+      "grad_norm": 0.10880185663700104,
+      "learning_rate": 5.697723164591441e-05,
+      "loss": 2.7904,
+      "step": 13970
+    },
+    {
+      "epoch": 0.8111636542980649,
+      "grad_norm": 0.1085624098777771,
+      "learning_rate": 5.6640152807505236e-05,
+      "loss": 2.7839,
+      "step": 13980
+    },
+    {
+      "epoch": 0.8117438858104383,
+      "grad_norm": 0.10740832984447479,
+      "learning_rate": 5.630397004554713e-05,
+      "loss": 2.7858,
+      "step": 13990
+    },
+    {
+      "epoch": 0.8123241173228118,
+      "grad_norm": 0.10401804000139236,
+      "learning_rate": 5.596868459790025e-05,
+      "loss": 2.7802,
+      "step": 14000
+    },
+    {
+      "epoch": 0.8123241173228118,
+      "eval_loss": 2.749423027038574,
+      "eval_runtime": 3.2586,
+      "eval_samples_per_second": 1328.792,
+      "eval_steps_per_second": 2.762,
+      "step": 14000
+    },
+    {
+      "epoch": 0.8129043488351853,
+      "grad_norm": 0.10784956812858582,
+      "learning_rate": 5.563429769912071e-05,
+      "loss": 2.7852,
+      "step": 14010
+    },
+    {
+      "epoch": 0.8134845803475587,
+      "grad_norm": 0.10523492097854614,
+      "learning_rate": 5.530081058045606e-05,
+      "loss": 2.7856,
+      "step": 14020
+    },
+    {
+      "epoch": 0.8140648118599321,
+      "grad_norm": 0.10354667156934738,
+      "learning_rate": 5.4968224469840935e-05,
+      "loss": 2.7826,
+      "step": 14030
+    },
+    {
+      "epoch": 0.8146450433723056,
+      "grad_norm": 0.10460636019706726,
+      "learning_rate": 5.4636540591892164e-05,
+      "loss": 2.7844,
+      "step": 14040
+    },
+    {
+      "epoch": 0.815225274884679,
+      "grad_norm": 0.11116158217191696,
+      "learning_rate": 5.430576016790453e-05,
+      "loss": 2.7879,
+      "step": 14050
+    },
+    {
+      "epoch": 0.8158055063970524,
+      "grad_norm": 0.11445162445306778,
+      "learning_rate": 5.3975884415846206e-05,
+      "loss": 2.7847,
+      "step": 14060
+    },
+    {
+      "epoch": 0.8163857379094258,
+      "grad_norm": 0.10757939517498016,
+      "learning_rate": 5.3646914550354204e-05,
+      "loss": 2.7884,
+      "step": 14070
+    },
+    {
+      "epoch": 0.8169659694217993,
+      "grad_norm": 0.10770777612924576,
+      "learning_rate": 5.331885178273015e-05,
+      "loss": 2.775,
+      "step": 14080
+    },
+    {
+      "epoch": 0.8175462009341727,
+      "grad_norm": 0.10863149166107178,
+      "learning_rate": 5.2991697320935486e-05,
+      "loss": 2.7883,
+      "step": 14090
+    },
+    {
+      "epoch": 0.8181264324465461,
+      "grad_norm": 0.10049009323120117,
+      "learning_rate": 5.266545236958718e-05,
+      "loss": 2.7878,
+      "step": 14100
+    },
+    {
+      "epoch": 0.8187066639589196,
+      "grad_norm": 0.104975625872612,
+      "learning_rate": 5.2340118129953346e-05,
+      "loss": 2.7806,
+      "step": 14110
+    },
+    {
+      "epoch": 0.8192868954712931,
+      "grad_norm": 0.10563846677541733,
+      "learning_rate": 5.201569579994865e-05,
+      "loss": 2.7807,
+      "step": 14120
+    },
+    {
+      "epoch": 0.8198671269836665,
+      "grad_norm": 0.10182633996009827,
+      "learning_rate": 5.1692186574130324e-05,
+      "loss": 2.7782,
+      "step": 14130
+    },
+    {
+      "epoch": 0.8204473584960399,
+      "grad_norm": 0.10903611779212952,
+      "learning_rate": 5.1369591643692896e-05,
+      "loss": 2.7792,
+      "step": 14140
+    },
+    {
+      "epoch": 0.8210275900084134,
+      "grad_norm": 0.10453125089406967,
+      "learning_rate": 5.1047912196464944e-05,
+      "loss": 2.7814,
+      "step": 14150
+    },
+    {
+      "epoch": 0.8216078215207868,
+      "grad_norm": 0.11026264727115631,
+      "learning_rate": 5.072714941690387e-05,
+      "loss": 2.7847,
+      "step": 14160
+    },
+    {
+      "epoch": 0.8221880530331602,
+      "grad_norm": 0.10732634365558624,
+      "learning_rate": 5.040730448609166e-05,
+      "loss": 2.7716,
+      "step": 14170
+    },
+    {
+      "epoch": 0.8227682845455336,
+      "grad_norm": 0.10351432114839554,
+      "learning_rate": 5.008837858173113e-05,
+      "loss": 2.7883,
+      "step": 14180
+    },
+    {
+      "epoch": 0.8233485160579072,
+      "grad_norm": 0.10946208238601685,
+      "learning_rate": 4.9770372878140575e-05,
+      "loss": 2.786,
+      "step": 14190
+    },
+    {
+      "epoch": 0.8239287475702806,
+      "grad_norm": 0.1038416251540184,
+      "learning_rate": 4.9453288546250494e-05,
+      "loss": 2.7799,
+      "step": 14200
+    },
+    {
+      "epoch": 0.824508979082654,
+      "grad_norm": 0.10568647086620331,
+      "learning_rate": 4.913712675359861e-05,
+      "loss": 2.7874,
+      "step": 14210
+    },
+    {
+      "epoch": 0.8250892105950274,
+      "grad_norm": 0.10334275662899017,
+      "learning_rate": 4.882188866432568e-05,
+      "loss": 2.7835,
+      "step": 14220
+    },
+    {
+      "epoch": 0.8256694421074009,
+      "grad_norm": 0.10559739917516708,
+      "learning_rate": 4.850757543917144e-05,
+      "loss": 2.7791,
+      "step": 14230
+    },
+    {
+      "epoch": 0.8262496736197743,
+      "grad_norm": 0.1026688888669014,
+      "learning_rate": 4.819418823546999e-05,
+      "loss": 2.7777,
+      "step": 14240
+    },
+    {
+      "epoch": 0.8268299051321477,
+      "grad_norm": 0.10159046947956085,
+      "learning_rate": 4.788172820714611e-05,
+      "loss": 2.7876,
+      "step": 14250
+    },
+    {
+      "epoch": 0.8274101366445211,
+      "grad_norm": 0.114133320748806,
+      "learning_rate": 4.7570196504710026e-05,
+      "loss": 2.7777,
+      "step": 14260
+    },
+    {
+      "epoch": 0.8279903681568946,
+      "grad_norm": 0.10327325016260147,
+      "learning_rate": 4.725959427525432e-05,
+      "loss": 2.7976,
+      "step": 14270
+    },
+    {
+      "epoch": 0.828570599669268,
+      "grad_norm": 0.10618502646684647,
+      "learning_rate": 4.694992266244889e-05,
+      "loss": 2.7904,
+      "step": 14280
+    },
+    {
+      "epoch": 0.8291508311816415,
+      "grad_norm": 0.10732074081897736,
+      "learning_rate": 4.6641182806537e-05,
+      "loss": 2.7724,
+      "step": 14290
+    },
+    {
+      "epoch": 0.8297310626940149,
+      "grad_norm": 0.10467931628227234,
+      "learning_rate": 4.63333758443313e-05,
+      "loss": 2.7843,
+      "step": 14300
+    },
+    {
+      "epoch": 0.8303112942063884,
+      "grad_norm": 0.10281146317720413,
+      "learning_rate": 4.6026502909209004e-05,
+      "loss": 2.7842,
+      "step": 14310
+    },
+    {
+      "epoch": 0.8308915257187618,
+      "grad_norm": 0.1023208498954773,
+      "learning_rate": 4.572056513110867e-05,
+      "loss": 2.774,
+      "step": 14320
+    },
+    {
+      "epoch": 0.8314717572311352,
+      "grad_norm": 0.10323374718427658,
+      "learning_rate": 4.541556363652511e-05,
+      "loss": 2.7755,
+      "step": 14330
+    },
+    {
+      "epoch": 0.8320519887435086,
+      "grad_norm": 0.10136920213699341,
+      "learning_rate": 4.5111499548505727e-05,
+      "loss": 2.7814,
+      "step": 14340
+    },
+    {
+      "epoch": 0.8326322202558821,
+      "grad_norm": 0.10571028292179108,
+      "learning_rate": 4.4808373986646565e-05,
+      "loss": 2.7878,
+      "step": 14350
+    },
+    {
+      "epoch": 0.8332124517682555,
+      "grad_norm": 0.10252848267555237,
+      "learning_rate": 4.45061880670874e-05,
+      "loss": 2.7754,
+      "step": 14360
+    },
+    {
+      "epoch": 0.8337926832806289,
+      "grad_norm": 0.10471548140048981,
+      "learning_rate": 4.420494290250869e-05,
+      "loss": 2.7767,
+      "step": 14370
+    },
+    {
+      "epoch": 0.8343729147930025,
+      "grad_norm": 0.10701679438352585,
+      "learning_rate": 4.390463960212658e-05,
+      "loss": 2.7792,
+      "step": 14380
+    },
+    {
+      "epoch": 0.8349531463053759,
+      "grad_norm": 0.10377515107393265,
+      "learning_rate": 4.3605279271689264e-05,
+      "loss": 2.7829,
+      "step": 14390
+    },
+    {
+      "epoch": 0.8355333778177493,
+      "grad_norm": 0.10350141674280167,
+      "learning_rate": 4.330686301347298e-05,
+      "loss": 2.7861,
+      "step": 14400
+    },
+    {
+      "epoch": 0.8361136093301227,
+      "grad_norm": 0.10299152880907059,
+      "learning_rate": 4.300939192627742e-05,
+      "loss": 2.7891,
+      "step": 14410
+    },
+    {
+      "epoch": 0.8366938408424962,
+      "grad_norm": 0.1038345992565155,
+      "learning_rate": 4.2712867105422465e-05,
+      "loss": 2.7812,
+      "step": 14420
+    },
+    {
+      "epoch": 0.8372740723548696,
+      "grad_norm": 0.10262761265039444,
+      "learning_rate": 4.241728964274352e-05,
+      "loss": 2.7784,
+      "step": 14430
+    },
+    {
+      "epoch": 0.837854303867243,
+      "grad_norm": 0.10034337639808655,
+      "learning_rate": 4.212266062658777e-05,
+      "loss": 2.7857,
+      "step": 14440
+    },
+    {
+      "epoch": 0.8384345353796164,
+      "grad_norm": 0.10054679960012436,
+      "learning_rate": 4.1828981141810104e-05,
+      "loss": 2.7783,
+      "step": 14450
+    },
+    {
+      "epoch": 0.83901476689199,
+      "grad_norm": 0.10352133959531784,
+      "learning_rate": 4.15362522697691e-05,
+      "loss": 2.7936,
+      "step": 14460
+    },
+    {
+      "epoch": 0.8395949984043634,
+      "grad_norm": 0.10465723276138306,
+      "learning_rate": 4.124447508832332e-05,
+      "loss": 2.7692,
+      "step": 14470
+    },
+    {
+      "epoch": 0.8401752299167368,
+      "grad_norm": 0.10384640097618103,
+      "learning_rate": 4.095365067182665e-05,
+      "loss": 2.781,
+      "step": 14480
+    },
+    {
+      "epoch": 0.8407554614291102,
+      "grad_norm": 0.10312188416719437,
+      "learning_rate": 4.066378009112523e-05,
+      "loss": 2.7767,
+      "step": 14490
+    },
+    {
+      "epoch": 0.8413356929414837,
+      "grad_norm": 0.10447024554014206,
+      "learning_rate": 4.037486441355288e-05,
+      "loss": 2.7832,
+      "step": 14500
+    },
+    {
+      "epoch": 0.8419159244538571,
+      "grad_norm": 0.10162138938903809,
+      "learning_rate": 4.008690470292732e-05,
+      "loss": 2.7786,
+      "step": 14510
+    },
+    {
+      "epoch": 0.8424961559662305,
+      "grad_norm": 0.09777431935071945,
+      "learning_rate": 3.979990201954653e-05,
+      "loss": 2.7792,
+      "step": 14520
+    },
+    {
+      "epoch": 0.8430763874786039,
+      "grad_norm": 0.10050346702337265,
+      "learning_rate": 3.9513857420184216e-05,
+      "loss": 2.7866,
+      "step": 14530
+    },
+    {
+      "epoch": 0.8436566189909774,
+      "grad_norm": 0.10209480673074722,
+      "learning_rate": 3.922877195808678e-05,
+      "loss": 2.7886,
+      "step": 14540
+    },
+    {
+      "epoch": 0.8442368505033508,
+      "grad_norm": 0.10496553033590317,
+      "learning_rate": 3.894464668296864e-05,
+      "loss": 2.7854,
+      "step": 14550
+    },
+    {
+      "epoch": 0.8448170820157243,
+      "grad_norm": 0.10205195099115372,
+      "learning_rate": 3.8661482641008866e-05,
+      "loss": 2.7869,
+      "step": 14560
+    },
+    {
+      "epoch": 0.8453973135280977,
+      "grad_norm": 0.10940441489219666,
+      "learning_rate": 3.837928087484711e-05,
+      "loss": 2.7799,
+      "step": 14570
+    },
+    {
+      "epoch": 0.8459775450404712,
+      "grad_norm": 0.10287832468748093,
+      "learning_rate": 3.8098042423579766e-05,
+      "loss": 2.7804,
+      "step": 14580
+    },
+    {
+      "epoch": 0.8465577765528446,
+      "grad_norm": 0.0999421551823616,
+      "learning_rate": 3.781776832275639e-05,
+      "loss": 2.7835,
+      "step": 14590
+    },
+    {
+      "epoch": 0.847138008065218,
+      "grad_norm": 0.10340355336666107,
+      "learning_rate": 3.753845960437557e-05,
+      "loss": 2.7831,
+      "step": 14600
+    },
+    {
+      "epoch": 0.8477182395775914,
+      "grad_norm": 0.10355892032384872,
+      "learning_rate": 3.72601172968812e-05,
+      "loss": 2.7749,
+      "step": 14610
+    },
+    {
+      "epoch": 0.8482984710899649,
+      "grad_norm": 0.10467097908258438,
+      "learning_rate": 3.6982742425158886e-05,
+      "loss": 2.7834,
+      "step": 14620
+    },
+    {
+      "epoch": 0.8488787026023383,
+      "grad_norm": 0.1060672402381897,
+      "learning_rate": 3.670633601053182e-05,
+      "loss": 2.7801,
+      "step": 14630
+    },
+    {
+      "epoch": 0.8494589341147117,
+      "grad_norm": 0.10443491488695145,
+      "learning_rate": 3.643089907075759e-05,
+      "loss": 2.7896,
+      "step": 14640
+    },
+    {
+      "epoch": 0.8500391656270853,
+      "grad_norm": 0.1023486852645874,
+      "learning_rate": 3.6156432620023726e-05,
+      "loss": 2.7691,
+      "step": 14650
+    },
+    {
+      "epoch": 0.8506193971394587,
+      "grad_norm": 0.10417921096086502,
+      "learning_rate": 3.5882937668944476e-05,
+      "loss": 2.7703,
+      "step": 14660
+    },
+    {
+      "epoch": 0.8511996286518321,
+      "grad_norm": 0.10138606280088425,
+      "learning_rate": 3.561041522455691e-05,
+      "loss": 2.7885,
+      "step": 14670
+    },
+    {
+      "epoch": 0.8517798601642055,
+      "grad_norm": 0.10121186077594757,
+      "learning_rate": 3.5338866290317204e-05,
+      "loss": 2.7721,
+      "step": 14680
+    },
+    {
+      "epoch": 0.852360091676579,
+      "grad_norm": 0.10391680151224136,
+      "learning_rate": 3.506829186609691e-05,
+      "loss": 2.7818,
+      "step": 14690
+    },
+    {
+      "epoch": 0.8529403231889524,
+      "grad_norm": 0.10207725316286087,
+      "learning_rate": 3.479869294817955e-05,
+      "loss": 2.775,
+      "step": 14700
+    },
+    {
+      "epoch": 0.8535205547013258,
+      "grad_norm": 0.10676626861095428,
+      "learning_rate": 3.4530070529256524e-05,
+      "loss": 2.7759,
+      "step": 14710
+    },
+    {
+      "epoch": 0.8541007862136992,
+      "grad_norm": 0.10105539858341217,
+      "learning_rate": 3.42624255984237e-05,
+      "loss": 2.7855,
+      "step": 14720
+    },
+    {
+      "epoch": 0.8546810177260727,
+      "grad_norm": 0.10040144622325897,
+      "learning_rate": 3.399575914117777e-05,
+      "loss": 2.7736,
+      "step": 14730
+    },
+    {
+      "epoch": 0.8552612492384462,
+      "grad_norm": 0.10322125256061554,
+      "learning_rate": 3.3730072139412456e-05,
+      "loss": 2.7834,
+      "step": 14740
+    },
+    {
+      "epoch": 0.8558414807508196,
+      "grad_norm": 0.10220754891633987,
+      "learning_rate": 3.3465365571415315e-05,
+      "loss": 2.7692,
+      "step": 14750
+    },
+    {
+      "epoch": 0.856421712263193,
+      "grad_norm": 0.10107099264860153,
+      "learning_rate": 3.3201640411863584e-05,
+      "loss": 2.7672,
+      "step": 14760
+    },
+    {
+      "epoch": 0.8570019437755665,
+      "grad_norm": 0.10284842550754547,
+      "learning_rate": 3.293889763182089e-05,
+      "loss": 2.7851,
+      "step": 14770
+    },
+    {
+      "epoch": 0.8575821752879399,
+      "grad_norm": 0.10386528819799423,
+      "learning_rate": 3.26771381987337e-05,
+      "loss": 2.7787,
+      "step": 14780
+    },
+    {
+      "epoch": 0.8581624068003133,
+      "grad_norm": 0.1039406880736351,
+      "learning_rate": 3.241636307642769e-05,
+      "loss": 2.7838,
+      "step": 14790
+    },
+    {
+      "epoch": 0.8587426383126867,
+      "grad_norm": 0.1034376472234726,
+      "learning_rate": 3.2156573225104145e-05,
+      "loss": 2.7794,
+      "step": 14800
+    },
+    {
+      "epoch": 0.8593228698250602,
+      "grad_norm": 0.10199546813964844,
+      "learning_rate": 3.189776960133645e-05,
+      "loss": 2.7806,
+      "step": 14810
+    },
+    {
+      "epoch": 0.8599031013374336,
+      "grad_norm": 0.10086624324321747,
+      "learning_rate": 3.163995315806681e-05,
+      "loss": 2.7666,
+      "step": 14820
+    },
+    {
+      "epoch": 0.860483332849807,
+      "grad_norm": 0.10021676123142242,
+      "learning_rate": 3.138312484460228e-05,
+      "loss": 2.7738,
+      "step": 14830
+    },
+    {
+      "epoch": 0.8610635643621805,
+      "grad_norm": 0.10465867072343826,
+      "learning_rate": 3.112728560661164e-05,
+      "loss": 2.7786,
+      "step": 14840
+    },
+    {
+      "epoch": 0.861643795874554,
+      "grad_norm": 0.10076703131198883,
+      "learning_rate": 3.0872436386121776e-05,
+      "loss": 2.7705,
+      "step": 14850
+    },
+    {
+      "epoch": 0.8622240273869274,
+      "grad_norm": 0.10121941566467285,
+      "learning_rate": 3.061857812151414e-05,
+      "loss": 2.7737,
+      "step": 14860
+    },
+    {
+      "epoch": 0.8628042588993008,
+      "grad_norm": 0.10309196263551712,
+      "learning_rate": 3.0365711747521538e-05,
+      "loss": 2.7783,
+      "step": 14870
+    },
+    {
+      "epoch": 0.8633844904116743,
+      "grad_norm": 0.10456740111112595,
+      "learning_rate": 3.011383819522446e-05,
+      "loss": 2.7809,
+      "step": 14880
+    },
+    {
+      "epoch": 0.8639647219240477,
+      "grad_norm": 0.1025143563747406,
+      "learning_rate": 2.986295839204764e-05,
+      "loss": 2.7813,
+      "step": 14890
+    },
+    {
+      "epoch": 0.8645449534364211,
+      "grad_norm": 0.10585116595029831,
+      "learning_rate": 2.961307326175688e-05,
+      "loss": 2.7738,
+      "step": 14900
+    },
+    {
+      "epoch": 0.8651251849487945,
+      "grad_norm": 0.10203658789396286,
+      "learning_rate": 2.936418372445527e-05,
+      "loss": 2.7777,
+      "step": 14910
+    },
+    {
+      "epoch": 0.865705416461168,
+      "grad_norm": 0.10538860410451889,
+      "learning_rate": 2.911629069658037e-05,
+      "loss": 2.7757,
+      "step": 14920
+    },
+    {
+      "epoch": 0.8662856479735415,
+      "grad_norm": 0.10184674710035324,
+      "learning_rate": 2.8869395090900037e-05,
+      "loss": 2.7797,
+      "step": 14930
+    },
+    {
+      "epoch": 0.8668658794859149,
+      "grad_norm": 0.10757064819335938,
+      "learning_rate": 2.862349781650991e-05,
+      "loss": 2.7837,
+      "step": 14940
+    },
+    {
+      "epoch": 0.8674461109982883,
+      "grad_norm": 0.09947676211595535,
+      "learning_rate": 2.8378599778829492e-05,
+      "loss": 2.7764,
+      "step": 14950
+    },
+    {
+      "epoch": 0.8680263425106618,
+      "grad_norm": 0.0980169028043747,
+      "learning_rate": 2.8134701879598965e-05,
+      "loss": 2.7877,
+      "step": 14960
+    },
+    {
+      "epoch": 0.8686065740230352,
+      "grad_norm": 0.09837668389081955,
+      "learning_rate": 2.7891805016876057e-05,
+      "loss": 2.7806,
+      "step": 14970
+    },
+    {
+      "epoch": 0.8691868055354086,
+      "grad_norm": 0.09911120682954788,
+      "learning_rate": 2.7649910085032277e-05,
+      "loss": 2.7807,
+      "step": 14980
+    },
+    {
+      "epoch": 0.869767037047782,
+      "grad_norm": 0.09837288409471512,
+      "learning_rate": 2.7409017974750257e-05,
+      "loss": 2.7677,
+      "step": 14990
+    },
+    {
+      "epoch": 0.8703472685601555,
+      "grad_norm": 0.10560393333435059,
+      "learning_rate": 2.7169129573019943e-05,
+      "loss": 2.7785,
+      "step": 15000
+    },
+    {
+      "epoch": 0.8703472685601555,
+      "eval_loss": 2.7414441108703613,
+      "eval_runtime": 3.2661,
+      "eval_samples_per_second": 1325.755,
+      "eval_steps_per_second": 2.756,
+      "step": 15000
+    },
+    {
+      "epoch": 0.870927500072529,
+      "grad_norm": 0.09839779883623123,
+      "learning_rate": 2.6930245763135504e-05,
+      "loss": 2.7759,
+      "step": 15010
+    },
+    {
+      "epoch": 0.8715077315849024,
+      "grad_norm": 0.09770379960536957,
+      "learning_rate": 2.6692367424692272e-05,
+      "loss": 2.787,
+      "step": 15020
+    },
+    {
+      "epoch": 0.8720879630972758,
+      "grad_norm": 0.09834130108356476,
+      "learning_rate": 2.645549543358304e-05,
+      "loss": 2.7731,
+      "step": 15030
+    },
+    {
+      "epoch": 0.8726681946096493,
+      "grad_norm": 0.1047162264585495,
+      "learning_rate": 2.6219630661995528e-05,
+      "loss": 2.7832,
+      "step": 15040
+    },
+    {
+      "epoch": 0.8732484261220227,
+      "grad_norm": 0.10111907124519348,
+      "learning_rate": 2.5984773978408257e-05,
+      "loss": 2.779,
+      "step": 15050
+    },
+    {
+      "epoch": 0.8738286576343961,
+      "grad_norm": 0.10093654692173004,
+      "learning_rate": 2.5750926247588322e-05,
+      "loss": 2.768,
+      "step": 15060
+    },
+    {
+      "epoch": 0.8744088891467695,
+      "grad_norm": 0.10071719437837601,
+      "learning_rate": 2.551808833058755e-05,
+      "loss": 2.7867,
+      "step": 15070
+    },
+    {
+      "epoch": 0.874989120659143,
+      "grad_norm": 0.10237322747707367,
+      "learning_rate": 2.5286261084739445e-05,
+      "loss": 2.7838,
+      "step": 15080
+    },
+    {
+      "epoch": 0.8755693521715164,
+      "grad_norm": 0.09815766662359238,
+      "learning_rate": 2.5055445363656358e-05,
+      "loss": 2.7839,
+      "step": 15090
+    },
+    {
+      "epoch": 0.8761495836838898,
+      "grad_norm": 0.10203532874584198,
+      "learning_rate": 2.482564201722581e-05,
+      "loss": 2.7878,
+      "step": 15100
+    },
+    {
+      "epoch": 0.8767298151962634,
+      "grad_norm": 0.10766585171222687,
+      "learning_rate": 2.4596851891607884e-05,
+      "loss": 2.7823,
+      "step": 15110
+    },
+    {
+      "epoch": 0.8773100467086368,
+      "grad_norm": 0.09876078367233276,
+      "learning_rate": 2.4369075829231766e-05,
+      "loss": 2.7762,
+      "step": 15120
+    },
+    {
+      "epoch": 0.8778902782210102,
+      "grad_norm": 0.10014016181230545,
+      "learning_rate": 2.414231466879274e-05,
+      "loss": 2.7733,
+      "step": 15130
+    },
+    {
+      "epoch": 0.8784705097333836,
+      "grad_norm": 0.10114018619060516,
+      "learning_rate": 2.3916569245249306e-05,
+      "loss": 2.7861,
+      "step": 15140
+    },
+    {
+      "epoch": 0.8790507412457571,
+      "grad_norm": 0.10012462735176086,
+      "learning_rate": 2.3691840389819526e-05,
+      "loss": 2.7635,
+      "step": 15150
+    },
+    {
+      "epoch": 0.8796309727581305,
+      "grad_norm": 0.10367590934038162,
+      "learning_rate": 2.3468128929978757e-05,
+      "loss": 2.7727,
+      "step": 15160
+    },
+    {
+      "epoch": 0.8802112042705039,
+      "grad_norm": 0.10224179178476334,
+      "learning_rate": 2.3245435689456015e-05,
+      "loss": 2.7712,
+      "step": 15170
+    },
+    {
+      "epoch": 0.8807914357828773,
+      "grad_norm": 0.0989450216293335,
+      "learning_rate": 2.302376148823102e-05,
+      "loss": 2.7761,
+      "step": 15180
+    },
+    {
+      "epoch": 0.8813716672952508,
+      "grad_norm": 0.10036759078502655,
+      "learning_rate": 2.2803107142531617e-05,
+      "loss": 2.7815,
+      "step": 15190
+    },
+    {
+      "epoch": 0.8819518988076243,
+      "grad_norm": 0.10400567203760147,
+      "learning_rate": 2.2583473464830005e-05,
+      "loss": 2.7826,
+      "step": 15200
+    },
+    {
+      "epoch": 0.8825321303199977,
+      "grad_norm": 0.09990741312503815,
+      "learning_rate": 2.2364861263840507e-05,
+      "loss": 2.7869,
+      "step": 15210
+    },
+    {
+      "epoch": 0.8831123618323711,
+      "grad_norm": 0.10067487508058548,
+      "learning_rate": 2.2147271344516128e-05,
+      "loss": 2.7771,
+      "step": 15220
+    },
+    {
+      "epoch": 0.8836925933447446,
+      "grad_norm": 0.10068360716104507,
+      "learning_rate": 2.1930704508045714e-05,
+      "loss": 2.781,
+      "step": 15230
+    },
+    {
+      "epoch": 0.884272824857118,
+      "grad_norm": 0.10076344013214111,
+      "learning_rate": 2.171516155185117e-05,
+      "loss": 2.7793,
+      "step": 15240
+    },
+    {
+      "epoch": 0.8848530563694914,
+      "grad_norm": 0.0988764762878418,
+      "learning_rate": 2.1500643269584027e-05,
+      "loss": 2.772,
+      "step": 15250
+    },
+    {
+      "epoch": 0.8854332878818648,
+      "grad_norm": 0.09937159717082977,
+      "learning_rate": 2.1287150451123224e-05,
+      "loss": 2.7786,
+      "step": 15260
+    },
+    {
+      "epoch": 0.8860135193942383,
+      "grad_norm": 0.10244645178318024,
+      "learning_rate": 2.1074683882571675e-05,
+      "loss": 2.7752,
+      "step": 15270
+    },
+    {
+      "epoch": 0.8865937509066117,
+      "grad_norm": 0.09691537171602249,
+      "learning_rate": 2.0863244346253517e-05,
+      "loss": 2.7735,
+      "step": 15280
+    },
+    {
+      "epoch": 0.8871739824189852,
+      "grad_norm": 0.09877140074968338,
+      "learning_rate": 2.065283262071128e-05,
+      "loss": 2.777,
+      "step": 15290
+    },
+    {
+      "epoch": 0.8877542139313586,
+      "grad_norm": 0.09832227975130081,
+      "learning_rate": 2.044344948070289e-05,
+      "loss": 2.7718,
+      "step": 15300
+    },
+    {
+      "epoch": 0.8883344454437321,
+      "grad_norm": 0.09934905916452408,
+      "learning_rate": 2.02350956971992e-05,
+      "loss": 2.7725,
+      "step": 15310
+    },
+    {
+      "epoch": 0.8889146769561055,
+      "grad_norm": 0.09960002452135086,
+      "learning_rate": 2.0027772037380463e-05,
+      "loss": 2.77,
+      "step": 15320
+    },
+    {
+      "epoch": 0.8894949084684789,
+      "grad_norm": 0.10142461210489273,
+      "learning_rate": 1.9821479264634234e-05,
+      "loss": 2.7781,
+      "step": 15330
+    },
+    {
+      "epoch": 0.8900751399808524,
+      "grad_norm": 0.09648580849170685,
+      "learning_rate": 1.96162181385521e-05,
+      "loss": 2.7774,
+      "step": 15340
+    },
+    {
+      "epoch": 0.8906553714932258,
+      "grad_norm": 0.09822871536016464,
+      "learning_rate": 1.9411989414926953e-05,
+      "loss": 2.7718,
+      "step": 15350
+    },
+    {
+      "epoch": 0.8912356030055992,
+      "grad_norm": 0.1000954881310463,
+      "learning_rate": 1.9208793845750504e-05,
+      "loss": 2.7763,
+      "step": 15360
+    },
+    {
+      "epoch": 0.8918158345179726,
+      "grad_norm": 0.10170748084783554,
+      "learning_rate": 1.9006632179209925e-05,
+      "loss": 2.78,
+      "step": 15370
+    },
+    {
+      "epoch": 0.8923960660303462,
+      "grad_norm": 0.10458207130432129,
+      "learning_rate": 1.8805505159685807e-05,
+      "loss": 2.77,
+      "step": 15380
+    },
+    {
+      "epoch": 0.8929762975427196,
+      "grad_norm": 0.09986699372529984,
+      "learning_rate": 1.8605413527748823e-05,
+      "loss": 2.776,
+      "step": 15390
+    },
+    {
+      "epoch": 0.893556529055093,
+      "grad_norm": 0.09813553094863892,
+      "learning_rate": 1.8406358020157364e-05,
+      "loss": 2.7711,
+      "step": 15400
+    },
+    {
+      "epoch": 0.8941367605674664,
+      "grad_norm": 0.09960541874170303,
+      "learning_rate": 1.8208339369854663e-05,
+      "loss": 2.7781,
+      "step": 15410
+    },
+    {
+      "epoch": 0.8947169920798399,
+      "grad_norm": 0.09737250953912735,
+      "learning_rate": 1.801135830596605e-05,
+      "loss": 2.7657,
+      "step": 15420
+    },
+    {
+      "epoch": 0.8952972235922133,
+      "grad_norm": 0.0949782207608223,
+      "learning_rate": 1.7815415553796575e-05,
+      "loss": 2.7705,
+      "step": 15430
+    },
+    {
+      "epoch": 0.8958774551045867,
+      "grad_norm": 0.09773328900337219,
+      "learning_rate": 1.762051183482788e-05,
+      "loss": 2.7684,
+      "step": 15440
+    },
+    {
+      "epoch": 0.8964576866169601,
+      "grad_norm": 0.09638100862503052,
+      "learning_rate": 1.7426647866715925e-05,
+      "loss": 2.7724,
+      "step": 15450
+    },
+    {
+      "epoch": 0.8970379181293336,
+      "grad_norm": 0.09620904177427292,
+      "learning_rate": 1.7233824363288118e-05,
+      "loss": 2.7738,
+      "step": 15460
+    },
+    {
+      "epoch": 0.897618149641707,
+      "grad_norm": 0.09929810464382172,
+      "learning_rate": 1.7042042034540783e-05,
+      "loss": 2.7754,
+      "step": 15470
+    },
+    {
+      "epoch": 0.8981983811540805,
+      "grad_norm": 0.09778960049152374,
+      "learning_rate": 1.6851301586636613e-05,
+      "loss": 2.7766,
+      "step": 15480
+    },
+    {
+      "epoch": 0.8987786126664539,
+      "grad_norm": 0.09684190899133682,
+      "learning_rate": 1.6661603721901873e-05,
+      "loss": 2.7777,
+      "step": 15490
+    },
+    {
+      "epoch": 0.8993588441788274,
+      "grad_norm": 0.09664195775985718,
+      "learning_rate": 1.6472949138823967e-05,
+      "loss": 2.7859,
+      "step": 15500
+    },
+    {
+      "epoch": 0.8999390756912008,
+      "grad_norm": 0.10036718100309372,
+      "learning_rate": 1.628533853204883e-05,
+      "loss": 2.7713,
+      "step": 15510
+    },
+    {
+      "epoch": 0.9005193072035742,
+      "grad_norm": 0.09811628609895706,
+      "learning_rate": 1.6098772592378417e-05,
+      "loss": 2.7733,
+      "step": 15520
+    },
+    {
+      "epoch": 0.9010995387159476,
+      "grad_norm": 0.09862551838159561,
+      "learning_rate": 1.591325200676795e-05,
+      "loss": 2.7701,
+      "step": 15530
+    },
+    {
+      "epoch": 0.9016797702283211,
+      "grad_norm": 0.09947618097066879,
+      "learning_rate": 1.5728777458323803e-05,
+      "loss": 2.7771,
+      "step": 15540
+    },
+    {
+      "epoch": 0.9022600017406945,
+      "grad_norm": 0.09834101796150208,
+      "learning_rate": 1.554534962630053e-05,
+      "loss": 2.7768,
+      "step": 15550
+    },
+    {
+      "epoch": 0.902840233253068,
+      "grad_norm": 0.10113567858934402,
+      "learning_rate": 1.5362969186098594e-05,
+      "loss": 2.7682,
+      "step": 15560
+    },
+    {
+      "epoch": 0.9034204647654415,
+      "grad_norm": 0.0977102592587471,
+      "learning_rate": 1.5181636809261921e-05,
+      "loss": 2.7769,
+      "step": 15570
+    },
+    {
+      "epoch": 0.9040006962778149,
+      "grad_norm": 0.09831026196479797,
+      "learning_rate": 1.5001353163475283e-05,
+      "loss": 2.7681,
+      "step": 15580
+    },
+    {
+      "epoch": 0.9045809277901883,
+      "grad_norm": 0.09537149965763092,
+      "learning_rate": 1.4822118912561943e-05,
+      "loss": 2.7628,
+      "step": 15590
+    },
+    {
+      "epoch": 0.9051611593025617,
+      "grad_norm": 0.09654498845338821,
+      "learning_rate": 1.4643934716481253e-05,
+      "loss": 2.7676,
+      "step": 15600
+    },
+    {
+      "epoch": 0.9057413908149352,
+      "grad_norm": 0.09738855808973312,
+      "learning_rate": 1.446680123132603e-05,
+      "loss": 2.7744,
+      "step": 15610
+    },
+    {
+      "epoch": 0.9063216223273086,
+      "grad_norm": 0.10082467645406723,
+      "learning_rate": 1.4290719109320382e-05,
+      "loss": 2.7706,
+      "step": 15620
+    },
+    {
+      "epoch": 0.906901853839682,
+      "grad_norm": 0.10283984988927841,
+      "learning_rate": 1.4115688998817043e-05,
+      "loss": 2.7742,
+      "step": 15630
+    },
+    {
+      "epoch": 0.9074820853520554,
+      "grad_norm": 0.09994236379861832,
+      "learning_rate": 1.3941711544295287e-05,
+      "loss": 2.7638,
+      "step": 15640
+    },
+    {
+      "epoch": 0.908062316864429,
+      "grad_norm": 0.09737379103899002,
+      "learning_rate": 1.3768787386358282e-05,
+      "loss": 2.7715,
+      "step": 15650
+    },
+    {
+      "epoch": 0.9086425483768024,
+      "grad_norm": 0.09915235638618469,
+      "learning_rate": 1.3596917161730902e-05,
+      "loss": 2.7694,
+      "step": 15660
+    },
+    {
+      "epoch": 0.9092227798891758,
+      "grad_norm": 0.09791626036167145,
+      "learning_rate": 1.3426101503257358e-05,
+      "loss": 2.7628,
+      "step": 15670
+    },
+    {
+      "epoch": 0.9098030114015492,
+      "grad_norm": 0.09681922197341919,
+      "learning_rate": 1.3256341039898766e-05,
+      "loss": 2.7741,
+      "step": 15680
+    },
+    {
+      "epoch": 0.9103832429139227,
+      "grad_norm": 0.09645412862300873,
+      "learning_rate": 1.3087636396730949e-05,
+      "loss": 2.7704,
+      "step": 15690
+    },
+    {
+      "epoch": 0.9109634744262961,
+      "grad_norm": 0.09795381873846054,
+      "learning_rate": 1.2919988194942011e-05,
+      "loss": 2.7666,
+      "step": 15700
+    },
+    {
+      "epoch": 0.9115437059386695,
+      "grad_norm": 0.09636548161506653,
+      "learning_rate": 1.2753397051830294e-05,
+      "loss": 2.7763,
+      "step": 15710
+    },
+    {
+      "epoch": 0.9121239374510429,
+      "grad_norm": 0.0992702841758728,
+      "learning_rate": 1.2587863580801794e-05,
+      "loss": 2.7693,
+      "step": 15720
+    },
+    {
+      "epoch": 0.9127041689634164,
+      "grad_norm": 0.09708980470895767,
+      "learning_rate": 1.2423388391368083e-05,
+      "loss": 2.7696,
+      "step": 15730
+    },
+    {
+      "epoch": 0.9132844004757898,
+      "grad_norm": 0.09657064080238342,
+      "learning_rate": 1.2259972089144054e-05,
+      "loss": 2.7799,
+      "step": 15740
+    },
+    {
+      "epoch": 0.9138646319881633,
+      "grad_norm": 0.09743205457925797,
+      "learning_rate": 1.2097615275845617e-05,
+      "loss": 2.7683,
+      "step": 15750
+    },
+    {
+      "epoch": 0.9144448635005367,
+      "grad_norm": 0.09803003072738647,
+      "learning_rate": 1.1936318549287638e-05,
+      "loss": 2.7731,
+      "step": 15760
+    },
+    {
+      "epoch": 0.9150250950129102,
+      "grad_norm": 0.0977969542145729,
+      "learning_rate": 1.1776082503381468e-05,
+      "loss": 2.778,
+      "step": 15770
+    },
+    {
+      "epoch": 0.9156053265252836,
+      "grad_norm": 0.0986003428697586,
+      "learning_rate": 1.1616907728133084e-05,
+      "loss": 2.7794,
+      "step": 15780
+    },
+    {
+      "epoch": 0.916185558037657,
+      "grad_norm": 0.09887285530567169,
+      "learning_rate": 1.1458794809640693e-05,
+      "loss": 2.7743,
+      "step": 15790
+    },
+    {
+      "epoch": 0.9167657895500304,
+      "grad_norm": 0.10056151449680328,
+      "learning_rate": 1.1301744330092522e-05,
+      "loss": 2.7739,
+      "step": 15800
+    },
+    {
+      "epoch": 0.9173460210624039,
+      "grad_norm": 0.09636414051055908,
+      "learning_rate": 1.1145756867765033e-05,
+      "loss": 2.7772,
+      "step": 15810
+    },
+    {
+      "epoch": 0.9179262525747773,
+      "grad_norm": 0.09793318808078766,
+      "learning_rate": 1.0990832997020282e-05,
+      "loss": 2.7729,
+      "step": 15820
+    },
+    {
+      "epoch": 0.9185064840871507,
+      "grad_norm": 0.09378232061862946,
+      "learning_rate": 1.0836973288304229e-05,
+      "loss": 2.7783,
+      "step": 15830
+    },
+    {
+      "epoch": 0.9190867155995243,
+      "grad_norm": 0.09904693067073822,
+      "learning_rate": 1.0684178308144498e-05,
+      "loss": 2.7697,
+      "step": 15840
+    },
+    {
+      "epoch": 0.9196669471118977,
+      "grad_norm": 0.0982363149523735,
+      "learning_rate": 1.0532448619148115e-05,
+      "loss": 2.7712,
+      "step": 15850
+    },
+    {
+      "epoch": 0.9202471786242711,
+      "grad_norm": 0.0995451807975769,
+      "learning_rate": 1.038178477999978e-05,
+      "loss": 2.7702,
+      "step": 15860
+    },
+    {
+      "epoch": 0.9208274101366445,
+      "grad_norm": 0.09749618917703629,
+      "learning_rate": 1.0232187345459431e-05,
+      "loss": 2.771,
+      "step": 15870
+    },
+    {
+      "epoch": 0.921407641649018,
+      "grad_norm": 0.09808894246816635,
+      "learning_rate": 1.0083656866360646e-05,
+      "loss": 2.7706,
+      "step": 15880
+    },
+    {
+      "epoch": 0.9219878731613914,
+      "grad_norm": 0.09838584810495377,
+      "learning_rate": 9.936193889608012e-06,
+      "loss": 2.7656,
+      "step": 15890
+    },
+    {
+      "epoch": 0.9225681046737648,
+      "grad_norm": 0.10016359388828278,
+      "learning_rate": 9.789798958175832e-06,
+      "loss": 2.7749,
+      "step": 15900
+    },
+    {
+      "epoch": 0.9231483361861382,
+      "grad_norm": 0.09670013934373856,
+      "learning_rate": 9.64447261110548e-06,
+      "loss": 2.7693,
+      "step": 15910
+    },
+    {
+      "epoch": 0.9237285676985117,
+      "grad_norm": 0.09639087319374084,
+      "learning_rate": 9.500215383503784e-06,
+      "loss": 2.7675,
+      "step": 15920
+    },
+    {
+      "epoch": 0.9243087992108852,
+      "grad_norm": 0.09851641952991486,
+      "learning_rate": 9.357027806541084e-06,
+      "loss": 2.7748,
+      "step": 15930
+    },
+    {
+      "epoch": 0.9248890307232586,
+      "grad_norm": 0.10145829617977142,
+      "learning_rate": 9.214910407448871e-06,
+      "loss": 2.7841,
+      "step": 15940
+    },
+    {
+      "epoch": 0.925469262235632,
+      "grad_norm": 0.09769120067358017,
+      "learning_rate": 9.073863709518426e-06,
+      "loss": 2.7703,
+      "step": 15950
+    },
+    {
+      "epoch": 0.9260494937480055,
+      "grad_norm": 0.09475893527269363,
+      "learning_rate": 8.933888232098408e-06,
+      "loss": 2.7703,
+      "step": 15960
+    },
+    {
+      "epoch": 0.9266297252603789,
+      "grad_norm": 0.09624000638723373,
+      "learning_rate": 8.794984490593171e-06,
+      "loss": 2.7753,
+      "step": 15970
+    },
+    {
+      "epoch": 0.9272099567727523,
+      "grad_norm": 0.09569297730922699,
+      "learning_rate": 8.657152996460958e-06,
+      "loss": 2.7635,
+      "step": 15980
+    },
+    {
+      "epoch": 0.9277901882851257,
+      "grad_norm": 0.10107609629631042,
+      "learning_rate": 8.520394257211605e-06,
+      "loss": 2.7714,
+      "step": 15990
+    },
+    {
+      "epoch": 0.9283704197974992,
+      "grad_norm": 0.09753672778606415,
+      "learning_rate": 8.384708776405236e-06,
+      "loss": 2.7706,
+      "step": 16000
+    },
+    {
+      "epoch": 0.9283704197974992,
+      "eval_loss": 2.7369606494903564,
+      "eval_runtime": 3.2559,
+      "eval_samples_per_second": 1329.896,
+      "eval_steps_per_second": 2.764,
+      "step": 16000
+    },
+    {
+      "epoch": 0.9289506513098726,
+      "grad_norm": 0.09548928588628769,
+      "learning_rate": 8.25009705364994e-06,
+      "loss": 2.7754,
+      "step": 16010
+    },
+    {
+      "epoch": 0.929530882822246,
+      "grad_norm": 0.09287203848361969,
+      "learning_rate": 8.116559584600201e-06,
+      "loss": 2.7777,
+      "step": 16020
+    },
+    {
+      "epoch": 0.9301111143346195,
+      "grad_norm": 0.0972280502319336,
+      "learning_rate": 7.984096860955036e-06,
+      "loss": 2.781,
+      "step": 16030
+    },
+    {
+      "epoch": 0.930691345846993,
+      "grad_norm": 0.09617298096418381,
+      "learning_rate": 7.852709370455922e-06,
+      "loss": 2.7692,
+      "step": 16040
+    },
+    {
+      "epoch": 0.9312715773593664,
+      "grad_norm": 0.09682459384202957,
+      "learning_rate": 7.72239759688551e-06,
+      "loss": 2.7742,
+      "step": 16050
+    },
+    {
+      "epoch": 0.9318518088717398,
+      "grad_norm": 0.09648177772760391,
+      "learning_rate": 7.593162020065313e-06,
+      "loss": 2.7783,
+      "step": 16060
+    },
+    {
+      "epoch": 0.9324320403841133,
+      "grad_norm": 0.09511367976665497,
+      "learning_rate": 7.4650031158542845e-06,
+      "loss": 2.7706,
+      "step": 16070
+    },
+    {
+      "epoch": 0.9330122718964867,
+      "grad_norm": 0.09434488415718079,
+      "learning_rate": 7.337921356146981e-06,
+      "loss": 2.7694,
+      "step": 16080
+    },
+    {
+      "epoch": 0.9335925034088601,
+      "grad_norm": 0.09737717360258102,
+      "learning_rate": 7.211917208871665e-06,
+      "loss": 2.7674,
+      "step": 16090
+    },
+    {
+      "epoch": 0.9341727349212335,
+      "grad_norm": 0.09725455194711685,
+      "learning_rate": 7.086991137988906e-06,
+      "loss": 2.7639,
+      "step": 16100
+    },
+    {
+      "epoch": 0.9347529664336071,
+      "grad_norm": 0.10136746615171432,
+      "learning_rate": 6.963143603489518e-06,
+      "loss": 2.7677,
+      "step": 16110
+    },
+    {
+      "epoch": 0.9353331979459805,
+      "grad_norm": 0.09756675362586975,
+      "learning_rate": 6.840375061393122e-06,
+      "loss": 2.765,
+      "step": 16120
+    },
+    {
+      "epoch": 0.9359134294583539,
+      "grad_norm": 0.09939330816268921,
+      "learning_rate": 6.718685963746318e-06,
+      "loss": 2.7751,
+      "step": 16130
+    },
+    {
+      "epoch": 0.9364936609707273,
+      "grad_norm": 0.09836092591285706,
+      "learning_rate": 6.598076758621118e-06,
+      "loss": 2.7828,
+      "step": 16140
+    },
+    {
+      "epoch": 0.9370738924831008,
+      "grad_norm": 0.09677501767873764,
+      "learning_rate": 6.4785478901133506e-06,
+      "loss": 2.769,
+      "step": 16150
+    },
+    {
+      "epoch": 0.9376541239954742,
+      "grad_norm": 0.097322478890419,
+      "learning_rate": 6.360099798340656e-06,
+      "loss": 2.7656,
+      "step": 16160
+    },
+    {
+      "epoch": 0.9382343555078476,
+      "grad_norm": 0.09472298622131348,
+      "learning_rate": 6.242732919441462e-06,
+      "loss": 2.7737,
+      "step": 16170
+    },
+    {
+      "epoch": 0.938814587020221,
+      "grad_norm": 0.09517394751310349,
+      "learning_rate": 6.126447685572844e-06,
+      "loss": 2.7807,
+      "step": 16180
+    },
+    {
+      "epoch": 0.9393948185325945,
+      "grad_norm": 0.09591302275657654,
+      "learning_rate": 6.011244524909198e-06,
+      "loss": 2.7774,
+      "step": 16190
+    },
+    {
+      "epoch": 0.939975050044968,
+      "grad_norm": 0.09797896444797516,
+      "learning_rate": 5.8971238616407405e-06,
+      "loss": 2.7637,
+      "step": 16200
+    },
+    {
+      "epoch": 0.9405552815573414,
+      "grad_norm": 0.09744720160961151,
+      "learning_rate": 5.7840861159715425e-06,
+      "loss": 2.7773,
+      "step": 16210
+    },
+    {
+      "epoch": 0.9411355130697148,
+      "grad_norm": 0.09814444929361343,
+      "learning_rate": 5.672131704118565e-06,
+      "loss": 2.7741,
+      "step": 16220
+    },
+    {
+      "epoch": 0.9417157445820883,
+      "grad_norm": 0.09604529291391373,
+      "learning_rate": 5.561261038309628e-06,
+      "loss": 2.7727,
+      "step": 16230
+    },
+    {
+      "epoch": 0.9422959760944617,
+      "grad_norm": 0.09737398475408554,
+      "learning_rate": 5.4514745267821404e-06,
+      "loss": 2.7737,
+      "step": 16240
+    },
+    {
+      "epoch": 0.9428762076068351,
+      "grad_norm": 0.09697815030813217,
+      "learning_rate": 5.342772573781507e-06,
+      "loss": 2.7638,
+      "step": 16250
+    },
+    {
+      "epoch": 0.9434564391192085,
+      "grad_norm": 0.09917178004980087,
+      "learning_rate": 5.235155579559725e-06,
+      "loss": 2.7709,
+      "step": 16260
+    },
+    {
+      "epoch": 0.944036670631582,
+      "grad_norm": 0.096290223300457,
+      "learning_rate": 5.128623940373888e-06,
+      "loss": 2.7674,
+      "step": 16270
+    },
+    {
+      "epoch": 0.9446169021439554,
+      "grad_norm": 0.09504272043704987,
+      "learning_rate": 5.023178048484589e-06,
+      "loss": 2.7694,
+      "step": 16280
+    },
+    {
+      "epoch": 0.9451971336563288,
+      "grad_norm": 0.09743209183216095,
+      "learning_rate": 4.91881829215468e-06,
+      "loss": 2.781,
+      "step": 16290
+    },
+    {
+      "epoch": 0.9457773651687024,
+      "grad_norm": 0.09843679517507553,
+      "learning_rate": 4.815545055647718e-06,
+      "loss": 2.776,
+      "step": 16300
+    },
+    {
+      "epoch": 0.9463575966810758,
+      "grad_norm": 0.0955999493598938,
+      "learning_rate": 4.713358719226523e-06,
+      "loss": 2.7789,
+      "step": 16310
+    },
+    {
+      "epoch": 0.9469378281934492,
+      "grad_norm": 0.09576351940631866,
+      "learning_rate": 4.612259659151984e-06,
+      "loss": 2.7716,
+      "step": 16320
+    },
+    {
+      "epoch": 0.9475180597058226,
+      "grad_norm": 0.09730935841798782,
+      "learning_rate": 4.512248247681394e-06,
+      "loss": 2.7802,
+      "step": 16330
+    },
+    {
+      "epoch": 0.9480982912181961,
+      "grad_norm": 0.09646177291870117,
+      "learning_rate": 4.413324853067213e-06,
+      "loss": 2.7765,
+      "step": 16340
+    },
+    {
+      "epoch": 0.9486785227305695,
+      "grad_norm": 0.09553349018096924,
+      "learning_rate": 4.3154898395557744e-06,
+      "loss": 2.778,
+      "step": 16350
+    },
+    {
+      "epoch": 0.9492587542429429,
+      "grad_norm": 0.09604230523109436,
+      "learning_rate": 4.218743567385852e-06,
+      "loss": 2.78,
+      "step": 16360
+    },
+    {
+      "epoch": 0.9498389857553163,
+      "grad_norm": 0.09518173336982727,
+      "learning_rate": 4.123086392787289e-06,
+      "loss": 2.7695,
+      "step": 16370
+    },
+    {
+      "epoch": 0.9504192172676899,
+      "grad_norm": 0.09625556319952011,
+      "learning_rate": 4.0285186679799406e-06,
+      "loss": 2.7694,
+      "step": 16380
+    },
+    {
+      "epoch": 0.9509994487800633,
+      "grad_norm": 0.09755248576402664,
+      "learning_rate": 3.935040741171969e-06,
+      "loss": 2.7625,
+      "step": 16390
+    },
+    {
+      "epoch": 0.9515796802924367,
+      "grad_norm": 0.09465952962636948,
+      "learning_rate": 3.842652956558945e-06,
+      "loss": 2.7658,
+      "step": 16400
+    },
+    {
+      "epoch": 0.9521599118048101,
+      "grad_norm": 0.0960998460650444,
+      "learning_rate": 3.7513556543223855e-06,
+      "loss": 2.7846,
+      "step": 16410
+    },
+    {
+      "epoch": 0.9527401433171836,
+      "grad_norm": 0.09892145544290543,
+      "learning_rate": 3.6611491706284856e-06,
+      "loss": 2.7708,
+      "step": 16420
+    },
+    {
+      "epoch": 0.953320374829557,
+      "grad_norm": 0.09714221954345703,
+      "learning_rate": 3.572033837626953e-06,
+      "loss": 2.7874,
+      "step": 16430
+    },
+    {
+      "epoch": 0.9539006063419304,
+      "grad_norm": 0.09727420657873154,
+      "learning_rate": 3.484009983449809e-06,
+      "loss": 2.7834,
+      "step": 16440
+    },
+    {
+      "epoch": 0.9544808378543038,
+      "grad_norm": 0.09665530920028687,
+      "learning_rate": 3.397077932210124e-06,
+      "loss": 2.7726,
+      "step": 16450
+    },
+    {
+      "epoch": 0.9550610693666773,
+      "grad_norm": 0.09558922797441483,
+      "learning_rate": 3.3112380040008156e-06,
+      "loss": 2.7723,
+      "step": 16460
+    },
+    {
+      "epoch": 0.9556413008790507,
+      "grad_norm": 0.0972527414560318,
+      "learning_rate": 3.2264905148934208e-06,
+      "loss": 2.772,
+      "step": 16470
+    },
+    {
+      "epoch": 0.9562215323914242,
+      "grad_norm": 0.09882599860429764,
+      "learning_rate": 3.142835776937158e-06,
+      "loss": 2.7685,
+      "step": 16480
+    },
+    {
+      "epoch": 0.9568017639037976,
+      "grad_norm": 0.09505190700292587,
+      "learning_rate": 3.060274098157467e-06,
+      "loss": 2.7694,
+      "step": 16490
+    },
+    {
+      "epoch": 0.9573819954161711,
+      "grad_norm": 0.09600254893302917,
+      "learning_rate": 2.9788057825551714e-06,
+      "loss": 2.7778,
+      "step": 16500
+    },
+    {
+      "epoch": 0.9579622269285445,
+      "grad_norm": 0.09696151316165924,
+      "learning_rate": 2.8984311301050835e-06,
+      "loss": 2.784,
+      "step": 16510
+    },
+    {
+      "epoch": 0.9585424584409179,
+      "grad_norm": 0.09621264785528183,
+      "learning_rate": 2.819150436755135e-06,
+      "loss": 2.7668,
+      "step": 16520
+    },
+    {
+      "epoch": 0.9591226899532914,
+      "grad_norm": 0.09673577547073364,
+      "learning_rate": 2.7409639944251162e-06,
+      "loss": 2.774,
+      "step": 16530
+    },
+    {
+      "epoch": 0.9597029214656648,
+      "grad_norm": 0.09513070434331894,
+      "learning_rate": 2.6638720910056697e-06,
+      "loss": 2.7783,
+      "step": 16540
+    },
+    {
+      "epoch": 0.9602831529780382,
+      "grad_norm": 0.09311112761497498,
+      "learning_rate": 2.587875010357332e-06,
+      "loss": 2.7665,
+      "step": 16550
+    },
+    {
+      "epoch": 0.9608633844904116,
+      "grad_norm": 0.09406144171953201,
+      "learning_rate": 2.5129730323092622e-06,
+      "loss": 2.7671,
+      "step": 16560
+    },
+    {
+      "epoch": 0.9614436160027852,
+      "grad_norm": 0.09770730882883072,
+      "learning_rate": 2.439166432658446e-06,
+      "loss": 2.7673,
+      "step": 16570
+    },
+    {
+      "epoch": 0.9620238475151586,
+      "grad_norm": 0.09938254207372665,
+      "learning_rate": 2.366455483168428e-06,
+      "loss": 2.7637,
+      "step": 16580
+    },
+    {
+      "epoch": 0.962604079027532,
+      "grad_norm": 0.09504234790802002,
+      "learning_rate": 2.2948404515686136e-06,
+      "loss": 2.7708,
+      "step": 16590
+    },
+    {
+      "epoch": 0.9631843105399054,
+      "grad_norm": 0.09619156271219254,
+      "learning_rate": 2.2243216015530362e-06,
+      "loss": 2.7716,
+      "step": 16600
+    },
+    {
+      "epoch": 0.9637645420522789,
+      "grad_norm": 0.09520803391933441,
+      "learning_rate": 2.1548991927794244e-06,
+      "loss": 2.771,
+      "step": 16610
+    },
+    {
+      "epoch": 0.9643447735646523,
+      "grad_norm": 0.09521950781345367,
+      "learning_rate": 2.0865734808684697e-06,
+      "loss": 2.7679,
+      "step": 16620
+    },
+    {
+      "epoch": 0.9649250050770257,
+      "grad_norm": 0.09744451195001602,
+      "learning_rate": 2.0193447174025268e-06,
+      "loss": 2.7715,
+      "step": 16630
+    },
+    {
+      "epoch": 0.9655052365893991,
+      "grad_norm": 0.09531662613153458,
+      "learning_rate": 1.953213149924948e-06,
+      "loss": 2.7824,
+      "step": 16640
+    },
+    {
+      "epoch": 0.9660854681017726,
+      "grad_norm": 0.09525689482688904,
+      "learning_rate": 1.8881790219391512e-06,
+      "loss": 2.7694,
+      "step": 16650
+    },
+    {
+      "epoch": 0.9666656996141461,
+      "grad_norm": 0.09457177668809891,
+      "learning_rate": 1.8242425729075527e-06,
+      "loss": 2.7588,
+      "step": 16660
+    },
+    {
+      "epoch": 0.9672459311265195,
+      "grad_norm": 0.09685463458299637,
+      "learning_rate": 1.7614040382508687e-06,
+      "loss": 2.7714,
+      "step": 16670
+    },
+    {
+      "epoch": 0.9678261626388929,
+      "grad_norm": 0.09774652868509293,
+      "learning_rate": 1.6996636493471494e-06,
+      "loss": 2.7683,
+      "step": 16680
+    },
+    {
+      "epoch": 0.9684063941512664,
+      "grad_norm": 0.09525836259126663,
+      "learning_rate": 1.6390216335309792e-06,
+      "loss": 2.77,
+      "step": 16690
+    },
+    {
+      "epoch": 0.9689866256636398,
+      "grad_norm": 0.09421420842409134,
+      "learning_rate": 1.5794782140926775e-06,
+      "loss": 2.7723,
+      "step": 16700
+    },
+    {
+      "epoch": 0.9695668571760132,
+      "grad_norm": 0.09693361073732376,
+      "learning_rate": 1.5210336102772668e-06,
+      "loss": 2.772,
+      "step": 16710
+    },
+    {
+      "epoch": 0.9701470886883866,
+      "grad_norm": 0.09740012139081955,
+      "learning_rate": 1.463688037283972e-06,
+      "loss": 2.7673,
+      "step": 16720
+    },
+    {
+      "epoch": 0.9707273202007601,
+      "grad_norm": 0.09596629440784454,
+      "learning_rate": 1.4074417062651221e-06,
+      "loss": 2.7878,
+      "step": 16730
+    },
+    {
+      "epoch": 0.9713075517131335,
+      "grad_norm": 0.09561031311750412,
+      "learning_rate": 1.3522948243256503e-06,
+      "loss": 2.7728,
+      "step": 16740
+    },
+    {
+      "epoch": 0.971887783225507,
+      "grad_norm": 0.09793524444103241,
+      "learning_rate": 1.2982475945221615e-06,
+      "loss": 2.7718,
+      "step": 16750
+    },
+    {
+      "epoch": 0.9724680147378804,
+      "grad_norm": 0.09407012164592743,
+      "learning_rate": 1.245300215862166e-06,
+      "loss": 2.7797,
+      "step": 16760
+    },
+    {
+      "epoch": 0.9730482462502539,
+      "grad_norm": 0.09444325417280197,
+      "learning_rate": 1.1934528833035139e-06,
+      "loss": 2.7725,
+      "step": 16770
+    },
+    {
+      "epoch": 0.9736284777626273,
+      "grad_norm": 0.09787797182798386,
+      "learning_rate": 1.1427057877534951e-06,
+      "loss": 2.7691,
+      "step": 16780
+    },
+    {
+      "epoch": 0.9742087092750007,
+      "grad_norm": 0.09456036239862442,
+      "learning_rate": 1.09305911606824e-06,
+      "loss": 2.7766,
+      "step": 16790
+    },
+    {
+      "epoch": 0.9747889407873742,
+      "grad_norm": 0.095250204205513,
+      "learning_rate": 1.044513051051954e-06,
+      "loss": 2.7701,
+      "step": 16800
+    },
+    {
+      "epoch": 0.9753691722997476,
+      "grad_norm": 0.09521818906068802,
+      "learning_rate": 9.970677714563835e-07,
+      "loss": 2.7734,
+      "step": 16810
+    },
+    {
+      "epoch": 0.975949403812121,
+      "grad_norm": 0.09462135285139084,
+      "learning_rate": 9.507234519800178e-07,
+      "loss": 2.7705,
+      "step": 16820
+    },
+    {
+      "epoch": 0.9765296353244944,
+      "grad_norm": 0.09560775011777878,
+      "learning_rate": 9.054802632674551e-07,
+      "loss": 2.7691,
+      "step": 16830
+    },
+    {
+      "epoch": 0.977109866836868,
+      "grad_norm": 0.09410873800516129,
+      "learning_rate": 8.61338371908904e-07,
+      "loss": 2.7787,
+      "step": 16840
+    },
+    {
+      "epoch": 0.9776900983492414,
+      "grad_norm": 0.09606259316205978,
+      "learning_rate": 8.18297940439383e-07,
+      "loss": 2.7766,
+      "step": 16850
+    },
+    {
+      "epoch": 0.9782703298616148,
+      "grad_norm": 0.09549134224653244,
+      "learning_rate": 7.763591273382885e-07,
+      "loss": 2.7701,
+      "step": 16860
+    },
+    {
+      "epoch": 0.9788505613739882,
+      "grad_norm": 0.09225918352603912,
+      "learning_rate": 7.355220870287615e-07,
+      "loss": 2.7635,
+      "step": 16870
+    },
+    {
+      "epoch": 0.9794307928863617,
+      "grad_norm": 0.09305543452501297,
+      "learning_rate": 6.95786969876988e-07,
+      "loss": 2.7659,
+      "step": 16880
+    },
+    {
+      "epoch": 0.9800110243987351,
+      "grad_norm": 0.09393244236707687,
+      "learning_rate": 6.571539221918997e-07,
+      "loss": 2.7743,
+      "step": 16890
+    },
+    {
+      "epoch": 0.9805912559111085,
+      "grad_norm": 0.09278815984725952,
+      "learning_rate": 6.196230862244078e-07,
+      "loss": 2.78,
+      "step": 16900
+    },
+    {
+      "epoch": 0.9811714874234819,
+      "grad_norm": 0.09347451478242874,
+      "learning_rate": 5.831946001669697e-07,
+      "loss": 2.7747,
+      "step": 16910
+    },
+    {
+      "epoch": 0.9817517189358554,
+      "grad_norm": 0.09540887176990509,
+      "learning_rate": 5.478685981530894e-07,
+      "loss": 2.7758,
+      "step": 16920
+    },
+    {
+      "epoch": 0.9823319504482289,
+      "grad_norm": 0.09621070325374603,
+      "learning_rate": 5.136452102567856e-07,
+      "loss": 2.7713,
+      "step": 16930
+    },
+    {
+      "epoch": 0.9829121819606023,
+      "grad_norm": 0.09409264475107193,
+      "learning_rate": 4.805245624922238e-07,
+      "loss": 2.7778,
+      "step": 16940
+    },
+    {
+      "epoch": 0.9834924134729757,
+      "grad_norm": 0.09619985520839691,
+      "learning_rate": 4.4850677681301795e-07,
+      "loss": 2.7701,
+      "step": 16950
+    },
+    {
+      "epoch": 0.9840726449853492,
+      "grad_norm": 0.09401355683803558,
+      "learning_rate": 4.1759197111206344e-07,
+      "loss": 2.7689,
+      "step": 16960
+    },
+    {
+      "epoch": 0.9846528764977226,
+      "grad_norm": 0.09698129445314407,
+      "learning_rate": 3.877802592209045e-07,
+      "loss": 2.7703,
+      "step": 16970
+    },
+    {
+      "epoch": 0.985233108010096,
+      "grad_norm": 0.09333529323339462,
+      "learning_rate": 3.590717509093677e-07,
+      "loss": 2.7784,
+      "step": 16980
+    },
+    {
+      "epoch": 0.9858133395224694,
+      "grad_norm": 0.09353555738925934,
+      "learning_rate": 3.3146655188519557e-07,
+      "loss": 2.7687,
+      "step": 16990
+    },
+    {
+      "epoch": 0.9863935710348429,
+      "grad_norm": 0.09438835084438324,
+      "learning_rate": 3.0496476379364697e-07,
+      "loss": 2.7665,
+      "step": 17000
+    },
+    {
+      "epoch": 0.9863935710348429,
+      "eval_loss": 2.735684633255005,
+      "eval_runtime": 3.2561,
+      "eval_samples_per_second": 1329.798,
+      "eval_steps_per_second": 2.764,
+      "step": 17000
+    },
+    {
+      "epoch": 0.9869738025472163,
+      "grad_norm": 0.09504197537899017,
+      "learning_rate": 2.7956648421703087e-07,
+      "loss": 2.7762,
+      "step": 17010
+    },
+    {
+      "epoch": 0.9875540340595897,
+      "grad_norm": 0.09602217376232147,
+      "learning_rate": 2.5527180667453963e-07,
+      "loss": 2.7673,
+      "step": 17020
+    },
+    {
+      "epoch": 0.9881342655719633,
+      "grad_norm": 0.09483738243579865,
+      "learning_rate": 2.3208082062168288e-07,
+      "loss": 2.7705,
+      "step": 17030
+    },
+    {
+      "epoch": 0.9887144970843367,
+      "grad_norm": 0.09395676851272583,
+      "learning_rate": 2.0999361145008775e-07,
+      "loss": 2.7692,
+      "step": 17040
+    },
+    {
+      "epoch": 0.9892947285967101,
+      "grad_norm": 0.09432484954595566,
+      "learning_rate": 1.8901026048719902e-07,
+      "loss": 2.7707,
+      "step": 17050
+    },
+    {
+      "epoch": 0.9898749601090835,
+      "grad_norm": 0.09382540732622147,
+      "learning_rate": 1.6913084499587948e-07,
+      "loss": 2.7788,
+      "step": 17060
+    },
+    {
+      "epoch": 0.990455191621457,
+      "grad_norm": 0.09619873762130737,
+      "learning_rate": 1.5035543817427663e-07,
+      "loss": 2.7604,
+      "step": 17070
+    },
+    {
+      "epoch": 0.9910354231338304,
+      "grad_norm": 0.09365525841712952,
+      "learning_rate": 1.3268410915532323e-07,
+      "loss": 2.7785,
+      "step": 17080
+    },
+    {
+      "epoch": 0.9916156546462038,
+      "grad_norm": 0.09718578308820724,
+      "learning_rate": 1.1611692300680376e-07,
+      "loss": 2.7745,
+      "step": 17090
+    },
+    {
+      "epoch": 0.9921958861585772,
+      "grad_norm": 0.0956762507557869,
+      "learning_rate": 1.0065394073075494e-07,
+      "loss": 2.7813,
+      "step": 17100
+    },
+    {
+      "epoch": 0.9927761176709508,
+      "grad_norm": 0.09347262978553772,
+      "learning_rate": 8.629521926353244e-08,
+      "loss": 2.7714,
+      "step": 17110
+    },
+    {
+      "epoch": 0.9933563491833242,
+      "grad_norm": 0.09415694326162338,
+      "learning_rate": 7.304081147544439e-08,
+      "loss": 2.7837,
+      "step": 17120
+    },
+    {
+      "epoch": 0.9939365806956976,
+      "grad_norm": 0.09390881657600403,
+      "learning_rate": 6.089076617058486e-08,
+      "loss": 2.7725,
+      "step": 17130
+    },
+    {
+      "epoch": 0.994516812208071,
+      "grad_norm": 0.09363935142755508,
+      "learning_rate": 4.984512808673402e-08,
+      "loss": 2.776,
+      "step": 17140
+    },
+    {
+      "epoch": 0.9950970437204445,
+      "grad_norm": 0.0957217812538147,
+      "learning_rate": 3.9903937895091606e-08,
+      "loss": 2.7731,
+      "step": 17150
+    },
+    {
+      "epoch": 0.9956772752328179,
+      "grad_norm": 0.09717927128076553,
+      "learning_rate": 3.1067232200110426e-08,
+      "loss": 2.7703,
+      "step": 17160
+    },
+    {
+      "epoch": 0.9962575067451913,
+      "grad_norm": 0.09413953870534897,
+      "learning_rate": 2.333504353952964e-08,
+      "loss": 2.7733,
+      "step": 17170
+    },
+    {
+      "epoch": 0.9968377382575647,
+      "grad_norm": 0.09774868190288544,
+      "learning_rate": 1.670740038400842e-08,
+      "loss": 2.7658,
+      "step": 17180
+    },
+    {
+      "epoch": 0.9974179697699382,
+      "grad_norm": 0.09658750146627426,
+      "learning_rate": 1.1184327137292448e-08,
+      "loss": 2.7734,
+      "step": 17190
+    },
+    {
+      "epoch": 0.9979982012823116,
+      "grad_norm": 0.0932522714138031,
+      "learning_rate": 6.765844135847576e-09,
+      "loss": 2.7708,
+      "step": 17200
+    },
+    {
+      "epoch": 0.9985784327946851,
+      "grad_norm": 0.09543392807245255,
+      "learning_rate": 3.4519676490596393e-09,
+      "loss": 2.7746,
+      "step": 17210
+    },
+    {
+      "epoch": 0.9991586643070585,
+      "grad_norm": 0.09391433745622635,
+      "learning_rate": 1.2427098789347111e-09,
+      "loss": 2.7707,
+      "step": 17220
+    },
+    {
+      "epoch": 0.999738895819432,
+      "grad_norm": 0.0975637212395668,
+      "learning_rate": 1.3807896016571064e-10,
+      "loss": 2.77,
+      "step": 17230
+    },
+    {
+      "epoch": 0.9999709884243814,
+      "step": 17234,
+      "total_flos": 4.402536853133695e+19,
+      "train_loss": 3.082940493684724,
+      "train_runtime": 20985.9807,
+      "train_samples_per_second": 420.462,
+      "train_steps_per_second": 0.821
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 17234,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.402536853133695e+19,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}