diff --git "a/contextlm_gpt2_base/trainer_state.json" "b/contextlm_gpt2_base/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/contextlm_gpt2_base/trainer_state.json"
@@ -0,0 +1,12240 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9999709884243814,
+  "eval_steps": 1000,
+  "global_step": 17234,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.000580231512373437,
+      "grad_norm": 4.24833869934082,
+      "learning_rate": 1.0440835266821346e-05,
+      "loss": 10.74,
+      "step": 10
+    },
+    {
+      "epoch": 0.001160463024746874,
+      "grad_norm": 2.10276198387146,
+      "learning_rate": 2.2041763341067284e-05,
+      "loss": 9.9422,
+      "step": 20
+    },
+    {
+      "epoch": 0.001740694537120311,
+      "grad_norm": 2.0392377376556396,
+      "learning_rate": 3.364269141531322e-05,
+      "loss": 9.4932,
+      "step": 30
+    },
+    {
+      "epoch": 0.002320926049493748,
+      "grad_norm": 1.8358850479125977,
+      "learning_rate": 4.5243619489559165e-05,
+      "loss": 9.1248,
+      "step": 40
+    },
+    {
+      "epoch": 0.002901157561867185,
+      "grad_norm": 1.6119049787521362,
+      "learning_rate": 5.68445475638051e-05,
+      "loss": 8.6786,
+      "step": 50
+    },
+    {
+      "epoch": 0.003481389074240622,
+      "grad_norm": 1.438369870185852,
+      "learning_rate": 6.844547563805105e-05,
+      "loss": 8.2109,
+      "step": 60
+    },
+    {
+      "epoch": 0.004061620586614059,
+      "grad_norm": 1.2206534147262573,
+      "learning_rate": 8.004640371229699e-05,
+      "loss": 7.7847,
+      "step": 70
+    },
+    {
+      "epoch": 0.004641852098987496,
+      "grad_norm": 1.797905445098877,
+      "learning_rate": 9.164733178654293e-05,
+      "loss": 7.4453,
+      "step": 80
+    },
+    {
+      "epoch": 0.005222083611360933,
+      "grad_norm": 0.9732237458229065,
+      "learning_rate": 0.00010324825986078886,
+      "loss": 7.2239,
+      "step": 90
+    },
+    {
+      "epoch": 0.00580231512373437,
+      "grad_norm": 0.704902172088623,
+      "learning_rate": 0.0001148491879350348,
+      "loss": 7.0716,
+      "step": 100
+    },
+    {
+      "epoch": 0.006382546636107807,
+      "grad_norm": 0.7608431577682495,
+      "learning_rate": 0.00012645011600928075,
+      "loss": 6.9281,
+      "step": 110
+    },
+    {
+      "epoch": 0.006962778148481244,
+      "grad_norm": 0.5090209245681763,
+      "learning_rate": 0.00013805104408352666,
+      "loss": 6.7721,
+      "step": 120
+    },
+    {
+      "epoch": 0.007543009660854681,
+      "grad_norm": 0.4300777018070221,
+      "learning_rate": 0.00014965197215777263,
+      "loss": 6.6322,
+      "step": 130
+    },
+    {
+      "epoch": 0.008123241173228117,
+      "grad_norm": 0.8929088115692139,
+      "learning_rate": 0.00016125290023201856,
+      "loss": 6.5415,
+      "step": 140
+    },
+    {
+      "epoch": 0.008703472685601555,
+      "grad_norm": 0.3701019287109375,
+      "learning_rate": 0.0001728538283062645,
+      "loss": 6.4223,
+      "step": 150
+    },
+    {
+      "epoch": 0.009283704197974993,
+      "grad_norm": 0.5044598579406738,
+      "learning_rate": 0.00018445475638051046,
+      "loss": 6.3371,
+      "step": 160
+    },
+    {
+      "epoch": 0.009863935710348428,
+      "grad_norm": 0.42209455370903015,
+      "learning_rate": 0.00019605568445475637,
+      "loss": 6.2547,
+      "step": 170
+    },
+    {
+      "epoch": 0.010444167222721866,
+      "grad_norm": 0.3204844295978546,
+      "learning_rate": 0.00020765661252900234,
+      "loss": 6.1801,
+      "step": 180
+    },
+    {
+      "epoch": 0.011024398735095304,
+      "grad_norm": 0.7174881100654602,
+      "learning_rate": 0.00021925754060324827,
+      "loss": 6.1174,
+      "step": 190
+    },
+    {
+      "epoch": 0.01160463024746874,
+      "grad_norm": 0.37421539425849915,
+      "learning_rate": 0.0002308584686774942,
+      "loss": 6.0759,
+      "step": 200
+    },
+    {
+      "epoch": 0.012184861759842177,
+      "grad_norm": 0.5762574672698975,
+      "learning_rate": 0.00024245939675174015,
+      "loss": 5.9975,
+      "step": 210
+    },
+    {
+      "epoch": 0.012765093272215615,
+      "grad_norm": 0.29894348978996277,
+      "learning_rate": 0.00025406032482598606,
+      "loss": 5.9512,
+      "step": 220
+    },
+    {
+      "epoch": 0.01334532478458905,
+      "grad_norm": 0.37387117743492126,
+      "learning_rate": 0.000265661252900232,
+      "loss": 5.9045,
+      "step": 230
+    },
+    {
+      "epoch": 0.013925556296962488,
+      "grad_norm": 0.4004443883895874,
+      "learning_rate": 0.000277262180974478,
+      "loss": 5.872,
+      "step": 240
+    },
+    {
+      "epoch": 0.014505787809335926,
+      "grad_norm": 0.43937498331069946,
+      "learning_rate": 0.0002888631090487239,
+      "loss": 5.8183,
+      "step": 250
+    },
+    {
+      "epoch": 0.015086019321709361,
+      "grad_norm": 0.3658200204372406,
+      "learning_rate": 0.0003004640371229698,
+      "loss": 5.7888,
+      "step": 260
+    },
+    {
+      "epoch": 0.0156662508340828,
+      "grad_norm": 0.3341761827468872,
+      "learning_rate": 0.0003120649651972158,
+      "loss": 5.7416,
+      "step": 270
+    },
+    {
+      "epoch": 0.016246482346456235,
+      "grad_norm": 0.46115365624427795,
+      "learning_rate": 0.00032366589327146174,
+      "loss": 5.7067,
+      "step": 280
+    },
+    {
+      "epoch": 0.016826713858829674,
+      "grad_norm": 0.3009159564971924,
+      "learning_rate": 0.00033526682134570767,
+      "loss": 5.665,
+      "step": 290
+    },
+    {
+      "epoch": 0.01740694537120311,
+      "grad_norm": 0.4389355778694153,
+      "learning_rate": 0.0003468677494199536,
+      "loss": 5.6231,
+      "step": 300
+    },
+    {
+      "epoch": 0.017987176883576546,
+      "grad_norm": 0.4746924042701721,
+      "learning_rate": 0.00035846867749419955,
+      "loss": 5.5983,
+      "step": 310
+    },
+    {
+      "epoch": 0.018567408395949985,
+      "grad_norm": 0.33507513999938965,
+      "learning_rate": 0.0003700696055684455,
+      "loss": 5.5685,
+      "step": 320
+    },
+    {
+      "epoch": 0.01914763990832342,
+      "grad_norm": 0.6776261329650879,
+      "learning_rate": 0.0003816705336426914,
+      "loss": 5.5087,
+      "step": 330
+    },
+    {
+      "epoch": 0.019727871420696857,
+      "grad_norm": 0.4664747416973114,
+      "learning_rate": 0.00039327146171693736,
+      "loss": 5.4964,
+      "step": 340
+    },
+    {
+      "epoch": 0.020308102933070296,
+      "grad_norm": 0.4787660539150238,
+      "learning_rate": 0.0004048723897911833,
+      "loss": 5.4473,
+      "step": 350
+    },
+    {
+      "epoch": 0.020888334445443732,
+      "grad_norm": 0.41453346610069275,
+      "learning_rate": 0.00041647331786542923,
+      "loss": 5.4019,
+      "step": 360
+    },
+    {
+      "epoch": 0.021468565957817168,
+      "grad_norm": 0.5684117078781128,
+      "learning_rate": 0.0004280742459396752,
+      "loss": 5.3821,
+      "step": 370
+    },
+    {
+      "epoch": 0.022048797470190607,
+      "grad_norm": 0.34503793716430664,
+      "learning_rate": 0.0004396751740139211,
+      "loss": 5.3503,
+      "step": 380
+    },
+    {
+      "epoch": 0.022629028982564043,
+      "grad_norm": 0.465599924325943,
+      "learning_rate": 0.00045127610208816704,
+      "loss": 5.3183,
+      "step": 390
+    },
+    {
+      "epoch": 0.02320926049493748,
+      "grad_norm": 0.3912290036678314,
+      "learning_rate": 0.000462877030162413,
+      "loss": 5.2937,
+      "step": 400
+    },
+    {
+      "epoch": 0.023789492007310918,
+      "grad_norm": 0.42982372641563416,
+      "learning_rate": 0.00047447795823665897,
+      "loss": 5.2526,
+      "step": 410
+    },
+    {
+      "epoch": 0.024369723519684354,
+      "grad_norm": 0.6874813437461853,
+      "learning_rate": 0.00048607888631090485,
+      "loss": 5.2364,
+      "step": 420
+    },
+    {
+      "epoch": 0.02494995503205779,
+      "grad_norm": 0.3257123529911041,
+      "learning_rate": 0.0004976798143851508,
+      "loss": 5.2107,
+      "step": 430
+    },
+    {
+      "epoch": 0.02553018654443123,
+      "grad_norm": 0.43527376651763916,
+      "learning_rate": 0.0005092807424593968,
+      "loss": 5.1765,
+      "step": 440
+    },
+    {
+      "epoch": 0.026110418056804665,
+      "grad_norm": 0.36336761713027954,
+      "learning_rate": 0.0005208816705336427,
+      "loss": 5.1535,
+      "step": 450
+    },
+    {
+      "epoch": 0.0266906495691781,
+      "grad_norm": 0.5609498023986816,
+      "learning_rate": 0.0005324825986078887,
+      "loss": 5.1106,
+      "step": 460
+    },
+    {
+      "epoch": 0.02727088108155154,
+      "grad_norm": 0.3533839285373688,
+      "learning_rate": 0.0005440835266821345,
+      "loss": 5.0758,
+      "step": 470
+    },
+    {
+      "epoch": 0.027851112593924976,
+      "grad_norm": 0.3420592248439789,
+      "learning_rate": 0.0005556844547563805,
+      "loss": 5.0461,
+      "step": 480
+    },
+    {
+      "epoch": 0.028431344106298412,
+      "grad_norm": 0.34275180101394653,
+      "learning_rate": 0.0005672853828306265,
+      "loss": 5.0096,
+      "step": 490
+    },
+    {
+      "epoch": 0.02901157561867185,
+      "grad_norm": 0.36812710762023926,
+      "learning_rate": 0.0005788863109048724,
+      "loss": 4.989,
+      "step": 500
+    },
+    {
+      "epoch": 0.029591807131045287,
+      "grad_norm": 0.30121806263923645,
+      "learning_rate": 0.0005904872389791184,
+      "loss": 4.947,
+      "step": 510
+    },
+    {
+      "epoch": 0.030172038643418723,
+      "grad_norm": 0.30486181378364563,
+      "learning_rate": 0.0006020881670533644,
+      "loss": 4.9224,
+      "step": 520
+    },
+    {
+      "epoch": 0.030752270155792162,
+      "grad_norm": 0.36893951892852783,
+      "learning_rate": 0.0006136890951276102,
+      "loss": 4.8849,
+      "step": 530
+    },
+    {
+      "epoch": 0.0313325016681656,
+      "grad_norm": 0.3596450090408325,
+      "learning_rate": 0.0006252900232018562,
+      "loss": 4.8539,
+      "step": 540
+    },
+    {
+      "epoch": 0.031912733180539034,
+      "grad_norm": 0.3460347354412079,
+      "learning_rate": 0.000636890951276102,
+      "loss": 4.8462,
+      "step": 550
+    },
+    {
+      "epoch": 0.03249296469291247,
+      "grad_norm": 0.353222519159317,
+      "learning_rate": 0.000648491879350348,
+      "loss": 4.8129,
+      "step": 560
+    },
+    {
+      "epoch": 0.03307319620528591,
+      "grad_norm": 0.23676836490631104,
+      "learning_rate": 0.000660092807424594,
+      "loss": 4.7744,
+      "step": 570
+    },
+    {
+      "epoch": 0.03365342771765935,
+      "grad_norm": 0.3343792259693146,
+      "learning_rate": 0.0006716937354988399,
+      "loss": 4.7622,
+      "step": 580
+    },
+    {
+      "epoch": 0.034233659230032784,
+      "grad_norm": 0.24879467487335205,
+      "learning_rate": 0.0006832946635730859,
+      "loss": 4.7259,
+      "step": 590
+    },
+    {
+      "epoch": 0.03481389074240622,
+      "grad_norm": 0.3369862139225006,
+      "learning_rate": 0.0006948955916473319,
+      "loss": 4.6865,
+      "step": 600
+    },
+    {
+      "epoch": 0.035394122254779656,
+      "grad_norm": 0.3400874435901642,
+      "learning_rate": 0.0007064965197215777,
+      "loss": 4.6676,
+      "step": 610
+    },
+    {
+      "epoch": 0.03597435376715309,
+      "grad_norm": 0.30212903022766113,
+      "learning_rate": 0.0007180974477958236,
+      "loss": 4.6644,
+      "step": 620
+    },
+    {
+      "epoch": 0.036554585279526534,
+      "grad_norm": 0.27961453795433044,
+      "learning_rate": 0.0007296983758700696,
+      "loss": 4.6348,
+      "step": 630
+    },
+    {
+      "epoch": 0.03713481679189997,
+      "grad_norm": 0.33748018741607666,
+      "learning_rate": 0.0007412993039443155,
+      "loss": 4.5963,
+      "step": 640
+    },
+    {
+      "epoch": 0.037715048304273406,
+      "grad_norm": 0.31729650497436523,
+      "learning_rate": 0.0007529002320185615,
+      "loss": 4.5758,
+      "step": 650
+    },
+    {
+      "epoch": 0.03829527981664684,
+      "grad_norm": 0.22230634093284607,
+      "learning_rate": 0.0007645011600928075,
+      "loss": 4.5496,
+      "step": 660
+    },
+    {
+      "epoch": 0.03887551132902028,
+      "grad_norm": 0.2796823978424072,
+      "learning_rate": 0.0007761020881670534,
+      "loss": 4.5275,
+      "step": 670
+    },
+    {
+      "epoch": 0.039455742841393714,
+      "grad_norm": 0.30923864245414734,
+      "learning_rate": 0.0007877030162412994,
+      "loss": 4.5012,
+      "step": 680
+    },
+    {
+      "epoch": 0.040035974353767156,
+      "grad_norm": 0.2574792504310608,
+      "learning_rate": 0.0007993039443155452,
+      "loss": 4.4765,
+      "step": 690
+    },
+    {
+      "epoch": 0.04061620586614059,
+      "grad_norm": 0.26686057448387146,
+      "learning_rate": 0.0008109048723897911,
+      "loss": 4.4458,
+      "step": 700
+    },
+    {
+      "epoch": 0.04119643737851403,
+      "grad_norm": 0.31044116616249084,
+      "learning_rate": 0.0008225058004640371,
+      "loss": 4.4281,
+      "step": 710
+    },
+    {
+      "epoch": 0.041776668890887464,
+      "grad_norm": 0.27075859904289246,
+      "learning_rate": 0.000834106728538283,
+      "loss": 4.4045,
+      "step": 720
+    },
+    {
+      "epoch": 0.0423569004032609,
+      "grad_norm": 0.25896894931793213,
+      "learning_rate": 0.000845707656612529,
+      "loss": 4.3825,
+      "step": 730
+    },
+    {
+      "epoch": 0.042937131915634336,
+      "grad_norm": 0.20856112241744995,
+      "learning_rate": 0.000857308584686775,
+      "loss": 4.3483,
+      "step": 740
+    },
+    {
+      "epoch": 0.04351736342800778,
+      "grad_norm": 0.3068506121635437,
+      "learning_rate": 0.0008689095127610209,
+      "loss": 4.3236,
+      "step": 750
+    },
+    {
+      "epoch": 0.044097594940381214,
+      "grad_norm": 0.2396797239780426,
+      "learning_rate": 0.0008805104408352669,
+      "loss": 4.3045,
+      "step": 760
+    },
+    {
+      "epoch": 0.04467782645275465,
+      "grad_norm": 0.32250258326530457,
+      "learning_rate": 0.0008921113689095129,
+      "loss": 4.2823,
+      "step": 770
+    },
+    {
+      "epoch": 0.045258057965128086,
+      "grad_norm": 0.28663381934165955,
+      "learning_rate": 0.0009037122969837586,
+      "loss": 4.2603,
+      "step": 780
+    },
+    {
+      "epoch": 0.04583828947750152,
+      "grad_norm": 0.26493045687675476,
+      "learning_rate": 0.0009153132250580046,
+      "loss": 4.2477,
+      "step": 790
+    },
+    {
+      "epoch": 0.04641852098987496,
+      "grad_norm": 0.2331818789243698,
+      "learning_rate": 0.0009269141531322506,
+      "loss": 4.2247,
+      "step": 800
+    },
+    {
+      "epoch": 0.0469987525022484,
+      "grad_norm": 0.31670576333999634,
+      "learning_rate": 0.0009385150812064965,
+      "loss": 4.1969,
+      "step": 810
+    },
+    {
+      "epoch": 0.047578984014621836,
+      "grad_norm": 0.2539173662662506,
+      "learning_rate": 0.0009501160092807425,
+      "loss": 4.1818,
+      "step": 820
+    },
+    {
+      "epoch": 0.04815921552699527,
+      "grad_norm": 0.24318990111351013,
+      "learning_rate": 0.0009617169373549885,
+      "loss": 4.1624,
+      "step": 830
+    },
+    {
+      "epoch": 0.04873944703936871,
+      "grad_norm": 0.2574463486671448,
+      "learning_rate": 0.0009733178654292344,
+      "loss": 4.1577,
+      "step": 840
+    },
+    {
+      "epoch": 0.049319678551742144,
+      "grad_norm": 0.28978243470191956,
+      "learning_rate": 0.0009849187935034804,
+      "loss": 4.1412,
+      "step": 850
+    },
+    {
+      "epoch": 0.04989991006411558,
+      "grad_norm": 0.20802126824855804,
+      "learning_rate": 0.0009965197215777261,
+      "loss": 4.1262,
+      "step": 860
+    },
+    {
+      "epoch": 0.05048014157648902,
+      "grad_norm": 0.1996484100818634,
+      "learning_rate": 0.0009999995489420968,
+      "loss": 4.1093,
+      "step": 870
+    },
+    {
+      "epoch": 0.05106037308886246,
+      "grad_norm": 0.2888406813144684,
+      "learning_rate": 0.0009999973396808558,
+      "loss": 4.0934,
+      "step": 880
+    },
+    {
+      "epoch": 0.051640604601235894,
+      "grad_norm": 0.23817220330238342,
+      "learning_rate": 0.0009999932893770317,
+      "loss": 4.0812,
+      "step": 890
+    },
+    {
+      "epoch": 0.05222083611360933,
+      "grad_norm": 0.21273092925548553,
+      "learning_rate": 0.0009999873980455383,
+      "loss": 4.0675,
+      "step": 900
+    },
+    {
+      "epoch": 0.052801067625982766,
+      "grad_norm": 0.2241075336933136,
+      "learning_rate": 0.000999979665708068,
+      "loss": 4.0543,
+      "step": 910
+    },
+    {
+      "epoch": 0.0533812991383562,
+      "grad_norm": 0.2210296392440796,
+      "learning_rate": 0.000999970092393092,
+      "loss": 4.0374,
+      "step": 920
+    },
+    {
+      "epoch": 0.053961530650729644,
+      "grad_norm": 0.21746701002120972,
+      "learning_rate": 0.0009999586781358604,
+      "loss": 4.0324,
+      "step": 930
+    },
+    {
+      "epoch": 0.05454176216310308,
+      "grad_norm": 0.20182138681411743,
+      "learning_rate": 0.0009999454229784018,
+      "loss": 4.0211,
+      "step": 940
+    },
+    {
+      "epoch": 0.055121993675476516,
+      "grad_norm": 0.20171615481376648,
+      "learning_rate": 0.0009999303269695226,
+      "loss": 4.0011,
+      "step": 950
+    },
+    {
+      "epoch": 0.05570222518784995,
+      "grad_norm": 0.27165260910987854,
+      "learning_rate": 0.0009999133901648083,
+      "loss": 3.9968,
+      "step": 960
+    },
+    {
+      "epoch": 0.05628245670022339,
+      "grad_norm": 0.2524718940258026,
+      "learning_rate": 0.0009998946126266218,
+      "loss": 3.9876,
+      "step": 970
+    },
+    {
+      "epoch": 0.056862688212596824,
+      "grad_norm": 0.18922555446624756,
+      "learning_rate": 0.0009998739944241041,
+      "loss": 3.9756,
+      "step": 980
+    },
+    {
+      "epoch": 0.057442919724970266,
+      "grad_norm": 0.2163887917995453,
+      "learning_rate": 0.0009998515356331734,
+      "loss": 3.953,
+      "step": 990
+    },
+    {
+      "epoch": 0.0580231512373437,
+      "grad_norm": 0.24239082634449005,
+      "learning_rate": 0.0009998272363365254,
+      "loss": 3.9616,
+      "step": 1000
+    },
+    {
+      "epoch": 0.0580231512373437,
+      "eval_loss": 3.8911185264587402,
+      "eval_runtime": 3.2761,
+      "eval_samples_per_second": 1321.706,
+      "eval_steps_per_second": 10.378,
+      "step": 1000
+    },
+    {
+      "epoch": 0.05860338274971714,
+      "grad_norm": 0.19143226742744446,
+      "learning_rate": 0.000999801096623633,
+      "loss": 3.9342,
+      "step": 1010
+    },
+    {
+      "epoch": 0.059183614262090574,
+      "grad_norm": 0.23061209917068481,
+      "learning_rate": 0.000999773116590745,
+      "loss": 3.932,
+      "step": 1020
+    },
+    {
+      "epoch": 0.05976384577446401,
+      "grad_norm": 0.20795617997646332,
+      "learning_rate": 0.0009997432963408865,
+      "loss": 3.9247,
+      "step": 1030
+    },
+    {
+      "epoch": 0.060344077286837446,
+      "grad_norm": 0.22474446892738342,
+      "learning_rate": 0.0009997116359838595,
+      "loss": 3.9318,
+      "step": 1040
+    },
+    {
+      "epoch": 0.06092430879921089,
+      "grad_norm": 0.1934744417667389,
+      "learning_rate": 0.00099967813563624,
+      "loss": 3.9166,
+      "step": 1050
+    },
+    {
+      "epoch": 0.061504540311584324,
+      "grad_norm": 0.22670230269432068,
+      "learning_rate": 0.0009996427954213807,
+      "loss": 3.8964,
+      "step": 1060
+    },
+    {
+      "epoch": 0.06208477182395776,
+      "grad_norm": 0.193163201212883,
+      "learning_rate": 0.0009996056154694072,
+      "loss": 3.9087,
+      "step": 1070
+    },
+    {
+      "epoch": 0.0626650033363312,
+      "grad_norm": 0.26965293288230896,
+      "learning_rate": 0.0009995665959172202,
+      "loss": 3.8943,
+      "step": 1080
+    },
+    {
+      "epoch": 0.06324523484870463,
+      "grad_norm": 0.16498436033725739,
+      "learning_rate": 0.0009995257369084939,
+      "loss": 3.8751,
+      "step": 1090
+    },
+    {
+      "epoch": 0.06382546636107807,
+      "grad_norm": 0.18623340129852295,
+      "learning_rate": 0.0009994830385936754,
+      "loss": 3.8759,
+      "step": 1100
+    },
+    {
+      "epoch": 0.0644056978734515,
+      "grad_norm": 0.18320336937904358,
+      "learning_rate": 0.000999438501129984,
+      "loss": 3.8592,
+      "step": 1110
+    },
+    {
+      "epoch": 0.06498592938582494,
+      "grad_norm": 0.16004326939582825,
+      "learning_rate": 0.0009993921246814119,
+      "loss": 3.8453,
+      "step": 1120
+    },
+    {
+      "epoch": 0.06556616089819838,
+      "grad_norm": 0.20819014310836792,
+      "learning_rate": 0.0009993439094187217,
+      "loss": 3.8493,
+      "step": 1130
+    },
+    {
+      "epoch": 0.06614639241057182,
+      "grad_norm": 0.20609912276268005,
+      "learning_rate": 0.0009992938555194472,
+      "loss": 3.8399,
+      "step": 1140
+    },
+    {
+      "epoch": 0.06672662392294526,
+      "grad_norm": 0.16247253119945526,
+      "learning_rate": 0.0009992419631678921,
+      "loss": 3.8425,
+      "step": 1150
+    },
+    {
+      "epoch": 0.0673068554353187,
+      "grad_norm": 0.2205752432346344,
+      "learning_rate": 0.0009991882325551295,
+      "loss": 3.823,
+      "step": 1160
+    },
+    {
+      "epoch": 0.06788708694769213,
+      "grad_norm": 0.18031305074691772,
+      "learning_rate": 0.0009991326638790008,
+      "loss": 3.8129,
+      "step": 1170
+    },
+    {
+      "epoch": 0.06846731846006557,
+      "grad_norm": 0.21720442175865173,
+      "learning_rate": 0.0009990752573441162,
+      "loss": 3.8177,
+      "step": 1180
+    },
+    {
+      "epoch": 0.069047549972439,
+      "grad_norm": 0.17516951262950897,
+      "learning_rate": 0.000999016013161852,
+      "loss": 3.8144,
+      "step": 1190
+    },
+    {
+      "epoch": 0.06962778148481244,
+      "grad_norm": 0.1756078600883484,
+      "learning_rate": 0.000998954931550352,
+      "loss": 3.8041,
+      "step": 1200
+    },
+    {
+      "epoch": 0.07020801299718588,
+      "grad_norm": 0.19864057004451752,
+      "learning_rate": 0.000998892012734525,
+      "loss": 3.8031,
+      "step": 1210
+    },
+    {
+      "epoch": 0.07078824450955931,
+      "grad_norm": 0.20211303234100342,
+      "learning_rate": 0.0009988272569460442,
+      "loss": 3.8009,
+      "step": 1220
+    },
+    {
+      "epoch": 0.07136847602193275,
+      "grad_norm": 0.16449439525604248,
+      "learning_rate": 0.0009987606644233477,
+      "loss": 3.7916,
+      "step": 1230
+    },
+    {
+      "epoch": 0.07194870753430618,
+      "grad_norm": 0.24472416937351227,
+      "learning_rate": 0.0009986922354116362,
+      "loss": 3.7902,
+      "step": 1240
+    },
+    {
+      "epoch": 0.07252893904667962,
+      "grad_norm": 0.15388889610767365,
+      "learning_rate": 0.000998621970162872,
+      "loss": 3.7747,
+      "step": 1250
+    },
+    {
+      "epoch": 0.07310917055905307,
+      "grad_norm": 0.18148912489414215,
+      "learning_rate": 0.0009985498689357797,
+      "loss": 3.771,
+      "step": 1260
+    },
+    {
+      "epoch": 0.0736894020714265,
+      "grad_norm": 0.20958511531352997,
+      "learning_rate": 0.000998475931995843,
+      "loss": 3.7728,
+      "step": 1270
+    },
+    {
+      "epoch": 0.07426963358379994,
+      "grad_norm": 0.20867913961410522,
+      "learning_rate": 0.000998400159615306,
+      "loss": 3.7641,
+      "step": 1280
+    },
+    {
+      "epoch": 0.07484986509617338,
+      "grad_norm": 0.2014162242412567,
+      "learning_rate": 0.00099832255207317,
+      "loss": 3.7605,
+      "step": 1290
+    },
+    {
+      "epoch": 0.07543009660854681,
+      "grad_norm": 0.19448301196098328,
+      "learning_rate": 0.0009982431096551947,
+      "loss": 3.7562,
+      "step": 1300
+    },
+    {
+      "epoch": 0.07601032812092025,
+      "grad_norm": 0.17145995795726776,
+      "learning_rate": 0.0009981618326538948,
+      "loss": 3.7583,
+      "step": 1310
+    },
+    {
+      "epoch": 0.07659055963329368,
+      "grad_norm": 0.16092616319656372,
+      "learning_rate": 0.000998078721368541,
+      "loss": 3.7439,
+      "step": 1320
+    },
+    {
+      "epoch": 0.07717079114566712,
+      "grad_norm": 0.22316782176494598,
+      "learning_rate": 0.000997993776105158,
+      "loss": 3.7373,
+      "step": 1330
+    },
+    {
+      "epoch": 0.07775102265804056,
+      "grad_norm": 0.17662325501441956,
+      "learning_rate": 0.0009979069971765226,
+      "loss": 3.7337,
+      "step": 1340
+    },
+    {
+      "epoch": 0.07833125417041399,
+      "grad_norm": 0.139273002743721,
+      "learning_rate": 0.0009978183849021645,
+      "loss": 3.7296,
+      "step": 1350
+    },
+    {
+      "epoch": 0.07891148568278743,
+      "grad_norm": 0.2038998305797577,
+      "learning_rate": 0.000997727939608363,
+      "loss": 3.7353,
+      "step": 1360
+    },
+    {
+      "epoch": 0.07949171719516086,
+      "grad_norm": 0.18561561405658722,
+      "learning_rate": 0.0009976356616281474,
+      "loss": 3.7318,
+      "step": 1370
+    },
+    {
+      "epoch": 0.08007194870753431,
+      "grad_norm": 0.20797798037528992,
+      "learning_rate": 0.0009975415513012946,
+      "loss": 3.7242,
+      "step": 1380
+    },
+    {
+      "epoch": 0.08065218021990775,
+      "grad_norm": 0.1697220355272293,
+      "learning_rate": 0.0009974456089743289,
+      "loss": 3.7066,
+      "step": 1390
+    },
+    {
+      "epoch": 0.08123241173228118,
+      "grad_norm": 0.18265104293823242,
+      "learning_rate": 0.0009973478350005199,
+      "loss": 3.7159,
+      "step": 1400
+    },
+    {
+      "epoch": 0.08181264324465462,
+      "grad_norm": 0.18804903328418732,
+      "learning_rate": 0.0009972482297398817,
+      "loss": 3.7042,
+      "step": 1410
+    },
+    {
+      "epoch": 0.08239287475702806,
+      "grad_norm": 0.1842174530029297,
+      "learning_rate": 0.0009971467935591713,
+      "loss": 3.7103,
+      "step": 1420
+    },
+    {
+      "epoch": 0.08297310626940149,
+      "grad_norm": 0.18509556353092194,
+      "learning_rate": 0.000997043526831887,
+      "loss": 3.7066,
+      "step": 1430
+    },
+    {
+      "epoch": 0.08355333778177493,
+      "grad_norm": 0.19408974051475525,
+      "learning_rate": 0.0009969384299382683,
+      "loss": 3.6867,
+      "step": 1440
+    },
+    {
+      "epoch": 0.08413356929414836,
+      "grad_norm": 0.18689195811748505,
+      "learning_rate": 0.0009968315032652924,
+      "loss": 3.6787,
+      "step": 1450
+    },
+    {
+      "epoch": 0.0847138008065218,
+      "grad_norm": 0.17087410390377045,
+      "learning_rate": 0.0009967227472066748,
+      "loss": 3.6792,
+      "step": 1460
+    },
+    {
+      "epoch": 0.08529403231889524,
+      "grad_norm": 0.1809985190629959,
+      "learning_rate": 0.000996612162162867,
+      "loss": 3.6975,
+      "step": 1470
+    },
+    {
+      "epoch": 0.08587426383126867,
+      "grad_norm": 0.2063867449760437,
+      "learning_rate": 0.000996499748541054,
+      "loss": 3.6831,
+      "step": 1480
+    },
+    {
+      "epoch": 0.08645449534364211,
+      "grad_norm": 0.191127747297287,
+      "learning_rate": 0.0009963855067551552,
+      "loss": 3.6779,
+      "step": 1490
+    },
+    {
+      "epoch": 0.08703472685601556,
+      "grad_norm": 0.17394115030765533,
+      "learning_rate": 0.0009962694372258206,
+      "loss": 3.665,
+      "step": 1500
+    },
+    {
+      "epoch": 0.08761495836838899,
+      "grad_norm": 0.17784751951694489,
+      "learning_rate": 0.0009961515403804303,
+      "loss": 3.6587,
+      "step": 1510
+    },
+    {
+      "epoch": 0.08819518988076243,
+      "grad_norm": 0.2151278853416443,
+      "learning_rate": 0.0009960318166530927,
+      "loss": 3.6526,
+      "step": 1520
+    },
+    {
+      "epoch": 0.08877542139313586,
+      "grad_norm": 0.20224444568157196,
+      "learning_rate": 0.0009959102664846432,
+      "loss": 3.664,
+      "step": 1530
+    },
+    {
+      "epoch": 0.0893556529055093,
+      "grad_norm": 0.18003828823566437,
+      "learning_rate": 0.0009957868903226425,
+      "loss": 3.6487,
+      "step": 1540
+    },
+    {
+      "epoch": 0.08993588441788274,
+      "grad_norm": 0.17379723489284515,
+      "learning_rate": 0.0009956616886213742,
+      "loss": 3.6553,
+      "step": 1550
+    },
+    {
+      "epoch": 0.09051611593025617,
+      "grad_norm": 0.19469398260116577,
+      "learning_rate": 0.0009955346618418443,
+      "loss": 3.6518,
+      "step": 1560
+    },
+    {
+      "epoch": 0.09109634744262961,
+      "grad_norm": 0.15254022181034088,
+      "learning_rate": 0.0009954058104517788,
+      "loss": 3.6517,
+      "step": 1570
+    },
+    {
+      "epoch": 0.09167657895500304,
+      "grad_norm": 0.19052338600158691,
+      "learning_rate": 0.0009952751349256218,
+      "loss": 3.6479,
+      "step": 1580
+    },
+    {
+      "epoch": 0.09225681046737648,
+      "grad_norm": 0.15514792501926422,
+      "learning_rate": 0.0009951426357445343,
+      "loss": 3.6307,
+      "step": 1590
+    },
+    {
+      "epoch": 0.09283704197974992,
+      "grad_norm": 0.1835625171661377,
+      "learning_rate": 0.0009950083133963923,
+      "loss": 3.6349,
+      "step": 1600
+    },
+    {
+      "epoch": 0.09341727349212335,
+      "grad_norm": 0.1494477540254593,
+      "learning_rate": 0.0009948721683757846,
+      "loss": 3.6373,
+      "step": 1610
+    },
+    {
+      "epoch": 0.0939975050044968,
+      "grad_norm": 0.23332346975803375,
+      "learning_rate": 0.0009947342011840114,
+      "loss": 3.63,
+      "step": 1620
+    },
+    {
+      "epoch": 0.09457773651687024,
+      "grad_norm": 0.1672951877117157,
+      "learning_rate": 0.0009945944123290827,
+      "loss": 3.6305,
+      "step": 1630
+    },
+    {
+      "epoch": 0.09515796802924367,
+      "grad_norm": 0.1747497320175171,
+      "learning_rate": 0.0009944528023257153,
+      "loss": 3.6295,
+      "step": 1640
+    },
+    {
+      "epoch": 0.09573819954161711,
+      "grad_norm": 0.15887069702148438,
+      "learning_rate": 0.0009943093716953321,
+      "loss": 3.6315,
+      "step": 1650
+    },
+    {
+      "epoch": 0.09631843105399054,
+      "grad_norm": 0.1994808316230774,
+      "learning_rate": 0.00099416412096606,
+      "loss": 3.6236,
+      "step": 1660
+    },
+    {
+      "epoch": 0.09689866256636398,
+      "grad_norm": 0.16880349814891815,
+      "learning_rate": 0.0009940170506727273,
+      "loss": 3.6111,
+      "step": 1670
+    },
+    {
+      "epoch": 0.09747889407873742,
+      "grad_norm": 0.2141963392496109,
+      "learning_rate": 0.000993868161356862,
+      "loss": 3.6147,
+      "step": 1680
+    },
+    {
+      "epoch": 0.09805912559111085,
+      "grad_norm": 0.19859230518341064,
+      "learning_rate": 0.0009937174535666904,
+      "loss": 3.6225,
+      "step": 1690
+    },
+    {
+      "epoch": 0.09863935710348429,
+      "grad_norm": 0.21606995165348053,
+      "learning_rate": 0.0009935649278571344,
+      "loss": 3.6035,
+      "step": 1700
+    },
+    {
+      "epoch": 0.09921958861585772,
+      "grad_norm": 0.1618034541606903,
+      "learning_rate": 0.0009934105847898094,
+      "loss": 3.6114,
+      "step": 1710
+    },
+    {
+      "epoch": 0.09979982012823116,
+      "grad_norm": 0.19869261980056763,
+      "learning_rate": 0.0009932544249330229,
+      "loss": 3.6085,
+      "step": 1720
+    },
+    {
+      "epoch": 0.1003800516406046,
+      "grad_norm": 0.20715004205703735,
+      "learning_rate": 0.0009930964488617717,
+      "loss": 3.6056,
+      "step": 1730
+    },
+    {
+      "epoch": 0.10096028315297804,
+      "grad_norm": 0.14460305869579315,
+      "learning_rate": 0.0009929366571577406,
+      "loss": 3.6041,
+      "step": 1740
+    },
+    {
+      "epoch": 0.10154051466535148,
+      "grad_norm": 0.1818549782037735,
+      "learning_rate": 0.000992775050409299,
+      "loss": 3.6026,
+      "step": 1750
+    },
+    {
+      "epoch": 0.10212074617772492,
+      "grad_norm": 0.16325876116752625,
+      "learning_rate": 0.0009926116292115,
+      "loss": 3.5907,
+      "step": 1760
+    },
+    {
+      "epoch": 0.10270097769009835,
+      "grad_norm": 0.1783674657344818,
+      "learning_rate": 0.0009924463941660777,
+      "loss": 3.5932,
+      "step": 1770
+    },
+    {
+      "epoch": 0.10328120920247179,
+      "grad_norm": 0.23217235505580902,
+      "learning_rate": 0.0009922793458814448,
+      "loss": 3.5905,
+      "step": 1780
+    },
+    {
+      "epoch": 0.10386144071484522,
+      "grad_norm": 0.18973788619041443,
+      "learning_rate": 0.0009921104849726903,
+      "loss": 3.6022,
+      "step": 1790
+    },
+    {
+      "epoch": 0.10444167222721866,
+      "grad_norm": 0.17927169799804688,
+      "learning_rate": 0.0009919398120615784,
+      "loss": 3.5857,
+      "step": 1800
+    },
+    {
+      "epoch": 0.1050219037395921,
+      "grad_norm": 0.1780613660812378,
+      "learning_rate": 0.000991767327776544,
+      "loss": 3.5834,
+      "step": 1810
+    },
+    {
+      "epoch": 0.10560213525196553,
+      "grad_norm": 0.16915231943130493,
+      "learning_rate": 0.0009915930327526925,
+      "loss": 3.5848,
+      "step": 1820
+    },
+    {
+      "epoch": 0.10618236676433897,
+      "grad_norm": 0.18921369314193726,
+      "learning_rate": 0.0009914169276317966,
+      "loss": 3.5855,
+      "step": 1830
+    },
+    {
+      "epoch": 0.1067625982767124,
+      "grad_norm": 0.17203205823898315,
+      "learning_rate": 0.0009912390130622935,
+      "loss": 3.577,
+      "step": 1840
+    },
+    {
+      "epoch": 0.10734282978908584,
+      "grad_norm": 0.23704229295253754,
+      "learning_rate": 0.0009910592896992835,
+      "loss": 3.5721,
+      "step": 1850
+    },
+    {
+      "epoch": 0.10792306130145929,
+      "grad_norm": 0.15052714943885803,
+      "learning_rate": 0.000990877758204527,
+      "loss": 3.5778,
+      "step": 1860
+    },
+    {
+      "epoch": 0.10850329281383272,
+      "grad_norm": 0.1933393031358719,
+      "learning_rate": 0.0009906944192464417,
+      "loss": 3.5789,
+      "step": 1870
+    },
+    {
+      "epoch": 0.10908352432620616,
+      "grad_norm": 0.17307838797569275,
+      "learning_rate": 0.000990509273500101,
+      "loss": 3.5701,
+      "step": 1880
+    },
+    {
+      "epoch": 0.1096637558385796,
+      "grad_norm": 0.20114563405513763,
+      "learning_rate": 0.0009903223216472306,
+      "loss": 3.5716,
+      "step": 1890
+    },
+    {
+      "epoch": 0.11024398735095303,
+      "grad_norm": 0.20986546576023102,
+      "learning_rate": 0.0009901335643762075,
+      "loss": 3.5644,
+      "step": 1900
+    },
+    {
+      "epoch": 0.11082421886332647,
+      "grad_norm": 0.1716219186782837,
+      "learning_rate": 0.0009899430023820551,
+      "loss": 3.5617,
+      "step": 1910
+    },
+    {
+      "epoch": 0.1114044503756999,
+      "grad_norm": 0.1951904147863388,
+      "learning_rate": 0.0009897506363664428,
+      "loss": 3.5535,
+      "step": 1920
+    },
+    {
+      "epoch": 0.11198468188807334,
+      "grad_norm": 0.1623149961233139,
+      "learning_rate": 0.0009895564670376823,
+      "loss": 3.5586,
+      "step": 1930
+    },
+    {
+      "epoch": 0.11256491340044678,
+      "grad_norm": 0.1657789647579193,
+      "learning_rate": 0.000989360495110726,
+      "loss": 3.5623,
+      "step": 1940
+    },
+    {
+      "epoch": 0.11314514491282021,
+      "grad_norm": 0.2033306509256363,
+      "learning_rate": 0.0009891627213071625,
+      "loss": 3.5404,
+      "step": 1950
+    },
+    {
+      "epoch": 0.11372537642519365,
+      "grad_norm": 0.17595010995864868,
+      "learning_rate": 0.0009889631463552157,
+      "loss": 3.5493,
+      "step": 1960
+    },
+    {
+      "epoch": 0.11430560793756708,
+      "grad_norm": 0.1695556640625,
+      "learning_rate": 0.0009887617709897416,
+      "loss": 3.5537,
+      "step": 1970
+    },
+    {
+      "epoch": 0.11488583944994053,
+      "grad_norm": 0.14913249015808105,
+      "learning_rate": 0.0009885585959522256,
+      "loss": 3.5531,
+      "step": 1980
+    },
+    {
+      "epoch": 0.11546607096231397,
+      "grad_norm": 0.1780378520488739,
+      "learning_rate": 0.000988353621990779,
+      "loss": 3.5458,
+      "step": 1990
+    },
+    {
+      "epoch": 0.1160463024746874,
+      "grad_norm": 0.1777425855398178,
+      "learning_rate": 0.0009881468498601379,
+      "loss": 3.5512,
+      "step": 2000
+    },
+    {
+      "epoch": 0.1160463024746874,
+      "eval_loss": 3.486086845397949,
+      "eval_runtime": 3.248,
+      "eval_samples_per_second": 1333.109,
+      "eval_steps_per_second": 10.468,
+      "step": 2000
+    },
+    {
+      "epoch": 0.11662653398706084,
+      "grad_norm": 0.1840885430574417,
+      "learning_rate": 0.0009879382803216585,
+      "loss": 3.5499,
+      "step": 2010
+    },
+    {
+      "epoch": 0.11720676549943428,
+      "grad_norm": 0.15481173992156982,
+      "learning_rate": 0.000987727914143316,
+      "loss": 3.5435,
+      "step": 2020
+    },
+    {
+      "epoch": 0.11778699701180771,
+      "grad_norm": 0.17583511769771576,
+      "learning_rate": 0.0009875157520997005,
+      "loss": 3.5421,
+      "step": 2030
+    },
+    {
+      "epoch": 0.11836722852418115,
+      "grad_norm": 0.17113088071346283,
+      "learning_rate": 0.000987301794972015,
+      "loss": 3.5256,
+      "step": 2040
+    },
+    {
+      "epoch": 0.11894746003655458,
+      "grad_norm": 0.19127151370048523,
+      "learning_rate": 0.000987086043548072,
+      "loss": 3.5307,
+      "step": 2050
+    },
+    {
+      "epoch": 0.11952769154892802,
+      "grad_norm": 0.1801798790693283,
+      "learning_rate": 0.000986868498622291,
+      "loss": 3.5428,
+      "step": 2060
+    },
+    {
+      "epoch": 0.12010792306130146,
+      "grad_norm": 0.17117474973201752,
+      "learning_rate": 0.0009866491609956949,
+      "loss": 3.5429,
+      "step": 2070
+    },
+    {
+      "epoch": 0.12068815457367489,
+      "grad_norm": 0.18343955278396606,
+      "learning_rate": 0.000986428031475908,
+      "loss": 3.5305,
+      "step": 2080
+    },
+    {
+      "epoch": 0.12126838608604833,
+      "grad_norm": 0.18340405821800232,
+      "learning_rate": 0.0009862051108771523,
+      "loss": 3.5239,
+      "step": 2090
+    },
+    {
+      "epoch": 0.12184861759842178,
+      "grad_norm": 0.17750664055347443,
+      "learning_rate": 0.000985980400020245,
+      "loss": 3.5233,
+      "step": 2100
+    },
+    {
+      "epoch": 0.12242884911079521,
+      "grad_norm": 0.18838383257389069,
+      "learning_rate": 0.000985753899732595,
+      "loss": 3.5299,
+      "step": 2110
+    },
+    {
+      "epoch": 0.12300908062316865,
+      "grad_norm": 0.18249675631523132,
+      "learning_rate": 0.0009855256108481996,
+      "loss": 3.5363,
+      "step": 2120
+    },
+    {
+      "epoch": 0.12358931213554208,
+      "grad_norm": 0.17180895805358887,
+      "learning_rate": 0.0009852955342076431,
+      "loss": 3.5211,
+      "step": 2130
+    },
+    {
+      "epoch": 0.12416954364791552,
+      "grad_norm": 0.2116585075855255,
+      "learning_rate": 0.0009850636706580911,
+      "loss": 3.5278,
+      "step": 2140
+    },
+    {
+      "epoch": 0.12474977516028896,
+      "grad_norm": 0.1602133959531784,
+      "learning_rate": 0.0009848300210532899,
+      "loss": 3.5184,
+      "step": 2150
+    },
+    {
+      "epoch": 0.1253300066726624,
+      "grad_norm": 0.18009105324745178,
+      "learning_rate": 0.0009845945862535618,
+      "loss": 3.5151,
+      "step": 2160
+    },
+    {
+      "epoch": 0.12591023818503583,
+      "grad_norm": 0.19407133758068085,
+      "learning_rate": 0.0009843573671258024,
+      "loss": 3.5237,
+      "step": 2170
+    },
+    {
+      "epoch": 0.12649046969740926,
+      "grad_norm": 0.1537596881389618,
+      "learning_rate": 0.000984118364543477,
+      "loss": 3.5151,
+      "step": 2180
+    },
+    {
+      "epoch": 0.1270707012097827,
+      "grad_norm": 0.20341388881206512,
+      "learning_rate": 0.0009838775793866187,
+      "loss": 3.5146,
+      "step": 2190
+    },
+    {
+      "epoch": 0.12765093272215614,
+      "grad_norm": 0.19128254055976868,
+      "learning_rate": 0.0009836350125418233,
+      "loss": 3.5243,
+      "step": 2200
+    },
+    {
+      "epoch": 0.12823116423452957,
+      "grad_norm": 0.18839573860168457,
+      "learning_rate": 0.0009833906649022476,
+      "loss": 3.5006,
+      "step": 2210
+    },
+    {
+      "epoch": 0.128811395746903,
+      "grad_norm": 0.1921667903661728,
+      "learning_rate": 0.0009831445373676049,
+      "loss": 3.5139,
+      "step": 2220
+    },
+    {
+      "epoch": 0.12939162725927644,
+      "grad_norm": 0.19188368320465088,
+      "learning_rate": 0.000982896630844163,
+      "loss": 3.5169,
+      "step": 2230
+    },
+    {
+      "epoch": 0.12997185877164988,
+      "grad_norm": 0.17427149415016174,
+      "learning_rate": 0.000982646946244739,
+      "loss": 3.5046,
+      "step": 2240
+    },
+    {
+      "epoch": 0.13055209028402331,
+      "grad_norm": 0.16430804133415222,
+      "learning_rate": 0.0009823954844886983,
+      "loss": 3.5112,
+      "step": 2250
+    },
+    {
+      "epoch": 0.13113232179639675,
+      "grad_norm": 0.15311101078987122,
+      "learning_rate": 0.0009821422465019496,
+      "loss": 3.5017,
+      "step": 2260
+    },
+    {
+      "epoch": 0.13171255330877019,
+      "grad_norm": 0.17969970405101776,
+      "learning_rate": 0.000981887233216941,
+      "loss": 3.4945,
+      "step": 2270
+    },
+    {
+      "epoch": 0.13229278482114365,
+      "grad_norm": 0.16781874001026154,
+      "learning_rate": 0.000981630445572659,
+      "loss": 3.5051,
+      "step": 2280
+    },
+    {
+      "epoch": 0.13287301633351709,
+      "grad_norm": 0.1790471076965332,
+      "learning_rate": 0.0009813718845146215,
+      "loss": 3.4946,
+      "step": 2290
+    },
+    {
+      "epoch": 0.13345324784589052,
+      "grad_norm": 0.14774377644062042,
+      "learning_rate": 0.0009811115509948784,
+      "loss": 3.495,
+      "step": 2300
+    },
+    {
+      "epoch": 0.13403347935826396,
+      "grad_norm": 0.18693110346794128,
+      "learning_rate": 0.0009808494459720046,
+      "loss": 3.5018,
+      "step": 2310
+    },
+    {
+      "epoch": 0.1346137108706374,
+      "grad_norm": 0.163302943110466,
+      "learning_rate": 0.000980585570411098,
+      "loss": 3.4864,
+      "step": 2320
+    },
+    {
+      "epoch": 0.13519394238301083,
+      "grad_norm": 0.17489096522331238,
+      "learning_rate": 0.0009803199252837766,
+      "loss": 3.4932,
+      "step": 2330
+    },
+    {
+      "epoch": 0.13577417389538426,
+      "grad_norm": 0.1895146518945694,
+      "learning_rate": 0.0009800525115681734,
+      "loss": 3.4937,
+      "step": 2340
+    },
+    {
+      "epoch": 0.1363544054077577,
+      "grad_norm": 0.19778315722942352,
+      "learning_rate": 0.0009797833302489334,
+      "loss": 3.4819,
+      "step": 2350
+    },
+    {
+      "epoch": 0.13693463692013114,
+      "grad_norm": 0.17575684189796448,
+      "learning_rate": 0.0009795123823172107,
+      "loss": 3.4853,
+      "step": 2360
+    },
+    {
+      "epoch": 0.13751486843250457,
+      "grad_norm": 0.1611810177564621,
+      "learning_rate": 0.000979239668770664,
+      "loss": 3.4912,
+      "step": 2370
+    },
+    {
+      "epoch": 0.138095099944878,
+      "grad_norm": 0.19706352055072784,
+      "learning_rate": 0.0009789651906134532,
+      "loss": 3.4814,
+      "step": 2380
+    },
+    {
+      "epoch": 0.13867533145725144,
+      "grad_norm": 0.15343667566776276,
+      "learning_rate": 0.0009786889488562352,
+      "loss": 3.4757,
+      "step": 2390
+    },
+    {
+      "epoch": 0.13925556296962488,
+      "grad_norm": 0.1835697740316391,
+      "learning_rate": 0.0009784109445161616,
+      "loss": 3.48,
+      "step": 2400
+    },
+    {
+      "epoch": 0.13983579448199832,
+      "grad_norm": 0.19989457726478577,
+      "learning_rate": 0.0009781311786168732,
+      "loss": 3.471,
+      "step": 2410
+    },
+    {
+      "epoch": 0.14041602599437175,
+      "grad_norm": 0.1824110597372055,
+      "learning_rate": 0.0009778496521884973,
+      "loss": 3.4795,
+      "step": 2420
+    },
+    {
+      "epoch": 0.1409962575067452,
+      "grad_norm": 0.17803703248500824,
+      "learning_rate": 0.0009775663662676438,
+      "loss": 3.4895,
+      "step": 2430
+    },
+    {
+      "epoch": 0.14157648901911862,
+      "grad_norm": 0.17566360533237457,
+      "learning_rate": 0.0009772813218974013,
+      "loss": 3.4771,
+      "step": 2440
+    },
+    {
+      "epoch": 0.14215672053149206,
+      "grad_norm": 0.17459633946418762,
+      "learning_rate": 0.0009769945201273328,
+      "loss": 3.4748,
+      "step": 2450
+    },
+    {
+      "epoch": 0.1427369520438655,
+      "grad_norm": 0.17547911405563354,
+      "learning_rate": 0.0009767059620134728,
+      "loss": 3.4851,
+      "step": 2460
+    },
+    {
+      "epoch": 0.14331718355623893,
+      "grad_norm": 0.1611047089099884,
+      "learning_rate": 0.0009764156486183223,
+      "loss": 3.4859,
+      "step": 2470
+    },
+    {
+      "epoch": 0.14389741506861237,
+      "grad_norm": 0.22488833963871002,
+      "learning_rate": 0.0009761235810108453,
+      "loss": 3.4704,
+      "step": 2480
+    },
+    {
+      "epoch": 0.1444776465809858,
+      "grad_norm": 0.1857168972492218,
+      "learning_rate": 0.0009758297602664658,
+      "loss": 3.4636,
+      "step": 2490
+    },
+    {
+      "epoch": 0.14505787809335924,
+      "grad_norm": 0.18012335896492004,
+      "learning_rate": 0.0009755341874670624,
+      "loss": 3.4675,
+      "step": 2500
+    },
+    {
+      "epoch": 0.14563810960573267,
+      "grad_norm": 0.14618253707885742,
+      "learning_rate": 0.000975236863700965,
+      "loss": 3.472,
+      "step": 2510
+    },
+    {
+      "epoch": 0.14621834111810614,
+      "grad_norm": 0.20513653755187988,
+      "learning_rate": 0.000974937790062951,
+      "loss": 3.4772,
+      "step": 2520
+    },
+    {
+      "epoch": 0.14679857263047957,
+      "grad_norm": 0.17752069234848022,
+      "learning_rate": 0.0009746369676542408,
+      "loss": 3.4674,
+      "step": 2530
+    },
+    {
+      "epoch": 0.147378804142853,
+      "grad_norm": 0.1703299880027771,
+      "learning_rate": 0.000974334397582494,
+      "loss": 3.4631,
+      "step": 2540
+    },
+    {
+      "epoch": 0.14795903565522645,
+      "grad_norm": 0.18766288459300995,
+      "learning_rate": 0.0009740300809618055,
+      "loss": 3.4696,
+      "step": 2550
+    },
+    {
+      "epoch": 0.14853926716759988,
+      "grad_norm": 0.19645459949970245,
+      "learning_rate": 0.0009737240189127005,
+      "loss": 3.4686,
+      "step": 2560
+    },
+    {
+      "epoch": 0.14911949867997332,
+      "grad_norm": 0.19703318178653717,
+      "learning_rate": 0.0009734162125621322,
+      "loss": 3.4645,
+      "step": 2570
+    },
+    {
+      "epoch": 0.14969973019234675,
+      "grad_norm": 0.1729518324136734,
+      "learning_rate": 0.0009731066630434753,
+      "loss": 3.4623,
+      "step": 2580
+    },
+    {
+      "epoch": 0.1502799617047202,
+      "grad_norm": 0.1629866361618042,
+      "learning_rate": 0.0009727953714965238,
+      "loss": 3.4587,
+      "step": 2590
+    },
+    {
+      "epoch": 0.15086019321709362,
+      "grad_norm": 0.1935587227344513,
+      "learning_rate": 0.0009724823390674857,
+      "loss": 3.452,
+      "step": 2600
+    },
+    {
+      "epoch": 0.15144042472946706,
+      "grad_norm": 0.1905779391527176,
+      "learning_rate": 0.0009721675669089791,
+      "loss": 3.4492,
+      "step": 2610
+    },
+    {
+      "epoch": 0.1520206562418405,
+      "grad_norm": 0.1649785190820694,
+      "learning_rate": 0.0009718510561800282,
+      "loss": 3.4553,
+      "step": 2620
+    },
+    {
+      "epoch": 0.15260088775421393,
+      "grad_norm": 0.18956676125526428,
+      "learning_rate": 0.0009715328080460587,
+      "loss": 3.4565,
+      "step": 2630
+    },
+    {
+      "epoch": 0.15318111926658737,
+      "grad_norm": 0.1951354593038559,
+      "learning_rate": 0.0009712128236788935,
+      "loss": 3.4588,
+      "step": 2640
+    },
+    {
+      "epoch": 0.1537613507789608,
+      "grad_norm": 0.19937202334403992,
+      "learning_rate": 0.0009708911042567485,
+      "loss": 3.4464,
+      "step": 2650
+    },
+    {
+      "epoch": 0.15434158229133424,
+      "grad_norm": 0.1778160184621811,
+      "learning_rate": 0.0009705676509642285,
+      "loss": 3.4619,
+      "step": 2660
+    },
+    {
+      "epoch": 0.15492181380370768,
+      "grad_norm": 0.19197410345077515,
+      "learning_rate": 0.0009702424649923221,
+      "loss": 3.4545,
+      "step": 2670
+    },
+    {
+      "epoch": 0.1555020453160811,
+      "grad_norm": 0.20156262814998627,
+      "learning_rate": 0.0009699155475383984,
+      "loss": 3.4407,
+      "step": 2680
+    },
+    {
+      "epoch": 0.15608227682845455,
+      "grad_norm": 0.1888488084077835,
+      "learning_rate": 0.0009695868998062016,
+      "loss": 3.4522,
+      "step": 2690
+    },
+    {
+      "epoch": 0.15666250834082798,
+      "grad_norm": 0.15710744261741638,
+      "learning_rate": 0.0009692565230058471,
+      "loss": 3.4385,
+      "step": 2700
+    },
+    {
+      "epoch": 0.15724273985320142,
+      "grad_norm": 0.17102789878845215,
+      "learning_rate": 0.0009689244183538169,
+      "loss": 3.4495,
+      "step": 2710
+    },
+    {
+      "epoch": 0.15782297136557485,
+      "grad_norm": 0.19345730543136597,
+      "learning_rate": 0.000968590587072955,
+      "loss": 3.4449,
+      "step": 2720
+    },
+    {
+      "epoch": 0.1584032028779483,
+      "grad_norm": 0.17778056859970093,
+      "learning_rate": 0.0009682550303924633,
+      "loss": 3.4424,
+      "step": 2730
+    },
+    {
+      "epoch": 0.15898343439032173,
+      "grad_norm": 0.17961904406547546,
+      "learning_rate": 0.0009679177495478966,
+      "loss": 3.4457,
+      "step": 2740
+    },
+    {
+      "epoch": 0.15956366590269516,
+      "grad_norm": 0.16124430298805237,
+      "learning_rate": 0.0009675787457811583,
+      "loss": 3.4388,
+      "step": 2750
+    },
+    {
+      "epoch": 0.16014389741506863,
+      "grad_norm": 0.24593627452850342,
+      "learning_rate": 0.0009672380203404957,
+      "loss": 3.4491,
+      "step": 2760
+    },
+    {
+      "epoch": 0.16072412892744206,
+      "grad_norm": 0.22059789299964905,
+      "learning_rate": 0.0009668955744804957,
+      "loss": 3.4452,
+      "step": 2770
+    },
+    {
+      "epoch": 0.1613043604398155,
+      "grad_norm": 0.17355699837207794,
+      "learning_rate": 0.0009665514094620798,
+      "loss": 3.4334,
+      "step": 2780
+    },
+    {
+      "epoch": 0.16188459195218893,
+      "grad_norm": 0.18848399817943573,
+      "learning_rate": 0.0009662055265524996,
+      "loss": 3.4445,
+      "step": 2790
+    },
+    {
+      "epoch": 0.16246482346456237,
+      "grad_norm": 0.16156227886676788,
+      "learning_rate": 0.0009658579270253321,
+      "loss": 3.432,
+      "step": 2800
+    },
+    {
+      "epoch": 0.1630450549769358,
+      "grad_norm": 0.17677579820156097,
+      "learning_rate": 0.0009655086121604754,
+      "loss": 3.4387,
+      "step": 2810
+    },
+    {
+      "epoch": 0.16362528648930924,
+      "grad_norm": 0.1860220730304718,
+      "learning_rate": 0.0009651575832441435,
+      "loss": 3.4352,
+      "step": 2820
+    },
+    {
+      "epoch": 0.16420551800168268,
+      "grad_norm": 0.1875036209821701,
+      "learning_rate": 0.0009648048415688612,
+      "loss": 3.4361,
+      "step": 2830
+    },
+    {
+      "epoch": 0.1647857495140561,
+      "grad_norm": 0.18773604929447174,
+      "learning_rate": 0.0009644503884334608,
+      "loss": 3.4293,
+      "step": 2840
+    },
+    {
+      "epoch": 0.16536598102642955,
+      "grad_norm": 0.17164553701877594,
+      "learning_rate": 0.0009640942251430755,
+      "loss": 3.4329,
+      "step": 2850
+    },
+    {
+      "epoch": 0.16594621253880298,
+      "grad_norm": 0.22501535713672638,
+      "learning_rate": 0.0009637363530091361,
+      "loss": 3.4354,
+      "step": 2860
+    },
+    {
+      "epoch": 0.16652644405117642,
+      "grad_norm": 0.16013330221176147,
+      "learning_rate": 0.0009633767733493651,
+      "loss": 3.4266,
+      "step": 2870
+    },
+    {
+      "epoch": 0.16710667556354986,
+      "grad_norm": 0.17605000734329224,
+      "learning_rate": 0.0009630154874877726,
+      "loss": 3.4202,
+      "step": 2880
+    },
+    {
+      "epoch": 0.1676869070759233,
+      "grad_norm": 0.19912280142307281,
+      "learning_rate": 0.0009626524967546508,
+      "loss": 3.4251,
+      "step": 2890
+    },
+    {
+      "epoch": 0.16826713858829673,
+      "grad_norm": 0.19407010078430176,
+      "learning_rate": 0.00096228780248657,
+      "loss": 3.4242,
+      "step": 2900
+    },
+    {
+      "epoch": 0.16884737010067016,
+      "grad_norm": 0.19874045252799988,
+      "learning_rate": 0.0009619214060263723,
+      "loss": 3.4326,
+      "step": 2910
+    },
+    {
+      "epoch": 0.1694276016130436,
+      "grad_norm": 0.1755009889602661,
+      "learning_rate": 0.000961553308723168,
+      "loss": 3.4201,
+      "step": 2920
+    },
+    {
+      "epoch": 0.17000783312541703,
+      "grad_norm": 0.16836123168468475,
+      "learning_rate": 0.00096118351193233,
+      "loss": 3.4244,
+      "step": 2930
+    },
+    {
+      "epoch": 0.17058806463779047,
+      "grad_norm": 0.17263484001159668,
+      "learning_rate": 0.0009608120170154886,
+      "loss": 3.4245,
+      "step": 2940
+    },
+    {
+      "epoch": 0.1711682961501639,
+      "grad_norm": 0.16616012156009674,
+      "learning_rate": 0.0009604388253405272,
+      "loss": 3.4149,
+      "step": 2950
+    },
+    {
+      "epoch": 0.17174852766253734,
+      "grad_norm": 0.18633881211280823,
+      "learning_rate": 0.0009600639382815768,
+      "loss": 3.4247,
+      "step": 2960
+    },
+    {
+      "epoch": 0.17232875917491078,
+      "grad_norm": 0.1888113021850586,
+      "learning_rate": 0.0009596873572190104,
+      "loss": 3.4185,
+      "step": 2970
+    },
+    {
+      "epoch": 0.17290899068728421,
+      "grad_norm": 0.1729818731546402,
+      "learning_rate": 0.0009593090835394392,
+      "loss": 3.4188,
+      "step": 2980
+    },
+    {
+      "epoch": 0.17348922219965765,
+      "grad_norm": 0.19416862726211548,
+      "learning_rate": 0.0009589291186357066,
+      "loss": 3.417,
+      "step": 2990
+    },
+    {
+      "epoch": 0.1740694537120311,
+      "grad_norm": 0.16473758220672607,
+      "learning_rate": 0.0009585474639068829,
+      "loss": 3.4279,
+      "step": 3000
+    },
+    {
+      "epoch": 0.1740694537120311,
+      "eval_loss": 3.3559625148773193,
+      "eval_runtime": 3.2502,
+      "eval_samples_per_second": 1332.237,
+      "eval_steps_per_second": 10.461,
+      "step": 3000
+    },
+    {
+      "epoch": 0.17464968522440455,
+      "grad_norm": 0.148405060172081,
+      "learning_rate": 0.0009581641207582609,
+      "loss": 3.4132,
+      "step": 3010
+    },
+    {
+      "epoch": 0.17522991673677799,
+      "grad_norm": 0.17240671813488007,
+      "learning_rate": 0.0009577790906013503,
+      "loss": 3.4145,
+      "step": 3020
+    },
+    {
+      "epoch": 0.17581014824915142,
+      "grad_norm": 0.1781778484582901,
+      "learning_rate": 0.0009573923748538724,
+      "loss": 3.4146,
+      "step": 3030
+    },
+    {
+      "epoch": 0.17639037976152486,
+      "grad_norm": 0.1692868173122406,
+      "learning_rate": 0.0009570039749397552,
+      "loss": 3.4154,
+      "step": 3040
+    },
+    {
+      "epoch": 0.1769706112738983,
+      "grad_norm": 0.18871796131134033,
+      "learning_rate": 0.0009566138922891277,
+      "loss": 3.4233,
+      "step": 3050
+    },
+    {
+      "epoch": 0.17755084278627173,
+      "grad_norm": 0.16771915555000305,
+      "learning_rate": 0.0009562221283383152,
+      "loss": 3.4144,
+      "step": 3060
+    },
+    {
+      "epoch": 0.17813107429864516,
+      "grad_norm": 0.17178234457969666,
+      "learning_rate": 0.0009558286845298337,
+      "loss": 3.4066,
+      "step": 3070
+    },
+    {
+      "epoch": 0.1787113058110186,
+      "grad_norm": 0.17993003129959106,
+      "learning_rate": 0.0009554335623123845,
+      "loss": 3.4125,
+      "step": 3080
+    },
+    {
+      "epoch": 0.17929153732339204,
+      "grad_norm": 0.1944742351770401,
+      "learning_rate": 0.0009550367631408485,
+      "loss": 3.4095,
+      "step": 3090
+    },
+    {
+      "epoch": 0.17987176883576547,
+      "grad_norm": 0.21978144347667694,
+      "learning_rate": 0.0009546382884762825,
+      "loss": 3.4204,
+      "step": 3100
+    },
+    {
+      "epoch": 0.1804520003481389,
+      "grad_norm": 0.19678272306919098,
+      "learning_rate": 0.0009542381397859116,
+      "loss": 3.3991,
+      "step": 3110
+    },
+    {
+      "epoch": 0.18103223186051234,
+      "grad_norm": 0.16551128029823303,
+      "learning_rate": 0.0009538363185431254,
+      "loss": 3.4055,
+      "step": 3120
+    },
+    {
+      "epoch": 0.18161246337288578,
+      "grad_norm": 0.18304401636123657,
+      "learning_rate": 0.0009534328262274717,
+      "loss": 3.4038,
+      "step": 3130
+    },
+    {
+      "epoch": 0.18219269488525922,
+      "grad_norm": 0.1903512328863144,
+      "learning_rate": 0.0009530276643246512,
+      "loss": 3.4081,
+      "step": 3140
+    },
+    {
+      "epoch": 0.18277292639763265,
+      "grad_norm": 0.19788897037506104,
+      "learning_rate": 0.0009526208343265129,
+      "loss": 3.3991,
+      "step": 3150
+    },
+    {
+      "epoch": 0.1833531579100061,
+      "grad_norm": 0.17483383417129517,
+      "learning_rate": 0.0009522123377310474,
+      "loss": 3.4105,
+      "step": 3160
+    },
+    {
+      "epoch": 0.18393338942237952,
+      "grad_norm": 0.18417778611183167,
+      "learning_rate": 0.0009518021760423816,
+      "loss": 3.3973,
+      "step": 3170
+    },
+    {
+      "epoch": 0.18451362093475296,
+      "grad_norm": 0.17036600410938263,
+      "learning_rate": 0.0009513903507707743,
+      "loss": 3.403,
+      "step": 3180
+    },
+    {
+      "epoch": 0.1850938524471264,
+      "grad_norm": 0.17953407764434814,
+      "learning_rate": 0.0009509768634326089,
+      "loss": 3.401,
+      "step": 3190
+    },
+    {
+      "epoch": 0.18567408395949983,
+      "grad_norm": 0.18770861625671387,
+      "learning_rate": 0.0009505617155503894,
+      "loss": 3.4006,
+      "step": 3200
+    },
+    {
+      "epoch": 0.18625431547187327,
+      "grad_norm": 0.20032437145709991,
+      "learning_rate": 0.0009501449086527336,
+      "loss": 3.4012,
+      "step": 3210
+    },
+    {
+      "epoch": 0.1868345469842467,
+      "grad_norm": 0.19611109793186188,
+      "learning_rate": 0.0009497264442743681,
+      "loss": 3.3974,
+      "step": 3220
+    },
+    {
+      "epoch": 0.18741477849662014,
+      "grad_norm": 0.17835482954978943,
+      "learning_rate": 0.0009493063239561227,
+      "loss": 3.3966,
+      "step": 3230
+    },
+    {
+      "epoch": 0.1879950100089936,
+      "grad_norm": 0.17207197844982147,
+      "learning_rate": 0.0009488845492449245,
+      "loss": 3.3957,
+      "step": 3240
+    },
+    {
+      "epoch": 0.18857524152136704,
+      "grad_norm": 0.15979701280593872,
+      "learning_rate": 0.0009484611216937919,
+      "loss": 3.3969,
+      "step": 3250
+    },
+    {
+      "epoch": 0.18915547303374047,
+      "grad_norm": 0.19770529866218567,
+      "learning_rate": 0.0009480360428618298,
+      "loss": 3.3972,
+      "step": 3260
+    },
+    {
+      "epoch": 0.1897357045461139,
+      "grad_norm": 0.161921888589859,
+      "learning_rate": 0.0009476093143142231,
+      "loss": 3.3782,
+      "step": 3270
+    },
+    {
+      "epoch": 0.19031593605848734,
+      "grad_norm": 0.17377763986587524,
+      "learning_rate": 0.0009471809376222304,
+      "loss": 3.3959,
+      "step": 3280
+    },
+    {
+      "epoch": 0.19089616757086078,
+      "grad_norm": 0.17865316569805145,
+      "learning_rate": 0.00094675091436318,
+      "loss": 3.3945,
+      "step": 3290
+    },
+    {
+      "epoch": 0.19147639908323422,
+      "grad_norm": 0.17098024487495422,
+      "learning_rate": 0.0009463192461204626,
+      "loss": 3.3915,
+      "step": 3300
+    },
+    {
+      "epoch": 0.19205663059560765,
+      "grad_norm": 0.18630030751228333,
+      "learning_rate": 0.0009458859344835259,
+      "loss": 3.3891,
+      "step": 3310
+    },
+    {
+      "epoch": 0.1926368621079811,
+      "grad_norm": 0.17725063860416412,
+      "learning_rate": 0.0009454509810478685,
+      "loss": 3.3856,
+      "step": 3320
+    },
+    {
+      "epoch": 0.19321709362035452,
+      "grad_norm": 0.15566089749336243,
+      "learning_rate": 0.0009450143874150347,
+      "loss": 3.3964,
+      "step": 3330
+    },
+    {
+      "epoch": 0.19379732513272796,
+      "grad_norm": 0.16617019474506378,
+      "learning_rate": 0.0009445761551926079,
+      "loss": 3.3854,
+      "step": 3340
+    },
+    {
+      "epoch": 0.1943775566451014,
+      "grad_norm": 0.1689499467611313,
+      "learning_rate": 0.0009441362859942054,
+      "loss": 3.3933,
+      "step": 3350
+    },
+    {
+      "epoch": 0.19495778815747483,
+      "grad_norm": 0.16878637671470642,
+      "learning_rate": 0.0009436947814394712,
+      "loss": 3.3819,
+      "step": 3360
+    },
+    {
+      "epoch": 0.19553801966984827,
+      "grad_norm": 0.20233598351478577,
+      "learning_rate": 0.0009432516431540714,
+      "loss": 3.3932,
+      "step": 3370
+    },
+    {
+      "epoch": 0.1961182511822217,
+      "grad_norm": 0.14608658850193024,
+      "learning_rate": 0.0009428068727696878,
+      "loss": 3.3878,
+      "step": 3380
+    },
+    {
+      "epoch": 0.19669848269459514,
+      "grad_norm": 0.16677936911582947,
+      "learning_rate": 0.0009423604719240114,
+      "loss": 3.3898,
+      "step": 3390
+    },
+    {
+      "epoch": 0.19727871420696858,
+      "grad_norm": 0.17749983072280884,
+      "learning_rate": 0.0009419124422607369,
+      "loss": 3.3835,
+      "step": 3400
+    },
+    {
+      "epoch": 0.197858945719342,
+      "grad_norm": 0.17364837229251862,
+      "learning_rate": 0.0009414627854295566,
+      "loss": 3.3873,
+      "step": 3410
+    },
+    {
+      "epoch": 0.19843917723171545,
+      "grad_norm": 0.1968606561422348,
+      "learning_rate": 0.0009410115030861536,
+      "loss": 3.3834,
+      "step": 3420
+    },
+    {
+      "epoch": 0.19901940874408888,
+      "grad_norm": 0.17494814097881317,
+      "learning_rate": 0.0009405585968921974,
+      "loss": 3.3768,
+      "step": 3430
+    },
+    {
+      "epoch": 0.19959964025646232,
+      "grad_norm": 0.17461541295051575,
+      "learning_rate": 0.0009401040685153357,
+      "loss": 3.3673,
+      "step": 3440
+    },
+    {
+      "epoch": 0.20017987176883575,
+      "grad_norm": 0.16065211594104767,
+      "learning_rate": 0.0009396479196291896,
+      "loss": 3.3831,
+      "step": 3450
+    },
+    {
+      "epoch": 0.2007601032812092,
+      "grad_norm": 0.18772707879543304,
+      "learning_rate": 0.000939190151913347,
+      "loss": 3.381,
+      "step": 3460
+    },
+    {
+      "epoch": 0.20134033479358263,
+      "grad_norm": 0.17115509510040283,
+      "learning_rate": 0.000938730767053357,
+      "loss": 3.3786,
+      "step": 3470
+    },
+    {
+      "epoch": 0.2019205663059561,
+      "grad_norm": 0.16643331944942474,
+      "learning_rate": 0.0009382697667407222,
+      "loss": 3.381,
+      "step": 3480
+    },
+    {
+      "epoch": 0.20250079781832953,
+      "grad_norm": 0.16961540281772614,
+      "learning_rate": 0.0009378071526728944,
+      "loss": 3.3798,
+      "step": 3490
+    },
+    {
+      "epoch": 0.20308102933070296,
+      "grad_norm": 0.18497344851493835,
+      "learning_rate": 0.000937342926553267,
+      "loss": 3.3796,
+      "step": 3500
+    },
+    {
+      "epoch": 0.2036612608430764,
+      "grad_norm": 0.17968598008155823,
+      "learning_rate": 0.0009368770900911691,
+      "loss": 3.3699,
+      "step": 3510
+    },
+    {
+      "epoch": 0.20424149235544983,
+      "grad_norm": 0.19794964790344238,
+      "learning_rate": 0.0009364096450018598,
+      "loss": 3.3711,
+      "step": 3520
+    },
+    {
+      "epoch": 0.20482172386782327,
+      "grad_norm": 0.16060756146907806,
+      "learning_rate": 0.0009359405930065202,
+      "loss": 3.3831,
+      "step": 3530
+    },
+    {
+      "epoch": 0.2054019553801967,
+      "grad_norm": 0.16221892833709717,
+      "learning_rate": 0.0009354699358322493,
+      "loss": 3.3673,
+      "step": 3540
+    },
+    {
+      "epoch": 0.20598218689257014,
+      "grad_norm": 0.1834096759557724,
+      "learning_rate": 0.0009349976752120561,
+      "loss": 3.3696,
+      "step": 3550
+    },
+    {
+      "epoch": 0.20656241840494358,
+      "grad_norm": 0.1541026383638382,
+      "learning_rate": 0.0009345238128848535,
+      "loss": 3.3659,
+      "step": 3560
+    },
+    {
+      "epoch": 0.207142649917317,
+      "grad_norm": 0.16199982166290283,
+      "learning_rate": 0.0009340483505954524,
+      "loss": 3.3728,
+      "step": 3570
+    },
+    {
+      "epoch": 0.20772288142969045,
+      "grad_norm": 0.16066418588161469,
+      "learning_rate": 0.0009335712900945547,
+      "loss": 3.3695,
+      "step": 3580
+    },
+    {
+      "epoch": 0.20830311294206388,
+      "grad_norm": 0.19340233504772186,
+      "learning_rate": 0.0009330926331387472,
+      "loss": 3.3751,
+      "step": 3590
+    },
+    {
+      "epoch": 0.20888334445443732,
+      "grad_norm": 0.1752013862133026,
+      "learning_rate": 0.0009326123814904949,
+      "loss": 3.3665,
+      "step": 3600
+    },
+    {
+      "epoch": 0.20946357596681076,
+      "grad_norm": 0.18730874359607697,
+      "learning_rate": 0.0009321305369181345,
+      "loss": 3.3656,
+      "step": 3610
+    },
+    {
+      "epoch": 0.2100438074791842,
+      "grad_norm": 0.18153837323188782,
+      "learning_rate": 0.0009316471011958685,
+      "loss": 3.3761,
+      "step": 3620
+    },
+    {
+      "epoch": 0.21062403899155763,
+      "grad_norm": 0.15640245378017426,
+      "learning_rate": 0.0009311620761037578,
+      "loss": 3.366,
+      "step": 3630
+    },
+    {
+      "epoch": 0.21120427050393106,
+      "grad_norm": 0.14558587968349457,
+      "learning_rate": 0.0009306754634277154,
+      "loss": 3.3667,
+      "step": 3640
+    },
+    {
+      "epoch": 0.2117845020163045,
+      "grad_norm": 0.17782875895500183,
+      "learning_rate": 0.0009301872649595005,
+      "loss": 3.3683,
+      "step": 3650
+    },
+    {
+      "epoch": 0.21236473352867793,
+      "grad_norm": 0.19611169397830963,
+      "learning_rate": 0.0009296974824967106,
+      "loss": 3.3705,
+      "step": 3660
+    },
+    {
+      "epoch": 0.21294496504105137,
+      "grad_norm": 0.17394675314426422,
+      "learning_rate": 0.0009292061178427762,
+      "loss": 3.3649,
+      "step": 3670
+    },
+    {
+      "epoch": 0.2135251965534248,
+      "grad_norm": 0.1756591647863388,
+      "learning_rate": 0.0009287131728069536,
+      "loss": 3.3661,
+      "step": 3680
+    },
+    {
+      "epoch": 0.21410542806579824,
+      "grad_norm": 0.19795431196689606,
+      "learning_rate": 0.0009282186492043178,
+      "loss": 3.3648,
+      "step": 3690
+    },
+    {
+      "epoch": 0.21468565957817168,
+      "grad_norm": 0.19772683084011078,
+      "learning_rate": 0.0009277225488557566,
+      "loss": 3.3584,
+      "step": 3700
+    },
+    {
+      "epoch": 0.21526589109054511,
+      "grad_norm": 0.18163970112800598,
+      "learning_rate": 0.0009272248735879636,
+      "loss": 3.3643,
+      "step": 3710
+    },
+    {
+      "epoch": 0.21584612260291858,
+      "grad_norm": 0.1849670708179474,
+      "learning_rate": 0.0009267256252334311,
+      "loss": 3.3672,
+      "step": 3720
+    },
+    {
+      "epoch": 0.216426354115292,
+      "grad_norm": 0.1637067347764969,
+      "learning_rate": 0.0009262248056304439,
+      "loss": 3.3708,
+      "step": 3730
+    },
+    {
+      "epoch": 0.21700658562766545,
+      "grad_norm": 0.1776672899723053,
+      "learning_rate": 0.0009257224166230722,
+      "loss": 3.3561,
+      "step": 3740
+    },
+    {
+      "epoch": 0.21758681714003889,
+      "grad_norm": 0.15775948762893677,
+      "learning_rate": 0.0009252184600611651,
+      "loss": 3.3573,
+      "step": 3750
+    },
+    {
+      "epoch": 0.21816704865241232,
+      "grad_norm": 0.1565089225769043,
+      "learning_rate": 0.0009247129378003432,
+      "loss": 3.3654,
+      "step": 3760
+    },
+    {
+      "epoch": 0.21874728016478576,
+      "grad_norm": 0.16912080347537994,
+      "learning_rate": 0.0009242058517019926,
+      "loss": 3.3494,
+      "step": 3770
+    },
+    {
+      "epoch": 0.2193275116771592,
+      "grad_norm": 0.1838664710521698,
+      "learning_rate": 0.0009236972036332574,
+      "loss": 3.3694,
+      "step": 3780
+    },
+    {
+      "epoch": 0.21990774318953263,
+      "grad_norm": 0.17254334688186646,
+      "learning_rate": 0.0009231869954670331,
+      "loss": 3.3601,
+      "step": 3790
+    },
+    {
+      "epoch": 0.22048797470190606,
+      "grad_norm": 0.17754711210727692,
+      "learning_rate": 0.0009226752290819595,
+      "loss": 3.3586,
+      "step": 3800
+    },
+    {
+      "epoch": 0.2210682062142795,
+      "grad_norm": 0.19803158938884735,
+      "learning_rate": 0.0009221619063624143,
+      "loss": 3.3603,
+      "step": 3810
+    },
+    {
+      "epoch": 0.22164843772665294,
+      "grad_norm": 0.16345560550689697,
+      "learning_rate": 0.0009216470291985053,
+      "loss": 3.3511,
+      "step": 3820
+    },
+    {
+      "epoch": 0.22222866923902637,
+      "grad_norm": 0.1706439107656479,
+      "learning_rate": 0.0009211305994860641,
+      "loss": 3.3578,
+      "step": 3830
+    },
+    {
+      "epoch": 0.2228089007513998,
+      "grad_norm": 0.18484774231910706,
+      "learning_rate": 0.0009206126191266393,
+      "loss": 3.3567,
+      "step": 3840
+    },
+    {
+      "epoch": 0.22338913226377324,
+      "grad_norm": 0.18500268459320068,
+      "learning_rate": 0.0009200930900274884,
+      "loss": 3.359,
+      "step": 3850
+    },
+    {
+      "epoch": 0.22396936377614668,
+      "grad_norm": 0.18986065685749054,
+      "learning_rate": 0.0009195720141015725,
+      "loss": 3.3497,
+      "step": 3860
+    },
+    {
+      "epoch": 0.22454959528852012,
+      "grad_norm": 0.17551766335964203,
+      "learning_rate": 0.0009190493932675473,
+      "loss": 3.3474,
+      "step": 3870
+    },
+    {
+      "epoch": 0.22512982680089355,
+      "grad_norm": 0.19979430735111237,
+      "learning_rate": 0.0009185252294497577,
+      "loss": 3.3474,
+      "step": 3880
+    },
+    {
+      "epoch": 0.225710058313267,
+      "grad_norm": 0.1754840612411499,
+      "learning_rate": 0.0009179995245782297,
+      "loss": 3.3426,
+      "step": 3890
+    },
+    {
+      "epoch": 0.22629028982564042,
+      "grad_norm": 0.16811451315879822,
+      "learning_rate": 0.0009174722805886638,
+      "loss": 3.3523,
+      "step": 3900
+    },
+    {
+      "epoch": 0.22687052133801386,
+      "grad_norm": 0.208675354719162,
+      "learning_rate": 0.0009169434994224274,
+      "loss": 3.3479,
+      "step": 3910
+    },
+    {
+      "epoch": 0.2274507528503873,
+      "grad_norm": 0.1587597280740738,
+      "learning_rate": 0.0009164131830265483,
+      "loss": 3.3451,
+      "step": 3920
+    },
+    {
+      "epoch": 0.22803098436276073,
+      "grad_norm": 0.1999424546957016,
+      "learning_rate": 0.0009158813333537071,
+      "loss": 3.3447,
+      "step": 3930
+    },
+    {
+      "epoch": 0.22861121587513417,
+      "grad_norm": 0.1611049622297287,
+      "learning_rate": 0.0009153479523622298,
+      "loss": 3.3534,
+      "step": 3940
+    },
+    {
+      "epoch": 0.2291914473875076,
+      "grad_norm": 0.18397875130176544,
+      "learning_rate": 0.0009148130420160813,
+      "loss": 3.346,
+      "step": 3950
+    },
+    {
+      "epoch": 0.22977167889988107,
+      "grad_norm": 0.17322111129760742,
+      "learning_rate": 0.0009142766042848574,
+      "loss": 3.3534,
+      "step": 3960
+    },
+    {
+      "epoch": 0.2303519104122545,
+      "grad_norm": 0.1563626080751419,
+      "learning_rate": 0.000913738641143778,
+      "loss": 3.3498,
+      "step": 3970
+    },
+    {
+      "epoch": 0.23093214192462794,
+      "grad_norm": 0.18060965836048126,
+      "learning_rate": 0.0009131991545736798,
+      "loss": 3.3402,
+      "step": 3980
+    },
+    {
+      "epoch": 0.23151237343700137,
+      "grad_norm": 0.18193970620632172,
+      "learning_rate": 0.0009126581465610089,
+      "loss": 3.3477,
+      "step": 3990
+    },
+    {
+      "epoch": 0.2320926049493748,
+      "grad_norm": 0.17089489102363586,
+      "learning_rate": 0.0009121156190978134,
+      "loss": 3.3471,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2320926049493748,
+      "eval_loss": 3.2811107635498047,
+      "eval_runtime": 3.2679,
+      "eval_samples_per_second": 1325.027,
+      "eval_steps_per_second": 10.404,
+      "step": 4000
+    },
+    {
+      "epoch": 0.23267283646174824,
+      "grad_norm": 0.1482742726802826,
+      "learning_rate": 0.0009115715741817364,
+      "loss": 3.3448,
+      "step": 4010
+    },
+    {
+      "epoch": 0.23325306797412168,
+      "grad_norm": 0.1840105950832367,
+      "learning_rate": 0.000911026013816008,
+      "loss": 3.3528,
+      "step": 4020
+    },
+    {
+      "epoch": 0.23383329948649512,
+      "grad_norm": 0.159522145986557,
+      "learning_rate": 0.0009104789400094387,
+      "loss": 3.3452,
+      "step": 4030
+    },
+    {
+      "epoch": 0.23441353099886855,
+      "grad_norm": 0.16177432239055634,
+      "learning_rate": 0.0009099303547764118,
+      "loss": 3.3407,
+      "step": 4040
+    },
+    {
+      "epoch": 0.234993762511242,
+      "grad_norm": 0.1564028412103653,
+      "learning_rate": 0.0009093802601368755,
+      "loss": 3.3393,
+      "step": 4050
+    },
+    {
+      "epoch": 0.23557399402361542,
+      "grad_norm": 0.18961066007614136,
+      "learning_rate": 0.0009088286581163357,
+      "loss": 3.3461,
+      "step": 4060
+    },
+    {
+      "epoch": 0.23615422553598886,
+      "grad_norm": 0.15274052321910858,
+      "learning_rate": 0.0009082755507458492,
+      "loss": 3.339,
+      "step": 4070
+    },
+    {
+      "epoch": 0.2367344570483623,
+      "grad_norm": 0.18588609993457794,
+      "learning_rate": 0.0009077209400620148,
+      "loss": 3.3366,
+      "step": 4080
+    },
+    {
+      "epoch": 0.23731468856073573,
+      "grad_norm": 0.17476515471935272,
+      "learning_rate": 0.0009071648281069673,
+      "loss": 3.3353,
+      "step": 4090
+    },
+    {
+      "epoch": 0.23789492007310917,
+      "grad_norm": 0.17199723422527313,
+      "learning_rate": 0.0009066072169283695,
+      "loss": 3.3329,
+      "step": 4100
+    },
+    {
+      "epoch": 0.2384751515854826,
+      "grad_norm": 0.19376391172409058,
+      "learning_rate": 0.0009060481085794037,
+      "loss": 3.3347,
+      "step": 4110
+    },
+    {
+      "epoch": 0.23905538309785604,
+      "grad_norm": 0.15385298430919647,
+      "learning_rate": 0.0009054875051187657,
+      "loss": 3.3387,
+      "step": 4120
+    },
+    {
+      "epoch": 0.23963561461022947,
+      "grad_norm": 0.15670862793922424,
+      "learning_rate": 0.000904925408610656,
+      "loss": 3.3386,
+      "step": 4130
+    },
+    {
+      "epoch": 0.2402158461226029,
+      "grad_norm": 0.16597050428390503,
+      "learning_rate": 0.0009043618211247731,
+      "loss": 3.3409,
+      "step": 4140
+    },
+    {
+      "epoch": 0.24079607763497635,
+      "grad_norm": 0.15981680154800415,
+      "learning_rate": 0.0009037967447363049,
+      "loss": 3.338,
+      "step": 4150
+    },
+    {
+      "epoch": 0.24137630914734978,
+      "grad_norm": 0.18593856692314148,
+      "learning_rate": 0.0009032301815259221,
+      "loss": 3.3384,
+      "step": 4160
+    },
+    {
+      "epoch": 0.24195654065972322,
+      "grad_norm": 0.16867688298225403,
+      "learning_rate": 0.0009026621335797696,
+      "loss": 3.3342,
+      "step": 4170
+    },
+    {
+      "epoch": 0.24253677217209665,
+      "grad_norm": 0.16456478834152222,
+      "learning_rate": 0.0009020926029894594,
+      "loss": 3.3346,
+      "step": 4180
+    },
+    {
+      "epoch": 0.2431170036844701,
+      "grad_norm": 0.1605817675590515,
+      "learning_rate": 0.0009015215918520629,
+      "loss": 3.3316,
+      "step": 4190
+    },
+    {
+      "epoch": 0.24369723519684355,
+      "grad_norm": 0.18208837509155273,
+      "learning_rate": 0.0009009491022701028,
+      "loss": 3.3329,
+      "step": 4200
+    },
+    {
+      "epoch": 0.244277466709217,
+      "grad_norm": 0.1611657440662384,
+      "learning_rate": 0.000900375136351546,
+      "loss": 3.3293,
+      "step": 4210
+    },
+    {
+      "epoch": 0.24485769822159043,
+      "grad_norm": 0.17636191844940186,
+      "learning_rate": 0.0008997996962097947,
+      "loss": 3.3449,
+      "step": 4220
+    },
+    {
+      "epoch": 0.24543792973396386,
+      "grad_norm": 0.19152578711509705,
+      "learning_rate": 0.0008992227839636804,
+      "loss": 3.3272,
+      "step": 4230
+    },
+    {
+      "epoch": 0.2460181612463373,
+      "grad_norm": 0.16381146013736725,
+      "learning_rate": 0.0008986444017374538,
+      "loss": 3.3223,
+      "step": 4240
+    },
+    {
+      "epoch": 0.24659839275871073,
+      "grad_norm": 0.16677068173885345,
+      "learning_rate": 0.0008980645516607793,
+      "loss": 3.3294,
+      "step": 4250
+    },
+    {
+      "epoch": 0.24717862427108417,
+      "grad_norm": 0.16276830434799194,
+      "learning_rate": 0.0008974832358687253,
+      "loss": 3.3337,
+      "step": 4260
+    },
+    {
+      "epoch": 0.2477588557834576,
+      "grad_norm": 0.17291460931301117,
+      "learning_rate": 0.0008969004565017577,
+      "loss": 3.3255,
+      "step": 4270
+    },
+    {
+      "epoch": 0.24833908729583104,
+      "grad_norm": 0.17407308518886566,
+      "learning_rate": 0.0008963162157057309,
+      "loss": 3.3329,
+      "step": 4280
+    },
+    {
+      "epoch": 0.24891931880820448,
+      "grad_norm": 0.15487220883369446,
+      "learning_rate": 0.0008957305156318811,
+      "loss": 3.3245,
+      "step": 4290
+    },
+    {
+      "epoch": 0.2494995503205779,
+      "grad_norm": 0.14908728003501892,
+      "learning_rate": 0.000895143358436817,
+      "loss": 3.3281,
+      "step": 4300
+    },
+    {
+      "epoch": 0.2500797818329513,
+      "grad_norm": 0.20198597013950348,
+      "learning_rate": 0.000894554746282513,
+      "loss": 3.325,
+      "step": 4310
+    },
+    {
+      "epoch": 0.2506600133453248,
+      "grad_norm": 0.19431117177009583,
+      "learning_rate": 0.0008939646813363007,
+      "loss": 3.322,
+      "step": 4320
+    },
+    {
+      "epoch": 0.25124024485769825,
+      "grad_norm": 0.1598384529352188,
+      "learning_rate": 0.000893373165770861,
+      "loss": 3.3353,
+      "step": 4330
+    },
+    {
+      "epoch": 0.25182047637007166,
+      "grad_norm": 0.1565127670764923,
+      "learning_rate": 0.0008927802017642164,
+      "loss": 3.3201,
+      "step": 4340
+    },
+    {
+      "epoch": 0.2524007078824451,
+      "grad_norm": 0.1654927283525467,
+      "learning_rate": 0.0008921857914997222,
+      "loss": 3.3326,
+      "step": 4350
+    },
+    {
+      "epoch": 0.2529809393948185,
+      "grad_norm": 0.15852831304073334,
+      "learning_rate": 0.0008915899371660595,
+      "loss": 3.328,
+      "step": 4360
+    },
+    {
+      "epoch": 0.253561170907192,
+      "grad_norm": 0.16426099836826324,
+      "learning_rate": 0.0008909926409572263,
+      "loss": 3.3326,
+      "step": 4370
+    },
+    {
+      "epoch": 0.2541414024195654,
+      "grad_norm": 0.16855676472187042,
+      "learning_rate": 0.0008903939050725297,
+      "loss": 3.3289,
+      "step": 4380
+    },
+    {
+      "epoch": 0.25472163393193886,
+      "grad_norm": 0.1595139056444168,
+      "learning_rate": 0.0008897937317165781,
+      "loss": 3.3324,
+      "step": 4390
+    },
+    {
+      "epoch": 0.25530186544431227,
+      "grad_norm": 0.15312696993350983,
+      "learning_rate": 0.0008891921230992725,
+      "loss": 3.3294,
+      "step": 4400
+    },
+    {
+      "epoch": 0.25588209695668573,
+      "grad_norm": 0.16672903299331665,
+      "learning_rate": 0.000888589081435799,
+      "loss": 3.3217,
+      "step": 4410
+    },
+    {
+      "epoch": 0.25646232846905914,
+      "grad_norm": 0.1601061373949051,
+      "learning_rate": 0.0008879846089466202,
+      "loss": 3.3153,
+      "step": 4420
+    },
+    {
+      "epoch": 0.2570425599814326,
+      "grad_norm": 0.16103419661521912,
+      "learning_rate": 0.0008873787078574671,
+      "loss": 3.3176,
+      "step": 4430
+    },
+    {
+      "epoch": 0.257622791493806,
+      "grad_norm": 0.15978160500526428,
+      "learning_rate": 0.0008867713803993309,
+      "loss": 3.3316,
+      "step": 4440
+    },
+    {
+      "epoch": 0.2582030230061795,
+      "grad_norm": 0.15198193490505219,
+      "learning_rate": 0.0008861626288084549,
+      "loss": 3.3205,
+      "step": 4450
+    },
+    {
+      "epoch": 0.2587832545185529,
+      "grad_norm": 0.17815245687961578,
+      "learning_rate": 0.0008855524553263263,
+      "loss": 3.3159,
+      "step": 4460
+    },
+    {
+      "epoch": 0.25936348603092635,
+      "grad_norm": 0.17297674715518951,
+      "learning_rate": 0.0008849408621996679,
+      "loss": 3.3131,
+      "step": 4470
+    },
+    {
+      "epoch": 0.25994371754329976,
+      "grad_norm": 0.17088572680950165,
+      "learning_rate": 0.0008843278516804294,
+      "loss": 3.3178,
+      "step": 4480
+    },
+    {
+      "epoch": 0.2605239490556732,
+      "grad_norm": 0.14876043796539307,
+      "learning_rate": 0.00088371342602578,
+      "loss": 3.3113,
+      "step": 4490
+    },
+    {
+      "epoch": 0.26110418056804663,
+      "grad_norm": 0.16213169693946838,
+      "learning_rate": 0.0008830975874980991,
+      "loss": 3.3168,
+      "step": 4500
+    },
+    {
+      "epoch": 0.2616844120804201,
+      "grad_norm": 0.1761493980884552,
+      "learning_rate": 0.0008824803383649688,
+      "loss": 3.319,
+      "step": 4510
+    },
+    {
+      "epoch": 0.2622646435927935,
+      "grad_norm": 0.15832360088825226,
+      "learning_rate": 0.0008818616808991651,
+      "loss": 3.3202,
+      "step": 4520
+    },
+    {
+      "epoch": 0.26284487510516696,
+      "grad_norm": 0.1700046807527542,
+      "learning_rate": 0.0008812416173786495,
+      "loss": 3.3119,
+      "step": 4530
+    },
+    {
+      "epoch": 0.26342510661754037,
+      "grad_norm": 0.17616261541843414,
+      "learning_rate": 0.0008806201500865609,
+      "loss": 3.3133,
+      "step": 4540
+    },
+    {
+      "epoch": 0.26400533812991384,
+      "grad_norm": 0.17251409590244293,
+      "learning_rate": 0.0008799972813112072,
+      "loss": 3.3148,
+      "step": 4550
+    },
+    {
+      "epoch": 0.2645855696422873,
+      "grad_norm": 0.16835258901119232,
+      "learning_rate": 0.0008793730133460561,
+      "loss": 3.3188,
+      "step": 4560
+    },
+    {
+      "epoch": 0.2651658011546607,
+      "grad_norm": 0.16114716231822968,
+      "learning_rate": 0.0008787473484897276,
+      "loss": 3.3227,
+      "step": 4570
+    },
+    {
+      "epoch": 0.26574603266703417,
+      "grad_norm": 0.16803395748138428,
+      "learning_rate": 0.0008781202890459856,
+      "loss": 3.322,
+      "step": 4580
+    },
+    {
+      "epoch": 0.2663262641794076,
+      "grad_norm": 0.1682870239019394,
+      "learning_rate": 0.0008774918373237284,
+      "loss": 3.3142,
+      "step": 4590
+    },
+    {
+      "epoch": 0.26690649569178104,
+      "grad_norm": 0.15376168489456177,
+      "learning_rate": 0.0008768619956369813,
+      "loss": 3.3131,
+      "step": 4600
+    },
+    {
+      "epoch": 0.26748672720415445,
+      "grad_norm": 0.14724121987819672,
+      "learning_rate": 0.0008762307663048871,
+      "loss": 3.3105,
+      "step": 4610
+    },
+    {
+      "epoch": 0.2680669587165279,
+      "grad_norm": 0.1947721391916275,
+      "learning_rate": 0.0008755981516516987,
+      "loss": 3.3177,
+      "step": 4620
+    },
+    {
+      "epoch": 0.2686471902289013,
+      "grad_norm": 0.15725037455558777,
+      "learning_rate": 0.0008749641540067691,
+      "loss": 3.308,
+      "step": 4630
+    },
+    {
+      "epoch": 0.2692274217412748,
+      "grad_norm": 0.1616797149181366,
+      "learning_rate": 0.0008743287757045443,
+      "loss": 3.3158,
+      "step": 4640
+    },
+    {
+      "epoch": 0.2698076532536482,
+      "grad_norm": 0.16861191391944885,
+      "learning_rate": 0.0008736920190845536,
+      "loss": 3.3113,
+      "step": 4650
+    },
+    {
+      "epoch": 0.27038788476602166,
+      "grad_norm": 0.16433420777320862,
+      "learning_rate": 0.0008730538864914019,
+      "loss": 3.3168,
+      "step": 4660
+    },
+    {
+      "epoch": 0.27096811627839507,
+      "grad_norm": 0.1651991754770279,
+      "learning_rate": 0.00087241438027476,
+      "loss": 3.3016,
+      "step": 4670
+    },
+    {
+      "epoch": 0.27154834779076853,
+      "grad_norm": 0.18741054832935333,
+      "learning_rate": 0.0008717735027893568,
+      "loss": 3.3121,
+      "step": 4680
+    },
+    {
+      "epoch": 0.27212857930314194,
+      "grad_norm": 0.15020763874053955,
+      "learning_rate": 0.0008711312563949703,
+      "loss": 3.309,
+      "step": 4690
+    },
+    {
+      "epoch": 0.2727088108155154,
+      "grad_norm": 0.17015768587589264,
+      "learning_rate": 0.000870487643456419,
+      "loss": 3.3225,
+      "step": 4700
+    },
+    {
+      "epoch": 0.2732890423278888,
+      "grad_norm": 0.19843073189258575,
+      "learning_rate": 0.0008698426663435533,
+      "loss": 3.3058,
+      "step": 4710
+    },
+    {
+      "epoch": 0.2738692738402623,
+      "grad_norm": 0.17581596970558167,
+      "learning_rate": 0.0008691963274312464,
+      "loss": 3.3086,
+      "step": 4720
+    },
+    {
+      "epoch": 0.2744495053526357,
+      "grad_norm": 0.17392052710056305,
+      "learning_rate": 0.000868548629099386,
+      "loss": 3.311,
+      "step": 4730
+    },
+    {
+      "epoch": 0.27502973686500914,
+      "grad_norm": 0.16661454737186432,
+      "learning_rate": 0.0008678995737328651,
+      "loss": 3.3108,
+      "step": 4740
+    },
+    {
+      "epoch": 0.27560996837738255,
+      "grad_norm": 0.1836322695016861,
+      "learning_rate": 0.0008672491637215735,
+      "loss": 3.3042,
+      "step": 4750
+    },
+    {
+      "epoch": 0.276190199889756,
+      "grad_norm": 0.14587625861167908,
+      "learning_rate": 0.0008665974014603891,
+      "loss": 3.3202,
+      "step": 4760
+    },
+    {
+      "epoch": 0.2767704314021294,
+      "grad_norm": 0.1731816828250885,
+      "learning_rate": 0.0008659442893491689,
+      "loss": 3.295,
+      "step": 4770
+    },
+    {
+      "epoch": 0.2773506629145029,
+      "grad_norm": 0.1422199159860611,
+      "learning_rate": 0.0008652898297927398,
+      "loss": 3.3102,
+      "step": 4780
+    },
+    {
+      "epoch": 0.2779308944268763,
+      "grad_norm": 0.1520106941461563,
+      "learning_rate": 0.0008646340252008908,
+      "loss": 3.3186,
+      "step": 4790
+    },
+    {
+      "epoch": 0.27851112593924976,
+      "grad_norm": 0.16036173701286316,
+      "learning_rate": 0.000863976877988363,
+      "loss": 3.3145,
+      "step": 4800
+    },
+    {
+      "epoch": 0.2790913574516232,
+      "grad_norm": 0.15433992445468903,
+      "learning_rate": 0.0008633183905748411,
+      "loss": 3.3073,
+      "step": 4810
+    },
+    {
+      "epoch": 0.27967158896399663,
+      "grad_norm": 0.15274393558502197,
+      "learning_rate": 0.0008626585653849449,
+      "loss": 3.3005,
+      "step": 4820
+    },
+    {
+      "epoch": 0.2802518204763701,
+      "grad_norm": 0.1466848999261856,
+      "learning_rate": 0.0008619974048482198,
+      "loss": 3.3096,
+      "step": 4830
+    },
+    {
+      "epoch": 0.2808320519887435,
+      "grad_norm": 0.17398701608181,
+      "learning_rate": 0.0008613349113991283,
+      "loss": 3.2977,
+      "step": 4840
+    },
+    {
+      "epoch": 0.28141228350111697,
+      "grad_norm": 0.1633315235376358,
+      "learning_rate": 0.0008606710874770405,
+      "loss": 3.3048,
+      "step": 4850
+    },
+    {
+      "epoch": 0.2819925150134904,
+      "grad_norm": 0.15874172747135162,
+      "learning_rate": 0.0008600059355262259,
+      "loss": 3.3,
+      "step": 4860
+    },
+    {
+      "epoch": 0.28257274652586384,
+      "grad_norm": 0.16768650710582733,
+      "learning_rate": 0.0008593394579958433,
+      "loss": 3.2971,
+      "step": 4870
+    },
+    {
+      "epoch": 0.28315297803823725,
+      "grad_norm": 0.17810046672821045,
+      "learning_rate": 0.0008586716573399329,
+      "loss": 3.3043,
+      "step": 4880
+    },
+    {
+      "epoch": 0.2837332095506107,
+      "grad_norm": 0.16520732641220093,
+      "learning_rate": 0.0008580025360174069,
+      "loss": 3.3097,
+      "step": 4890
+    },
+    {
+      "epoch": 0.2843134410629841,
+      "grad_norm": 0.15736092627048492,
+      "learning_rate": 0.0008573320964920397,
+      "loss": 3.2936,
+      "step": 4900
+    },
+    {
+      "epoch": 0.2848936725753576,
+      "grad_norm": 0.16211272776126862,
+      "learning_rate": 0.0008566603412324602,
+      "loss": 3.3037,
+      "step": 4910
+    },
+    {
+      "epoch": 0.285473904087731,
+      "grad_norm": 0.1499512791633606,
+      "learning_rate": 0.0008559872727121416,
+      "loss": 3.2995,
+      "step": 4920
+    },
+    {
+      "epoch": 0.28605413560010445,
+      "grad_norm": 0.17990955710411072,
+      "learning_rate": 0.0008553128934093926,
+      "loss": 3.3008,
+      "step": 4930
+    },
+    {
+      "epoch": 0.28663436711247786,
+      "grad_norm": 0.16948296129703522,
+      "learning_rate": 0.0008546372058073484,
+      "loss": 3.2988,
+      "step": 4940
+    },
+    {
+      "epoch": 0.2872145986248513,
+      "grad_norm": 0.16478213667869568,
+      "learning_rate": 0.0008539602123939616,
+      "loss": 3.2981,
+      "step": 4950
+    },
+    {
+      "epoch": 0.28779483013722473,
+      "grad_norm": 0.17590472102165222,
+      "learning_rate": 0.0008532819156619928,
+      "loss": 3.2979,
+      "step": 4960
+    },
+    {
+      "epoch": 0.2883750616495982,
+      "grad_norm": 0.16984987258911133,
+      "learning_rate": 0.0008526023181090019,
+      "loss": 3.3093,
+      "step": 4970
+    },
+    {
+      "epoch": 0.2889552931619716,
+      "grad_norm": 0.18633520603179932,
+      "learning_rate": 0.0008519214222373379,
+      "loss": 3.3027,
+      "step": 4980
+    },
+    {
+      "epoch": 0.28953552467434507,
+      "grad_norm": 0.1713830530643463,
+      "learning_rate": 0.000851239230554131,
+      "loss": 3.3019,
+      "step": 4990
+    },
+    {
+      "epoch": 0.2901157561867185,
+      "grad_norm": 0.16138288378715515,
+      "learning_rate": 0.0008505557455712825,
+      "loss": 3.2957,
+      "step": 5000
+    },
+    {
+      "epoch": 0.2901157561867185,
+      "eval_loss": 3.232105016708374,
+      "eval_runtime": 3.247,
+      "eval_samples_per_second": 1333.559,
+      "eval_steps_per_second": 10.471,
+      "step": 5000
+    },
+    {
+      "epoch": 0.29069598769909194,
+      "grad_norm": 0.14849427342414856,
+      "learning_rate": 0.0008498709698054553,
+      "loss": 3.297,
+      "step": 5010
+    },
+    {
+      "epoch": 0.29127621921146535,
+      "grad_norm": 0.15944212675094604,
+      "learning_rate": 0.0008491849057780658,
+      "loss": 3.2875,
+      "step": 5020
+    },
+    {
+      "epoch": 0.2918564507238388,
+      "grad_norm": 0.14453737437725067,
+      "learning_rate": 0.0008484975560152737,
+      "loss": 3.2919,
+      "step": 5030
+    },
+    {
+      "epoch": 0.2924366822362123,
+      "grad_norm": 0.18005253374576569,
+      "learning_rate": 0.0008478089230479726,
+      "loss": 3.2981,
+      "step": 5040
+    },
+    {
+      "epoch": 0.2930169137485857,
+      "grad_norm": 0.16119055449962616,
+      "learning_rate": 0.0008471190094117814,
+      "loss": 3.2942,
+      "step": 5050
+    },
+    {
+      "epoch": 0.29359714526095915,
+      "grad_norm": 0.15280525386333466,
+      "learning_rate": 0.0008464278176470342,
+      "loss": 3.2958,
+      "step": 5060
+    },
+    {
+      "epoch": 0.29417737677333256,
+      "grad_norm": 0.15936006605625153,
+      "learning_rate": 0.0008457353502987718,
+      "loss": 3.294,
+      "step": 5070
+    },
+    {
+      "epoch": 0.294757608285706,
+      "grad_norm": 0.15230213105678558,
+      "learning_rate": 0.0008450416099167313,
+      "loss": 3.3008,
+      "step": 5080
+    },
+    {
+      "epoch": 0.2953378397980794,
+      "grad_norm": 0.15988774597644806,
+      "learning_rate": 0.0008443465990553374,
+      "loss": 3.2902,
+      "step": 5090
+    },
+    {
+      "epoch": 0.2959180713104529,
+      "grad_norm": 0.17253676056861877,
+      "learning_rate": 0.0008436503202736928,
+      "loss": 3.2986,
+      "step": 5100
+    },
+    {
+      "epoch": 0.2964983028228263,
+      "grad_norm": 0.15170855820178986,
+      "learning_rate": 0.0008429527761355693,
+      "loss": 3.2877,
+      "step": 5110
+    },
+    {
+      "epoch": 0.29707853433519976,
+      "grad_norm": 0.16487392783164978,
+      "learning_rate": 0.0008422539692093974,
+      "loss": 3.2846,
+      "step": 5120
+    },
+    {
+      "epoch": 0.29765876584757317,
+      "grad_norm": 0.14367428421974182,
+      "learning_rate": 0.000841553902068257,
+      "loss": 3.2906,
+      "step": 5130
+    },
+    {
+      "epoch": 0.29823899735994663,
+      "grad_norm": 0.17368683218955994,
+      "learning_rate": 0.0008408525772898692,
+      "loss": 3.3027,
+      "step": 5140
+    },
+    {
+      "epoch": 0.29881922887232004,
+      "grad_norm": 0.1599467247724533,
+      "learning_rate": 0.000840149997456585,
+      "loss": 3.2852,
+      "step": 5150
+    },
+    {
+      "epoch": 0.2993994603846935,
+      "grad_norm": 0.15667842328548431,
+      "learning_rate": 0.0008394461651553768,
+      "loss": 3.2898,
+      "step": 5160
+    },
+    {
+      "epoch": 0.2999796918970669,
+      "grad_norm": 0.15665532648563385,
+      "learning_rate": 0.000838741082977829,
+      "loss": 3.2994,
+      "step": 5170
+    },
+    {
+      "epoch": 0.3005599234094404,
+      "grad_norm": 0.17890366911888123,
+      "learning_rate": 0.0008380347535201283,
+      "loss": 3.2879,
+      "step": 5180
+    },
+    {
+      "epoch": 0.3011401549218138,
+      "grad_norm": 0.15844030678272247,
+      "learning_rate": 0.0008373271793830536,
+      "loss": 3.2948,
+      "step": 5190
+    },
+    {
+      "epoch": 0.30172038643418725,
+      "grad_norm": 0.14670881628990173,
+      "learning_rate": 0.0008366183631719668,
+      "loss": 3.2901,
+      "step": 5200
+    },
+    {
+      "epoch": 0.30230061794656066,
+      "grad_norm": 0.15351887047290802,
+      "learning_rate": 0.0008359083074968039,
+      "loss": 3.2899,
+      "step": 5210
+    },
+    {
+      "epoch": 0.3028808494589341,
+      "grad_norm": 0.14965134859085083,
+      "learning_rate": 0.0008351970149720636,
+      "loss": 3.2885,
+      "step": 5220
+    },
+    {
+      "epoch": 0.30346108097130753,
+      "grad_norm": 0.16267924010753632,
+      "learning_rate": 0.0008344844882167999,
+      "loss": 3.2937,
+      "step": 5230
+    },
+    {
+      "epoch": 0.304041312483681,
+      "grad_norm": 0.14958548545837402,
+      "learning_rate": 0.0008337707298546112,
+      "loss": 3.2887,
+      "step": 5240
+    },
+    {
+      "epoch": 0.3046215439960544,
+      "grad_norm": 0.1530071198940277,
+      "learning_rate": 0.0008330557425136299,
+      "loss": 3.2865,
+      "step": 5250
+    },
+    {
+      "epoch": 0.30520177550842786,
+      "grad_norm": 0.17143143713474274,
+      "learning_rate": 0.0008323395288265149,
+      "loss": 3.2861,
+      "step": 5260
+    },
+    {
+      "epoch": 0.3057820070208013,
+      "grad_norm": 0.15875962376594543,
+      "learning_rate": 0.0008316220914304398,
+      "loss": 3.2919,
+      "step": 5270
+    },
+    {
+      "epoch": 0.30636223853317474,
+      "grad_norm": 0.14534059166908264,
+      "learning_rate": 0.0008309034329670841,
+      "loss": 3.2813,
+      "step": 5280
+    },
+    {
+      "epoch": 0.3069424700455482,
+      "grad_norm": 0.1608089655637741,
+      "learning_rate": 0.0008301835560826236,
+      "loss": 3.2866,
+      "step": 5290
+    },
+    {
+      "epoch": 0.3075227015579216,
+      "grad_norm": 0.15711037814617157,
+      "learning_rate": 0.0008294624634277208,
+      "loss": 3.2924,
+      "step": 5300
+    },
+    {
+      "epoch": 0.30810293307029507,
+      "grad_norm": 0.1359197199344635,
+      "learning_rate": 0.0008287401576575139,
+      "loss": 3.2906,
+      "step": 5310
+    },
+    {
+      "epoch": 0.3086831645826685,
+      "grad_norm": 0.1708402931690216,
+      "learning_rate": 0.0008280166414316086,
+      "loss": 3.2919,
+      "step": 5320
+    },
+    {
+      "epoch": 0.30926339609504194,
+      "grad_norm": 0.16216853260993958,
+      "learning_rate": 0.0008272919174140674,
+      "loss": 3.278,
+      "step": 5330
+    },
+    {
+      "epoch": 0.30984362760741535,
+      "grad_norm": 0.16680875420570374,
+      "learning_rate": 0.0008265659882734002,
+      "loss": 3.2745,
+      "step": 5340
+    },
+    {
+      "epoch": 0.3104238591197888,
+      "grad_norm": 0.1761893630027771,
+      "learning_rate": 0.0008258388566825539,
+      "loss": 3.2768,
+      "step": 5350
+    },
+    {
+      "epoch": 0.3110040906321622,
+      "grad_norm": 0.17635513842105865,
+      "learning_rate": 0.0008251105253189034,
+      "loss": 3.2908,
+      "step": 5360
+    },
+    {
+      "epoch": 0.3115843221445357,
+      "grad_norm": 0.1403694897890091,
+      "learning_rate": 0.0008243809968642411,
+      "loss": 3.2896,
+      "step": 5370
+    },
+    {
+      "epoch": 0.3121645536569091,
+      "grad_norm": 0.15853242576122284,
+      "learning_rate": 0.0008236502740047669,
+      "loss": 3.2876,
+      "step": 5380
+    },
+    {
+      "epoch": 0.31274478516928256,
+      "grad_norm": 0.14447931945323944,
+      "learning_rate": 0.0008229183594310791,
+      "loss": 3.2749,
+      "step": 5390
+    },
+    {
+      "epoch": 0.31332501668165597,
+      "grad_norm": 0.14274190366268158,
+      "learning_rate": 0.0008221852558381639,
+      "loss": 3.2826,
+      "step": 5400
+    },
+    {
+      "epoch": 0.31390524819402943,
+      "grad_norm": 0.15020518004894257,
+      "learning_rate": 0.0008214509659253855,
+      "loss": 3.2768,
+      "step": 5410
+    },
+    {
+      "epoch": 0.31448547970640284,
+      "grad_norm": 0.16364452242851257,
+      "learning_rate": 0.0008207154923964761,
+      "loss": 3.2796,
+      "step": 5420
+    },
+    {
+      "epoch": 0.3150657112187763,
+      "grad_norm": 0.15643912553787231,
+      "learning_rate": 0.0008199788379595266,
+      "loss": 3.2897,
+      "step": 5430
+    },
+    {
+      "epoch": 0.3156459427311497,
+      "grad_norm": 0.14374196529388428,
+      "learning_rate": 0.0008192410053269757,
+      "loss": 3.2829,
+      "step": 5440
+    },
+    {
+      "epoch": 0.3162261742435232,
+      "grad_norm": 0.1532783806324005,
+      "learning_rate": 0.0008185019972156003,
+      "loss": 3.2775,
+      "step": 5450
+    },
+    {
+      "epoch": 0.3168064057558966,
+      "grad_norm": 0.15971648693084717,
+      "learning_rate": 0.0008177618163465054,
+      "loss": 3.2815,
+      "step": 5460
+    },
+    {
+      "epoch": 0.31738663726827004,
+      "grad_norm": 0.18425996601581573,
+      "learning_rate": 0.0008170204654451154,
+      "loss": 3.2777,
+      "step": 5470
+    },
+    {
+      "epoch": 0.31796686878064345,
+      "grad_norm": 0.13549089431762695,
+      "learning_rate": 0.0008162779472411612,
+      "loss": 3.2782,
+      "step": 5480
+    },
+    {
+      "epoch": 0.3185471002930169,
+      "grad_norm": 0.14235620200634003,
+      "learning_rate": 0.0008155342644686729,
+      "loss": 3.2755,
+      "step": 5490
+    },
+    {
+      "epoch": 0.3191273318053903,
+      "grad_norm": 0.1648331880569458,
+      "learning_rate": 0.0008147894198659683,
+      "loss": 3.2767,
+      "step": 5500
+    },
+    {
+      "epoch": 0.3197075633177638,
+      "grad_norm": 0.15751086175441742,
+      "learning_rate": 0.0008140434161756433,
+      "loss": 3.2789,
+      "step": 5510
+    },
+    {
+      "epoch": 0.32028779483013725,
+      "grad_norm": 0.16342034935951233,
+      "learning_rate": 0.0008132962561445616,
+      "loss": 3.2693,
+      "step": 5520
+    },
+    {
+      "epoch": 0.32086802634251066,
+      "grad_norm": 0.1530640870332718,
+      "learning_rate": 0.0008125479425238447,
+      "loss": 3.2773,
+      "step": 5530
+    },
+    {
+      "epoch": 0.3214482578548841,
+      "grad_norm": 0.1614234745502472,
+      "learning_rate": 0.0008117984780688619,
+      "loss": 3.276,
+      "step": 5540
+    },
+    {
+      "epoch": 0.32202848936725753,
+      "grad_norm": 0.16489213705062866,
+      "learning_rate": 0.0008110478655392195,
+      "loss": 3.2802,
+      "step": 5550
+    },
+    {
+      "epoch": 0.322608720879631,
+      "grad_norm": 0.16258342564105988,
+      "learning_rate": 0.0008102961076987519,
+      "loss": 3.2755,
+      "step": 5560
+    },
+    {
+      "epoch": 0.3231889523920044,
+      "grad_norm": 0.14555875957012177,
+      "learning_rate": 0.0008095432073155098,
+      "loss": 3.2775,
+      "step": 5570
+    },
+    {
+      "epoch": 0.32376918390437787,
+      "grad_norm": 0.1406964659690857,
+      "learning_rate": 0.0008087891671617515,
+      "loss": 3.2611,
+      "step": 5580
+    },
+    {
+      "epoch": 0.3243494154167513,
+      "grad_norm": 0.162723109126091,
+      "learning_rate": 0.0008080339900139317,
+      "loss": 3.2648,
+      "step": 5590
+    },
+    {
+      "epoch": 0.32492964692912474,
+      "grad_norm": 0.1660017967224121,
+      "learning_rate": 0.0008072776786526921,
+      "loss": 3.2704,
+      "step": 5600
+    },
+    {
+      "epoch": 0.32550987844149815,
+      "grad_norm": 0.13984504342079163,
+      "learning_rate": 0.0008065202358628501,
+      "loss": 3.2757,
+      "step": 5610
+    },
+    {
+      "epoch": 0.3260901099538716,
+      "grad_norm": 0.17101338505744934,
+      "learning_rate": 0.0008057616644333894,
+      "loss": 3.2742,
+      "step": 5620
+    },
+    {
+      "epoch": 0.326670341466245,
+      "grad_norm": 0.15518838167190552,
+      "learning_rate": 0.0008050019671574496,
+      "loss": 3.2676,
+      "step": 5630
+    },
+    {
+      "epoch": 0.3272505729786185,
+      "grad_norm": 0.17470310628414154,
+      "learning_rate": 0.0008042411468323154,
+      "loss": 3.2731,
+      "step": 5640
+    },
+    {
+      "epoch": 0.3278308044909919,
+      "grad_norm": 0.14603078365325928,
+      "learning_rate": 0.0008034792062594072,
+      "loss": 3.2727,
+      "step": 5650
+    },
+    {
+      "epoch": 0.32841103600336535,
+      "grad_norm": 0.14128392934799194,
+      "learning_rate": 0.00080271614824427,
+      "loss": 3.2689,
+      "step": 5660
+    },
+    {
+      "epoch": 0.32899126751573876,
+      "grad_norm": 0.16043803095817566,
+      "learning_rate": 0.0008019519755965629,
+      "loss": 3.2574,
+      "step": 5670
+    },
+    {
+      "epoch": 0.3295714990281122,
+      "grad_norm": 0.1634809821844101,
+      "learning_rate": 0.0008011866911300504,
+      "loss": 3.2706,
+      "step": 5680
+    },
+    {
+      "epoch": 0.33015173054048563,
+      "grad_norm": 0.1542406529188156,
+      "learning_rate": 0.0008004202976625895,
+      "loss": 3.2894,
+      "step": 5690
+    },
+    {
+      "epoch": 0.3307319620528591,
+      "grad_norm": 0.15284700691699982,
+      "learning_rate": 0.0007996527980161214,
+      "loss": 3.2814,
+      "step": 5700
+    },
+    {
+      "epoch": 0.3313121935652325,
+      "grad_norm": 0.1456363946199417,
+      "learning_rate": 0.0007988841950166602,
+      "loss": 3.2727,
+      "step": 5710
+    },
+    {
+      "epoch": 0.33189242507760597,
+      "grad_norm": 0.173149973154068,
+      "learning_rate": 0.0007981144914942827,
+      "loss": 3.2607,
+      "step": 5720
+    },
+    {
+      "epoch": 0.3324726565899794,
+      "grad_norm": 0.1568511724472046,
+      "learning_rate": 0.0007973436902831179,
+      "loss": 3.2638,
+      "step": 5730
+    },
+    {
+      "epoch": 0.33305288810235284,
+      "grad_norm": 0.16106663644313812,
+      "learning_rate": 0.0007965717942213365,
+      "loss": 3.2652,
+      "step": 5740
+    },
+    {
+      "epoch": 0.3336331196147263,
+      "grad_norm": 0.14553207159042358,
+      "learning_rate": 0.0007957988061511408,
+      "loss": 3.2771,
+      "step": 5750
+    },
+    {
+      "epoch": 0.3342133511270997,
+      "grad_norm": 0.15205144882202148,
+      "learning_rate": 0.0007950247289187538,
+      "loss": 3.2729,
+      "step": 5760
+    },
+    {
+      "epoch": 0.3347935826394732,
+      "grad_norm": 0.15027263760566711,
+      "learning_rate": 0.0007942495653744089,
+      "loss": 3.2727,
+      "step": 5770
+    },
+    {
+      "epoch": 0.3353738141518466,
+      "grad_norm": 0.1563851535320282,
+      "learning_rate": 0.0007934733183723395,
+      "loss": 3.2653,
+      "step": 5780
+    },
+    {
+      "epoch": 0.33595404566422005,
+      "grad_norm": 0.14442172646522522,
+      "learning_rate": 0.0007926959907707683,
+      "loss": 3.2754,
+      "step": 5790
+    },
+    {
+      "epoch": 0.33653427717659345,
+      "grad_norm": 0.13226284086704254,
+      "learning_rate": 0.0007919175854318971,
+      "loss": 3.2605,
+      "step": 5800
+    },
+    {
+      "epoch": 0.3371145086889669,
+      "grad_norm": 0.1466459184885025,
+      "learning_rate": 0.0007911381052218961,
+      "loss": 3.2638,
+      "step": 5810
+    },
+    {
+      "epoch": 0.3376947402013403,
+      "grad_norm": 0.15801192820072174,
+      "learning_rate": 0.0007903575530108926,
+      "loss": 3.2604,
+      "step": 5820
+    },
+    {
+      "epoch": 0.3382749717137138,
+      "grad_norm": 0.14419734477996826,
+      "learning_rate": 0.000789575931672962,
+      "loss": 3.2674,
+      "step": 5830
+    },
+    {
+      "epoch": 0.3388552032260872,
+      "grad_norm": 0.16942447423934937,
+      "learning_rate": 0.0007887932440861158,
+      "loss": 3.2634,
+      "step": 5840
+    },
+    {
+      "epoch": 0.33943543473846066,
+      "grad_norm": 0.15958154201507568,
+      "learning_rate": 0.0007880094931322916,
+      "loss": 3.2687,
+      "step": 5850
+    },
+    {
+      "epoch": 0.34001566625083407,
+      "grad_norm": 0.14885276556015015,
+      "learning_rate": 0.0007872246816973428,
+      "loss": 3.2665,
+      "step": 5860
+    },
+    {
+      "epoch": 0.34059589776320753,
+      "grad_norm": 0.15011096000671387,
+      "learning_rate": 0.0007864388126710268,
+      "loss": 3.2697,
+      "step": 5870
+    },
+    {
+      "epoch": 0.34117612927558094,
+      "grad_norm": 0.1571209579706192,
+      "learning_rate": 0.0007856518889469961,
+      "loss": 3.2688,
+      "step": 5880
+    },
+    {
+      "epoch": 0.3417563607879544,
+      "grad_norm": 0.1479196548461914,
+      "learning_rate": 0.0007848639134227864,
+      "loss": 3.2688,
+      "step": 5890
+    },
+    {
+      "epoch": 0.3423365923003278,
+      "grad_norm": 0.1649380475282669,
+      "learning_rate": 0.0007840748889998057,
+      "loss": 3.2629,
+      "step": 5900
+    },
+    {
+      "epoch": 0.3429168238127013,
+      "grad_norm": 0.1554240733385086,
+      "learning_rate": 0.000783284818583325,
+      "loss": 3.2653,
+      "step": 5910
+    },
+    {
+      "epoch": 0.3434970553250747,
+      "grad_norm": 0.14077123999595642,
+      "learning_rate": 0.000782493705082466,
+      "loss": 3.2631,
+      "step": 5920
+    },
+    {
+      "epoch": 0.34407728683744815,
+      "grad_norm": 0.16318172216415405,
+      "learning_rate": 0.0007817015514101917,
+      "loss": 3.2486,
+      "step": 5930
+    },
+    {
+      "epoch": 0.34465751834982156,
+      "grad_norm": 0.16839627921581268,
+      "learning_rate": 0.0007809083604832948,
+      "loss": 3.265,
+      "step": 5940
+    },
+    {
+      "epoch": 0.345237749862195,
+      "grad_norm": 0.14004649221897125,
+      "learning_rate": 0.0007801141352223873,
+      "loss": 3.261,
+      "step": 5950
+    },
+    {
+      "epoch": 0.34581798137456843,
+      "grad_norm": 0.1588735729455948,
+      "learning_rate": 0.0007793188785518901,
+      "loss": 3.2614,
+      "step": 5960
+    },
+    {
+      "epoch": 0.3463982128869419,
+      "grad_norm": 0.15165585279464722,
+      "learning_rate": 0.0007785225934000213,
+      "loss": 3.2654,
+      "step": 5970
+    },
+    {
+      "epoch": 0.3469784443993153,
+      "grad_norm": 0.14551375806331635,
+      "learning_rate": 0.0007777252826987864,
+      "loss": 3.2593,
+      "step": 5980
+    },
+    {
+      "epoch": 0.34755867591168876,
+      "grad_norm": 0.13176442682743073,
+      "learning_rate": 0.0007769269493839669,
+      "loss": 3.2519,
+      "step": 5990
+    },
+    {
+      "epoch": 0.3481389074240622,
+      "grad_norm": 0.1538584679365158,
+      "learning_rate": 0.0007761275963951096,
+      "loss": 3.2677,
+      "step": 6000
+    },
+    {
+      "epoch": 0.3481389074240622,
+      "eval_loss": 3.1944527626037598,
+      "eval_runtime": 3.2607,
+      "eval_samples_per_second": 1327.935,
+      "eval_steps_per_second": 10.427,
+      "step": 6000
+    },
+    {
+      "epoch": 0.34871913893643564,
+      "grad_norm": 0.1548028141260147,
+      "learning_rate": 0.0007753272266755161,
+      "loss": 3.2613,
+      "step": 6010
+    },
+    {
+      "epoch": 0.3492993704488091,
+      "grad_norm": 0.15565787255764008,
+      "learning_rate": 0.0007745258431722313,
+      "loss": 3.2622,
+      "step": 6020
+    },
+    {
+      "epoch": 0.3498796019611825,
+      "grad_norm": 0.15149720013141632,
+      "learning_rate": 0.0007737234488360334,
+      "loss": 3.2608,
+      "step": 6030
+    },
+    {
+      "epoch": 0.35045983347355597,
+      "grad_norm": 0.14522841572761536,
+      "learning_rate": 0.0007729200466214225,
+      "loss": 3.2509,
+      "step": 6040
+    },
+    {
+      "epoch": 0.3510400649859294,
+      "grad_norm": 0.1358969360589981,
+      "learning_rate": 0.0007721156394866096,
+      "loss": 3.2631,
+      "step": 6050
+    },
+    {
+      "epoch": 0.35162029649830284,
+      "grad_norm": 0.13670052587985992,
+      "learning_rate": 0.0007713102303935058,
+      "loss": 3.2643,
+      "step": 6060
+    },
+    {
+      "epoch": 0.35220052801067625,
+      "grad_norm": 0.16404469311237335,
+      "learning_rate": 0.0007705038223077121,
+      "loss": 3.2435,
+      "step": 6070
+    },
+    {
+      "epoch": 0.3527807595230497,
+      "grad_norm": 0.14754830300807953,
+      "learning_rate": 0.0007696964181985076,
+      "loss": 3.264,
+      "step": 6080
+    },
+    {
+      "epoch": 0.3533609910354231,
+      "grad_norm": 0.16356825828552246,
+      "learning_rate": 0.0007688880210388384,
+      "loss": 3.2629,
+      "step": 6090
+    },
+    {
+      "epoch": 0.3539412225477966,
+      "grad_norm": 0.1487089991569519,
+      "learning_rate": 0.0007680786338053079,
+      "loss": 3.255,
+      "step": 6100
+    },
+    {
+      "epoch": 0.35452145406017,
+      "grad_norm": 0.16473692655563354,
+      "learning_rate": 0.0007672682594781645,
+      "loss": 3.2539,
+      "step": 6110
+    },
+    {
+      "epoch": 0.35510168557254346,
+      "grad_norm": 0.14944276213645935,
+      "learning_rate": 0.0007664569010412914,
+      "loss": 3.2526,
+      "step": 6120
+    },
+    {
+      "epoch": 0.35568191708491687,
+      "grad_norm": 0.15919911861419678,
+      "learning_rate": 0.0007656445614821954,
+      "loss": 3.2613,
+      "step": 6130
+    },
+    {
+      "epoch": 0.35626214859729033,
+      "grad_norm": 0.15756233036518097,
+      "learning_rate": 0.000764831243791996,
+      "loss": 3.2484,
+      "step": 6140
+    },
+    {
+      "epoch": 0.35684238010966374,
+      "grad_norm": 0.14643166959285736,
+      "learning_rate": 0.0007640169509654136,
+      "loss": 3.2552,
+      "step": 6150
+    },
+    {
+      "epoch": 0.3574226116220372,
+      "grad_norm": 0.15033231675624847,
+      "learning_rate": 0.0007632016860007603,
+      "loss": 3.2531,
+      "step": 6160
+    },
+    {
+      "epoch": 0.3580028431344106,
+      "grad_norm": 0.14909091591835022,
+      "learning_rate": 0.000762385451899927,
+      "loss": 3.2651,
+      "step": 6170
+    },
+    {
+      "epoch": 0.3585830746467841,
+      "grad_norm": 0.14181995391845703,
+      "learning_rate": 0.0007615682516683728,
+      "loss": 3.2596,
+      "step": 6180
+    },
+    {
+      "epoch": 0.3591633061591575,
+      "grad_norm": 0.1447875052690506,
+      "learning_rate": 0.0007607500883151148,
+      "loss": 3.2588,
+      "step": 6190
+    },
+    {
+      "epoch": 0.35974353767153094,
+      "grad_norm": 0.15402406454086304,
+      "learning_rate": 0.0007599309648527162,
+      "loss": 3.2478,
+      "step": 6200
+    },
+    {
+      "epoch": 0.36032376918390435,
+      "grad_norm": 0.16491296887397766,
+      "learning_rate": 0.0007591108842972754,
+      "loss": 3.2442,
+      "step": 6210
+    },
+    {
+      "epoch": 0.3609040006962778,
+      "grad_norm": 0.14670206606388092,
+      "learning_rate": 0.0007582898496684148,
+      "loss": 3.2601,
+      "step": 6220
+    },
+    {
+      "epoch": 0.3614842322086513,
+      "grad_norm": 0.12047087401151657,
+      "learning_rate": 0.0007574678639892702,
+      "loss": 3.2531,
+      "step": 6230
+    },
+    {
+      "epoch": 0.3620644637210247,
+      "grad_norm": 0.1422395259141922,
+      "learning_rate": 0.0007566449302864784,
+      "loss": 3.2565,
+      "step": 6240
+    },
+    {
+      "epoch": 0.36264469523339815,
+      "grad_norm": 0.16634182631969452,
+      "learning_rate": 0.0007558210515901683,
+      "loss": 3.2521,
+      "step": 6250
+    },
+    {
+      "epoch": 0.36322492674577156,
+      "grad_norm": 0.14773225784301758,
+      "learning_rate": 0.0007549962309339467,
+      "loss": 3.2571,
+      "step": 6260
+    },
+    {
+      "epoch": 0.363805158258145,
+      "grad_norm": 0.1608228087425232,
+      "learning_rate": 0.0007541704713548905,
+      "loss": 3.2466,
+      "step": 6270
+    },
+    {
+      "epoch": 0.36438538977051843,
+      "grad_norm": 0.1616470217704773,
+      "learning_rate": 0.0007533437758935324,
+      "loss": 3.2559,
+      "step": 6280
+    },
+    {
+      "epoch": 0.3649656212828919,
+      "grad_norm": 0.14817708730697632,
+      "learning_rate": 0.0007525161475938518,
+      "loss": 3.2579,
+      "step": 6290
+    },
+    {
+      "epoch": 0.3655458527952653,
+      "grad_norm": 0.1556018441915512,
+      "learning_rate": 0.0007516875895032628,
+      "loss": 3.2521,
+      "step": 6300
+    },
+    {
+      "epoch": 0.36612608430763877,
+      "grad_norm": 0.13821960985660553,
+      "learning_rate": 0.0007508581046726032,
+      "loss": 3.256,
+      "step": 6310
+    },
+    {
+      "epoch": 0.3667063158200122,
+      "grad_norm": 0.13531796634197235,
+      "learning_rate": 0.0007500276961561232,
+      "loss": 3.2476,
+      "step": 6320
+    },
+    {
+      "epoch": 0.36728654733238564,
+      "grad_norm": 0.13882015645503998,
+      "learning_rate": 0.0007491963670114737,
+      "loss": 3.2507,
+      "step": 6330
+    },
+    {
+      "epoch": 0.36786677884475905,
+      "grad_norm": 0.13630333542823792,
+      "learning_rate": 0.0007483641202996957,
+      "loss": 3.2536,
+      "step": 6340
+    },
+    {
+      "epoch": 0.3684470103571325,
+      "grad_norm": 0.12747836112976074,
+      "learning_rate": 0.0007475309590852089,
+      "loss": 3.2559,
+      "step": 6350
+    },
+    {
+      "epoch": 0.3690272418695059,
+      "grad_norm": 0.15810616314411163,
+      "learning_rate": 0.0007466968864357998,
+      "loss": 3.2431,
+      "step": 6360
+    },
+    {
+      "epoch": 0.3696074733818794,
+      "grad_norm": 0.1615232676267624,
+      "learning_rate": 0.0007458619054226117,
+      "loss": 3.2513,
+      "step": 6370
+    },
+    {
+      "epoch": 0.3701877048942528,
+      "grad_norm": 0.12830163538455963,
+      "learning_rate": 0.000745026019120132,
+      "loss": 3.2539,
+      "step": 6380
+    },
+    {
+      "epoch": 0.37076793640662625,
+      "grad_norm": 0.16822132468223572,
+      "learning_rate": 0.0007441892306061817,
+      "loss": 3.2442,
+      "step": 6390
+    },
+    {
+      "epoch": 0.37134816791899966,
+      "grad_norm": 0.14407211542129517,
+      "learning_rate": 0.0007433515429619038,
+      "loss": 3.2533,
+      "step": 6400
+    },
+    {
+      "epoch": 0.3719283994313731,
+      "grad_norm": 0.13332654535770416,
+      "learning_rate": 0.0007425129592717516,
+      "loss": 3.247,
+      "step": 6410
+    },
+    {
+      "epoch": 0.37250863094374653,
+      "grad_norm": 0.15194551646709442,
+      "learning_rate": 0.0007416734826234786,
+      "loss": 3.2469,
+      "step": 6420
+    },
+    {
+      "epoch": 0.37308886245612,
+      "grad_norm": 0.13437363505363464,
+      "learning_rate": 0.0007408331161081255,
+      "loss": 3.246,
+      "step": 6430
+    },
+    {
+      "epoch": 0.3736690939684934,
+      "grad_norm": 0.1475239098072052,
+      "learning_rate": 0.00073999186282001,
+      "loss": 3.2452,
+      "step": 6440
+    },
+    {
+      "epoch": 0.37424932548086687,
+      "grad_norm": 0.1388455033302307,
+      "learning_rate": 0.0007391497258567146,
+      "loss": 3.2484,
+      "step": 6450
+    },
+    {
+      "epoch": 0.3748295569932403,
+      "grad_norm": 0.14330509305000305,
+      "learning_rate": 0.000738306708319076,
+      "loss": 3.2499,
+      "step": 6460
+    },
+    {
+      "epoch": 0.37540978850561374,
+      "grad_norm": 0.13358131051063538,
+      "learning_rate": 0.0007374628133111728,
+      "loss": 3.2416,
+      "step": 6470
+    },
+    {
+      "epoch": 0.3759900200179872,
+      "grad_norm": 0.15574291348457336,
+      "learning_rate": 0.0007366180439403152,
+      "loss": 3.2499,
+      "step": 6480
+    },
+    {
+      "epoch": 0.3765702515303606,
+      "grad_norm": 0.15618012845516205,
+      "learning_rate": 0.0007357724033170323,
+      "loss": 3.2408,
+      "step": 6490
+    },
+    {
+      "epoch": 0.3771504830427341,
+      "grad_norm": 0.12743791937828064,
+      "learning_rate": 0.0007349258945550615,
+      "loss": 3.2478,
+      "step": 6500
+    },
+    {
+      "epoch": 0.3777307145551075,
+      "grad_norm": 0.1619246006011963,
+      "learning_rate": 0.000734078520771337,
+      "loss": 3.2358,
+      "step": 6510
+    },
+    {
+      "epoch": 0.37831094606748095,
+      "grad_norm": 0.1590278297662735,
+      "learning_rate": 0.0007332302850859773,
+      "loss": 3.2425,
+      "step": 6520
+    },
+    {
+      "epoch": 0.37889117757985435,
+      "grad_norm": 0.16503369808197021,
+      "learning_rate": 0.0007323811906222755,
+      "loss": 3.2411,
+      "step": 6530
+    },
+    {
+      "epoch": 0.3794714090922278,
+      "grad_norm": 0.1441235989332199,
+      "learning_rate": 0.0007315312405066861,
+      "loss": 3.245,
+      "step": 6540
+    },
+    {
+      "epoch": 0.3800516406046012,
+      "grad_norm": 0.16268372535705566,
+      "learning_rate": 0.0007306804378688147,
+      "loss": 3.2475,
+      "step": 6550
+    },
+    {
+      "epoch": 0.3806318721169747,
+      "grad_norm": 0.17126062512397766,
+      "learning_rate": 0.0007298287858414057,
+      "loss": 3.2395,
+      "step": 6560
+    },
+    {
+      "epoch": 0.3812121036293481,
+      "grad_norm": 0.14614002406597137,
+      "learning_rate": 0.0007289762875603308,
+      "loss": 3.2465,
+      "step": 6570
+    },
+    {
+      "epoch": 0.38179233514172156,
+      "grad_norm": 0.1300090104341507,
+      "learning_rate": 0.0007281229461645782,
+      "loss": 3.2534,
+      "step": 6580
+    },
+    {
+      "epoch": 0.38237256665409497,
+      "grad_norm": 0.16573797166347504,
+      "learning_rate": 0.0007272687647962403,
+      "loss": 3.2395,
+      "step": 6590
+    },
+    {
+      "epoch": 0.38295279816646843,
+      "grad_norm": 0.17565912008285522,
+      "learning_rate": 0.0007264137466005025,
+      "loss": 3.2412,
+      "step": 6600
+    },
+    {
+      "epoch": 0.38353302967884184,
+      "grad_norm": 0.14961925148963928,
+      "learning_rate": 0.0007255578947256312,
+      "loss": 3.2339,
+      "step": 6610
+    },
+    {
+      "epoch": 0.3841132611912153,
+      "grad_norm": 0.1480415016412735,
+      "learning_rate": 0.0007247012123229627,
+      "loss": 3.2358,
+      "step": 6620
+    },
+    {
+      "epoch": 0.3846934927035887,
+      "grad_norm": 0.14414618909358978,
+      "learning_rate": 0.0007238437025468913,
+      "loss": 3.2367,
+      "step": 6630
+    },
+    {
+      "epoch": 0.3852737242159622,
+      "grad_norm": 0.14013369381427765,
+      "learning_rate": 0.0007229853685548578,
+      "loss": 3.2453,
+      "step": 6640
+    },
+    {
+      "epoch": 0.3858539557283356,
+      "grad_norm": 0.13546213507652283,
+      "learning_rate": 0.0007221262135073381,
+      "loss": 3.2346,
+      "step": 6650
+    },
+    {
+      "epoch": 0.38643418724070905,
+      "grad_norm": 0.16352064907550812,
+      "learning_rate": 0.0007212662405678309,
+      "loss": 3.24,
+      "step": 6660
+    },
+    {
+      "epoch": 0.38701441875308246,
+      "grad_norm": 0.14588786661624908,
+      "learning_rate": 0.0007204054529028467,
+      "loss": 3.2478,
+      "step": 6670
+    },
+    {
+      "epoch": 0.3875946502654559,
+      "grad_norm": 0.151209756731987,
+      "learning_rate": 0.0007195438536818957,
+      "loss": 3.2306,
+      "step": 6680
+    },
+    {
+      "epoch": 0.38817488177782933,
+      "grad_norm": 0.14419269561767578,
+      "learning_rate": 0.0007186814460774769,
+      "loss": 3.2372,
+      "step": 6690
+    },
+    {
+      "epoch": 0.3887551132902028,
+      "grad_norm": 0.14094632863998413,
+      "learning_rate": 0.0007178182332650649,
+      "loss": 3.2323,
+      "step": 6700
+    },
+    {
+      "epoch": 0.38933534480257626,
+      "grad_norm": 0.1500055193901062,
+      "learning_rate": 0.0007169542184231001,
+      "loss": 3.2421,
+      "step": 6710
+    },
+    {
+      "epoch": 0.38991557631494966,
+      "grad_norm": 0.14962860941886902,
+      "learning_rate": 0.0007160894047329756,
+      "loss": 3.2392,
+      "step": 6720
+    },
+    {
+      "epoch": 0.3904958078273231,
+      "grad_norm": 0.14648567140102386,
+      "learning_rate": 0.0007152237953790258,
+      "loss": 3.2358,
+      "step": 6730
+    },
+    {
+      "epoch": 0.39107603933969654,
+      "grad_norm": 0.14237669110298157,
+      "learning_rate": 0.0007143573935485153,
+      "loss": 3.2479,
+      "step": 6740
+    },
+    {
+      "epoch": 0.39165627085207,
+      "grad_norm": 0.12649035453796387,
+      "learning_rate": 0.0007134902024316264,
+      "loss": 3.2412,
+      "step": 6750
+    },
+    {
+      "epoch": 0.3922365023644434,
+      "grad_norm": 0.13935695588588715,
+      "learning_rate": 0.0007126222252214473,
+      "loss": 3.2341,
+      "step": 6760
+    },
+    {
+      "epoch": 0.39281673387681687,
+      "grad_norm": 0.15621213614940643,
+      "learning_rate": 0.0007117534651139612,
+      "loss": 3.2332,
+      "step": 6770
+    },
+    {
+      "epoch": 0.3933969653891903,
+      "grad_norm": 0.1531130075454712,
+      "learning_rate": 0.0007108839253080338,
+      "loss": 3.2251,
+      "step": 6780
+    },
+    {
+      "epoch": 0.39397719690156374,
+      "grad_norm": 0.14018535614013672,
+      "learning_rate": 0.0007100136090054019,
+      "loss": 3.2377,
+      "step": 6790
+    },
+    {
+      "epoch": 0.39455742841393715,
+      "grad_norm": 0.1587972193956375,
+      "learning_rate": 0.0007091425194106611,
+      "loss": 3.2356,
+      "step": 6800
+    },
+    {
+      "epoch": 0.3951376599263106,
+      "grad_norm": 0.13827280700206757,
+      "learning_rate": 0.0007082706597312549,
+      "loss": 3.2345,
+      "step": 6810
+    },
+    {
+      "epoch": 0.395717891438684,
+      "grad_norm": 0.13535282015800476,
+      "learning_rate": 0.0007073980331774615,
+      "loss": 3.2347,
+      "step": 6820
+    },
+    {
+      "epoch": 0.3962981229510575,
+      "grad_norm": 0.15061281621456146,
+      "learning_rate": 0.0007065246429623835,
+      "loss": 3.2345,
+      "step": 6830
+    },
+    {
+      "epoch": 0.3968783544634309,
+      "grad_norm": 0.1398342251777649,
+      "learning_rate": 0.0007056504923019352,
+      "loss": 3.231,
+      "step": 6840
+    },
+    {
+      "epoch": 0.39745858597580436,
+      "grad_norm": 0.14031299948692322,
+      "learning_rate": 0.0007047755844148307,
+      "loss": 3.2212,
+      "step": 6850
+    },
+    {
+      "epoch": 0.39803881748817777,
+      "grad_norm": 0.1403796672821045,
+      "learning_rate": 0.0007038999225225729,
+      "loss": 3.2346,
+      "step": 6860
+    },
+    {
+      "epoch": 0.39861904900055123,
+      "grad_norm": 0.13849115371704102,
+      "learning_rate": 0.0007030235098494403,
+      "loss": 3.2424,
+      "step": 6870
+    },
+    {
+      "epoch": 0.39919928051292464,
+      "grad_norm": 0.14095619320869446,
+      "learning_rate": 0.0007021463496224762,
+      "loss": 3.2299,
+      "step": 6880
+    },
+    {
+      "epoch": 0.3997795120252981,
+      "grad_norm": 0.14068861305713654,
+      "learning_rate": 0.0007012684450714765,
+      "loss": 3.2338,
+      "step": 6890
+    },
+    {
+      "epoch": 0.4003597435376715,
+      "grad_norm": 0.14077772200107574,
+      "learning_rate": 0.0007003897994289777,
+      "loss": 3.2323,
+      "step": 6900
+    },
+    {
+      "epoch": 0.40093997505004497,
+      "grad_norm": 0.14695732295513153,
+      "learning_rate": 0.0006995104159302452,
+      "loss": 3.2343,
+      "step": 6910
+    },
+    {
+      "epoch": 0.4015202065624184,
+      "grad_norm": 0.14510050415992737,
+      "learning_rate": 0.0006986302978132611,
+      "loss": 3.2269,
+      "step": 6920
+    },
+    {
+      "epoch": 0.40210043807479184,
+      "grad_norm": 0.14484266936779022,
+      "learning_rate": 0.0006977494483187126,
+      "loss": 3.2255,
+      "step": 6930
+    },
+    {
+      "epoch": 0.40268066958716525,
+      "grad_norm": 0.14667174220085144,
+      "learning_rate": 0.0006968678706899795,
+      "loss": 3.2269,
+      "step": 6940
+    },
+    {
+      "epoch": 0.4032609010995387,
+      "grad_norm": 0.15151144564151764,
+      "learning_rate": 0.0006959855681731233,
+      "loss": 3.2294,
+      "step": 6950
+    },
+    {
+      "epoch": 0.4038411326119122,
+      "grad_norm": 0.1448170691728592,
+      "learning_rate": 0.000695102544016874,
+      "loss": 3.2299,
+      "step": 6960
+    },
+    {
+      "epoch": 0.4044213641242856,
+      "grad_norm": 0.12772366404533386,
+      "learning_rate": 0.0006942188014726194,
+      "loss": 3.2285,
+      "step": 6970
+    },
+    {
+      "epoch": 0.40500159563665905,
+      "grad_norm": 0.15471121668815613,
+      "learning_rate": 0.000693334343794392,
+      "loss": 3.227,
+      "step": 6980
+    },
+    {
+      "epoch": 0.40558182714903246,
+      "grad_norm": 0.15615463256835938,
+      "learning_rate": 0.0006924491742388573,
+      "loss": 3.228,
+      "step": 6990
+    },
+    {
+      "epoch": 0.4061620586614059,
+      "grad_norm": 0.12857802212238312,
+      "learning_rate": 0.0006915632960653029,
+      "loss": 3.225,
+      "step": 7000
+    },
+    {
+      "epoch": 0.4061620586614059,
+      "eval_loss": 3.165278911590576,
+      "eval_runtime": 3.2586,
+      "eval_samples_per_second": 1328.797,
+      "eval_steps_per_second": 10.434,
+      "step": 7000
+    },
+    {
+      "epoch": 0.40674229017377933,
+      "grad_norm": 0.14582431316375732,
+      "learning_rate": 0.0006906767125356246,
+      "loss": 3.235,
+      "step": 7010
+    },
+    {
+      "epoch": 0.4073225216861528,
+      "grad_norm": 0.1576128900051117,
+      "learning_rate": 0.000689789426914316,
+      "loss": 3.2256,
+      "step": 7020
+    },
+    {
+      "epoch": 0.4079027531985262,
+      "grad_norm": 0.16213494539260864,
+      "learning_rate": 0.0006889014424684557,
+      "loss": 3.2409,
+      "step": 7030
+    },
+    {
+      "epoch": 0.40848298471089967,
+      "grad_norm": 0.13359089195728302,
+      "learning_rate": 0.0006880127624676955,
+      "loss": 3.2328,
+      "step": 7040
+    },
+    {
+      "epoch": 0.4090632162232731,
+      "grad_norm": 0.1388418972492218,
+      "learning_rate": 0.0006871233901842481,
+      "loss": 3.2191,
+      "step": 7050
+    },
+    {
+      "epoch": 0.40964344773564654,
+      "grad_norm": 0.1342374086380005,
+      "learning_rate": 0.0006862333288928755,
+      "loss": 3.2348,
+      "step": 7060
+    },
+    {
+      "epoch": 0.41022367924801995,
+      "grad_norm": 0.15014256536960602,
+      "learning_rate": 0.0006853425818708767,
+      "loss": 3.2239,
+      "step": 7070
+    },
+    {
+      "epoch": 0.4108039107603934,
+      "grad_norm": 0.1368698626756668,
+      "learning_rate": 0.0006844511523980755,
+      "loss": 3.2385,
+      "step": 7080
+    },
+    {
+      "epoch": 0.4113841422727668,
+      "grad_norm": 0.15549789369106293,
+      "learning_rate": 0.0006835590437568084,
+      "loss": 3.2344,
+      "step": 7090
+    },
+    {
+      "epoch": 0.4119643737851403,
+      "grad_norm": 0.13888388872146606,
+      "learning_rate": 0.0006826662592319131,
+      "loss": 3.2258,
+      "step": 7100
+    },
+    {
+      "epoch": 0.4125446052975137,
+      "grad_norm": 0.12590526044368744,
+      "learning_rate": 0.0006817728021107159,
+      "loss": 3.221,
+      "step": 7110
+    },
+    {
+      "epoch": 0.41312483680988715,
+      "grad_norm": 0.12910686433315277,
+      "learning_rate": 0.0006808786756830192,
+      "loss": 3.2283,
+      "step": 7120
+    },
+    {
+      "epoch": 0.41370506832226056,
+      "grad_norm": 0.13956746459007263,
+      "learning_rate": 0.0006799838832410903,
+      "loss": 3.2201,
+      "step": 7130
+    },
+    {
+      "epoch": 0.414285299834634,
+      "grad_norm": 0.1646030694246292,
+      "learning_rate": 0.0006790884280796486,
+      "loss": 3.2191,
+      "step": 7140
+    },
+    {
+      "epoch": 0.41486553134700743,
+      "grad_norm": 0.13890932500362396,
+      "learning_rate": 0.0006781923134958539,
+      "loss": 3.2257,
+      "step": 7150
+    },
+    {
+      "epoch": 0.4154457628593809,
+      "grad_norm": 0.13294340670108795,
+      "learning_rate": 0.0006772955427892939,
+      "loss": 3.2317,
+      "step": 7160
+    },
+    {
+      "epoch": 0.4160259943717543,
+      "grad_norm": 0.12860235571861267,
+      "learning_rate": 0.0006763981192619726,
+      "loss": 3.2154,
+      "step": 7170
+    },
+    {
+      "epoch": 0.41660622588412777,
+      "grad_norm": 0.14738686382770538,
+      "learning_rate": 0.0006755000462182972,
+      "loss": 3.2332,
+      "step": 7180
+    },
+    {
+      "epoch": 0.41718645739650123,
+      "grad_norm": 0.13093027472496033,
+      "learning_rate": 0.0006746013269650666,
+      "loss": 3.2351,
+      "step": 7190
+    },
+    {
+      "epoch": 0.41776668890887464,
+      "grad_norm": 0.13175268471240997,
+      "learning_rate": 0.0006737019648114593,
+      "loss": 3.2294,
+      "step": 7200
+    },
+    {
+      "epoch": 0.4183469204212481,
+      "grad_norm": 0.13433928787708282,
+      "learning_rate": 0.000672801963069021,
+      "loss": 3.2273,
+      "step": 7210
+    },
+    {
+      "epoch": 0.4189271519336215,
+      "grad_norm": 0.14208847284317017,
+      "learning_rate": 0.0006719013250516526,
+      "loss": 3.2272,
+      "step": 7220
+    },
+    {
+      "epoch": 0.419507383445995,
+      "grad_norm": 0.13174398243427277,
+      "learning_rate": 0.0006710000540755973,
+      "loss": 3.2153,
+      "step": 7230
+    },
+    {
+      "epoch": 0.4200876149583684,
+      "grad_norm": 0.14360399544239044,
+      "learning_rate": 0.0006700981534594296,
+      "loss": 3.223,
+      "step": 7240
+    },
+    {
+      "epoch": 0.42066784647074185,
+      "grad_norm": 0.1482868790626526,
+      "learning_rate": 0.0006691956265240417,
+      "loss": 3.218,
+      "step": 7250
+    },
+    {
+      "epoch": 0.42124807798311525,
+      "grad_norm": 0.13119544088840485,
+      "learning_rate": 0.0006682924765926323,
+      "loss": 3.2294,
+      "step": 7260
+    },
+    {
+      "epoch": 0.4218283094954887,
+      "grad_norm": 0.13039755821228027,
+      "learning_rate": 0.0006673887069906945,
+      "loss": 3.227,
+      "step": 7270
+    },
+    {
+      "epoch": 0.4224085410078621,
+      "grad_norm": 0.12415551394224167,
+      "learning_rate": 0.0006664843210460025,
+      "loss": 3.2142,
+      "step": 7280
+    },
+    {
+      "epoch": 0.4229887725202356,
+      "grad_norm": 0.13810203969478607,
+      "learning_rate": 0.0006655793220885997,
+      "loss": 3.2275,
+      "step": 7290
+    },
+    {
+      "epoch": 0.423569004032609,
+      "grad_norm": 0.13545836508274078,
+      "learning_rate": 0.0006646737134507874,
+      "loss": 3.2113,
+      "step": 7300
+    },
+    {
+      "epoch": 0.42414923554498246,
+      "grad_norm": 0.12676437199115753,
+      "learning_rate": 0.0006637674984671113,
+      "loss": 3.2183,
+      "step": 7310
+    },
+    {
+      "epoch": 0.42472946705735587,
+      "grad_norm": 0.12899167835712433,
+      "learning_rate": 0.0006628606804743502,
+      "loss": 3.2237,
+      "step": 7320
+    },
+    {
+      "epoch": 0.42530969856972933,
+      "grad_norm": 0.13533097505569458,
+      "learning_rate": 0.0006619532628115027,
+      "loss": 3.2025,
+      "step": 7330
+    },
+    {
+      "epoch": 0.42588993008210274,
+      "grad_norm": 0.12174040824174881,
+      "learning_rate": 0.0006610452488197758,
+      "loss": 3.2141,
+      "step": 7340
+    },
+    {
+      "epoch": 0.4264701615944762,
+      "grad_norm": 0.14033706486225128,
+      "learning_rate": 0.000660136641842572,
+      "loss": 3.2309,
+      "step": 7350
+    },
+    {
+      "epoch": 0.4270503931068496,
+      "grad_norm": 0.1348879039287567,
+      "learning_rate": 0.0006592274452254775,
+      "loss": 3.2207,
+      "step": 7360
+    },
+    {
+      "epoch": 0.4276306246192231,
+      "grad_norm": 0.13258253037929535,
+      "learning_rate": 0.0006583176623162494,
+      "loss": 3.2273,
+      "step": 7370
+    },
+    {
+      "epoch": 0.4282108561315965,
+      "grad_norm": 0.14150184392929077,
+      "learning_rate": 0.0006574072964648038,
+      "loss": 3.2205,
+      "step": 7380
+    },
+    {
+      "epoch": 0.42879108764396995,
+      "grad_norm": 0.1396942138671875,
+      "learning_rate": 0.0006564963510232031,
+      "loss": 3.2179,
+      "step": 7390
+    },
+    {
+      "epoch": 0.42937131915634336,
+      "grad_norm": 0.13543544709682465,
+      "learning_rate": 0.0006555848293456438,
+      "loss": 3.217,
+      "step": 7400
+    },
+    {
+      "epoch": 0.4299515506687168,
+      "grad_norm": 0.1295756995677948,
+      "learning_rate": 0.0006546727347884441,
+      "loss": 3.2206,
+      "step": 7410
+    },
+    {
+      "epoch": 0.43053178218109023,
+      "grad_norm": 0.15362213551998138,
+      "learning_rate": 0.000653760070710032,
+      "loss": 3.2212,
+      "step": 7420
+    },
+    {
+      "epoch": 0.4311120136934637,
+      "grad_norm": 0.1392851173877716,
+      "learning_rate": 0.0006528468404709319,
+      "loss": 3.2175,
+      "step": 7430
+    },
+    {
+      "epoch": 0.43169224520583716,
+      "grad_norm": 0.12892089784145355,
+      "learning_rate": 0.0006519330474337534,
+      "loss": 3.218,
+      "step": 7440
+    },
+    {
+      "epoch": 0.43227247671821056,
+      "grad_norm": 0.1390940397977829,
+      "learning_rate": 0.0006510186949631782,
+      "loss": 3.2171,
+      "step": 7450
+    },
+    {
+      "epoch": 0.432852708230584,
+      "grad_norm": 0.13330115377902985,
+      "learning_rate": 0.0006501037864259478,
+      "loss": 3.2293,
+      "step": 7460
+    },
+    {
+      "epoch": 0.43343293974295743,
+      "grad_norm": 0.1433860808610916,
+      "learning_rate": 0.0006491883251908513,
+      "loss": 3.2099,
+      "step": 7470
+    },
+    {
+      "epoch": 0.4340131712553309,
+      "grad_norm": 0.13837961852550507,
+      "learning_rate": 0.000648272314628713,
+      "loss": 3.2139,
+      "step": 7480
+    },
+    {
+      "epoch": 0.4345934027677043,
+      "grad_norm": 0.13204647600650787,
+      "learning_rate": 0.0006473557581123797,
+      "loss": 3.2267,
+      "step": 7490
+    },
+    {
+      "epoch": 0.43517363428007777,
+      "grad_norm": 0.13703928887844086,
+      "learning_rate": 0.0006464386590167082,
+      "loss": 3.2131,
+      "step": 7500
+    },
+    {
+      "epoch": 0.4357538657924512,
+      "grad_norm": 0.13702446222305298,
+      "learning_rate": 0.0006455210207185539,
+      "loss": 3.2238,
+      "step": 7510
+    },
+    {
+      "epoch": 0.43633409730482464,
+      "grad_norm": 0.13354191184043884,
+      "learning_rate": 0.0006446028465967568,
+      "loss": 3.2131,
+      "step": 7520
+    },
+    {
+      "epoch": 0.43691432881719805,
+      "grad_norm": 0.14659354090690613,
+      "learning_rate": 0.0006436841400321304,
+      "loss": 3.2243,
+      "step": 7530
+    },
+    {
+      "epoch": 0.4374945603295715,
+      "grad_norm": 0.13794207572937012,
+      "learning_rate": 0.0006427649044074484,
+      "loss": 3.2229,
+      "step": 7540
+    },
+    {
+      "epoch": 0.4380747918419449,
+      "grad_norm": 0.146932452917099,
+      "learning_rate": 0.0006418451431074329,
+      "loss": 3.2117,
+      "step": 7550
+    },
+    {
+      "epoch": 0.4386550233543184,
+      "grad_norm": 0.13247311115264893,
+      "learning_rate": 0.0006409248595187409,
+      "loss": 3.223,
+      "step": 7560
+    },
+    {
+      "epoch": 0.4392352548666918,
+      "grad_norm": 0.13509587943553925,
+      "learning_rate": 0.0006400040570299535,
+      "loss": 3.2165,
+      "step": 7570
+    },
+    {
+      "epoch": 0.43981548637906526,
+      "grad_norm": 0.1419142633676529,
+      "learning_rate": 0.0006390827390315614,
+      "loss": 3.2125,
+      "step": 7580
+    },
+    {
+      "epoch": 0.44039571789143866,
+      "grad_norm": 0.14198483526706696,
+      "learning_rate": 0.0006381609089159545,
+      "loss": 3.2188,
+      "step": 7590
+    },
+    {
+      "epoch": 0.44097594940381213,
+      "grad_norm": 0.12889783084392548,
+      "learning_rate": 0.0006372385700774075,
+      "loss": 3.2026,
+      "step": 7600
+    },
+    {
+      "epoch": 0.44155618091618554,
+      "grad_norm": 0.13903778791427612,
+      "learning_rate": 0.0006363157259120689,
+      "loss": 3.2068,
+      "step": 7610
+    },
+    {
+      "epoch": 0.442136412428559,
+      "grad_norm": 0.13285143673419952,
+      "learning_rate": 0.0006353923798179472,
+      "loss": 3.2067,
+      "step": 7620
+    },
+    {
+      "epoch": 0.4427166439409324,
+      "grad_norm": 0.13808636367321014,
+      "learning_rate": 0.0006344685351948998,
+      "loss": 3.2131,
+      "step": 7630
+    },
+    {
+      "epoch": 0.44329687545330587,
+      "grad_norm": 0.12950055301189423,
+      "learning_rate": 0.0006335441954446191,
+      "loss": 3.2128,
+      "step": 7640
+    },
+    {
+      "epoch": 0.4438771069656793,
+      "grad_norm": 0.1326564997434616,
+      "learning_rate": 0.0006326193639706214,
+      "loss": 3.2228,
+      "step": 7650
+    },
+    {
+      "epoch": 0.44445733847805274,
+      "grad_norm": 0.12820059061050415,
+      "learning_rate": 0.0006316940441782325,
+      "loss": 3.2105,
+      "step": 7660
+    },
+    {
+      "epoch": 0.4450375699904262,
+      "grad_norm": 0.14834731817245483,
+      "learning_rate": 0.000630768239474577,
+      "loss": 3.2162,
+      "step": 7670
+    },
+    {
+      "epoch": 0.4456178015027996,
+      "grad_norm": 0.1407567858695984,
+      "learning_rate": 0.0006298419532685649,
+      "loss": 3.2075,
+      "step": 7680
+    },
+    {
+      "epoch": 0.4461980330151731,
+      "grad_norm": 0.18233934044837952,
+      "learning_rate": 0.0006289151889708788,
+      "loss": 3.2209,
+      "step": 7690
+    },
+    {
+      "epoch": 0.4467782645275465,
+      "grad_norm": 0.13413317501544952,
+      "learning_rate": 0.0006279879499939625,
+      "loss": 3.2062,
+      "step": 7700
+    },
+    {
+      "epoch": 0.44735849603991995,
+      "grad_norm": 0.14402654767036438,
+      "learning_rate": 0.0006270602397520065,
+      "loss": 3.2056,
+      "step": 7710
+    },
+    {
+      "epoch": 0.44793872755229336,
+      "grad_norm": 0.14101460576057434,
+      "learning_rate": 0.0006261320616609372,
+      "loss": 3.2135,
+      "step": 7720
+    },
+    {
+      "epoch": 0.4485189590646668,
+      "grad_norm": 0.15453755855560303,
+      "learning_rate": 0.0006252034191384035,
+      "loss": 3.2165,
+      "step": 7730
+    },
+    {
+      "epoch": 0.44909919057704023,
+      "grad_norm": 0.13480693101882935,
+      "learning_rate": 0.0006242743156037646,
+      "loss": 3.2139,
+      "step": 7740
+    },
+    {
+      "epoch": 0.4496794220894137,
+      "grad_norm": 0.13115455210208893,
+      "learning_rate": 0.0006233447544780772,
+      "loss": 3.2135,
+      "step": 7750
+    },
+    {
+      "epoch": 0.4502596536017871,
+      "grad_norm": 0.14997157454490662,
+      "learning_rate": 0.0006224147391840824,
+      "loss": 3.1969,
+      "step": 7760
+    },
+    {
+      "epoch": 0.45083988511416057,
+      "grad_norm": 0.13748539984226227,
+      "learning_rate": 0.0006214842731461942,
+      "loss": 3.2268,
+      "step": 7770
+    },
+    {
+      "epoch": 0.451420116626534,
+      "grad_norm": 0.12151113897562027,
+      "learning_rate": 0.0006205533597904857,
+      "loss": 3.2063,
+      "step": 7780
+    },
+    {
+      "epoch": 0.45200034813890744,
+      "grad_norm": 0.13322634994983673,
+      "learning_rate": 0.0006196220025446778,
+      "loss": 3.2066,
+      "step": 7790
+    },
+    {
+      "epoch": 0.45258057965128085,
+      "grad_norm": 0.1378646045923233,
+      "learning_rate": 0.0006186902048381252,
+      "loss": 3.1976,
+      "step": 7800
+    },
+    {
+      "epoch": 0.4531608111636543,
+      "grad_norm": 0.14197058975696564,
+      "learning_rate": 0.0006177579701018048,
+      "loss": 3.2056,
+      "step": 7810
+    },
+    {
+      "epoch": 0.4537410426760277,
+      "grad_norm": 0.13627830147743225,
+      "learning_rate": 0.0006168253017683025,
+      "loss": 3.2052,
+      "step": 7820
+    },
+    {
+      "epoch": 0.4543212741884012,
+      "grad_norm": 0.15169207751750946,
+      "learning_rate": 0.0006158922032718006,
+      "loss": 3.211,
+      "step": 7830
+    },
+    {
+      "epoch": 0.4549015057007746,
+      "grad_norm": 0.1337585300207138,
+      "learning_rate": 0.0006149586780480659,
+      "loss": 3.2157,
+      "step": 7840
+    },
+    {
+      "epoch": 0.45548173721314805,
+      "grad_norm": 0.1394774168729782,
+      "learning_rate": 0.0006140247295344359,
+      "loss": 3.2174,
+      "step": 7850
+    },
+    {
+      "epoch": 0.45606196872552146,
+      "grad_norm": 0.12764208018779755,
+      "learning_rate": 0.0006130903611698067,
+      "loss": 3.2102,
+      "step": 7860
+    },
+    {
+      "epoch": 0.4566422002378949,
+      "grad_norm": 0.13290008902549744,
+      "learning_rate": 0.0006121555763946207,
+      "loss": 3.2041,
+      "step": 7870
+    },
+    {
+      "epoch": 0.45722243175026833,
+      "grad_norm": 0.14185406267642975,
+      "learning_rate": 0.0006112203786508533,
+      "loss": 3.2152,
+      "step": 7880
+    },
+    {
+      "epoch": 0.4578026632626418,
+      "grad_norm": 0.12418293952941895,
+      "learning_rate": 0.0006102847713820006,
+      "loss": 3.2028,
+      "step": 7890
+    },
+    {
+      "epoch": 0.4583828947750152,
+      "grad_norm": 0.138755664229393,
+      "learning_rate": 0.0006093487580330666,
+      "loss": 3.2043,
+      "step": 7900
+    },
+    {
+      "epoch": 0.45896312628738867,
+      "grad_norm": 0.13823552429676056,
+      "learning_rate": 0.0006084123420505503,
+      "loss": 3.2043,
+      "step": 7910
+    },
+    {
+      "epoch": 0.45954335779976213,
+      "grad_norm": 0.1277630627155304,
+      "learning_rate": 0.0006074755268824335,
+      "loss": 3.2068,
+      "step": 7920
+    },
+    {
+      "epoch": 0.46012358931213554,
+      "grad_norm": 0.14666809141635895,
+      "learning_rate": 0.0006065383159781682,
+      "loss": 3.2156,
+      "step": 7930
+    },
+    {
+      "epoch": 0.460703820824509,
+      "grad_norm": 0.12684592604637146,
+      "learning_rate": 0.0006056007127886626,
+      "loss": 3.2059,
+      "step": 7940
+    },
+    {
+      "epoch": 0.4612840523368824,
+      "grad_norm": 0.12497347593307495,
+      "learning_rate": 0.0006046627207662702,
+      "loss": 3.2043,
+      "step": 7950
+    },
+    {
+      "epoch": 0.4618642838492559,
+      "grad_norm": 0.12166955322027206,
+      "learning_rate": 0.0006037243433647757,
+      "loss": 3.2039,
+      "step": 7960
+    },
+    {
+      "epoch": 0.4624445153616293,
+      "grad_norm": 0.12836964428424835,
+      "learning_rate": 0.000602785584039383,
+      "loss": 3.1986,
+      "step": 7970
+    },
+    {
+      "epoch": 0.46302474687400275,
+      "grad_norm": 0.1306101679801941,
+      "learning_rate": 0.0006018464462467023,
+      "loss": 3.2028,
+      "step": 7980
+    },
+    {
+      "epoch": 0.46360497838637615,
+      "grad_norm": 0.13166449964046478,
+      "learning_rate": 0.0006009069334447374,
+      "loss": 3.2017,
+      "step": 7990
+    },
+    {
+      "epoch": 0.4641852098987496,
+      "grad_norm": 0.1289730966091156,
+      "learning_rate": 0.0005999670490928729,
+      "loss": 3.2051,
+      "step": 8000
+    },
+    {
+      "epoch": 0.4641852098987496,
+      "eval_loss": 3.13897442817688,
+      "eval_runtime": 3.2533,
+      "eval_samples_per_second": 1330.958,
+      "eval_steps_per_second": 10.451,
+      "step": 8000
+    },
+    {
+      "epoch": 0.464765441411123,
+      "grad_norm": 0.12794232368469238,
+      "learning_rate": 0.0005990267966518613,
+      "loss": 3.2052,
+      "step": 8010
+    },
+    {
+      "epoch": 0.4653456729234965,
+      "grad_norm": 0.13217690587043762,
+      "learning_rate": 0.0005980861795838108,
+      "loss": 3.2057,
+      "step": 8020
+    },
+    {
+      "epoch": 0.4659259044358699,
+      "grad_norm": 0.14063167572021484,
+      "learning_rate": 0.0005971452013521717,
+      "loss": 3.202,
+      "step": 8030
+    },
+    {
+      "epoch": 0.46650613594824336,
+      "grad_norm": 0.1315622478723526,
+      "learning_rate": 0.0005962038654217244,
+      "loss": 3.202,
+      "step": 8040
+    },
+    {
+      "epoch": 0.46708636746061677,
+      "grad_norm": 0.14890199899673462,
+      "learning_rate": 0.0005952621752585667,
+      "loss": 3.2069,
+      "step": 8050
+    },
+    {
+      "epoch": 0.46766659897299023,
+      "grad_norm": 0.13835932314395905,
+      "learning_rate": 0.0005943201343301005,
+      "loss": 3.2079,
+      "step": 8060
+    },
+    {
+      "epoch": 0.46824683048536364,
+      "grad_norm": 0.13147889077663422,
+      "learning_rate": 0.0005933777461050187,
+      "loss": 3.2082,
+      "step": 8070
+    },
+    {
+      "epoch": 0.4688270619977371,
+      "grad_norm": 0.12799794971942902,
+      "learning_rate": 0.0005924350140532939,
+      "loss": 3.1974,
+      "step": 8080
+    },
+    {
+      "epoch": 0.4694072935101105,
+      "grad_norm": 0.12934145331382751,
+      "learning_rate": 0.000591491941646164,
+      "loss": 3.2048,
+      "step": 8090
+    },
+    {
+      "epoch": 0.469987525022484,
+      "grad_norm": 0.13451933860778809,
+      "learning_rate": 0.0005905485323561207,
+      "loss": 3.1955,
+      "step": 8100
+    },
+    {
+      "epoch": 0.4705677565348574,
+      "grad_norm": 0.15659664571285248,
+      "learning_rate": 0.0005896047896568955,
+      "loss": 3.1993,
+      "step": 8110
+    },
+    {
+      "epoch": 0.47114798804723085,
+      "grad_norm": 0.14385788142681122,
+      "learning_rate": 0.0005886607170234482,
+      "loss": 3.2043,
+      "step": 8120
+    },
+    {
+      "epoch": 0.47172821955960426,
+      "grad_norm": 0.13023056089878082,
+      "learning_rate": 0.0005877163179319527,
+      "loss": 3.2048,
+      "step": 8130
+    },
+    {
+      "epoch": 0.4723084510719777,
+      "grad_norm": 0.1275002360343933,
+      "learning_rate": 0.0005867715958597859,
+      "loss": 3.2101,
+      "step": 8140
+    },
+    {
+      "epoch": 0.4728886825843512,
+      "grad_norm": 0.13934627175331116,
+      "learning_rate": 0.000585826554285513,
+      "loss": 3.204,
+      "step": 8150
+    },
+    {
+      "epoch": 0.4734689140967246,
+      "grad_norm": 0.1253582239151001,
+      "learning_rate": 0.0005848811966888763,
+      "loss": 3.2038,
+      "step": 8160
+    },
+    {
+      "epoch": 0.47404914560909805,
+      "grad_norm": 0.13219626247882843,
+      "learning_rate": 0.0005839355265507817,
+      "loss": 3.2011,
+      "step": 8170
+    },
+    {
+      "epoch": 0.47462937712147146,
+      "grad_norm": 0.13276910781860352,
+      "learning_rate": 0.0005829895473532852,
+      "loss": 3.2011,
+      "step": 8180
+    },
+    {
+      "epoch": 0.4752096086338449,
+      "grad_norm": 0.146236851811409,
+      "learning_rate": 0.0005820432625795819,
+      "loss": 3.1997,
+      "step": 8190
+    },
+    {
+      "epoch": 0.47578984014621833,
+      "grad_norm": 0.13150210678577423,
+      "learning_rate": 0.0005810966757139909,
+      "loss": 3.1945,
+      "step": 8200
+    },
+    {
+      "epoch": 0.4763700716585918,
+      "grad_norm": 0.14235766232013702,
+      "learning_rate": 0.0005801497902419444,
+      "loss": 3.2039,
+      "step": 8210
+    },
+    {
+      "epoch": 0.4769503031709652,
+      "grad_norm": 0.13625676929950714,
+      "learning_rate": 0.0005792026096499741,
+      "loss": 3.1921,
+      "step": 8220
+    },
+    {
+      "epoch": 0.47753053468333867,
+      "grad_norm": 0.12872271239757538,
+      "learning_rate": 0.0005782551374256981,
+      "loss": 3.1912,
+      "step": 8230
+    },
+    {
+      "epoch": 0.4781107661957121,
+      "grad_norm": 0.14330317080020905,
+      "learning_rate": 0.0005773073770578081,
+      "loss": 3.1958,
+      "step": 8240
+    },
+    {
+      "epoch": 0.47869099770808554,
+      "grad_norm": 0.128121480345726,
+      "learning_rate": 0.0005763593320360575,
+      "loss": 3.1934,
+      "step": 8250
+    },
+    {
+      "epoch": 0.47927122922045895,
+      "grad_norm": 0.13301797211170197,
+      "learning_rate": 0.000575411005851247,
+      "loss": 3.1976,
+      "step": 8260
+    },
+    {
+      "epoch": 0.4798514607328324,
+      "grad_norm": 0.12738023698329926,
+      "learning_rate": 0.0005744624019952131,
+      "loss": 3.1995,
+      "step": 8270
+    },
+    {
+      "epoch": 0.4804316922452058,
+      "grad_norm": 0.13468343019485474,
+      "learning_rate": 0.0005735135239608146,
+      "loss": 3.2016,
+      "step": 8280
+    },
+    {
+      "epoch": 0.4810119237575793,
+      "grad_norm": 0.14049942791461945,
+      "learning_rate": 0.0005725643752419198,
+      "loss": 3.2005,
+      "step": 8290
+    },
+    {
+      "epoch": 0.4815921552699527,
+      "grad_norm": 0.12929829955101013,
+      "learning_rate": 0.0005716149593333938,
+      "loss": 3.2072,
+      "step": 8300
+    },
+    {
+      "epoch": 0.48217238678232616,
+      "grad_norm": 0.1476507931947708,
+      "learning_rate": 0.0005706652797310851,
+      "loss": 3.2013,
+      "step": 8310
+    },
+    {
+      "epoch": 0.48275261829469956,
+      "grad_norm": 0.15121173858642578,
+      "learning_rate": 0.000569715339931814,
+      "loss": 3.1976,
+      "step": 8320
+    },
+    {
+      "epoch": 0.48333284980707303,
+      "grad_norm": 0.12322133034467697,
+      "learning_rate": 0.000568765143433358,
+      "loss": 3.1905,
+      "step": 8330
+    },
+    {
+      "epoch": 0.48391308131944644,
+      "grad_norm": 0.1308000385761261,
+      "learning_rate": 0.0005678146937344402,
+      "loss": 3.1951,
+      "step": 8340
+    },
+    {
+      "epoch": 0.4844933128318199,
+      "grad_norm": 0.14018505811691284,
+      "learning_rate": 0.000566863994334716,
+      "loss": 3.1983,
+      "step": 8350
+    },
+    {
+      "epoch": 0.4850735443441933,
+      "grad_norm": 0.12124442309141159,
+      "learning_rate": 0.0005659130487347602,
+      "loss": 3.1969,
+      "step": 8360
+    },
+    {
+      "epoch": 0.48565377585656677,
+      "grad_norm": 0.13091051578521729,
+      "learning_rate": 0.000564961860436054,
+      "loss": 3.1932,
+      "step": 8370
+    },
+    {
+      "epoch": 0.4862340073689402,
+      "grad_norm": 0.12319710105657578,
+      "learning_rate": 0.0005640104329409727,
+      "loss": 3.1944,
+      "step": 8380
+    },
+    {
+      "epoch": 0.48681423888131364,
+      "grad_norm": 0.12845876812934875,
+      "learning_rate": 0.0005630587697527716,
+      "loss": 3.1929,
+      "step": 8390
+    },
+    {
+      "epoch": 0.4873944703936871,
+      "grad_norm": 0.14527294039726257,
+      "learning_rate": 0.0005621068743755743,
+      "loss": 3.1932,
+      "step": 8400
+    },
+    {
+      "epoch": 0.4879747019060605,
+      "grad_norm": 0.1430954933166504,
+      "learning_rate": 0.0005611547503143595,
+      "loss": 3.1963,
+      "step": 8410
+    },
+    {
+      "epoch": 0.488554933418434,
+      "grad_norm": 0.12142278254032135,
+      "learning_rate": 0.0005602024010749475,
+      "loss": 3.1912,
+      "step": 8420
+    },
+    {
+      "epoch": 0.4891351649308074,
+      "grad_norm": 0.12531523406505585,
+      "learning_rate": 0.0005592498301639884,
+      "loss": 3.1936,
+      "step": 8430
+    },
+    {
+      "epoch": 0.48971539644318085,
+      "grad_norm": 0.11823923885822296,
+      "learning_rate": 0.0005582970410889476,
+      "loss": 3.2031,
+      "step": 8440
+    },
+    {
+      "epoch": 0.49029562795555426,
+      "grad_norm": 0.1265026479959488,
+      "learning_rate": 0.0005573440373580946,
+      "loss": 3.1863,
+      "step": 8450
+    },
+    {
+      "epoch": 0.4908758594679277,
+      "grad_norm": 0.1225002259016037,
+      "learning_rate": 0.0005563908224804887,
+      "loss": 3.1978,
+      "step": 8460
+    },
+    {
+      "epoch": 0.49145609098030113,
+      "grad_norm": 0.13115541636943817,
+      "learning_rate": 0.000555437399965967,
+      "loss": 3.1945,
+      "step": 8470
+    },
+    {
+      "epoch": 0.4920363224926746,
+      "grad_norm": 0.11067093908786774,
+      "learning_rate": 0.0005544837733251313,
+      "loss": 3.195,
+      "step": 8480
+    },
+    {
+      "epoch": 0.492616554005048,
+      "grad_norm": 0.13542482256889343,
+      "learning_rate": 0.0005535299460693346,
+      "loss": 3.1976,
+      "step": 8490
+    },
+    {
+      "epoch": 0.49319678551742147,
+      "grad_norm": 0.12914744019508362,
+      "learning_rate": 0.000552575921710669,
+      "loss": 3.1817,
+      "step": 8500
+    },
+    {
+      "epoch": 0.4937770170297949,
+      "grad_norm": 0.14814750850200653,
+      "learning_rate": 0.0005516217037619517,
+      "loss": 3.1952,
+      "step": 8510
+    },
+    {
+      "epoch": 0.49435724854216834,
+      "grad_norm": 0.13739536702632904,
+      "learning_rate": 0.0005506672957367135,
+      "loss": 3.1946,
+      "step": 8520
+    },
+    {
+      "epoch": 0.49493748005454175,
+      "grad_norm": 0.12713497877120972,
+      "learning_rate": 0.0005497127011491846,
+      "loss": 3.193,
+      "step": 8530
+    },
+    {
+      "epoch": 0.4955177115669152,
+      "grad_norm": 0.13317294418811798,
+      "learning_rate": 0.0005487579235142823,
+      "loss": 3.1951,
+      "step": 8540
+    },
+    {
+      "epoch": 0.4960979430792886,
+      "grad_norm": 0.13274219632148743,
+      "learning_rate": 0.000547802966347598,
+      "loss": 3.1799,
+      "step": 8550
+    },
+    {
+      "epoch": 0.4966781745916621,
+      "grad_norm": 0.137456014752388,
+      "learning_rate": 0.0005468478331653838,
+      "loss": 3.1907,
+      "step": 8560
+    },
+    {
+      "epoch": 0.4972584061040355,
+      "grad_norm": 0.12658333778381348,
+      "learning_rate": 0.0005458925274845402,
+      "loss": 3.1906,
+      "step": 8570
+    },
+    {
+      "epoch": 0.49783863761640895,
+      "grad_norm": 0.15250617265701294,
+      "learning_rate": 0.000544937052822603,
+      "loss": 3.1905,
+      "step": 8580
+    },
+    {
+      "epoch": 0.49841886912878236,
+      "grad_norm": 0.12137165665626526,
+      "learning_rate": 0.0005439814126977296,
+      "loss": 3.195,
+      "step": 8590
+    },
+    {
+      "epoch": 0.4989991006411558,
+      "grad_norm": 0.12580706179141998,
+      "learning_rate": 0.0005430256106286874,
+      "loss": 3.1851,
+      "step": 8600
+    },
+    {
+      "epoch": 0.49957933215352923,
+      "grad_norm": 0.1316945105791092,
+      "learning_rate": 0.0005420696501348397,
+      "loss": 3.1827,
+      "step": 8610
+    },
+    {
+      "epoch": 0.5001595636659026,
+      "grad_norm": 0.12646295130252838,
+      "learning_rate": 0.0005411135347361329,
+      "loss": 3.1911,
+      "step": 8620
+    },
+    {
+      "epoch": 0.5007397951782762,
+      "grad_norm": 0.1217370554804802,
+      "learning_rate": 0.0005401572679530844,
+      "loss": 3.1963,
+      "step": 8630
+    },
+    {
+      "epoch": 0.5013200266906496,
+      "grad_norm": 0.13218845427036285,
+      "learning_rate": 0.0005392008533067684,
+      "loss": 3.1959,
+      "step": 8640
+    },
+    {
+      "epoch": 0.501900258203023,
+      "grad_norm": 0.13100461661815643,
+      "learning_rate": 0.000538244294318804,
+      "loss": 3.1984,
+      "step": 8650
+    },
+    {
+      "epoch": 0.5024804897153965,
+      "grad_norm": 0.12846529483795166,
+      "learning_rate": 0.0005372875945113417,
+      "loss": 3.1873,
+      "step": 8660
+    },
+    {
+      "epoch": 0.5030607212277699,
+      "grad_norm": 0.14111390709877014,
+      "learning_rate": 0.0005363307574070503,
+      "loss": 3.1974,
+      "step": 8670
+    },
+    {
+      "epoch": 0.5036409527401433,
+      "grad_norm": 0.1287715584039688,
+      "learning_rate": 0.0005353737865291039,
+      "loss": 3.1913,
+      "step": 8680
+    },
+    {
+      "epoch": 0.5042211842525167,
+      "grad_norm": 0.1581788808107376,
+      "learning_rate": 0.0005344166854011702,
+      "loss": 3.1833,
+      "step": 8690
+    },
+    {
+      "epoch": 0.5048014157648902,
+      "grad_norm": 0.1236489862203598,
+      "learning_rate": 0.0005334594575473952,
+      "loss": 3.1933,
+      "step": 8700
+    },
+    {
+      "epoch": 0.5053816472772636,
+      "grad_norm": 0.12864017486572266,
+      "learning_rate": 0.0005325021064923924,
+      "loss": 3.1913,
+      "step": 8710
+    },
+    {
+      "epoch": 0.505961878789637,
+      "grad_norm": 0.12411046773195267,
+      "learning_rate": 0.0005315446357612288,
+      "loss": 3.1871,
+      "step": 8720
+    },
+    {
+      "epoch": 0.5065421103020105,
+      "grad_norm": 0.12000168114900589,
+      "learning_rate": 0.0005305870488794117,
+      "loss": 3.1815,
+      "step": 8730
+    },
+    {
+      "epoch": 0.507122341814384,
+      "grad_norm": 0.1342875063419342,
+      "learning_rate": 0.0005296293493728764,
+      "loss": 3.187,
+      "step": 8740
+    },
+    {
+      "epoch": 0.5077025733267574,
+      "grad_norm": 0.12101846933364868,
+      "learning_rate": 0.0005286715407679729,
+      "loss": 3.1871,
+      "step": 8750
+    },
+    {
+      "epoch": 0.5082828048391308,
+      "grad_norm": 0.13273586332798004,
+      "learning_rate": 0.0005277136265914528,
+      "loss": 3.193,
+      "step": 8760
+    },
+    {
+      "epoch": 0.5088630363515042,
+      "grad_norm": 0.12270906567573547,
+      "learning_rate": 0.0005267556103704562,
+      "loss": 3.178,
+      "step": 8770
+    },
+    {
+      "epoch": 0.5094432678638777,
+      "grad_norm": 0.1259816586971283,
+      "learning_rate": 0.0005257974956324994,
+      "loss": 3.187,
+      "step": 8780
+    },
+    {
+      "epoch": 0.5100234993762511,
+      "grad_norm": 0.12720678746700287,
+      "learning_rate": 0.0005248392859054612,
+      "loss": 3.1837,
+      "step": 8790
+    },
+    {
+      "epoch": 0.5106037308886245,
+      "grad_norm": 0.13317066431045532,
+      "learning_rate": 0.0005238809847175704,
+      "loss": 3.1873,
+      "step": 8800
+    },
+    {
+      "epoch": 0.511183962400998,
+      "grad_norm": 0.12688656151294708,
+      "learning_rate": 0.000522922595597392,
+      "loss": 3.1829,
+      "step": 8810
+    },
+    {
+      "epoch": 0.5117641939133715,
+      "grad_norm": 0.1185089647769928,
+      "learning_rate": 0.0005219641220738154,
+      "loss": 3.1864,
+      "step": 8820
+    },
+    {
+      "epoch": 0.5123444254257449,
+      "grad_norm": 0.12334717810153961,
+      "learning_rate": 0.0005210055676760403,
+      "loss": 3.1924,
+      "step": 8830
+    },
+    {
+      "epoch": 0.5129246569381183,
+      "grad_norm": 0.130776509642601,
+      "learning_rate": 0.0005200469359335645,
+      "loss": 3.1864,
+      "step": 8840
+    },
+    {
+      "epoch": 0.5135048884504917,
+      "grad_norm": 0.1395205855369568,
+      "learning_rate": 0.0005190882303761707,
+      "loss": 3.1894,
+      "step": 8850
+    },
+    {
+      "epoch": 0.5140851199628652,
+      "grad_norm": 0.13597504794597626,
+      "learning_rate": 0.000518129454533913,
+      "loss": 3.197,
+      "step": 8860
+    },
+    {
+      "epoch": 0.5146653514752386,
+      "grad_norm": 0.1248147115111351,
+      "learning_rate": 0.0005171706119371045,
+      "loss": 3.1865,
+      "step": 8870
+    },
+    {
+      "epoch": 0.515245582987612,
+      "grad_norm": 0.12766875326633453,
+      "learning_rate": 0.0005162117061163039,
+      "loss": 3.1843,
+      "step": 8880
+    },
+    {
+      "epoch": 0.5158258144999855,
+      "grad_norm": 0.125563845038414,
+      "learning_rate": 0.0005152527406023033,
+      "loss": 3.1827,
+      "step": 8890
+    },
+    {
+      "epoch": 0.516406046012359,
+      "grad_norm": 0.12935802340507507,
+      "learning_rate": 0.0005142937189261138,
+      "loss": 3.1825,
+      "step": 8900
+    },
+    {
+      "epoch": 0.5169862775247324,
+      "grad_norm": 0.12293805927038193,
+      "learning_rate": 0.0005133346446189541,
+      "loss": 3.1909,
+      "step": 8910
+    },
+    {
+      "epoch": 0.5175665090371058,
+      "grad_norm": 0.12657864391803741,
+      "learning_rate": 0.0005123755212122359,
+      "loss": 3.172,
+      "step": 8920
+    },
+    {
+      "epoch": 0.5181467405494793,
+      "grad_norm": 0.12287136912345886,
+      "learning_rate": 0.0005114163522375522,
+      "loss": 3.1968,
+      "step": 8930
+    },
+    {
+      "epoch": 0.5187269720618527,
+      "grad_norm": 0.13364015519618988,
+      "learning_rate": 0.0005104571412266636,
+      "loss": 3.1799,
+      "step": 8940
+    },
+    {
+      "epoch": 0.5193072035742261,
+      "grad_norm": 0.14035052061080933,
+      "learning_rate": 0.0005094978917114853,
+      "loss": 3.1776,
+      "step": 8950
+    },
+    {
+      "epoch": 0.5198874350865995,
+      "grad_norm": 0.12295843660831451,
+      "learning_rate": 0.000508538607224075,
+      "loss": 3.1805,
+      "step": 8960
+    },
+    {
+      "epoch": 0.520467666598973,
+      "grad_norm": 0.11810554563999176,
+      "learning_rate": 0.0005075792912966184,
+      "loss": 3.1785,
+      "step": 8970
+    },
+    {
+      "epoch": 0.5210478981113464,
+      "grad_norm": 0.14744389057159424,
+      "learning_rate": 0.0005066199474614173,
+      "loss": 3.1906,
+      "step": 8980
+    },
+    {
+      "epoch": 0.5216281296237198,
+      "grad_norm": 0.13044193387031555,
+      "learning_rate": 0.000505660579250876,
+      "loss": 3.1766,
+      "step": 8990
+    },
+    {
+      "epoch": 0.5222083611360933,
+      "grad_norm": 0.1182679831981659,
+      "learning_rate": 0.000504701190197489,
+      "loss": 3.1816,
+      "step": 9000
+    },
+    {
+      "epoch": 0.5222083611360933,
+      "eval_loss": 3.1160783767700195,
+      "eval_runtime": 3.2455,
+      "eval_samples_per_second": 1334.151,
+      "eval_steps_per_second": 10.476,
+      "step": 9000
+    },
+    {
+      "epoch": 0.5227885926484668,
+      "grad_norm": 0.13336721062660217,
+      "learning_rate": 0.0005037417838338272,
+      "loss": 3.1825,
+      "step": 9010
+    },
+    {
+      "epoch": 0.5233688241608402,
+      "grad_norm": 0.11967909336090088,
+      "learning_rate": 0.0005027823636925254,
+      "loss": 3.1839,
+      "step": 9020
+    },
+    {
+      "epoch": 0.5239490556732136,
+      "grad_norm": 0.1373438537120819,
+      "learning_rate": 0.0005018229333062689,
+      "loss": 3.1859,
+      "step": 9030
+    },
+    {
+      "epoch": 0.524529287185587,
+      "grad_norm": 0.12351592630147934,
+      "learning_rate": 0.0005008634962077811,
+      "loss": 3.1889,
+      "step": 9040
+    },
+    {
+      "epoch": 0.5251095186979605,
+      "grad_norm": 0.13033995032310486,
+      "learning_rate": 0.0004999040559298097,
+      "loss": 3.1879,
+      "step": 9050
+    },
+    {
+      "epoch": 0.5256897502103339,
+      "grad_norm": 0.11891571432352066,
+      "learning_rate": 0.0004989446160051145,
+      "loss": 3.1905,
+      "step": 9060
+    },
+    {
+      "epoch": 0.5262699817227073,
+      "grad_norm": 0.12579171359539032,
+      "learning_rate": 0.0004979851799664539,
+      "loss": 3.1708,
+      "step": 9070
+    },
+    {
+      "epoch": 0.5268502132350807,
+      "grad_norm": 0.12359123677015305,
+      "learning_rate": 0.0004970257513465714,
+      "loss": 3.1824,
+      "step": 9080
+    },
+    {
+      "epoch": 0.5274304447474543,
+      "grad_norm": 0.1159052848815918,
+      "learning_rate": 0.0004960663336781842,
+      "loss": 3.18,
+      "step": 9090
+    },
+    {
+      "epoch": 0.5280106762598277,
+      "grad_norm": 0.11939360946416855,
+      "learning_rate": 0.0004951069304939684,
+      "loss": 3.1806,
+      "step": 9100
+    },
+    {
+      "epoch": 0.5285909077722011,
+      "grad_norm": 0.12974359095096588,
+      "learning_rate": 0.0004941475453265471,
+      "loss": 3.1774,
+      "step": 9110
+    },
+    {
+      "epoch": 0.5291711392845746,
+      "grad_norm": 0.13163182139396667,
+      "learning_rate": 0.0004931881817084771,
+      "loss": 3.1888,
+      "step": 9120
+    },
+    {
+      "epoch": 0.529751370796948,
+      "grad_norm": 0.1382340043783188,
+      "learning_rate": 0.0004922288431722355,
+      "loss": 3.1814,
+      "step": 9130
+    },
+    {
+      "epoch": 0.5303316023093214,
+      "grad_norm": 0.12294802814722061,
+      "learning_rate": 0.0004912695332502076,
+      "loss": 3.1793,
+      "step": 9140
+    },
+    {
+      "epoch": 0.5309118338216948,
+      "grad_norm": 0.13962669670581818,
+      "learning_rate": 0.0004903102554746727,
+      "loss": 3.1819,
+      "step": 9150
+    },
+    {
+      "epoch": 0.5314920653340683,
+      "grad_norm": 0.12373016029596329,
+      "learning_rate": 0.0004893510133777922,
+      "loss": 3.1747,
+      "step": 9160
+    },
+    {
+      "epoch": 0.5320722968464418,
+      "grad_norm": 0.12787871062755585,
+      "learning_rate": 0.0004883918104915962,
+      "loss": 3.1756,
+      "step": 9170
+    },
+    {
+      "epoch": 0.5326525283588152,
+      "grad_norm": 0.12098101526498795,
+      "learning_rate": 0.0004874326503479698,
+      "loss": 3.1826,
+      "step": 9180
+    },
+    {
+      "epoch": 0.5332327598711886,
+      "grad_norm": 0.13320770859718323,
+      "learning_rate": 0.0004864735364786415,
+      "loss": 3.1798,
+      "step": 9190
+    },
+    {
+      "epoch": 0.5338129913835621,
+      "grad_norm": 0.13723470270633698,
+      "learning_rate": 0.00048551447241516866,
+      "loss": 3.1811,
+      "step": 9200
+    },
+    {
+      "epoch": 0.5343932228959355,
+      "grad_norm": 0.12632976472377777,
+      "learning_rate": 0.00048455546168892614,
+      "loss": 3.1935,
+      "step": 9210
+    },
+    {
+      "epoch": 0.5349734544083089,
+      "grad_norm": 0.12484107166528702,
+      "learning_rate": 0.00048359650783109145,
+      "loss": 3.1719,
+      "step": 9220
+    },
+    {
+      "epoch": 0.5355536859206823,
+      "grad_norm": 0.13436180353164673,
+      "learning_rate": 0.0004826376143726332,
+      "loss": 3.1862,
+      "step": 9230
+    },
+    {
+      "epoch": 0.5361339174330558,
+      "grad_norm": 0.13016556203365326,
+      "learning_rate": 0.00048167878484429793,
+      "loss": 3.1812,
+      "step": 9240
+    },
+    {
+      "epoch": 0.5367141489454292,
+      "grad_norm": 0.12285098433494568,
+      "learning_rate": 0.00048072002277659595,
+      "loss": 3.1799,
+      "step": 9250
+    },
+    {
+      "epoch": 0.5372943804578026,
+      "grad_norm": 0.12092869728803635,
+      "learning_rate": 0.0004797613316997899,
+      "loss": 3.178,
+      "step": 9260
+    },
+    {
+      "epoch": 0.537874611970176,
+      "grad_norm": 0.11524718254804611,
+      "learning_rate": 0.0004788027151438806,
+      "loss": 3.1737,
+      "step": 9270
+    },
+    {
+      "epoch": 0.5384548434825496,
+      "grad_norm": 0.12745259702205658,
+      "learning_rate": 0.0004778441766385947,
+      "loss": 3.1746,
+      "step": 9280
+    },
+    {
+      "epoch": 0.539035074994923,
+      "grad_norm": 0.12326768040657043,
+      "learning_rate": 0.00047688571971337155,
+      "loss": 3.1752,
+      "step": 9290
+    },
+    {
+      "epoch": 0.5396153065072964,
+      "grad_norm": 0.11677366495132446,
+      "learning_rate": 0.00047592734789734967,
+      "loss": 3.1702,
+      "step": 9300
+    },
+    {
+      "epoch": 0.5401955380196698,
+      "grad_norm": 0.1259879618883133,
+      "learning_rate": 0.0004749690647193547,
+      "loss": 3.174,
+      "step": 9310
+    },
+    {
+      "epoch": 0.5407757695320433,
+      "grad_norm": 0.1180575042963028,
+      "learning_rate": 0.00047401087370788547,
+      "loss": 3.1738,
+      "step": 9320
+    },
+    {
+      "epoch": 0.5413560010444167,
+      "grad_norm": 0.12572011351585388,
+      "learning_rate": 0.00047305277839110207,
+      "loss": 3.1795,
+      "step": 9330
+    },
+    {
+      "epoch": 0.5419362325567901,
+      "grad_norm": 0.12240619957447052,
+      "learning_rate": 0.0004720947822968113,
+      "loss": 3.1814,
+      "step": 9340
+    },
+    {
+      "epoch": 0.5425164640691637,
+      "grad_norm": 0.12223079055547714,
+      "learning_rate": 0.00047113688895245536,
+      "loss": 3.1693,
+      "step": 9350
+    },
+    {
+      "epoch": 0.5430966955815371,
+      "grad_norm": 0.11417195945978165,
+      "learning_rate": 0.00047017910188509805,
+      "loss": 3.1765,
+      "step": 9360
+    },
+    {
+      "epoch": 0.5436769270939105,
+      "grad_norm": 0.1286236047744751,
+      "learning_rate": 0.00046922142462141146,
+      "loss": 3.1799,
+      "step": 9370
+    },
+    {
+      "epoch": 0.5442571586062839,
+      "grad_norm": 0.12266254425048828,
+      "learning_rate": 0.0004682638606876639,
+      "loss": 3.1679,
+      "step": 9380
+    },
+    {
+      "epoch": 0.5448373901186574,
+      "grad_norm": 0.12598057091236115,
+      "learning_rate": 0.00046730641360970564,
+      "loss": 3.1589,
+      "step": 9390
+    },
+    {
+      "epoch": 0.5454176216310308,
+      "grad_norm": 0.11109986901283264,
+      "learning_rate": 0.0004663490869129574,
+      "loss": 3.1772,
+      "step": 9400
+    },
+    {
+      "epoch": 0.5459978531434042,
+      "grad_norm": 0.12376119196414948,
+      "learning_rate": 0.0004653918841223964,
+      "loss": 3.1748,
+      "step": 9410
+    },
+    {
+      "epoch": 0.5465780846557776,
+      "grad_norm": 0.14834947884082794,
+      "learning_rate": 0.0004644348087625434,
+      "loss": 3.1799,
+      "step": 9420
+    },
+    {
+      "epoch": 0.5471583161681511,
+      "grad_norm": 0.12348010390996933,
+      "learning_rate": 0.00046347786435745053,
+      "loss": 3.1679,
+      "step": 9430
+    },
+    {
+      "epoch": 0.5477385476805245,
+      "grad_norm": 0.12390238046646118,
+      "learning_rate": 0.00046252105443068676,
+      "loss": 3.1809,
+      "step": 9440
+    },
+    {
+      "epoch": 0.548318779192898,
+      "grad_norm": 0.12994909286499023,
+      "learning_rate": 0.0004615643825053269,
+      "loss": 3.1774,
+      "step": 9450
+    },
+    {
+      "epoch": 0.5488990107052714,
+      "grad_norm": 0.13186348974704742,
+      "learning_rate": 0.000460607852103937,
+      "loss": 3.1627,
+      "step": 9460
+    },
+    {
+      "epoch": 0.5494792422176449,
+      "grad_norm": 0.11778120696544647,
+      "learning_rate": 0.00045965146674856216,
+      "loss": 3.1642,
+      "step": 9470
+    },
+    {
+      "epoch": 0.5500594737300183,
+      "grad_norm": 0.13027390837669373,
+      "learning_rate": 0.0004586952299607139,
+      "loss": 3.1745,
+      "step": 9480
+    },
+    {
+      "epoch": 0.5506397052423917,
+      "grad_norm": 0.13938818871974945,
+      "learning_rate": 0.00045773914526135555,
+      "loss": 3.177,
+      "step": 9490
+    },
+    {
+      "epoch": 0.5512199367547651,
+      "grad_norm": 0.13590595126152039,
+      "learning_rate": 0.0004567832161708918,
+      "loss": 3.1794,
+      "step": 9500
+    },
+    {
+      "epoch": 0.5518001682671386,
+      "grad_norm": 0.13180691003799438,
+      "learning_rate": 0.00045582744620915313,
+      "loss": 3.1752,
+      "step": 9510
+    },
+    {
+      "epoch": 0.552380399779512,
+      "grad_norm": 0.13000676035881042,
+      "learning_rate": 0.0004548718388953849,
+      "loss": 3.1737,
+      "step": 9520
+    },
+    {
+      "epoch": 0.5529606312918854,
+      "grad_norm": 0.1294243186712265,
+      "learning_rate": 0.00045391639774823345,
+      "loss": 3.1729,
+      "step": 9530
+    },
+    {
+      "epoch": 0.5535408628042588,
+      "grad_norm": 0.12174411118030548,
+      "learning_rate": 0.000452961126285733,
+      "loss": 3.173,
+      "step": 9540
+    },
+    {
+      "epoch": 0.5541210943166324,
+      "grad_norm": 0.11989938467741013,
+      "learning_rate": 0.0004520060280252934,
+      "loss": 3.172,
+      "step": 9550
+    },
+    {
+      "epoch": 0.5547013258290058,
+      "grad_norm": 0.12082493305206299,
+      "learning_rate": 0.0004510511064836862,
+      "loss": 3.1676,
+      "step": 9560
+    },
+    {
+      "epoch": 0.5552815573413792,
+      "grad_norm": 0.12731657922267914,
+      "learning_rate": 0.00045009636517703275,
+      "loss": 3.1816,
+      "step": 9570
+    },
+    {
+      "epoch": 0.5558617888537526,
+      "grad_norm": 0.11837522685527802,
+      "learning_rate": 0.0004491418076207903,
+      "loss": 3.1749,
+      "step": 9580
+    },
+    {
+      "epoch": 0.5564420203661261,
+      "grad_norm": 0.11386506259441376,
+      "learning_rate": 0.00044818743732974003,
+      "loss": 3.1577,
+      "step": 9590
+    },
+    {
+      "epoch": 0.5570222518784995,
+      "grad_norm": 0.12620550394058228,
+      "learning_rate": 0.00044723325781797346,
+      "loss": 3.1755,
+      "step": 9600
+    },
+    {
+      "epoch": 0.5576024833908729,
+      "grad_norm": 0.11217296868562698,
+      "learning_rate": 0.0004462792725988791,
+      "loss": 3.1599,
+      "step": 9610
+    },
+    {
+      "epoch": 0.5581827149032464,
+      "grad_norm": 0.12458086013793945,
+      "learning_rate": 0.0004453254851851308,
+      "loss": 3.1749,
+      "step": 9620
+    },
+    {
+      "epoch": 0.5587629464156199,
+      "grad_norm": 0.1312059760093689,
+      "learning_rate": 0.0004443718990886734,
+      "loss": 3.1693,
+      "step": 9630
+    },
+    {
+      "epoch": 0.5593431779279933,
+      "grad_norm": 0.128337562084198,
+      "learning_rate": 0.00044341851782071106,
+      "loss": 3.1755,
+      "step": 9640
+    },
+    {
+      "epoch": 0.5599234094403667,
+      "grad_norm": 0.11443614959716797,
+      "learning_rate": 0.00044246534489169367,
+      "loss": 3.1716,
+      "step": 9650
+    },
+    {
+      "epoch": 0.5605036409527402,
+      "grad_norm": 0.12734359502792358,
+      "learning_rate": 0.00044151238381130324,
+      "loss": 3.1717,
+      "step": 9660
+    },
+    {
+      "epoch": 0.5610838724651136,
+      "grad_norm": 0.1212453842163086,
+      "learning_rate": 0.0004405596380884428,
+      "loss": 3.1642,
+      "step": 9670
+    },
+    {
+      "epoch": 0.561664103977487,
+      "grad_norm": 0.1245492622256279,
+      "learning_rate": 0.0004396071112312216,
+      "loss": 3.175,
+      "step": 9680
+    },
+    {
+      "epoch": 0.5622443354898604,
+      "grad_norm": 0.11448545008897781,
+      "learning_rate": 0.0004386548067469437,
+      "loss": 3.1716,
+      "step": 9690
+    },
+    {
+      "epoch": 0.5628245670022339,
+      "grad_norm": 0.12811312079429626,
+      "learning_rate": 0.00043770272814209343,
+      "loss": 3.1614,
+      "step": 9700
+    },
+    {
+      "epoch": 0.5634047985146073,
+      "grad_norm": 0.11437219381332397,
+      "learning_rate": 0.0004367508789223243,
+      "loss": 3.1724,
+      "step": 9710
+    },
+    {
+      "epoch": 0.5639850300269807,
+      "grad_norm": 0.11365852504968643,
+      "learning_rate": 0.00043579926259244487,
+      "loss": 3.1707,
+      "step": 9720
+    },
+    {
+      "epoch": 0.5645652615393542,
+      "grad_norm": 0.12131789326667786,
+      "learning_rate": 0.0004348478826564059,
+      "loss": 3.1694,
+      "step": 9730
+    },
+    {
+      "epoch": 0.5651454930517277,
+      "grad_norm": 0.11996260285377502,
+      "learning_rate": 0.0004338967426172884,
+      "loss": 3.1579,
+      "step": 9740
+    },
+    {
+      "epoch": 0.5657257245641011,
+      "grad_norm": 0.12249016016721725,
+      "learning_rate": 0.00043294584597728915,
+      "loss": 3.1685,
+      "step": 9750
+    },
+    {
+      "epoch": 0.5663059560764745,
+      "grad_norm": 0.12243705987930298,
+      "learning_rate": 0.0004319951962377094,
+      "loss": 3.1719,
+      "step": 9760
+    },
+    {
+      "epoch": 0.5668861875888479,
+      "grad_norm": 0.12344249337911606,
+      "learning_rate": 0.00043104479689894137,
+      "loss": 3.1779,
+      "step": 9770
+    },
+    {
+      "epoch": 0.5674664191012214,
+      "grad_norm": 0.11184128373861313,
+      "learning_rate": 0.00043009465146045444,
+      "loss": 3.1705,
+      "step": 9780
+    },
+    {
+      "epoch": 0.5680466506135948,
+      "grad_norm": 0.12422725558280945,
+      "learning_rate": 0.0004291447634207841,
+      "loss": 3.1702,
+      "step": 9790
+    },
+    {
+      "epoch": 0.5686268821259682,
+      "grad_norm": 0.13139618933200836,
+      "learning_rate": 0.0004281951362775173,
+      "loss": 3.1658,
+      "step": 9800
+    },
+    {
+      "epoch": 0.5692071136383416,
+      "grad_norm": 0.14261464774608612,
+      "learning_rate": 0.000427245773527281,
+      "loss": 3.165,
+      "step": 9810
+    },
+    {
+      "epoch": 0.5697873451507152,
+      "grad_norm": 0.11359596252441406,
+      "learning_rate": 0.0004262966786657279,
+      "loss": 3.1698,
+      "step": 9820
+    },
+    {
+      "epoch": 0.5703675766630886,
+      "grad_norm": 0.13556736707687378,
+      "learning_rate": 0.0004253478551875249,
+      "loss": 3.168,
+      "step": 9830
+    },
+    {
+      "epoch": 0.570947808175462,
+      "grad_norm": 0.12147964537143707,
+      "learning_rate": 0.00042439930658633965,
+      "loss": 3.1672,
+      "step": 9840
+    },
+    {
+      "epoch": 0.5715280396878355,
+      "grad_norm": 0.12102476507425308,
+      "learning_rate": 0.00042345103635482706,
+      "loss": 3.1628,
+      "step": 9850
+    },
+    {
+      "epoch": 0.5721082712002089,
+      "grad_norm": 0.11904580146074295,
+      "learning_rate": 0.0004225030479846179,
+      "loss": 3.1644,
+      "step": 9860
+    },
+    {
+      "epoch": 0.5726885027125823,
+      "grad_norm": 0.1157933697104454,
+      "learning_rate": 0.00042155534496630427,
+      "loss": 3.1663,
+      "step": 9870
+    },
+    {
+      "epoch": 0.5732687342249557,
+      "grad_norm": 0.12185543030500412,
+      "learning_rate": 0.00042060793078942804,
+      "loss": 3.1785,
+      "step": 9880
+    },
+    {
+      "epoch": 0.5738489657373292,
+      "grad_norm": 0.1252318024635315,
+      "learning_rate": 0.00041966080894246773,
+      "loss": 3.159,
+      "step": 9890
+    },
+    {
+      "epoch": 0.5744291972497026,
+      "grad_norm": 0.12326642870903015,
+      "learning_rate": 0.00041871398291282484,
+      "loss": 3.1576,
+      "step": 9900
+    },
+    {
+      "epoch": 0.5750094287620761,
+      "grad_norm": 0.1266362965106964,
+      "learning_rate": 0.0004177674561868123,
+      "loss": 3.16,
+      "step": 9910
+    },
+    {
+      "epoch": 0.5755896602744495,
+      "grad_norm": 0.1305086612701416,
+      "learning_rate": 0.00041682123224964047,
+      "loss": 3.1697,
+      "step": 9920
+    },
+    {
+      "epoch": 0.576169891786823,
+      "grad_norm": 0.12299249321222305,
+      "learning_rate": 0.0004158753145854051,
+      "loss": 3.1663,
+      "step": 9930
+    },
+    {
+      "epoch": 0.5767501232991964,
+      "grad_norm": 0.11096496134996414,
+      "learning_rate": 0.00041492970667707403,
+      "loss": 3.1663,
+      "step": 9940
+    },
+    {
+      "epoch": 0.5773303548115698,
+      "grad_norm": 0.10970742255449295,
+      "learning_rate": 0.00041398441200647467,
+      "loss": 3.1617,
+      "step": 9950
+    },
+    {
+      "epoch": 0.5779105863239432,
+      "grad_norm": 0.12066974490880966,
+      "learning_rate": 0.0004130394340542813,
+      "loss": 3.1656,
+      "step": 9960
+    },
+    {
+      "epoch": 0.5784908178363167,
+      "grad_norm": 0.10806959867477417,
+      "learning_rate": 0.0004120947763000012,
+      "loss": 3.1649,
+      "step": 9970
+    },
+    {
+      "epoch": 0.5790710493486901,
+      "grad_norm": 0.11969128251075745,
+      "learning_rate": 0.0004111504422219637,
+      "loss": 3.1675,
+      "step": 9980
+    },
+    {
+      "epoch": 0.5796512808610635,
+      "grad_norm": 0.11461341381072998,
+      "learning_rate": 0.0004102064352973054,
+      "loss": 3.1631,
+      "step": 9990
+    },
+    {
+      "epoch": 0.580231512373437,
+      "grad_norm": 0.1177605539560318,
+      "learning_rate": 0.00040926275900195886,
+      "loss": 3.1583,
+      "step": 10000
+    },
+    {
+      "epoch": 0.580231512373437,
+      "eval_loss": 3.0971479415893555,
+      "eval_runtime": 3.2713,
+      "eval_samples_per_second": 1323.637,
+      "eval_steps_per_second": 10.393,
+      "step": 10000
+    },
+    {
+      "epoch": 0.5808117438858105,
+      "grad_norm": 0.13472320139408112,
+      "learning_rate": 0.00040831941681063926,
+      "loss": 3.1596,
+      "step": 10010
+    },
+    {
+      "epoch": 0.5813919753981839,
+      "grad_norm": 0.12773457169532776,
+      "learning_rate": 0.000407376412196831,
+      "loss": 3.1751,
+      "step": 10020
+    },
+    {
+      "epoch": 0.5819722069105573,
+      "grad_norm": 0.11364042013883591,
+      "learning_rate": 0.0004064337486327761,
+      "loss": 3.1541,
+      "step": 10030
+    },
+    {
+      "epoch": 0.5825524384229307,
+      "grad_norm": 0.1128978356719017,
+      "learning_rate": 0.00040549142958946037,
+      "loss": 3.1594,
+      "step": 10040
+    },
+    {
+      "epoch": 0.5831326699353042,
+      "grad_norm": 0.11539763957262039,
+      "learning_rate": 0.00040454945853660157,
+      "loss": 3.1708,
+      "step": 10050
+    },
+    {
+      "epoch": 0.5837129014476776,
+      "grad_norm": 0.13058942556381226,
+      "learning_rate": 0.00040360783894263536,
+      "loss": 3.1611,
+      "step": 10060
+    },
+    {
+      "epoch": 0.584293132960051,
+      "grad_norm": 0.13269387185573578,
+      "learning_rate": 0.00040266657427470395,
+      "loss": 3.1631,
+      "step": 10070
+    },
+    {
+      "epoch": 0.5848733644724246,
+      "grad_norm": 0.11398264765739441,
+      "learning_rate": 0.00040172566799864264,
+      "loss": 3.1593,
+      "step": 10080
+    },
+    {
+      "epoch": 0.585453595984798,
+      "grad_norm": 0.12349914014339447,
+      "learning_rate": 0.00040078512357896647,
+      "loss": 3.1585,
+      "step": 10090
+    },
+    {
+      "epoch": 0.5860338274971714,
+      "grad_norm": 0.12374427914619446,
+      "learning_rate": 0.0003998449444788589,
+      "loss": 3.1654,
+      "step": 10100
+    },
+    {
+      "epoch": 0.5866140590095448,
+      "grad_norm": 0.11344794183969498,
+      "learning_rate": 0.0003989051341601576,
+      "loss": 3.1564,
+      "step": 10110
+    },
+    {
+      "epoch": 0.5871942905219183,
+      "grad_norm": 0.11296453326940536,
+      "learning_rate": 0.0003979656960833428,
+      "loss": 3.1632,
+      "step": 10120
+    },
+    {
+      "epoch": 0.5877745220342917,
+      "grad_norm": 0.11938530951738358,
+      "learning_rate": 0.00039702663370752393,
+      "loss": 3.1687,
+      "step": 10130
+    },
+    {
+      "epoch": 0.5883547535466651,
+      "grad_norm": 0.12476367503404617,
+      "learning_rate": 0.00039608795049042686,
+      "loss": 3.1605,
+      "step": 10140
+    },
+    {
+      "epoch": 0.5889349850590385,
+      "grad_norm": 0.1283896565437317,
+      "learning_rate": 0.0003951496498883817,
+      "loss": 3.154,
+      "step": 10150
+    },
+    {
+      "epoch": 0.589515216571412,
+      "grad_norm": 0.11707280576229095,
+      "learning_rate": 0.00039421173535630937,
+      "loss": 3.1675,
+      "step": 10160
+    },
+    {
+      "epoch": 0.5900954480837854,
+      "grad_norm": 0.11196309328079224,
+      "learning_rate": 0.0003932742103477098,
+      "loss": 3.1597,
+      "step": 10170
+    },
+    {
+      "epoch": 0.5906756795961589,
+      "grad_norm": 0.13069289922714233,
+      "learning_rate": 0.0003923370783146477,
+      "loss": 3.162,
+      "step": 10180
+    },
+    {
+      "epoch": 0.5912559111085323,
+      "grad_norm": 0.11600931733846664,
+      "learning_rate": 0.0003914003427077418,
+      "loss": 3.1611,
+      "step": 10190
+    },
+    {
+      "epoch": 0.5918361426209058,
+      "grad_norm": 0.11921602487564087,
+      "learning_rate": 0.00039046400697615076,
+      "loss": 3.1603,
+      "step": 10200
+    },
+    {
+      "epoch": 0.5924163741332792,
+      "grad_norm": 0.10909148305654526,
+      "learning_rate": 0.0003895280745675606,
+      "loss": 3.1651,
+      "step": 10210
+    },
+    {
+      "epoch": 0.5929966056456526,
+      "grad_norm": 0.1261613517999649,
+      "learning_rate": 0.0003885925489281729,
+      "loss": 3.164,
+      "step": 10220
+    },
+    {
+      "epoch": 0.593576837158026,
+      "grad_norm": 0.1152707114815712,
+      "learning_rate": 0.00038765743350269047,
+      "loss": 3.1569,
+      "step": 10230
+    },
+    {
+      "epoch": 0.5941570686703995,
+      "grad_norm": 0.13062123954296112,
+      "learning_rate": 0.0003867227317343066,
+      "loss": 3.1526,
+      "step": 10240
+    },
+    {
+      "epoch": 0.5947373001827729,
+      "grad_norm": 0.13169212639331818,
+      "learning_rate": 0.0003857884470646912,
+      "loss": 3.1584,
+      "step": 10250
+    },
+    {
+      "epoch": 0.5953175316951463,
+      "grad_norm": 0.1235685646533966,
+      "learning_rate": 0.0003848545829339781,
+      "loss": 3.1635,
+      "step": 10260
+    },
+    {
+      "epoch": 0.5958977632075197,
+      "grad_norm": 0.11871648579835892,
+      "learning_rate": 0.00038392114278075316,
+      "loss": 3.1547,
+      "step": 10270
+    },
+    {
+      "epoch": 0.5964779947198933,
+      "grad_norm": 0.11664935946464539,
+      "learning_rate": 0.0003829881300420404,
+      "loss": 3.1553,
+      "step": 10280
+    },
+    {
+      "epoch": 0.5970582262322667,
+      "grad_norm": 0.10464397817850113,
+      "learning_rate": 0.0003820555481532908,
+      "loss": 3.1465,
+      "step": 10290
+    },
+    {
+      "epoch": 0.5976384577446401,
+      "grad_norm": 0.11757074296474457,
+      "learning_rate": 0.0003811234005483683,
+      "loss": 3.1576,
+      "step": 10300
+    },
+    {
+      "epoch": 0.5982186892570136,
+      "grad_norm": 0.12942548096179962,
+      "learning_rate": 0.0003801916906595382,
+      "loss": 3.1582,
+      "step": 10310
+    },
+    {
+      "epoch": 0.598798920769387,
+      "grad_norm": 0.13089211285114288,
+      "learning_rate": 0.000379260421917454,
+      "loss": 3.149,
+      "step": 10320
+    },
+    {
+      "epoch": 0.5993791522817604,
+      "grad_norm": 0.123594731092453,
+      "learning_rate": 0.0003783295977511445,
+      "loss": 3.1622,
+      "step": 10330
+    },
+    {
+      "epoch": 0.5999593837941338,
+      "grad_norm": 0.12618903815746307,
+      "learning_rate": 0.0003773992215880022,
+      "loss": 3.1599,
+      "step": 10340
+    },
+    {
+      "epoch": 0.6005396153065073,
+      "grad_norm": 0.11297423392534256,
+      "learning_rate": 0.00037646929685376904,
+      "loss": 3.1575,
+      "step": 10350
+    },
+    {
+      "epoch": 0.6011198468188808,
+      "grad_norm": 0.12514062225818634,
+      "learning_rate": 0.0003755398269725256,
+      "loss": 3.1549,
+      "step": 10360
+    },
+    {
+      "epoch": 0.6017000783312542,
+      "grad_norm": 0.11910570412874222,
+      "learning_rate": 0.00037461081536667743,
+      "loss": 3.1615,
+      "step": 10370
+    },
+    {
+      "epoch": 0.6022803098436276,
+      "grad_norm": 0.11765125393867493,
+      "learning_rate": 0.0003736822654569425,
+      "loss": 3.1613,
+      "step": 10380
+    },
+    {
+      "epoch": 0.6028605413560011,
+      "grad_norm": 0.10604594647884369,
+      "learning_rate": 0.00037275418066233903,
+      "loss": 3.1475,
+      "step": 10390
+    },
+    {
+      "epoch": 0.6034407728683745,
+      "grad_norm": 0.1241423636674881,
+      "learning_rate": 0.00037182656440017207,
+      "loss": 3.1537,
+      "step": 10400
+    },
+    {
+      "epoch": 0.6040210043807479,
+      "grad_norm": 0.13135185837745667,
+      "learning_rate": 0.0003708994200860221,
+      "loss": 3.1423,
+      "step": 10410
+    },
+    {
+      "epoch": 0.6046012358931213,
+      "grad_norm": 0.11381290853023529,
+      "learning_rate": 0.0003699727511337316,
+      "loss": 3.157,
+      "step": 10420
+    },
+    {
+      "epoch": 0.6051814674054948,
+      "grad_norm": 0.11703768372535706,
+      "learning_rate": 0.0003690465609553927,
+      "loss": 3.15,
+      "step": 10430
+    },
+    {
+      "epoch": 0.6057616989178682,
+      "grad_norm": 0.11526386439800262,
+      "learning_rate": 0.0003681208529613348,
+      "loss": 3.1625,
+      "step": 10440
+    },
+    {
+      "epoch": 0.6063419304302416,
+      "grad_norm": 0.1294795721769333,
+      "learning_rate": 0.00036719563056011146,
+      "loss": 3.1577,
+      "step": 10450
+    },
+    {
+      "epoch": 0.6069221619426151,
+      "grad_norm": 0.12788033485412598,
+      "learning_rate": 0.0003662708971584887,
+      "loss": 3.1549,
+      "step": 10460
+    },
+    {
+      "epoch": 0.6075023934549886,
+      "grad_norm": 0.11444190889596939,
+      "learning_rate": 0.00036534665616143157,
+      "loss": 3.158,
+      "step": 10470
+    },
+    {
+      "epoch": 0.608082624967362,
+      "grad_norm": 0.12848497927188873,
+      "learning_rate": 0.00036442291097209245,
+      "loss": 3.1534,
+      "step": 10480
+    },
+    {
+      "epoch": 0.6086628564797354,
+      "grad_norm": 0.13192491233348846,
+      "learning_rate": 0.000363499664991798,
+      "loss": 3.1647,
+      "step": 10490
+    },
+    {
+      "epoch": 0.6092430879921088,
+      "grad_norm": 0.1181025505065918,
+      "learning_rate": 0.0003625769216200362,
+      "loss": 3.1556,
+      "step": 10500
+    },
+    {
+      "epoch": 0.6098233195044823,
+      "grad_norm": 0.11332180351018906,
+      "learning_rate": 0.00036165468425444514,
+      "loss": 3.1531,
+      "step": 10510
+    },
+    {
+      "epoch": 0.6104035510168557,
+      "grad_norm": 0.11427458375692368,
+      "learning_rate": 0.00036073295629079926,
+      "loss": 3.1441,
+      "step": 10520
+    },
+    {
+      "epoch": 0.6109837825292291,
+      "grad_norm": 0.1351877599954605,
+      "learning_rate": 0.00035981174112299774,
+      "loss": 3.1592,
+      "step": 10530
+    },
+    {
+      "epoch": 0.6115640140416027,
+      "grad_norm": 0.11437386274337769,
+      "learning_rate": 0.000358891042143051,
+      "loss": 3.1508,
+      "step": 10540
+    },
+    {
+      "epoch": 0.6121442455539761,
+      "grad_norm": 0.1317347139120102,
+      "learning_rate": 0.00035797086274106917,
+      "loss": 3.1602,
+      "step": 10550
+    },
+    {
+      "epoch": 0.6127244770663495,
+      "grad_norm": 0.12212193757295609,
+      "learning_rate": 0.00035705120630524946,
+      "loss": 3.1562,
+      "step": 10560
+    },
+    {
+      "epoch": 0.6133047085787229,
+      "grad_norm": 0.10987838357686996,
+      "learning_rate": 0.00035613207622186297,
+      "loss": 3.1498,
+      "step": 10570
+    },
+    {
+      "epoch": 0.6138849400910964,
+      "grad_norm": 0.1109929159283638,
+      "learning_rate": 0.00035521347587524324,
+      "loss": 3.1592,
+      "step": 10580
+    },
+    {
+      "epoch": 0.6144651716034698,
+      "grad_norm": 0.11722821742296219,
+      "learning_rate": 0.00035429540864777254,
+      "loss": 3.1588,
+      "step": 10590
+    },
+    {
+      "epoch": 0.6150454031158432,
+      "grad_norm": 0.11384609341621399,
+      "learning_rate": 0.00035337787791987085,
+      "loss": 3.1563,
+      "step": 10600
+    },
+    {
+      "epoch": 0.6156256346282166,
+      "grad_norm": 0.13255846500396729,
+      "learning_rate": 0.0003524608870699826,
+      "loss": 3.1546,
+      "step": 10610
+    },
+    {
+      "epoch": 0.6162058661405901,
+      "grad_norm": 0.12805138528347015,
+      "learning_rate": 0.00035154443947456364,
+      "loss": 3.1468,
+      "step": 10620
+    },
+    {
+      "epoch": 0.6167860976529635,
+      "grad_norm": 0.11819039285182953,
+      "learning_rate": 0.0003506285385080705,
+      "loss": 3.1436,
+      "step": 10630
+    },
+    {
+      "epoch": 0.617366329165337,
+      "grad_norm": 0.11611706018447876,
+      "learning_rate": 0.0003497131875429462,
+      "loss": 3.153,
+      "step": 10640
+    },
+    {
+      "epoch": 0.6179465606777104,
+      "grad_norm": 0.12574134767055511,
+      "learning_rate": 0.0003487983899496092,
+      "loss": 3.1676,
+      "step": 10650
+    },
+    {
+      "epoch": 0.6185267921900839,
+      "grad_norm": 0.13298243284225464,
+      "learning_rate": 0.00034788414909643975,
+      "loss": 3.1448,
+      "step": 10660
+    },
+    {
+      "epoch": 0.6191070237024573,
+      "grad_norm": 0.11737950146198273,
+      "learning_rate": 0.00034697046834976847,
+      "loss": 3.1603,
+      "step": 10670
+    },
+    {
+      "epoch": 0.6196872552148307,
+      "grad_norm": 0.11029376089572906,
+      "learning_rate": 0.0003460573510738638,
+      "loss": 3.1523,
+      "step": 10680
+    },
+    {
+      "epoch": 0.6202674867272041,
+      "grad_norm": 0.12390248477458954,
+      "learning_rate": 0.000345144800630919,
+      "loss": 3.1591,
+      "step": 10690
+    },
+    {
+      "epoch": 0.6208477182395776,
+      "grad_norm": 0.11781900376081467,
+      "learning_rate": 0.00034423282038104064,
+      "loss": 3.1617,
+      "step": 10700
+    },
+    {
+      "epoch": 0.621427949751951,
+      "grad_norm": 0.12515197694301605,
+      "learning_rate": 0.0003433214136822352,
+      "loss": 3.1418,
+      "step": 10710
+    },
+    {
+      "epoch": 0.6220081812643244,
+      "grad_norm": 0.10986288636922836,
+      "learning_rate": 0.0003424105838903978,
+      "loss": 3.1374,
+      "step": 10720
+    },
+    {
+      "epoch": 0.6225884127766979,
+      "grad_norm": 0.12500767409801483,
+      "learning_rate": 0.00034150033435929926,
+      "loss": 3.1508,
+      "step": 10730
+    },
+    {
+      "epoch": 0.6231686442890714,
+      "grad_norm": 0.11399463564157486,
+      "learning_rate": 0.0003405906684405735,
+      "loss": 3.155,
+      "step": 10740
+    },
+    {
+      "epoch": 0.6237488758014448,
+      "grad_norm": 0.1243964433670044,
+      "learning_rate": 0.000339681589483706,
+      "loss": 3.149,
+      "step": 10750
+    },
+    {
+      "epoch": 0.6243291073138182,
+      "grad_norm": 0.1269841343164444,
+      "learning_rate": 0.0003387731008360203,
+      "loss": 3.157,
+      "step": 10760
+    },
+    {
+      "epoch": 0.6249093388261916,
+      "grad_norm": 0.11488167196512222,
+      "learning_rate": 0.0003378652058426672,
+      "loss": 3.1591,
+      "step": 10770
+    },
+    {
+      "epoch": 0.6254895703385651,
+      "grad_norm": 0.12310460954904556,
+      "learning_rate": 0.00033695790784661085,
+      "loss": 3.1493,
+      "step": 10780
+    },
+    {
+      "epoch": 0.6260698018509385,
+      "grad_norm": 0.11951915174722672,
+      "learning_rate": 0.0003360512101886176,
+      "loss": 3.1519,
+      "step": 10790
+    },
+    {
+      "epoch": 0.6266500333633119,
+      "grad_norm": 0.11739303171634674,
+      "learning_rate": 0.0003351451162072435,
+      "loss": 3.1517,
+      "step": 10800
+    },
+    {
+      "epoch": 0.6272302648756855,
+      "grad_norm": 0.12451887875795364,
+      "learning_rate": 0.000334239629238821,
+      "loss": 3.1437,
+      "step": 10810
+    },
+    {
+      "epoch": 0.6278104963880589,
+      "grad_norm": 0.10753390938043594,
+      "learning_rate": 0.0003333347526174484,
+      "loss": 3.1474,
+      "step": 10820
+    },
+    {
+      "epoch": 0.6283907279004323,
+      "grad_norm": 0.12157886475324631,
+      "learning_rate": 0.00033243048967497596,
+      "loss": 3.1502,
+      "step": 10830
+    },
+    {
+      "epoch": 0.6289709594128057,
+      "grad_norm": 0.13651184737682343,
+      "learning_rate": 0.0003315268437409946,
+      "loss": 3.1553,
+      "step": 10840
+    },
+    {
+      "epoch": 0.6295511909251792,
+      "grad_norm": 0.12725335359573364,
+      "learning_rate": 0.00033062381814282367,
+      "loss": 3.141,
+      "step": 10850
+    },
+    {
+      "epoch": 0.6301314224375526,
+      "grad_norm": 0.11685140430927277,
+      "learning_rate": 0.00032972141620549747,
+      "loss": 3.1451,
+      "step": 10860
+    },
+    {
+      "epoch": 0.630711653949926,
+      "grad_norm": 0.1115005612373352,
+      "learning_rate": 0.00032881964125175487,
+      "loss": 3.1482,
+      "step": 10870
+    },
+    {
+      "epoch": 0.6312918854622994,
+      "grad_norm": 0.11986386775970459,
+      "learning_rate": 0.00032791849660202547,
+      "loss": 3.1434,
+      "step": 10880
+    },
+    {
+      "epoch": 0.6318721169746729,
+      "grad_norm": 0.11233355104923248,
+      "learning_rate": 0.00032701798557441833,
+      "loss": 3.1418,
+      "step": 10890
+    },
+    {
+      "epoch": 0.6324523484870463,
+      "grad_norm": 0.11507276445627213,
+      "learning_rate": 0.0003261181114847094,
+      "loss": 3.1415,
+      "step": 10900
+    },
+    {
+      "epoch": 0.6330325799994198,
+      "grad_norm": 0.1157032698392868,
+      "learning_rate": 0.00032521887764632937,
+      "loss": 3.149,
+      "step": 10910
+    },
+    {
+      "epoch": 0.6336128115117932,
+      "grad_norm": 0.12391894310712814,
+      "learning_rate": 0.0003243202873703516,
+      "loss": 3.1476,
+      "step": 10920
+    },
+    {
+      "epoch": 0.6341930430241667,
+      "grad_norm": 0.11616963148117065,
+      "learning_rate": 0.00032342234396547933,
+      "loss": 3.1522,
+      "step": 10930
+    },
+    {
+      "epoch": 0.6347732745365401,
+      "grad_norm": 0.113109290599823,
+      "learning_rate": 0.00032252505073803437,
+      "loss": 3.1398,
+      "step": 10940
+    },
+    {
+      "epoch": 0.6353535060489135,
+      "grad_norm": 0.1344575732946396,
+      "learning_rate": 0.00032162841099194427,
+      "loss": 3.1388,
+      "step": 10950
+    },
+    {
+      "epoch": 0.6359337375612869,
+      "grad_norm": 0.1219155341386795,
+      "learning_rate": 0.0003207324280287307,
+      "loss": 3.1499,
+      "step": 10960
+    },
+    {
+      "epoch": 0.6365139690736604,
+      "grad_norm": 0.11315035074949265,
+      "learning_rate": 0.0003198371051474969,
+      "loss": 3.152,
+      "step": 10970
+    },
+    {
+      "epoch": 0.6370942005860338,
+      "grad_norm": 0.1105627492070198,
+      "learning_rate": 0.000318942445644915,
+      "loss": 3.1512,
+      "step": 10980
+    },
+    {
+      "epoch": 0.6376744320984072,
+      "grad_norm": 0.11000196635723114,
+      "learning_rate": 0.00031804845281521553,
+      "loss": 3.1464,
+      "step": 10990
+    },
+    {
+      "epoch": 0.6382546636107806,
+      "grad_norm": 0.11353638023138046,
+      "learning_rate": 0.0003171551299501734,
+      "loss": 3.1464,
+      "step": 11000
+    },
+    {
+      "epoch": 0.6382546636107806,
+      "eval_loss": 3.079380750656128,
+      "eval_runtime": 3.2712,
+      "eval_samples_per_second": 1323.663,
+      "eval_steps_per_second": 10.394,
+      "step": 11000
+    },
+    {
+      "epoch": 0.6388348951231542,
+      "grad_norm": 0.11272257566452026,
+      "learning_rate": 0.0003162624803390973,
+      "loss": 3.1544,
+      "step": 11010
+    },
+    {
+      "epoch": 0.6394151266355276,
+      "grad_norm": 0.11919167637825012,
+      "learning_rate": 0.00031537050726881635,
+      "loss": 3.1495,
+      "step": 11020
+    },
+    {
+      "epoch": 0.639995358147901,
+      "grad_norm": 0.11367520689964294,
+      "learning_rate": 0.00031447921402366874,
+      "loss": 3.1422,
+      "step": 11030
+    },
+    {
+      "epoch": 0.6405755896602745,
+      "grad_norm": 0.11759908497333527,
+      "learning_rate": 0.0003135886038854899,
+      "loss": 3.1414,
+      "step": 11040
+    },
+    {
+      "epoch": 0.6411558211726479,
+      "grad_norm": 0.11302473396062851,
+      "learning_rate": 0.0003126986801335995,
+      "loss": 3.1471,
+      "step": 11050
+    },
+    {
+      "epoch": 0.6417360526850213,
+      "grad_norm": 0.12279005348682404,
+      "learning_rate": 0.0003118094460447901,
+      "loss": 3.1427,
+      "step": 11060
+    },
+    {
+      "epoch": 0.6423162841973947,
+      "grad_norm": 0.11281714588403702,
+      "learning_rate": 0.0003109209048933145,
+      "loss": 3.1327,
+      "step": 11070
+    },
+    {
+      "epoch": 0.6428965157097682,
+      "grad_norm": 0.10893500596284866,
+      "learning_rate": 0.0003100330599508745,
+      "loss": 3.1472,
+      "step": 11080
+    },
+    {
+      "epoch": 0.6434767472221417,
+      "grad_norm": 0.12594285607337952,
+      "learning_rate": 0.0003091459144866083,
+      "loss": 3.146,
+      "step": 11090
+    },
+    {
+      "epoch": 0.6440569787345151,
+      "grad_norm": 0.1286400705575943,
+      "learning_rate": 0.0003082594717670781,
+      "loss": 3.1457,
+      "step": 11100
+    },
+    {
+      "epoch": 0.6446372102468885,
+      "grad_norm": 0.12681199610233307,
+      "learning_rate": 0.0003073737350562594,
+      "loss": 3.1349,
+      "step": 11110
+    },
+    {
+      "epoch": 0.645217441759262,
+      "grad_norm": 0.12026621401309967,
+      "learning_rate": 0.00030648870761552693,
+      "loss": 3.1425,
+      "step": 11120
+    },
+    {
+      "epoch": 0.6457976732716354,
+      "grad_norm": 0.11217381060123444,
+      "learning_rate": 0.00030560439270364495,
+      "loss": 3.1424,
+      "step": 11130
+    },
+    {
+      "epoch": 0.6463779047840088,
+      "grad_norm": 0.11966854333877563,
+      "learning_rate": 0.00030472079357675316,
+      "loss": 3.1477,
+      "step": 11140
+    },
+    {
+      "epoch": 0.6469581362963822,
+      "grad_norm": 0.10743203014135361,
+      "learning_rate": 0.0003038379134883563,
+      "loss": 3.1472,
+      "step": 11150
+    },
+    {
+      "epoch": 0.6475383678087557,
+      "grad_norm": 0.11516842246055603,
+      "learning_rate": 0.0003029557556893117,
+      "loss": 3.1363,
+      "step": 11160
+    },
+    {
+      "epoch": 0.6481185993211291,
+      "grad_norm": 0.11448100209236145,
+      "learning_rate": 0.00030207432342781615,
+      "loss": 3.1397,
+      "step": 11170
+    },
+    {
+      "epoch": 0.6486988308335025,
+      "grad_norm": 0.12240401655435562,
+      "learning_rate": 0.0003011936199493962,
+      "loss": 3.1451,
+      "step": 11180
+    },
+    {
+      "epoch": 0.649279062345876,
+      "grad_norm": 0.1107584685087204,
+      "learning_rate": 0.0003003136484968937,
+      "loss": 3.1516,
+      "step": 11190
+    },
+    {
+      "epoch": 0.6498592938582495,
+      "grad_norm": 0.11135096102952957,
+      "learning_rate": 0.0002994344123104561,
+      "loss": 3.1423,
+      "step": 11200
+    },
+    {
+      "epoch": 0.6504395253706229,
+      "grad_norm": 0.11470366269350052,
+      "learning_rate": 0.0002985559146275231,
+      "loss": 3.1441,
+      "step": 11210
+    },
+    {
+      "epoch": 0.6510197568829963,
+      "grad_norm": 0.11569292098283768,
+      "learning_rate": 0.0002976781586828151,
+      "loss": 3.149,
+      "step": 11220
+    },
+    {
+      "epoch": 0.6515999883953697,
+      "grad_norm": 0.12388614565134048,
+      "learning_rate": 0.0002968011477083217,
+      "loss": 3.1319,
+      "step": 11230
+    },
+    {
+      "epoch": 0.6521802199077432,
+      "grad_norm": 0.12496737390756607,
+      "learning_rate": 0.00029592488493328885,
+      "loss": 3.1391,
+      "step": 11240
+    },
+    {
+      "epoch": 0.6527604514201166,
+      "grad_norm": 0.1108599305152893,
+      "learning_rate": 0.00029504937358420803,
+      "loss": 3.1453,
+      "step": 11250
+    },
+    {
+      "epoch": 0.65334068293249,
+      "grad_norm": 0.11209242045879364,
+      "learning_rate": 0.0002941746168848037,
+      "loss": 3.1468,
+      "step": 11260
+    },
+    {
+      "epoch": 0.6539209144448636,
+      "grad_norm": 0.10576393455266953,
+      "learning_rate": 0.0002933006180560217,
+      "loss": 3.1327,
+      "step": 11270
+    },
+    {
+      "epoch": 0.654501145957237,
+      "grad_norm": 0.11058243364095688,
+      "learning_rate": 0.00029242738031601745,
+      "loss": 3.1378,
+      "step": 11280
+    },
+    {
+      "epoch": 0.6550813774696104,
+      "grad_norm": 0.10569418221712112,
+      "learning_rate": 0.00029155490688014343,
+      "loss": 3.1402,
+      "step": 11290
+    },
+    {
+      "epoch": 0.6556616089819838,
+      "grad_norm": 0.11297528445720673,
+      "learning_rate": 0.0002906832009609384,
+      "loss": 3.1453,
+      "step": 11300
+    },
+    {
+      "epoch": 0.6562418404943573,
+      "grad_norm": 0.11635693162679672,
+      "learning_rate": 0.00028981226576811506,
+      "loss": 3.1323,
+      "step": 11310
+    },
+    {
+      "epoch": 0.6568220720067307,
+      "grad_norm": 0.11293961852788925,
+      "learning_rate": 0.0002889421045085475,
+      "loss": 3.151,
+      "step": 11320
+    },
+    {
+      "epoch": 0.6574023035191041,
+      "grad_norm": 0.11303776502609253,
+      "learning_rate": 0.0002880727203862612,
+      "loss": 3.1461,
+      "step": 11330
+    },
+    {
+      "epoch": 0.6579825350314775,
+      "grad_norm": 0.10966860502958298,
+      "learning_rate": 0.0002872041166024194,
+      "loss": 3.1441,
+      "step": 11340
+    },
+    {
+      "epoch": 0.658562766543851,
+      "grad_norm": 0.11160997301340103,
+      "learning_rate": 0.00028633629635531224,
+      "loss": 3.1488,
+      "step": 11350
+    },
+    {
+      "epoch": 0.6591429980562244,
+      "grad_norm": 0.1070476621389389,
+      "learning_rate": 0.0002854692628403446,
+      "loss": 3.1413,
+      "step": 11360
+    },
+    {
+      "epoch": 0.6597232295685979,
+      "grad_norm": 0.11823021620512009,
+      "learning_rate": 0.0002846030192500249,
+      "loss": 3.145,
+      "step": 11370
+    },
+    {
+      "epoch": 0.6603034610809713,
+      "grad_norm": 0.11843527853488922,
+      "learning_rate": 0.0002837375687739525,
+      "loss": 3.1374,
+      "step": 11380
+    },
+    {
+      "epoch": 0.6608836925933448,
+      "grad_norm": 0.118824802339077,
+      "learning_rate": 0.00028287291459880716,
+      "loss": 3.157,
+      "step": 11390
+    },
+    {
+      "epoch": 0.6614639241057182,
+      "grad_norm": 0.11628689616918564,
+      "learning_rate": 0.0002820090599083358,
+      "loss": 3.1352,
+      "step": 11400
+    },
+    {
+      "epoch": 0.6620441556180916,
+      "grad_norm": 0.11970434337854385,
+      "learning_rate": 0.0002811460078833421,
+      "loss": 3.1468,
+      "step": 11410
+    },
+    {
+      "epoch": 0.662624387130465,
+      "grad_norm": 0.10809943079948425,
+      "learning_rate": 0.00028028376170167383,
+      "loss": 3.1405,
+      "step": 11420
+    },
+    {
+      "epoch": 0.6632046186428385,
+      "grad_norm": 0.10611239075660706,
+      "learning_rate": 0.00027942232453821193,
+      "loss": 3.1449,
+      "step": 11430
+    },
+    {
+      "epoch": 0.6637848501552119,
+      "grad_norm": 0.11383804678916931,
+      "learning_rate": 0.0002785616995648579,
+      "loss": 3.1525,
+      "step": 11440
+    },
+    {
+      "epoch": 0.6643650816675853,
+      "grad_norm": 0.11580588668584824,
+      "learning_rate": 0.0002777018899505236,
+      "loss": 3.1335,
+      "step": 11450
+    },
+    {
+      "epoch": 0.6649453131799588,
+      "grad_norm": 0.1111442893743515,
+      "learning_rate": 0.0002768428988611178,
+      "loss": 3.1467,
+      "step": 11460
+    },
+    {
+      "epoch": 0.6655255446923323,
+      "grad_norm": 0.11236603558063507,
+      "learning_rate": 0.0002759847294595357,
+      "loss": 3.1369,
+      "step": 11470
+    },
+    {
+      "epoch": 0.6661057762047057,
+      "grad_norm": 0.12457659840583801,
+      "learning_rate": 0.00027512738490564697,
+      "loss": 3.1346,
+      "step": 11480
+    },
+    {
+      "epoch": 0.6666860077170791,
+      "grad_norm": 0.11812961846590042,
+      "learning_rate": 0.0002742708683562841,
+      "loss": 3.1479,
+      "step": 11490
+    },
+    {
+      "epoch": 0.6672662392294526,
+      "grad_norm": 0.1041463240981102,
+      "learning_rate": 0.0002734151829652304,
+      "loss": 3.1363,
+      "step": 11500
+    },
+    {
+      "epoch": 0.667846470741826,
+      "grad_norm": 0.11501579731702805,
+      "learning_rate": 0.0002725603318832097,
+      "loss": 3.1286,
+      "step": 11510
+    },
+    {
+      "epoch": 0.6684267022541994,
+      "grad_norm": 0.11745285987854004,
+      "learning_rate": 0.00027170631825787294,
+      "loss": 3.1406,
+      "step": 11520
+    },
+    {
+      "epoch": 0.6690069337665728,
+      "grad_norm": 0.1133042722940445,
+      "learning_rate": 0.00027085314523378777,
+      "loss": 3.1506,
+      "step": 11530
+    },
+    {
+      "epoch": 0.6695871652789464,
+      "grad_norm": 0.11351029574871063,
+      "learning_rate": 0.00027000081595242667,
+      "loss": 3.135,
+      "step": 11540
+    },
+    {
+      "epoch": 0.6701673967913198,
+      "grad_norm": 0.11267419159412384,
+      "learning_rate": 0.0002691493335521551,
+      "loss": 3.131,
+      "step": 11550
+    },
+    {
+      "epoch": 0.6707476283036932,
+      "grad_norm": 0.10249326378107071,
+      "learning_rate": 0.00026829870116822085,
+      "loss": 3.1318,
+      "step": 11560
+    },
+    {
+      "epoch": 0.6713278598160666,
+      "grad_norm": 0.11103315651416779,
+      "learning_rate": 0.0002674489219327413,
+      "loss": 3.1344,
+      "step": 11570
+    },
+    {
+      "epoch": 0.6719080913284401,
+      "grad_norm": 0.11441586166620255,
+      "learning_rate": 0.0002665999989746926,
+      "loss": 3.1352,
+      "step": 11580
+    },
+    {
+      "epoch": 0.6724883228408135,
+      "grad_norm": 0.11026325076818466,
+      "learning_rate": 0.00026575193541989795,
+      "loss": 3.1315,
+      "step": 11590
+    },
+    {
+      "epoch": 0.6730685543531869,
+      "grad_norm": 0.1115291491150856,
+      "learning_rate": 0.00026490473439101615,
+      "loss": 3.1339,
+      "step": 11600
+    },
+    {
+      "epoch": 0.6736487858655603,
+      "grad_norm": 0.10771960020065308,
+      "learning_rate": 0.0002640583990075306,
+      "loss": 3.1238,
+      "step": 11610
+    },
+    {
+      "epoch": 0.6742290173779338,
+      "grad_norm": 0.11078547686338425,
+      "learning_rate": 0.00026321293238573614,
+      "loss": 3.1365,
+      "step": 11620
+    },
+    {
+      "epoch": 0.6748092488903072,
+      "grad_norm": 0.1105736717581749,
+      "learning_rate": 0.00026236833763872993,
+      "loss": 3.1466,
+      "step": 11630
+    },
+    {
+      "epoch": 0.6753894804026807,
+      "grad_norm": 0.10596097260713577,
+      "learning_rate": 0.0002615246178763983,
+      "loss": 3.1442,
+      "step": 11640
+    },
+    {
+      "epoch": 0.6759697119150541,
+      "grad_norm": 0.11247435957193375,
+      "learning_rate": 0.00026068177620540536,
+      "loss": 3.1439,
+      "step": 11650
+    },
+    {
+      "epoch": 0.6765499434274276,
+      "grad_norm": 0.11171044409275055,
+      "learning_rate": 0.00025983981572918314,
+      "loss": 3.1451,
+      "step": 11660
+    },
+    {
+      "epoch": 0.677130174939801,
+      "grad_norm": 0.12344854325056076,
+      "learning_rate": 0.0002589987395479175,
+      "loss": 3.1372,
+      "step": 11670
+    },
+    {
+      "epoch": 0.6777104064521744,
+      "grad_norm": 0.1220024824142456,
+      "learning_rate": 0.00025815855075853977,
+      "loss": 3.1366,
+      "step": 11680
+    },
+    {
+      "epoch": 0.6782906379645478,
+      "grad_norm": 0.10972929000854492,
+      "learning_rate": 0.0002573192524547128,
+      "loss": 3.1299,
+      "step": 11690
+    },
+    {
+      "epoch": 0.6788708694769213,
+      "grad_norm": 0.11175742000341415,
+      "learning_rate": 0.00025648084772682056,
+      "loss": 3.1375,
+      "step": 11700
+    },
+    {
+      "epoch": 0.6794511009892947,
+      "grad_norm": 0.12257856875658035,
+      "learning_rate": 0.00025564333966195785,
+      "loss": 3.1402,
+      "step": 11710
+    },
+    {
+      "epoch": 0.6800313325016681,
+      "grad_norm": 0.11670278757810593,
+      "learning_rate": 0.0002548067313439162,
+      "loss": 3.1357,
+      "step": 11720
+    },
+    {
+      "epoch": 0.6806115640140415,
+      "grad_norm": 0.11864405870437622,
+      "learning_rate": 0.0002539710258531759,
+      "loss": 3.136,
+      "step": 11730
+    },
+    {
+      "epoch": 0.6811917955264151,
+      "grad_norm": 0.10666168481111526,
+      "learning_rate": 0.00025313622626689134,
+      "loss": 3.1374,
+      "step": 11740
+    },
+    {
+      "epoch": 0.6817720270387885,
+      "grad_norm": 0.10843271762132645,
+      "learning_rate": 0.00025230233565888267,
+      "loss": 3.1343,
+      "step": 11750
+    },
+    {
+      "epoch": 0.6823522585511619,
+      "grad_norm": 0.10990433394908905,
+      "learning_rate": 0.00025146935709962216,
+      "loss": 3.1386,
+      "step": 11760
+    },
+    {
+      "epoch": 0.6829324900635354,
+      "grad_norm": 0.10423313081264496,
+      "learning_rate": 0.00025063729365622407,
+      "loss": 3.1382,
+      "step": 11770
+    },
+    {
+      "epoch": 0.6835127215759088,
+      "grad_norm": 0.11088298261165619,
+      "learning_rate": 0.00024980614839243364,
+      "loss": 3.1191,
+      "step": 11780
+    },
+    {
+      "epoch": 0.6840929530882822,
+      "grad_norm": 0.11372388154268265,
+      "learning_rate": 0.00024897592436861406,
+      "loss": 3.1294,
+      "step": 11790
+    },
+    {
+      "epoch": 0.6846731846006556,
+      "grad_norm": 0.10824663192033768,
+      "learning_rate": 0.0002481466246417377,
+      "loss": 3.1291,
+      "step": 11800
+    },
+    {
+      "epoch": 0.6852534161130291,
+      "grad_norm": 0.10850938409566879,
+      "learning_rate": 0.00024731825226537293,
+      "loss": 3.1438,
+      "step": 11810
+    },
+    {
+      "epoch": 0.6858336476254026,
+      "grad_norm": 0.1074269562959671,
+      "learning_rate": 0.00024649081028967334,
+      "loss": 3.1336,
+      "step": 11820
+    },
+    {
+      "epoch": 0.686413879137776,
+      "grad_norm": 0.11285442113876343,
+      "learning_rate": 0.00024566430176136756,
+      "loss": 3.1326,
+      "step": 11830
+    },
+    {
+      "epoch": 0.6869941106501494,
+      "grad_norm": 0.11877676844596863,
+      "learning_rate": 0.0002448387297237459,
+      "loss": 3.1333,
+      "step": 11840
+    },
+    {
+      "epoch": 0.6875743421625229,
+      "grad_norm": 0.1159949079155922,
+      "learning_rate": 0.00024401409721665148,
+      "loss": 3.1271,
+      "step": 11850
+    },
+    {
+      "epoch": 0.6881545736748963,
+      "grad_norm": 0.11141040176153183,
+      "learning_rate": 0.00024319040727646752,
+      "loss": 3.1315,
+      "step": 11860
+    },
+    {
+      "epoch": 0.6887348051872697,
+      "grad_norm": 0.1103438287973404,
+      "learning_rate": 0.0002423676629361064,
+      "loss": 3.1271,
+      "step": 11870
+    },
+    {
+      "epoch": 0.6893150366996431,
+      "grad_norm": 0.12033682316541672,
+      "learning_rate": 0.00024154586722499965,
+      "loss": 3.1317,
+      "step": 11880
+    },
+    {
+      "epoch": 0.6898952682120166,
+      "grad_norm": 0.10661648213863373,
+      "learning_rate": 0.00024072502316908428,
+      "loss": 3.1272,
+      "step": 11890
+    },
+    {
+      "epoch": 0.69047549972439,
+      "grad_norm": 0.1170666292309761,
+      "learning_rate": 0.00023990513379079477,
+      "loss": 3.1398,
+      "step": 11900
+    },
+    {
+      "epoch": 0.6910557312367634,
+      "grad_norm": 0.11095455288887024,
+      "learning_rate": 0.00023908620210904947,
+      "loss": 3.1298,
+      "step": 11910
+    },
+    {
+      "epoch": 0.6916359627491369,
+      "grad_norm": 0.1100478321313858,
+      "learning_rate": 0.00023826823113924035,
+      "loss": 3.1286,
+      "step": 11920
+    },
+    {
+      "epoch": 0.6922161942615104,
+      "grad_norm": 0.11419103294610977,
+      "learning_rate": 0.00023745122389322293,
+      "loss": 3.1343,
+      "step": 11930
+    },
+    {
+      "epoch": 0.6927964257738838,
+      "grad_norm": 0.11160432547330856,
+      "learning_rate": 0.00023663518337930256,
+      "loss": 3.1402,
+      "step": 11940
+    },
+    {
+      "epoch": 0.6933766572862572,
+      "grad_norm": 0.10984364151954651,
+      "learning_rate": 0.00023582011260222664,
+      "loss": 3.1351,
+      "step": 11950
+    },
+    {
+      "epoch": 0.6939568887986306,
+      "grad_norm": 0.11625155061483383,
+      "learning_rate": 0.00023500601456317083,
+      "loss": 3.134,
+      "step": 11960
+    },
+    {
+      "epoch": 0.6945371203110041,
+      "grad_norm": 0.1080445721745491,
+      "learning_rate": 0.00023419289225972946,
+      "loss": 3.1311,
+      "step": 11970
+    },
+    {
+      "epoch": 0.6951173518233775,
+      "grad_norm": 0.10590895265340805,
+      "learning_rate": 0.00023338074868590393,
+      "loss": 3.1371,
+      "step": 11980
+    },
+    {
+      "epoch": 0.6956975833357509,
+      "grad_norm": 0.11543317884206772,
+      "learning_rate": 0.0002325695868320919,
+      "loss": 3.1316,
+      "step": 11990
+    },
+    {
+      "epoch": 0.6962778148481245,
+      "grad_norm": 0.11939459294080734,
+      "learning_rate": 0.0002317594096850768,
+      "loss": 3.1365,
+      "step": 12000
+    },
+    {
+      "epoch": 0.6962778148481245,
+      "eval_loss": 3.064452648162842,
+      "eval_runtime": 3.2623,
+      "eval_samples_per_second": 1327.301,
+      "eval_steps_per_second": 10.422,
+      "step": 12000
+    },
+    {
+      "epoch": 0.6968580463604979,
+      "grad_norm": 0.10952937602996826,
+      "learning_rate": 0.00023095022022801503,
+      "loss": 3.1378,
+      "step": 12010
+    },
+    {
+      "epoch": 0.6974382778728713,
+      "grad_norm": 0.11545655131340027,
+      "learning_rate": 0.00023014202144042744,
+      "loss": 3.1373,
+      "step": 12020
+    },
+    {
+      "epoch": 0.6980185093852447,
+      "grad_norm": 0.10757040232419968,
+      "learning_rate": 0.00022933481629818653,
+      "loss": 3.137,
+      "step": 12030
+    },
+    {
+      "epoch": 0.6985987408976182,
+      "grad_norm": 0.11333664506673813,
+      "learning_rate": 0.00022852860777350593,
+      "loss": 3.1328,
+      "step": 12040
+    },
+    {
+      "epoch": 0.6991789724099916,
+      "grad_norm": 0.10705193877220154,
+      "learning_rate": 0.00022772339883493048,
+      "loss": 3.1283,
+      "step": 12050
+    },
+    {
+      "epoch": 0.699759203922365,
+      "grad_norm": 0.11158863455057144,
+      "learning_rate": 0.00022691919244732307,
+      "loss": 3.1303,
+      "step": 12060
+    },
+    {
+      "epoch": 0.7003394354347384,
+      "grad_norm": 0.11993270367383957,
+      "learning_rate": 0.00022611599157185648,
+      "loss": 3.1262,
+      "step": 12070
+    },
+    {
+      "epoch": 0.7009196669471119,
+      "grad_norm": 0.10471105575561523,
+      "learning_rate": 0.00022531379916600026,
+      "loss": 3.1397,
+      "step": 12080
+    },
+    {
+      "epoch": 0.7014998984594853,
+      "grad_norm": 0.11472434550523758,
+      "learning_rate": 0.00022451261818351082,
+      "loss": 3.1334,
+      "step": 12090
+    },
+    {
+      "epoch": 0.7020801299718588,
+      "grad_norm": 0.12063375115394592,
+      "learning_rate": 0.0002237124515744206,
+      "loss": 3.1311,
+      "step": 12100
+    },
+    {
+      "epoch": 0.7026603614842322,
+      "grad_norm": 0.11272242665290833,
+      "learning_rate": 0.00022291330228502658,
+      "loss": 3.13,
+      "step": 12110
+    },
+    {
+      "epoch": 0.7032405929966057,
+      "grad_norm": 0.10762272030115128,
+      "learning_rate": 0.00022211517325788056,
+      "loss": 3.1255,
+      "step": 12120
+    },
+    {
+      "epoch": 0.7038208245089791,
+      "grad_norm": 0.10291790962219238,
+      "learning_rate": 0.00022131806743177707,
+      "loss": 3.1284,
+      "step": 12130
+    },
+    {
+      "epoch": 0.7044010560213525,
+      "grad_norm": 0.10812744498252869,
+      "learning_rate": 0.00022052198774174327,
+      "loss": 3.1348,
+      "step": 12140
+    },
+    {
+      "epoch": 0.7049812875337259,
+      "grad_norm": 0.10778633505105972,
+      "learning_rate": 0.00021972693711902792,
+      "loss": 3.1342,
+      "step": 12150
+    },
+    {
+      "epoch": 0.7055615190460994,
+      "grad_norm": 0.10863006114959717,
+      "learning_rate": 0.00021893291849109053,
+      "loss": 3.1319,
+      "step": 12160
+    },
+    {
+      "epoch": 0.7061417505584728,
+      "grad_norm": 0.11223878711462021,
+      "learning_rate": 0.00021813993478159128,
+      "loss": 3.1299,
+      "step": 12170
+    },
+    {
+      "epoch": 0.7067219820708462,
+      "grad_norm": 0.11001647263765335,
+      "learning_rate": 0.000217347988910379,
+      "loss": 3.1256,
+      "step": 12180
+    },
+    {
+      "epoch": 0.7073022135832197,
+      "grad_norm": 0.11175502091646194,
+      "learning_rate": 0.00021655708379348144,
+      "loss": 3.1374,
+      "step": 12190
+    },
+    {
+      "epoch": 0.7078824450955932,
+      "grad_norm": 0.10740119218826294,
+      "learning_rate": 0.00021576722234309403,
+      "loss": 3.1284,
+      "step": 12200
+    },
+    {
+      "epoch": 0.7084626766079666,
+      "grad_norm": 0.1120908334851265,
+      "learning_rate": 0.00021497840746756942,
+      "loss": 3.1225,
+      "step": 12210
+    },
+    {
+      "epoch": 0.70904290812034,
+      "grad_norm": 0.110707126557827,
+      "learning_rate": 0.00021419064207140639,
+      "loss": 3.1256,
+      "step": 12220
+    },
+    {
+      "epoch": 0.7096231396327135,
+      "grad_norm": 0.11498415470123291,
+      "learning_rate": 0.00021340392905524002,
+      "loss": 3.1249,
+      "step": 12230
+    },
+    {
+      "epoch": 0.7102033711450869,
+      "grad_norm": 0.11245319992303848,
+      "learning_rate": 0.00021261827131582989,
+      "loss": 3.135,
+      "step": 12240
+    },
+    {
+      "epoch": 0.7107836026574603,
+      "grad_norm": 0.10842925310134888,
+      "learning_rate": 0.00021183367174605006,
+      "loss": 3.121,
+      "step": 12250
+    },
+    {
+      "epoch": 0.7113638341698337,
+      "grad_norm": 0.10173554718494415,
+      "learning_rate": 0.00021105013323487843,
+      "loss": 3.1246,
+      "step": 12260
+    },
+    {
+      "epoch": 0.7119440656822072,
+      "grad_norm": 0.10313431173563004,
+      "learning_rate": 0.00021026765866738578,
+      "loss": 3.1298,
+      "step": 12270
+    },
+    {
+      "epoch": 0.7125242971945807,
+      "grad_norm": 0.1053348183631897,
+      "learning_rate": 0.00020948625092472535,
+      "loss": 3.1264,
+      "step": 12280
+    },
+    {
+      "epoch": 0.7131045287069541,
+      "grad_norm": 0.12006527930498123,
+      "learning_rate": 0.00020870591288412254,
+      "loss": 3.1306,
+      "step": 12290
+    },
+    {
+      "epoch": 0.7136847602193275,
+      "grad_norm": 0.1115618497133255,
+      "learning_rate": 0.00020792664741886368,
+      "loss": 3.1264,
+      "step": 12300
+    },
+    {
+      "epoch": 0.714264991731701,
+      "grad_norm": 0.10678742080926895,
+      "learning_rate": 0.00020714845739828585,
+      "loss": 3.1337,
+      "step": 12310
+    },
+    {
+      "epoch": 0.7148452232440744,
+      "grad_norm": 0.11345722526311874,
+      "learning_rate": 0.00020637134568776615,
+      "loss": 3.1283,
+      "step": 12320
+    },
+    {
+      "epoch": 0.7154254547564478,
+      "grad_norm": 0.11007855087518692,
+      "learning_rate": 0.00020559531514871145,
+      "loss": 3.124,
+      "step": 12330
+    },
+    {
+      "epoch": 0.7160056862688212,
+      "grad_norm": 0.1093551367521286,
+      "learning_rate": 0.00020482036863854708,
+      "loss": 3.1251,
+      "step": 12340
+    },
+    {
+      "epoch": 0.7165859177811947,
+      "grad_norm": 0.10831980407238007,
+      "learning_rate": 0.00020404650901070787,
+      "loss": 3.122,
+      "step": 12350
+    },
+    {
+      "epoch": 0.7171661492935681,
+      "grad_norm": 0.1150059625506401,
+      "learning_rate": 0.00020327373911462572,
+      "loss": 3.1253,
+      "step": 12360
+    },
+    {
+      "epoch": 0.7177463808059416,
+      "grad_norm": 0.10329602658748627,
+      "learning_rate": 0.00020250206179572034,
+      "loss": 3.1315,
+      "step": 12370
+    },
+    {
+      "epoch": 0.718326612318315,
+      "grad_norm": 0.10848727822303772,
+      "learning_rate": 0.00020173147989538853,
+      "loss": 3.1334,
+      "step": 12380
+    },
+    {
+      "epoch": 0.7189068438306885,
+      "grad_norm": 0.10476922988891602,
+      "learning_rate": 0.00020096199625099337,
+      "loss": 3.1208,
+      "step": 12390
+    },
+    {
+      "epoch": 0.7194870753430619,
+      "grad_norm": 0.10537194460630417,
+      "learning_rate": 0.00020019361369585454,
+      "loss": 3.1265,
+      "step": 12400
+    },
+    {
+      "epoch": 0.7200673068554353,
+      "grad_norm": 0.10518410056829453,
+      "learning_rate": 0.00019942633505923703,
+      "loss": 3.124,
+      "step": 12410
+    },
+    {
+      "epoch": 0.7206475383678087,
+      "grad_norm": 0.10767358541488647,
+      "learning_rate": 0.000198660163166341,
+      "loss": 3.1174,
+      "step": 12420
+    },
+    {
+      "epoch": 0.7212277698801822,
+      "grad_norm": 0.10856916010379791,
+      "learning_rate": 0.0001978951008382918,
+      "loss": 3.124,
+      "step": 12430
+    },
+    {
+      "epoch": 0.7218080013925556,
+      "grad_norm": 0.1153908297419548,
+      "learning_rate": 0.0001971311508921288,
+      "loss": 3.1287,
+      "step": 12440
+    },
+    {
+      "epoch": 0.722388232904929,
+      "grad_norm": 0.10942938178777695,
+      "learning_rate": 0.00019636831614079625,
+      "loss": 3.118,
+      "step": 12450
+    },
+    {
+      "epoch": 0.7229684644173026,
+      "grad_norm": 0.1065671443939209,
+      "learning_rate": 0.00019560659939313096,
+      "loss": 3.1286,
+      "step": 12460
+    },
+    {
+      "epoch": 0.723548695929676,
+      "grad_norm": 0.10312582552433014,
+      "learning_rate": 0.0001948460034538543,
+      "loss": 3.1235,
+      "step": 12470
+    },
+    {
+      "epoch": 0.7241289274420494,
+      "grad_norm": 0.1132279559969902,
+      "learning_rate": 0.00019408653112355995,
+      "loss": 3.128,
+      "step": 12480
+    },
+    {
+      "epoch": 0.7247091589544228,
+      "grad_norm": 0.10551323741674423,
+      "learning_rate": 0.00019332818519870453,
+      "loss": 3.1256,
+      "step": 12490
+    },
+    {
+      "epoch": 0.7252893904667963,
+      "grad_norm": 0.11312738060951233,
+      "learning_rate": 0.00019257096847159766,
+      "loss": 3.1083,
+      "step": 12500
+    },
+    {
+      "epoch": 0.7258696219791697,
+      "grad_norm": 0.113512322306633,
+      "learning_rate": 0.00019181488373038992,
+      "loss": 3.1143,
+      "step": 12510
+    },
+    {
+      "epoch": 0.7264498534915431,
+      "grad_norm": 0.10724562406539917,
+      "learning_rate": 0.00019105993375906512,
+      "loss": 3.1284,
+      "step": 12520
+    },
+    {
+      "epoch": 0.7270300850039165,
+      "grad_norm": 0.10460355132818222,
+      "learning_rate": 0.00019030612133742787,
+      "loss": 3.1162,
+      "step": 12530
+    },
+    {
+      "epoch": 0.72761031651629,
+      "grad_norm": 0.10553659498691559,
+      "learning_rate": 0.00018955344924109435,
+      "loss": 3.1269,
+      "step": 12540
+    },
+    {
+      "epoch": 0.7281905480286635,
+      "grad_norm": 0.10612872242927551,
+      "learning_rate": 0.00018880192024148268,
+      "loss": 3.1362,
+      "step": 12550
+    },
+    {
+      "epoch": 0.7287707795410369,
+      "grad_norm": 0.11399485170841217,
+      "learning_rate": 0.00018805153710580054,
+      "loss": 3.135,
+      "step": 12560
+    },
+    {
+      "epoch": 0.7293510110534103,
+      "grad_norm": 0.11703281104564667,
+      "learning_rate": 0.00018730230259703795,
+      "loss": 3.1188,
+      "step": 12570
+    },
+    {
+      "epoch": 0.7299312425657838,
+      "grad_norm": 0.11565428972244263,
+      "learning_rate": 0.00018655421947395425,
+      "loss": 3.1244,
+      "step": 12580
+    },
+    {
+      "epoch": 0.7305114740781572,
+      "grad_norm": 0.10253434628248215,
+      "learning_rate": 0.00018580729049107026,
+      "loss": 3.1183,
+      "step": 12590
+    },
+    {
+      "epoch": 0.7310917055905306,
+      "grad_norm": 0.101934053003788,
+      "learning_rate": 0.0001850615183986567,
+      "loss": 3.1192,
+      "step": 12600
+    },
+    {
+      "epoch": 0.731671937102904,
+      "grad_norm": 0.10347875952720642,
+      "learning_rate": 0.0001843169059427243,
+      "loss": 3.1201,
+      "step": 12610
+    },
+    {
+      "epoch": 0.7322521686152775,
+      "grad_norm": 0.10719276964664459,
+      "learning_rate": 0.00018357345586501468,
+      "loss": 3.1261,
+      "step": 12620
+    },
+    {
+      "epoch": 0.7328324001276509,
+      "grad_norm": 0.10952641069889069,
+      "learning_rate": 0.00018283117090298813,
+      "loss": 3.1286,
+      "step": 12630
+    },
+    {
+      "epoch": 0.7334126316400243,
+      "grad_norm": 0.10987886041402817,
+      "learning_rate": 0.00018209005378981626,
+      "loss": 3.1325,
+      "step": 12640
+    },
+    {
+      "epoch": 0.7339928631523978,
+      "grad_norm": 0.1137159988284111,
+      "learning_rate": 0.00018135010725436968,
+      "loss": 3.1282,
+      "step": 12650
+    },
+    {
+      "epoch": 0.7345730946647713,
+      "grad_norm": 0.10682724416255951,
+      "learning_rate": 0.00018061133402120895,
+      "loss": 3.1168,
+      "step": 12660
+    },
+    {
+      "epoch": 0.7351533261771447,
+      "grad_norm": 0.11520636081695557,
+      "learning_rate": 0.00017987373681057495,
+      "loss": 3.1311,
+      "step": 12670
+    },
+    {
+      "epoch": 0.7357335576895181,
+      "grad_norm": 0.107805997133255,
+      "learning_rate": 0.00017913731833837715,
+      "loss": 3.1157,
+      "step": 12680
+    },
+    {
+      "epoch": 0.7363137892018915,
+      "grad_norm": 0.10552658140659332,
+      "learning_rate": 0.00017840208131618618,
+      "loss": 3.1206,
+      "step": 12690
+    },
+    {
+      "epoch": 0.736894020714265,
+      "grad_norm": 0.10237275809049606,
+      "learning_rate": 0.0001776680284512215,
+      "loss": 3.1185,
+      "step": 12700
+    },
+    {
+      "epoch": 0.7374742522266384,
+      "grad_norm": 0.10909226536750793,
+      "learning_rate": 0.00017693516244634246,
+      "loss": 3.1108,
+      "step": 12710
+    },
+    {
+      "epoch": 0.7380544837390118,
+      "grad_norm": 0.10805969685316086,
+      "learning_rate": 0.00017620348600003898,
+      "loss": 3.1244,
+      "step": 12720
+    },
+    {
+      "epoch": 0.7386347152513854,
+      "grad_norm": 0.1112237498164177,
+      "learning_rate": 0.00017547300180641978,
+      "loss": 3.1242,
+      "step": 12730
+    },
+    {
+      "epoch": 0.7392149467637588,
+      "grad_norm": 0.10815447568893433,
+      "learning_rate": 0.00017474371255520466,
+      "loss": 3.115,
+      "step": 12740
+    },
+    {
+      "epoch": 0.7397951782761322,
+      "grad_norm": 0.10469721257686615,
+      "learning_rate": 0.00017401562093171286,
+      "loss": 3.1276,
+      "step": 12750
+    },
+    {
+      "epoch": 0.7403754097885056,
+      "grad_norm": 0.10945837199687958,
+      "learning_rate": 0.00017328872961685382,
+      "loss": 3.1234,
+      "step": 12760
+    },
+    {
+      "epoch": 0.7409556413008791,
+      "grad_norm": 0.11551317572593689,
+      "learning_rate": 0.00017256304128711807,
+      "loss": 3.1234,
+      "step": 12770
+    },
+    {
+      "epoch": 0.7415358728132525,
+      "grad_norm": 0.10662870854139328,
+      "learning_rate": 0.0001718385586145654,
+      "loss": 3.1193,
+      "step": 12780
+    },
+    {
+      "epoch": 0.7421161043256259,
+      "grad_norm": 0.0992654412984848,
+      "learning_rate": 0.00017111528426681728,
+      "loss": 3.12,
+      "step": 12790
+    },
+    {
+      "epoch": 0.7426963358379993,
+      "grad_norm": 0.10338784754276276,
+      "learning_rate": 0.00017039322090704555,
+      "loss": 3.1162,
+      "step": 12800
+    },
+    {
+      "epoch": 0.7432765673503728,
+      "grad_norm": 0.11413225531578064,
+      "learning_rate": 0.00016967237119396318,
+      "loss": 3.1261,
+      "step": 12810
+    },
+    {
+      "epoch": 0.7438567988627462,
+      "grad_norm": 0.12023269385099411,
+      "learning_rate": 0.00016895273778181426,
+      "loss": 3.1234,
+      "step": 12820
+    },
+    {
+      "epoch": 0.7444370303751197,
+      "grad_norm": 0.10315112769603729,
+      "learning_rate": 0.00016823432332036426,
+      "loss": 3.1175,
+      "step": 12830
+    },
+    {
+      "epoch": 0.7450172618874931,
+      "grad_norm": 0.11159830540418625,
+      "learning_rate": 0.00016751713045489098,
+      "loss": 3.129,
+      "step": 12840
+    },
+    {
+      "epoch": 0.7455974933998666,
+      "grad_norm": 0.10925300419330597,
+      "learning_rate": 0.000166801161826173,
+      "loss": 3.1211,
+      "step": 12850
+    },
+    {
+      "epoch": 0.74617772491224,
+      "grad_norm": 0.10250292718410492,
+      "learning_rate": 0.00016608642007048235,
+      "loss": 3.1262,
+      "step": 12860
+    },
+    {
+      "epoch": 0.7467579564246134,
+      "grad_norm": 0.10880285501480103,
+      "learning_rate": 0.00016537290781957288,
+      "loss": 3.1129,
+      "step": 12870
+    },
+    {
+      "epoch": 0.7473381879369868,
+      "grad_norm": 0.10200098901987076,
+      "learning_rate": 0.00016466062770067124,
+      "loss": 3.1227,
+      "step": 12880
+    },
+    {
+      "epoch": 0.7479184194493603,
+      "grad_norm": 0.10475246608257294,
+      "learning_rate": 0.000163949582336468,
+      "loss": 3.1347,
+      "step": 12890
+    },
+    {
+      "epoch": 0.7484986509617337,
+      "grad_norm": 0.10829997807741165,
+      "learning_rate": 0.00016323977434510594,
+      "loss": 3.1228,
+      "step": 12900
+    },
+    {
+      "epoch": 0.7490788824741071,
+      "grad_norm": 0.10326212644577026,
+      "learning_rate": 0.000162531206340173,
+      "loss": 3.1217,
+      "step": 12910
+    },
+    {
+      "epoch": 0.7496591139864806,
+      "grad_norm": 0.10354658216238022,
+      "learning_rate": 0.0001618238809306906,
+      "loss": 3.1181,
+      "step": 12920
+    },
+    {
+      "epoch": 0.7502393454988541,
+      "grad_norm": 0.10224345326423645,
+      "learning_rate": 0.00016111780072110504,
+      "loss": 3.12,
+      "step": 12930
+    },
+    {
+      "epoch": 0.7508195770112275,
+      "grad_norm": 0.10535353422164917,
+      "learning_rate": 0.00016041296831127756,
+      "loss": 3.1297,
+      "step": 12940
+    },
+    {
+      "epoch": 0.7513998085236009,
+      "grad_norm": 0.11045780032873154,
+      "learning_rate": 0.0001597093862964748,
+      "loss": 3.1183,
+      "step": 12950
+    },
+    {
+      "epoch": 0.7519800400359744,
+      "grad_norm": 0.1009131371974945,
+      "learning_rate": 0.00015900705726735976,
+      "loss": 3.1174,
+      "step": 12960
+    },
+    {
+      "epoch": 0.7525602715483478,
+      "grad_norm": 0.10427884012460709,
+      "learning_rate": 0.00015830598380998134,
+      "loss": 3.1101,
+      "step": 12970
+    },
+    {
+      "epoch": 0.7531405030607212,
+      "grad_norm": 0.11507980525493622,
+      "learning_rate": 0.0001576061685057655,
+      "loss": 3.1247,
+      "step": 12980
+    },
+    {
+      "epoch": 0.7537207345730946,
+      "grad_norm": 0.10372275114059448,
+      "learning_rate": 0.00015690761393150537,
+      "loss": 3.1183,
+      "step": 12990
+    },
+    {
+      "epoch": 0.7543009660854681,
+      "grad_norm": 0.09975118935108185,
+      "learning_rate": 0.00015621032265935203,
+      "loss": 3.1256,
+      "step": 13000
+    },
+    {
+      "epoch": 0.7543009660854681,
+      "eval_loss": 3.050916910171509,
+      "eval_runtime": 3.2621,
+      "eval_samples_per_second": 1327.375,
+      "eval_steps_per_second": 10.423,
+      "step": 13000
+    },
+    {
+      "epoch": 0.7548811975978416,
+      "grad_norm": 0.10384030640125275,
+      "learning_rate": 0.00015551429725680531,
+      "loss": 3.1167,
+      "step": 13010
+    },
+    {
+      "epoch": 0.755461429110215,
+      "grad_norm": 0.10713458061218262,
+      "learning_rate": 0.00015481954028670342,
+      "loss": 3.1206,
+      "step": 13020
+    },
+    {
+      "epoch": 0.7560416606225884,
+      "grad_norm": 0.10227679461240768,
+      "learning_rate": 0.0001541260543072144,
+      "loss": 3.1142,
+      "step": 13030
+    },
+    {
+      "epoch": 0.7566218921349619,
+      "grad_norm": 0.10371891409158707,
+      "learning_rate": 0.00015343384187182612,
+      "loss": 3.12,
+      "step": 13040
+    },
+    {
+      "epoch": 0.7572021236473353,
+      "grad_norm": 0.10193309932947159,
+      "learning_rate": 0.00015274290552933745,
+      "loss": 3.1191,
+      "step": 13050
+    },
+    {
+      "epoch": 0.7577823551597087,
+      "grad_norm": 0.10207639634609222,
+      "learning_rate": 0.00015205324782384817,
+      "loss": 3.1159,
+      "step": 13060
+    },
+    {
+      "epoch": 0.7583625866720821,
+      "grad_norm": 0.1024574562907219,
+      "learning_rate": 0.00015136487129475046,
+      "loss": 3.1155,
+      "step": 13070
+    },
+    {
+      "epoch": 0.7589428181844556,
+      "grad_norm": 0.10583309829235077,
+      "learning_rate": 0.00015067777847671876,
+      "loss": 3.1178,
+      "step": 13080
+    },
+    {
+      "epoch": 0.759523049696829,
+      "grad_norm": 0.10229542851448059,
+      "learning_rate": 0.00014999197189970065,
+      "loss": 3.1168,
+      "step": 13090
+    },
+    {
+      "epoch": 0.7601032812092025,
+      "grad_norm": 0.09870131313800812,
+      "learning_rate": 0.00014930745408890794,
+      "loss": 3.1121,
+      "step": 13100
+    },
+    {
+      "epoch": 0.7606835127215759,
+      "grad_norm": 0.10222458094358444,
+      "learning_rate": 0.00014862422756480687,
+      "loss": 3.128,
+      "step": 13110
+    },
+    {
+      "epoch": 0.7612637442339494,
+      "grad_norm": 0.10863006114959717,
+      "learning_rate": 0.00014794229484310883,
+      "loss": 3.1115,
+      "step": 13120
+    },
+    {
+      "epoch": 0.7618439757463228,
+      "grad_norm": 0.10503353178501129,
+      "learning_rate": 0.00014726165843476202,
+      "loss": 3.1222,
+      "step": 13130
+    },
+    {
+      "epoch": 0.7624242072586962,
+      "grad_norm": 0.09862679988145828,
+      "learning_rate": 0.0001465823208459407,
+      "loss": 3.1138,
+      "step": 13140
+    },
+    {
+      "epoch": 0.7630044387710696,
+      "grad_norm": 0.10148298740386963,
+      "learning_rate": 0.00014590428457803706,
+      "loss": 3.1158,
+      "step": 13150
+    },
+    {
+      "epoch": 0.7635846702834431,
+      "grad_norm": 0.11013112962245941,
+      "learning_rate": 0.00014522755212765176,
+      "loss": 3.1157,
+      "step": 13160
+    },
+    {
+      "epoch": 0.7641649017958165,
+      "grad_norm": 0.10017859935760498,
+      "learning_rate": 0.00014455212598658447,
+      "loss": 3.1264,
+      "step": 13170
+    },
+    {
+      "epoch": 0.7647451333081899,
+      "grad_norm": 0.10435356944799423,
+      "learning_rate": 0.00014387800864182487,
+      "loss": 3.1072,
+      "step": 13180
+    },
+    {
+      "epoch": 0.7653253648205635,
+      "grad_norm": 0.10140141099691391,
+      "learning_rate": 0.00014320520257554397,
+      "loss": 3.1145,
+      "step": 13190
+    },
+    {
+      "epoch": 0.7659055963329369,
+      "grad_norm": 0.09953157603740692,
+      "learning_rate": 0.000142533710265084,
+      "loss": 3.1244,
+      "step": 13200
+    },
+    {
+      "epoch": 0.7664858278453103,
+      "grad_norm": 0.10731340944766998,
+      "learning_rate": 0.00014186353418295006,
+      "loss": 3.1164,
+      "step": 13210
+    },
+    {
+      "epoch": 0.7670660593576837,
+      "grad_norm": 0.097932830452919,
+      "learning_rate": 0.0001411946767968006,
+      "loss": 3.1192,
+      "step": 13220
+    },
+    {
+      "epoch": 0.7676462908700572,
+      "grad_norm": 0.10346731543540955,
+      "learning_rate": 0.00014052714056943849,
+      "loss": 3.1211,
+      "step": 13230
+    },
+    {
+      "epoch": 0.7682265223824306,
+      "grad_norm": 0.10606466978788376,
+      "learning_rate": 0.0001398609279588024,
+      "loss": 3.1154,
+      "step": 13240
+    },
+    {
+      "epoch": 0.768806753894804,
+      "grad_norm": 0.10412232577800751,
+      "learning_rate": 0.00013919604141795667,
+      "loss": 3.1164,
+      "step": 13250
+    },
+    {
+      "epoch": 0.7693869854071774,
+      "grad_norm": 0.09943995624780655,
+      "learning_rate": 0.0001385324833950833,
+      "loss": 3.1195,
+      "step": 13260
+    },
+    {
+      "epoch": 0.769967216919551,
+      "grad_norm": 0.098769411444664,
+      "learning_rate": 0.00013787025633347239,
+      "loss": 3.1183,
+      "step": 13270
+    },
+    {
+      "epoch": 0.7705474484319244,
+      "grad_norm": 0.11205046623945236,
+      "learning_rate": 0.00013720936267151324,
+      "loss": 3.12,
+      "step": 13280
+    },
+    {
+      "epoch": 0.7711276799442978,
+      "grad_norm": 0.10622609406709671,
+      "learning_rate": 0.00013654980484268598,
+      "loss": 3.1139,
+      "step": 13290
+    },
+    {
+      "epoch": 0.7717079114566712,
+      "grad_norm": 0.10065792500972748,
+      "learning_rate": 0.00013589158527555094,
+      "loss": 3.1104,
+      "step": 13300
+    },
+    {
+      "epoch": 0.7722881429690447,
+      "grad_norm": 0.11066281795501709,
+      "learning_rate": 0.0001352347063937422,
+      "loss": 3.1149,
+      "step": 13310
+    },
+    {
+      "epoch": 0.7728683744814181,
+      "grad_norm": 0.10775715857744217,
+      "learning_rate": 0.0001345791706159562,
+      "loss": 3.1172,
+      "step": 13320
+    },
+    {
+      "epoch": 0.7734486059937915,
+      "grad_norm": 0.0999317467212677,
+      "learning_rate": 0.0001339249803559444,
+      "loss": 3.118,
+      "step": 13330
+    },
+    {
+      "epoch": 0.7740288375061649,
+      "grad_norm": 0.10435137152671814,
+      "learning_rate": 0.0001332721380225042,
+      "loss": 3.1238,
+      "step": 13340
+    },
+    {
+      "epoch": 0.7746090690185384,
+      "grad_norm": 0.10001866519451141,
+      "learning_rate": 0.00013262064601946895,
+      "loss": 3.1035,
+      "step": 13350
+    },
+    {
+      "epoch": 0.7751893005309118,
+      "grad_norm": 0.10652778297662735,
+      "learning_rate": 0.00013197050674570077,
+      "loss": 3.1129,
+      "step": 13360
+    },
+    {
+      "epoch": 0.7757695320432852,
+      "grad_norm": 0.09261602908372879,
+      "learning_rate": 0.00013132172259508058,
+      "loss": 3.1256,
+      "step": 13370
+    },
+    {
+      "epoch": 0.7763497635556587,
+      "grad_norm": 0.1106601282954216,
+      "learning_rate": 0.0001306742959564995,
+      "loss": 3.1256,
+      "step": 13380
+    },
+    {
+      "epoch": 0.7769299950680322,
+      "grad_norm": 0.11139431595802307,
+      "learning_rate": 0.0001300282292138502,
+      "loss": 3.1171,
+      "step": 13390
+    },
+    {
+      "epoch": 0.7775102265804056,
+      "grad_norm": 0.09999184310436249,
+      "learning_rate": 0.00012938352474601805,
+      "loss": 3.1173,
+      "step": 13400
+    },
+    {
+      "epoch": 0.778090458092779,
+      "grad_norm": 0.10087510198354721,
+      "learning_rate": 0.0001287401849268728,
+      "loss": 3.1224,
+      "step": 13410
+    },
+    {
+      "epoch": 0.7786706896051525,
+      "grad_norm": 0.10008762031793594,
+      "learning_rate": 0.0001280982121252585,
+      "loss": 3.117,
+      "step": 13420
+    },
+    {
+      "epoch": 0.7792509211175259,
+      "grad_norm": 0.10388967394828796,
+      "learning_rate": 0.0001274576087049868,
+      "loss": 3.105,
+      "step": 13430
+    },
+    {
+      "epoch": 0.7798311526298993,
+      "grad_norm": 0.10136213898658752,
+      "learning_rate": 0.0001268183770248263,
+      "loss": 3.1128,
+      "step": 13440
+    },
+    {
+      "epoch": 0.7804113841422727,
+      "grad_norm": 0.09719151258468628,
+      "learning_rate": 0.0001261805194384949,
+      "loss": 3.1094,
+      "step": 13450
+    },
+    {
+      "epoch": 0.7809916156546463,
+      "grad_norm": 0.10660874843597412,
+      "learning_rate": 0.00012554403829465155,
+      "loss": 3.1207,
+      "step": 13460
+    },
+    {
+      "epoch": 0.7815718471670197,
+      "grad_norm": 0.10400804132223129,
+      "learning_rate": 0.00012490893593688584,
+      "loss": 3.1109,
+      "step": 13470
+    },
+    {
+      "epoch": 0.7821520786793931,
+      "grad_norm": 0.10217728465795517,
+      "learning_rate": 0.00012427521470371173,
+      "loss": 3.1128,
+      "step": 13480
+    },
+    {
+      "epoch": 0.7827323101917665,
+      "grad_norm": 0.10613488405942917,
+      "learning_rate": 0.0001236428769285569,
+      "loss": 3.1091,
+      "step": 13490
+    },
+    {
+      "epoch": 0.78331254170414,
+      "grad_norm": 0.09936373680830002,
+      "learning_rate": 0.00012301192493975526,
+      "loss": 3.1107,
+      "step": 13500
+    },
+    {
+      "epoch": 0.7838927732165134,
+      "grad_norm": 0.1033458337187767,
+      "learning_rate": 0.00012238236106053852,
+      "loss": 3.1209,
+      "step": 13510
+    },
+    {
+      "epoch": 0.7844730047288868,
+      "grad_norm": 0.10691102594137192,
+      "learning_rate": 0.00012175418760902617,
+      "loss": 3.1077,
+      "step": 13520
+    },
+    {
+      "epoch": 0.7850532362412602,
+      "grad_norm": 0.11017080396413803,
+      "learning_rate": 0.00012112740689821921,
+      "loss": 3.119,
+      "step": 13530
+    },
+    {
+      "epoch": 0.7856334677536337,
+      "grad_norm": 0.10583353787660599,
+      "learning_rate": 0.00012050202123598974,
+      "loss": 3.1136,
+      "step": 13540
+    },
+    {
+      "epoch": 0.7862136992660071,
+      "grad_norm": 0.10265874862670898,
+      "learning_rate": 0.00011987803292507305,
+      "loss": 3.1122,
+      "step": 13550
+    },
+    {
+      "epoch": 0.7867939307783806,
+      "grad_norm": 0.09812895208597183,
+      "learning_rate": 0.00011925544426305996,
+      "loss": 3.11,
+      "step": 13560
+    },
+    {
+      "epoch": 0.787374162290754,
+      "grad_norm": 0.10639332979917526,
+      "learning_rate": 0.00011863425754238655,
+      "loss": 3.1162,
+      "step": 13570
+    },
+    {
+      "epoch": 0.7879543938031275,
+      "grad_norm": 0.10056709498167038,
+      "learning_rate": 0.00011801447505032786,
+      "loss": 3.1108,
+      "step": 13580
+    },
+    {
+      "epoch": 0.7885346253155009,
+      "grad_norm": 0.1005856990814209,
+      "learning_rate": 0.00011739609906898774,
+      "loss": 3.1051,
+      "step": 13590
+    },
+    {
+      "epoch": 0.7891148568278743,
+      "grad_norm": 0.10471539199352264,
+      "learning_rate": 0.00011677913187529126,
+      "loss": 3.1174,
+      "step": 13600
+    },
+    {
+      "epoch": 0.7896950883402477,
+      "grad_norm": 0.10265914350748062,
+      "learning_rate": 0.0001161635757409767,
+      "loss": 3.1132,
+      "step": 13610
+    },
+    {
+      "epoch": 0.7902753198526212,
+      "grad_norm": 0.10047253966331482,
+      "learning_rate": 0.00011554943293258557,
+      "loss": 3.1144,
+      "step": 13620
+    },
+    {
+      "epoch": 0.7908555513649946,
+      "grad_norm": 0.10369555652141571,
+      "learning_rate": 0.00011493670571145665,
+      "loss": 3.1165,
+      "step": 13630
+    },
+    {
+      "epoch": 0.791435782877368,
+      "grad_norm": 0.10994482040405273,
+      "learning_rate": 0.0001143253963337152,
+      "loss": 3.1099,
+      "step": 13640
+    },
+    {
+      "epoch": 0.7920160143897415,
+      "grad_norm": 0.10117276012897491,
+      "learning_rate": 0.00011371550705026673,
+      "loss": 3.1207,
+      "step": 13650
+    },
+    {
+      "epoch": 0.792596245902115,
+      "grad_norm": 0.10518882423639297,
+      "learning_rate": 0.00011310704010678747,
+      "loss": 3.0989,
+      "step": 13660
+    },
+    {
+      "epoch": 0.7931764774144884,
+      "grad_norm": 0.10278623551130295,
+      "learning_rate": 0.00011249999774371621,
+      "loss": 3.1032,
+      "step": 13670
+    },
+    {
+      "epoch": 0.7937567089268618,
+      "grad_norm": 0.10157769918441772,
+      "learning_rate": 0.00011189438219624698,
+      "loss": 3.1141,
+      "step": 13680
+    },
+    {
+      "epoch": 0.7943369404392353,
+      "grad_norm": 0.10142907500267029,
+      "learning_rate": 0.00011129019569431908,
+      "loss": 3.1123,
+      "step": 13690
+    },
+    {
+      "epoch": 0.7949171719516087,
+      "grad_norm": 0.10015449672937393,
+      "learning_rate": 0.00011068744046261098,
+      "loss": 3.1125,
+      "step": 13700
+    },
+    {
+      "epoch": 0.7954974034639821,
+      "grad_norm": 0.10350023210048676,
+      "learning_rate": 0.00011008611872053037,
+      "loss": 3.1038,
+      "step": 13710
+    },
+    {
+      "epoch": 0.7960776349763555,
+      "grad_norm": 0.10056246072053909,
+      "learning_rate": 0.00010948623268220676,
+      "loss": 3.1087,
+      "step": 13720
+    },
+    {
+      "epoch": 0.796657866488729,
+      "grad_norm": 0.09896915405988693,
+      "learning_rate": 0.00010888778455648391,
+      "loss": 3.1132,
+      "step": 13730
+    },
+    {
+      "epoch": 0.7972380980011025,
+      "grad_norm": 0.10385739803314209,
+      "learning_rate": 0.00010829077654690983,
+      "loss": 3.1183,
+      "step": 13740
+    },
+    {
+      "epoch": 0.7978183295134759,
+      "grad_norm": 0.09953512251377106,
+      "learning_rate": 0.000107695210851731,
+      "loss": 3.1125,
+      "step": 13750
+    },
+    {
+      "epoch": 0.7983985610258493,
+      "grad_norm": 0.09491749107837677,
+      "learning_rate": 0.00010710108966388266,
+      "loss": 3.1131,
+      "step": 13760
+    },
+    {
+      "epoch": 0.7989787925382228,
+      "grad_norm": 0.09977880120277405,
+      "learning_rate": 0.00010650841517098115,
+      "loss": 3.121,
+      "step": 13770
+    },
+    {
+      "epoch": 0.7995590240505962,
+      "grad_norm": 0.10586149990558624,
+      "learning_rate": 0.00010591718955531605,
+      "loss": 3.1175,
+      "step": 13780
+    },
+    {
+      "epoch": 0.8001392555629696,
+      "grad_norm": 0.10209766030311584,
+      "learning_rate": 0.0001053274149938419,
+      "loss": 3.1164,
+      "step": 13790
+    },
+    {
+      "epoch": 0.800719487075343,
+      "grad_norm": 0.10039076209068298,
+      "learning_rate": 0.0001047390936581707,
+      "loss": 3.1094,
+      "step": 13800
+    },
+    {
+      "epoch": 0.8012997185877165,
+      "grad_norm": 0.10035811364650726,
+      "learning_rate": 0.00010415222771456307,
+      "loss": 3.1173,
+      "step": 13810
+    },
+    {
+      "epoch": 0.8018799501000899,
+      "grad_norm": 0.09645077586174011,
+      "learning_rate": 0.00010356681932392093,
+      "loss": 3.1097,
+      "step": 13820
+    },
+    {
+      "epoch": 0.8024601816124634,
+      "grad_norm": 0.10422459989786148,
+      "learning_rate": 0.0001029828706417793,
+      "loss": 3.1142,
+      "step": 13830
+    },
+    {
+      "epoch": 0.8030404131248368,
+      "grad_norm": 0.10029245167970657,
+      "learning_rate": 0.0001024003838182982,
+      "loss": 3.1054,
+      "step": 13840
+    },
+    {
+      "epoch": 0.8036206446372103,
+      "grad_norm": 0.09637358039617538,
+      "learning_rate": 0.00010181936099825551,
+      "loss": 3.1093,
+      "step": 13850
+    },
+    {
+      "epoch": 0.8042008761495837,
+      "grad_norm": 0.10224739462137222,
+      "learning_rate": 0.00010123980432103791,
+      "loss": 3.1085,
+      "step": 13860
+    },
+    {
+      "epoch": 0.8047811076619571,
+      "grad_norm": 0.09893719106912613,
+      "learning_rate": 0.00010066171592063377,
+      "loss": 3.1045,
+      "step": 13870
+    },
+    {
+      "epoch": 0.8053613391743305,
+      "grad_norm": 0.09696366637945175,
+      "learning_rate": 0.00010008509792562525,
+      "loss": 3.1068,
+      "step": 13880
+    },
+    {
+      "epoch": 0.805941570686704,
+      "grad_norm": 0.09792386740446091,
+      "learning_rate": 9.950995245918016e-05,
+      "loss": 3.1193,
+      "step": 13890
+    },
+    {
+      "epoch": 0.8065218021990774,
+      "grad_norm": 0.09721978008747101,
+      "learning_rate": 9.893628163904417e-05,
+      "loss": 3.1135,
+      "step": 13900
+    },
+    {
+      "epoch": 0.8071020337114508,
+      "grad_norm": 0.102999746799469,
+      "learning_rate": 9.836408757753363e-05,
+      "loss": 3.1162,
+      "step": 13910
+    },
+    {
+      "epoch": 0.8076822652238244,
+      "grad_norm": 0.10018763691186905,
+      "learning_rate": 9.779337238152697e-05,
+      "loss": 3.1185,
+      "step": 13920
+    },
+    {
+      "epoch": 0.8082624967361978,
+      "grad_norm": 0.09860005974769592,
+      "learning_rate": 9.722413815245717e-05,
+      "loss": 3.1131,
+      "step": 13930
+    },
+    {
+      "epoch": 0.8088427282485712,
+      "grad_norm": 0.09546367824077606,
+      "learning_rate": 9.665638698630442e-05,
+      "loss": 3.1123,
+      "step": 13940
+    },
+    {
+      "epoch": 0.8094229597609446,
+      "grad_norm": 0.09807273745536804,
+      "learning_rate": 9.6090120973588e-05,
+      "loss": 3.1063,
+      "step": 13950
+    },
+    {
+      "epoch": 0.8100031912733181,
+      "grad_norm": 0.10019023716449738,
+      "learning_rate": 9.552534219935844e-05,
+      "loss": 3.1155,
+      "step": 13960
+    },
+    {
+      "epoch": 0.8105834227856915,
+      "grad_norm": 0.1006435975432396,
+      "learning_rate": 9.496205274319069e-05,
+      "loss": 3.106,
+      "step": 13970
+    },
+    {
+      "epoch": 0.8111636542980649,
+      "grad_norm": 0.10411754250526428,
+      "learning_rate": 9.44002546791754e-05,
+      "loss": 3.1155,
+      "step": 13980
+    },
+    {
+      "epoch": 0.8117438858104383,
+      "grad_norm": 0.1065743938088417,
+      "learning_rate": 9.38399500759119e-05,
+      "loss": 3.1075,
+      "step": 13990
+    },
+    {
+      "epoch": 0.8123241173228118,
+      "grad_norm": 0.09786444157361984,
+      "learning_rate": 9.328114099650042e-05,
+      "loss": 3.1073,
+      "step": 14000
+    },
+    {
+      "epoch": 0.8123241173228118,
+      "eval_loss": 3.041724681854248,
+      "eval_runtime": 3.2609,
+      "eval_samples_per_second": 1327.855,
+      "eval_steps_per_second": 10.427,
+      "step": 14000
+    },
+    {
+      "epoch": 0.8129043488351853,
+      "grad_norm": 0.09952949732542038,
+      "learning_rate": 9.272382949853453e-05,
+      "loss": 3.1102,
+      "step": 14010
+    },
+    {
+      "epoch": 0.8134845803475587,
+      "grad_norm": 0.0931655615568161,
+      "learning_rate": 9.216801763409343e-05,
+      "loss": 3.1085,
+      "step": 14020
+    },
+    {
+      "epoch": 0.8140648118599321,
+      "grad_norm": 0.09814907610416412,
+      "learning_rate": 9.161370744973491e-05,
+      "loss": 3.1011,
+      "step": 14030
+    },
+    {
+      "epoch": 0.8146450433723056,
+      "grad_norm": 0.09730423241853714,
+      "learning_rate": 9.106090098648696e-05,
+      "loss": 3.1048,
+      "step": 14040
+    },
+    {
+      "epoch": 0.815225274884679,
+      "grad_norm": 0.0960068479180336,
+      "learning_rate": 9.05096002798409e-05,
+      "loss": 3.1144,
+      "step": 14050
+    },
+    {
+      "epoch": 0.8158055063970524,
+      "grad_norm": 0.0986780971288681,
+      "learning_rate": 8.995980735974369e-05,
+      "loss": 3.1092,
+      "step": 14060
+    },
+    {
+      "epoch": 0.8163857379094258,
+      "grad_norm": 0.09585044533014297,
+      "learning_rate": 8.941152425059034e-05,
+      "loss": 3.1125,
+      "step": 14070
+    },
+    {
+      "epoch": 0.8169659694217993,
+      "grad_norm": 0.10179416090250015,
+      "learning_rate": 8.886475297121693e-05,
+      "loss": 3.1041,
+      "step": 14080
+    },
+    {
+      "epoch": 0.8175462009341727,
+      "grad_norm": 0.0965443029999733,
+      "learning_rate": 8.831949553489249e-05,
+      "loss": 3.1132,
+      "step": 14090
+    },
+    {
+      "epoch": 0.8181264324465461,
+      "grad_norm": 0.0964798629283905,
+      "learning_rate": 8.777575394931198e-05,
+      "loss": 3.1103,
+      "step": 14100
+    },
+    {
+      "epoch": 0.8187066639589196,
+      "grad_norm": 0.11066204309463501,
+      "learning_rate": 8.723353021658892e-05,
+      "loss": 3.105,
+      "step": 14110
+    },
+    {
+      "epoch": 0.8192868954712931,
+      "grad_norm": 0.09815254807472229,
+      "learning_rate": 8.669282633324776e-05,
+      "loss": 3.1088,
+      "step": 14120
+    },
+    {
+      "epoch": 0.8198671269836665,
+      "grad_norm": 0.10448320209980011,
+      "learning_rate": 8.615364429021722e-05,
+      "loss": 3.0998,
+      "step": 14130
+    },
+    {
+      "epoch": 0.8204473584960399,
+      "grad_norm": 0.10330579429864883,
+      "learning_rate": 8.56159860728215e-05,
+      "loss": 3.1101,
+      "step": 14140
+    },
+    {
+      "epoch": 0.8210275900084134,
+      "grad_norm": 0.10102424770593643,
+      "learning_rate": 8.507985366077493e-05,
+      "loss": 3.1033,
+      "step": 14150
+    },
+    {
+      "epoch": 0.8216078215207868,
+      "grad_norm": 0.09644920378923416,
+      "learning_rate": 8.454524902817312e-05,
+      "loss": 3.1087,
+      "step": 14160
+    },
+    {
+      "epoch": 0.8221880530331602,
+      "grad_norm": 0.09765134006738663,
+      "learning_rate": 8.401217414348611e-05,
+      "loss": 3.0975,
+      "step": 14170
+    },
+    {
+      "epoch": 0.8227682845455336,
+      "grad_norm": 0.09793014079332352,
+      "learning_rate": 8.348063096955188e-05,
+      "loss": 3.116,
+      "step": 14180
+    },
+    {
+      "epoch": 0.8233485160579072,
+      "grad_norm": 0.09745863080024719,
+      "learning_rate": 8.295062146356763e-05,
+      "loss": 3.1123,
+      "step": 14190
+    },
+    {
+      "epoch": 0.8239287475702806,
+      "grad_norm": 0.10549872368574142,
+      "learning_rate": 8.242214757708416e-05,
+      "loss": 3.1137,
+      "step": 14200
+    },
+    {
+      "epoch": 0.824508979082654,
+      "grad_norm": 0.10081037133932114,
+      "learning_rate": 8.18952112559977e-05,
+      "loss": 3.1119,
+      "step": 14210
+    },
+    {
+      "epoch": 0.8250892105950274,
+      "grad_norm": 0.10452466458082199,
+      "learning_rate": 8.136981444054281e-05,
+      "loss": 3.108,
+      "step": 14220
+    },
+    {
+      "epoch": 0.8256694421074009,
+      "grad_norm": 0.10242857784032822,
+      "learning_rate": 8.084595906528574e-05,
+      "loss": 3.1052,
+      "step": 14230
+    },
+    {
+      "epoch": 0.8262496736197743,
+      "grad_norm": 0.10497426986694336,
+      "learning_rate": 8.032364705911665e-05,
+      "loss": 3.1,
+      "step": 14240
+    },
+    {
+      "epoch": 0.8268299051321477,
+      "grad_norm": 0.0955236405134201,
+      "learning_rate": 7.980288034524353e-05,
+      "loss": 3.1138,
+      "step": 14250
+    },
+    {
+      "epoch": 0.8274101366445211,
+      "grad_norm": 0.10172264277935028,
+      "learning_rate": 7.928366084118338e-05,
+      "loss": 3.0993,
+      "step": 14260
+    },
+    {
+      "epoch": 0.8279903681568946,
+      "grad_norm": 0.09582705050706863,
+      "learning_rate": 7.87659904587572e-05,
+      "loss": 3.1224,
+      "step": 14270
+    },
+    {
+      "epoch": 0.828570599669268,
+      "grad_norm": 0.10169988870620728,
+      "learning_rate": 7.824987110408149e-05,
+      "loss": 3.1154,
+      "step": 14280
+    },
+    {
+      "epoch": 0.8291508311816415,
+      "grad_norm": 0.09703046083450317,
+      "learning_rate": 7.773530467756168e-05,
+      "loss": 3.1,
+      "step": 14290
+    },
+    {
+      "epoch": 0.8297310626940149,
+      "grad_norm": 0.09659979492425919,
+      "learning_rate": 7.722229307388551e-05,
+      "loss": 3.1027,
+      "step": 14300
+    },
+    {
+      "epoch": 0.8303112942063884,
+      "grad_norm": 0.10455524176359177,
+      "learning_rate": 7.671083818201502e-05,
+      "loss": 3.1086,
+      "step": 14310
+    },
+    {
+      "epoch": 0.8308915257187618,
+      "grad_norm": 0.09594230353832245,
+      "learning_rate": 7.620094188518112e-05,
+      "loss": 3.098,
+      "step": 14320
+    },
+    {
+      "epoch": 0.8314717572311352,
+      "grad_norm": 0.0956626608967781,
+      "learning_rate": 7.569260606087518e-05,
+      "loss": 3.0967,
+      "step": 14330
+    },
+    {
+      "epoch": 0.8320519887435086,
+      "grad_norm": 0.09703920781612396,
+      "learning_rate": 7.518583258084288e-05,
+      "loss": 3.1088,
+      "step": 14340
+    },
+    {
+      "epoch": 0.8326322202558821,
+      "grad_norm": 0.09883217513561249,
+      "learning_rate": 7.468062331107761e-05,
+      "loss": 3.1125,
+      "step": 14350
+    },
+    {
+      "epoch": 0.8332124517682555,
+      "grad_norm": 0.09501095116138458,
+      "learning_rate": 7.417698011181234e-05,
+      "loss": 3.1007,
+      "step": 14360
+    },
+    {
+      "epoch": 0.8337926832806289,
+      "grad_norm": 0.09835023432970047,
+      "learning_rate": 7.367490483751448e-05,
+      "loss": 3.103,
+      "step": 14370
+    },
+    {
+      "epoch": 0.8343729147930025,
+      "grad_norm": 0.10247199982404709,
+      "learning_rate": 7.317439933687764e-05,
+      "loss": 3.1054,
+      "step": 14380
+    },
+    {
+      "epoch": 0.8349531463053759,
+      "grad_norm": 0.10141029953956604,
+      "learning_rate": 7.267546545281544e-05,
+      "loss": 3.1124,
+      "step": 14390
+    },
+    {
+      "epoch": 0.8355333778177493,
+      "grad_norm": 0.09786387532949448,
+      "learning_rate": 7.217810502245498e-05,
+      "loss": 3.1143,
+      "step": 14400
+    },
+    {
+      "epoch": 0.8361136093301227,
+      "grad_norm": 0.10012129694223404,
+      "learning_rate": 7.168231987712903e-05,
+      "loss": 3.1133,
+      "step": 14410
+    },
+    {
+      "epoch": 0.8366938408424962,
+      "grad_norm": 0.10030212253332138,
+      "learning_rate": 7.118811184237078e-05,
+      "loss": 3.1001,
+      "step": 14420
+    },
+    {
+      "epoch": 0.8372740723548696,
+      "grad_norm": 0.09827015548944473,
+      "learning_rate": 7.069548273790588e-05,
+      "loss": 3.1031,
+      "step": 14430
+    },
+    {
+      "epoch": 0.837854303867243,
+      "grad_norm": 0.09829414635896683,
+      "learning_rate": 7.020443437764629e-05,
+      "loss": 3.1095,
+      "step": 14440
+    },
+    {
+      "epoch": 0.8384345353796164,
+      "grad_norm": 0.0953378975391388,
+      "learning_rate": 6.971496856968351e-05,
+      "loss": 3.1009,
+      "step": 14450
+    },
+    {
+      "epoch": 0.83901476689199,
+      "grad_norm": 0.09821732342243195,
+      "learning_rate": 6.922708711628183e-05,
+      "loss": 3.1148,
+      "step": 14460
+    },
+    {
+      "epoch": 0.8395949984043634,
+      "grad_norm": 0.09834583848714828,
+      "learning_rate": 6.874079181387221e-05,
+      "loss": 3.1015,
+      "step": 14470
+    },
+    {
+      "epoch": 0.8401752299167368,
+      "grad_norm": 0.09869256615638733,
+      "learning_rate": 6.825608445304443e-05,
+      "loss": 3.1101,
+      "step": 14480
+    },
+    {
+      "epoch": 0.8407554614291102,
+      "grad_norm": 0.10293745249509811,
+      "learning_rate": 6.777296681854206e-05,
+      "loss": 3.1056,
+      "step": 14490
+    },
+    {
+      "epoch": 0.8413356929414837,
+      "grad_norm": 0.09749376773834229,
+      "learning_rate": 6.72914406892548e-05,
+      "loss": 3.1106,
+      "step": 14500
+    },
+    {
+      "epoch": 0.8419159244538571,
+      "grad_norm": 0.09520737081766129,
+      "learning_rate": 6.681150783821222e-05,
+      "loss": 3.1085,
+      "step": 14510
+    },
+    {
+      "epoch": 0.8424961559662305,
+      "grad_norm": 0.09628592431545258,
+      "learning_rate": 6.633317003257755e-05,
+      "loss": 3.1083,
+      "step": 14520
+    },
+    {
+      "epoch": 0.8430763874786039,
+      "grad_norm": 0.09358352422714233,
+      "learning_rate": 6.585642903364036e-05,
+      "loss": 3.1113,
+      "step": 14530
+    },
+    {
+      "epoch": 0.8436566189909774,
+      "grad_norm": 0.09465614706277847,
+      "learning_rate": 6.538128659681131e-05,
+      "loss": 3.1141,
+      "step": 14540
+    },
+    {
+      "epoch": 0.8442368505033508,
+      "grad_norm": 0.09725864231586456,
+      "learning_rate": 6.490774447161441e-05,
+      "loss": 3.1104,
+      "step": 14550
+    },
+    {
+      "epoch": 0.8448170820157243,
+      "grad_norm": 0.09526196122169495,
+      "learning_rate": 6.443580440168146e-05,
+      "loss": 3.1165,
+      "step": 14560
+    },
+    {
+      "epoch": 0.8453973135280977,
+      "grad_norm": 0.09678266197443008,
+      "learning_rate": 6.396546812474519e-05,
+      "loss": 3.1012,
+      "step": 14570
+    },
+    {
+      "epoch": 0.8459775450404712,
+      "grad_norm": 0.0973704531788826,
+      "learning_rate": 6.349673737263295e-05,
+      "loss": 3.1026,
+      "step": 14580
+    },
+    {
+      "epoch": 0.8465577765528446,
+      "grad_norm": 0.09803210198879242,
+      "learning_rate": 6.302961387126066e-05,
+      "loss": 3.1056,
+      "step": 14590
+    },
+    {
+      "epoch": 0.847138008065218,
+      "grad_norm": 0.09953798353672028,
+      "learning_rate": 6.256409934062595e-05,
+      "loss": 3.1067,
+      "step": 14600
+    },
+    {
+      "epoch": 0.8477182395775914,
+      "grad_norm": 0.10524503886699677,
+      "learning_rate": 6.2100195494802e-05,
+      "loss": 3.1031,
+      "step": 14610
+    },
+    {
+      "epoch": 0.8482984710899649,
+      "grad_norm": 0.09302034974098206,
+      "learning_rate": 6.163790404193148e-05,
+      "loss": 3.1096,
+      "step": 14620
+    },
+    {
+      "epoch": 0.8488787026023383,
+      "grad_norm": 0.09579528868198395,
+      "learning_rate": 6.117722668421971e-05,
+      "loss": 3.1069,
+      "step": 14630
+    },
+    {
+      "epoch": 0.8494589341147117,
+      "grad_norm": 0.09332197159528732,
+      "learning_rate": 6.071816511792932e-05,
+      "loss": 3.1117,
+      "step": 14640
+    },
+    {
+      "epoch": 0.8500391656270853,
+      "grad_norm": 0.09204788506031036,
+      "learning_rate": 6.0260721033372876e-05,
+      "loss": 3.0956,
+      "step": 14650
+    },
+    {
+      "epoch": 0.8506193971394587,
+      "grad_norm": 0.09581390768289566,
+      "learning_rate": 5.980489611490747e-05,
+      "loss": 3.098,
+      "step": 14660
+    },
+    {
+      "epoch": 0.8511996286518321,
+      "grad_norm": 0.09121184796094894,
+      "learning_rate": 5.935069204092819e-05,
+      "loss": 3.112,
+      "step": 14670
+    },
+    {
+      "epoch": 0.8517798601642055,
+      "grad_norm": 0.09455008059740067,
+      "learning_rate": 5.889811048386201e-05,
+      "loss": 3.1009,
+      "step": 14680
+    },
+    {
+      "epoch": 0.852360091676579,
+      "grad_norm": 0.09240598976612091,
+      "learning_rate": 5.8447153110161524e-05,
+      "loss": 3.1075,
+      "step": 14690
+    },
+    {
+      "epoch": 0.8529403231889524,
+      "grad_norm": 0.09437933564186096,
+      "learning_rate": 5.7997821580299256e-05,
+      "loss": 3.0997,
+      "step": 14700
+    },
+    {
+      "epoch": 0.8535205547013258,
+      "grad_norm": 0.0974365696310997,
+      "learning_rate": 5.755011754876088e-05,
+      "loss": 3.0986,
+      "step": 14710
+    },
+    {
+      "epoch": 0.8541007862136992,
+      "grad_norm": 0.0968250036239624,
+      "learning_rate": 5.710404266403951e-05,
+      "loss": 3.1132,
+      "step": 14720
+    },
+    {
+      "epoch": 0.8546810177260727,
+      "grad_norm": 0.09723575413227081,
+      "learning_rate": 5.665959856862962e-05,
+      "loss": 3.1009,
+      "step": 14730
+    },
+    {
+      "epoch": 0.8552612492384462,
+      "grad_norm": 0.09329159557819366,
+      "learning_rate": 5.621678689902077e-05,
+      "loss": 3.1138,
+      "step": 14740
+    },
+    {
+      "epoch": 0.8558414807508196,
+      "grad_norm": 0.09336376190185547,
+      "learning_rate": 5.57756092856922e-05,
+      "loss": 3.0973,
+      "step": 14750
+    },
+    {
+      "epoch": 0.856421712263193,
+      "grad_norm": 0.0969925969839096,
+      "learning_rate": 5.5336067353105976e-05,
+      "loss": 3.0939,
+      "step": 14760
+    },
+    {
+      "epoch": 0.8570019437755665,
+      "grad_norm": 0.09060470759868622,
+      "learning_rate": 5.489816271970149e-05,
+      "loss": 3.1113,
+      "step": 14770
+    },
+    {
+      "epoch": 0.8575821752879399,
+      "grad_norm": 0.09199319034814835,
+      "learning_rate": 5.4461896997889505e-05,
+      "loss": 3.1083,
+      "step": 14780
+    },
+    {
+      "epoch": 0.8581624068003133,
+      "grad_norm": 0.09637030959129333,
+      "learning_rate": 5.402727179404615e-05,
+      "loss": 3.1091,
+      "step": 14790
+    },
+    {
+      "epoch": 0.8587426383126867,
+      "grad_norm": 0.0952460765838623,
+      "learning_rate": 5.359428870850691e-05,
+      "loss": 3.1057,
+      "step": 14800
+    },
+    {
+      "epoch": 0.8593228698250602,
+      "grad_norm": 0.09476283192634583,
+      "learning_rate": 5.316294933556076e-05,
+      "loss": 3.1085,
+      "step": 14810
+    },
+    {
+      "epoch": 0.8599031013374336,
+      "grad_norm": 0.0928397923707962,
+      "learning_rate": 5.273325526344469e-05,
+      "loss": 3.0943,
+      "step": 14820
+    },
+    {
+      "epoch": 0.860483332849807,
+      "grad_norm": 0.09518643468618393,
+      "learning_rate": 5.230520807433714e-05,
+      "loss": 3.0964,
+      "step": 14830
+    },
+    {
+      "epoch": 0.8610635643621805,
+      "grad_norm": 0.09664203971624374,
+      "learning_rate": 5.187880934435274e-05,
+      "loss": 3.1037,
+      "step": 14840
+    },
+    {
+      "epoch": 0.861643795874554,
+      "grad_norm": 0.0946909636259079,
+      "learning_rate": 5.145406064353631e-05,
+      "loss": 3.0976,
+      "step": 14850
+    },
+    {
+      "epoch": 0.8622240273869274,
+      "grad_norm": 0.09507571905851364,
+      "learning_rate": 5.10309635358569e-05,
+      "loss": 3.0998,
+      "step": 14860
+    },
+    {
+      "epoch": 0.8628042588993008,
+      "grad_norm": 0.09289544820785522,
+      "learning_rate": 5.060951957920257e-05,
+      "loss": 3.1094,
+      "step": 14870
+    },
+    {
+      "epoch": 0.8633844904116743,
+      "grad_norm": 0.09258411824703217,
+      "learning_rate": 5.018973032537411e-05,
+      "loss": 3.1052,
+      "step": 14880
+    },
+    {
+      "epoch": 0.8639647219240477,
+      "grad_norm": 0.09440912306308746,
+      "learning_rate": 4.977159732007941e-05,
+      "loss": 3.1092,
+      "step": 14890
+    },
+    {
+      "epoch": 0.8645449534364211,
+      "grad_norm": 0.09835471212863922,
+      "learning_rate": 4.935512210292814e-05,
+      "loss": 3.0988,
+      "step": 14900
+    },
+    {
+      "epoch": 0.8651251849487945,
+      "grad_norm": 0.09666918218135834,
+      "learning_rate": 4.894030620742545e-05,
+      "loss": 3.1009,
+      "step": 14910
+    },
+    {
+      "epoch": 0.865705416461168,
+      "grad_norm": 0.0979539081454277,
+      "learning_rate": 4.8527151160967286e-05,
+      "loss": 3.0995,
+      "step": 14920
+    },
+    {
+      "epoch": 0.8662856479735415,
+      "grad_norm": 0.0943804681301117,
+      "learning_rate": 4.81156584848334e-05,
+      "loss": 3.1054,
+      "step": 14930
+    },
+    {
+      "epoch": 0.8668658794859149,
+      "grad_norm": 0.09513070434331894,
+      "learning_rate": 4.770582969418319e-05,
+      "loss": 3.1108,
+      "step": 14940
+    },
+    {
+      "epoch": 0.8674461109982883,
+      "grad_norm": 0.09076978266239166,
+      "learning_rate": 4.7297666298049156e-05,
+      "loss": 3.1028,
+      "step": 14950
+    },
+    {
+      "epoch": 0.8680263425106618,
+      "grad_norm": 0.09252411872148514,
+      "learning_rate": 4.6891169799331614e-05,
+      "loss": 3.117,
+      "step": 14960
+    },
+    {
+      "epoch": 0.8686065740230352,
+      "grad_norm": 0.09229396283626556,
+      "learning_rate": 4.648634169479343e-05,
+      "loss": 3.1078,
+      "step": 14970
+    },
+    {
+      "epoch": 0.8691868055354086,
+      "grad_norm": 0.09328366816043854,
+      "learning_rate": 4.60831834750538e-05,
+      "loss": 3.1087,
+      "step": 14980
+    },
+    {
+      "epoch": 0.869767037047782,
+      "grad_norm": 0.09177900850772858,
+      "learning_rate": 4.568169662458377e-05,
+      "loss": 3.0944,
+      "step": 14990
+    },
+    {
+      "epoch": 0.8703472685601555,
+      "grad_norm": 0.09426326304674149,
+      "learning_rate": 4.528188262169991e-05,
+      "loss": 3.108,
+      "step": 15000
+    },
+    {
+      "epoch": 0.8703472685601555,
+      "eval_loss": 3.0348994731903076,
+      "eval_runtime": 3.2633,
+      "eval_samples_per_second": 1326.859,
+      "eval_steps_per_second": 10.419,
+      "step": 15000
+    },
+    {
+      "epoch": 0.870927500072529,
+      "grad_norm": 0.0925714373588562,
+      "learning_rate": 4.488374293855918e-05,
+      "loss": 3.104,
+      "step": 15010
+    },
+    {
+      "epoch": 0.8715077315849024,
+      "grad_norm": 0.0921747237443924,
+      "learning_rate": 4.448727904115379e-05,
+      "loss": 3.1142,
+      "step": 15020
+    },
+    {
+      "epoch": 0.8720879630972758,
+      "grad_norm": 0.09229473024606705,
+      "learning_rate": 4.4092492389305074e-05,
+      "loss": 3.0982,
+      "step": 15030
+    },
+    {
+      "epoch": 0.8726681946096493,
+      "grad_norm": 0.09434866905212402,
+      "learning_rate": 4.369938443665922e-05,
+      "loss": 3.1127,
+      "step": 15040
+    },
+    {
+      "epoch": 0.8732484261220227,
+      "grad_norm": 0.09186001121997833,
+      "learning_rate": 4.330795663068044e-05,
+      "loss": 3.1025,
+      "step": 15050
+    },
+    {
+      "epoch": 0.8738286576343961,
+      "grad_norm": 0.09200981259346008,
+      "learning_rate": 4.291821041264721e-05,
+      "loss": 3.0938,
+      "step": 15060
+    },
+    {
+      "epoch": 0.8744088891467695,
+      "grad_norm": 0.09246696531772614,
+      "learning_rate": 4.253014721764592e-05,
+      "loss": 3.1122,
+      "step": 15070
+    },
+    {
+      "epoch": 0.874989120659143,
+      "grad_norm": 0.09366760402917862,
+      "learning_rate": 4.214376847456575e-05,
+      "loss": 3.1114,
+      "step": 15080
+    },
+    {
+      "epoch": 0.8755693521715164,
+      "grad_norm": 0.09217476844787598,
+      "learning_rate": 4.1759075606093934e-05,
+      "loss": 3.1152,
+      "step": 15090
+    },
+    {
+      "epoch": 0.8761495836838898,
+      "grad_norm": 0.09385888278484344,
+      "learning_rate": 4.137607002870969e-05,
+      "loss": 3.1151,
+      "step": 15100
+    },
+    {
+      "epoch": 0.8767298151962634,
+      "grad_norm": 0.09338943660259247,
+      "learning_rate": 4.099475315267981e-05,
+      "loss": 3.1108,
+      "step": 15110
+    },
+    {
+      "epoch": 0.8773100467086368,
+      "grad_norm": 0.09102658182382584,
+      "learning_rate": 4.0615126382052945e-05,
+      "loss": 3.106,
+      "step": 15120
+    },
+    {
+      "epoch": 0.8778902782210102,
+      "grad_norm": 0.09203559905290604,
+      "learning_rate": 4.023719111465457e-05,
+      "loss": 3.1,
+      "step": 15130
+    },
+    {
+      "epoch": 0.8784705097333836,
+      "grad_norm": 0.09662605822086334,
+      "learning_rate": 3.986094874208218e-05,
+      "loss": 3.1095,
+      "step": 15140
+    },
+    {
+      "epoch": 0.8790507412457571,
+      "grad_norm": 0.09100698679685593,
+      "learning_rate": 3.9486400649699216e-05,
+      "loss": 3.0917,
+      "step": 15150
+    },
+    {
+      "epoch": 0.8796309727581305,
+      "grad_norm": 0.09504050016403198,
+      "learning_rate": 3.911354821663127e-05,
+      "loss": 3.1041,
+      "step": 15160
+    },
+    {
+      "epoch": 0.8802112042705039,
+      "grad_norm": 0.09688866138458252,
+      "learning_rate": 3.874239281576003e-05,
+      "loss": 3.0942,
+      "step": 15170
+    },
+    {
+      "epoch": 0.8807914357828773,
+      "grad_norm": 0.0897068902850151,
+      "learning_rate": 3.837293581371837e-05,
+      "loss": 3.1024,
+      "step": 15180
+    },
+    {
+      "epoch": 0.8813716672952508,
+      "grad_norm": 0.0906522199511528,
+      "learning_rate": 3.800517857088604e-05,
+      "loss": 3.103,
+      "step": 15190
+    },
+    {
+      "epoch": 0.8819518988076243,
+      "grad_norm": 0.0898643210530281,
+      "learning_rate": 3.763912244138334e-05,
+      "loss": 3.11,
+      "step": 15200
+    },
+    {
+      "epoch": 0.8825321303199977,
+      "grad_norm": 0.09033368527889252,
+      "learning_rate": 3.727476877306751e-05,
+      "loss": 3.1093,
+      "step": 15210
+    },
+    {
+      "epoch": 0.8831123618323711,
+      "grad_norm": 0.0916651263833046,
+      "learning_rate": 3.691211890752688e-05,
+      "loss": 3.1059,
+      "step": 15220
+    },
+    {
+      "epoch": 0.8836925933447446,
+      "grad_norm": 0.09232784807682037,
+      "learning_rate": 3.6551174180076195e-05,
+      "loss": 3.1066,
+      "step": 15230
+    },
+    {
+      "epoch": 0.884272824857118,
+      "grad_norm": 0.09441141784191132,
+      "learning_rate": 3.619193591975195e-05,
+      "loss": 3.1105,
+      "step": 15240
+    },
+    {
+      "epoch": 0.8848530563694914,
+      "grad_norm": 0.09673616290092468,
+      "learning_rate": 3.583440544930672e-05,
+      "loss": 3.0993,
+      "step": 15250
+    },
+    {
+      "epoch": 0.8854332878818648,
+      "grad_norm": 0.09390676021575928,
+      "learning_rate": 3.547858408520538e-05,
+      "loss": 3.1056,
+      "step": 15260
+    },
+    {
+      "epoch": 0.8860135193942383,
+      "grad_norm": 0.09317633509635925,
+      "learning_rate": 3.512447313761946e-05,
+      "loss": 3.0977,
+      "step": 15270
+    },
+    {
+      "epoch": 0.8865937509066117,
+      "grad_norm": 0.09400813281536102,
+      "learning_rate": 3.477207391042253e-05,
+      "loss": 3.0963,
+      "step": 15280
+    },
+    {
+      "epoch": 0.8871739824189852,
+      "grad_norm": 0.09057381004095078,
+      "learning_rate": 3.442138770118547e-05,
+      "loss": 3.1024,
+      "step": 15290
+    },
+    {
+      "epoch": 0.8877542139313586,
+      "grad_norm": 0.09377612918615341,
+      "learning_rate": 3.4072415801171484e-05,
+      "loss": 3.0959,
+      "step": 15300
+    },
+    {
+      "epoch": 0.8883344454437321,
+      "grad_norm": 0.09093187749385834,
+      "learning_rate": 3.3725159495332e-05,
+      "loss": 3.0976,
+      "step": 15310
+    },
+    {
+      "epoch": 0.8889146769561055,
+      "grad_norm": 0.09208898991346359,
+      "learning_rate": 3.3379620062300774e-05,
+      "loss": 3.1007,
+      "step": 15320
+    },
+    {
+      "epoch": 0.8894949084684789,
+      "grad_norm": 0.09365664422512054,
+      "learning_rate": 3.303579877439039e-05,
+      "loss": 3.1053,
+      "step": 15330
+    },
+    {
+      "epoch": 0.8900751399808524,
+      "grad_norm": 0.09230521321296692,
+      "learning_rate": 3.269369689758683e-05,
+      "loss": 3.1055,
+      "step": 15340
+    },
+    {
+      "epoch": 0.8906553714932258,
+      "grad_norm": 0.0912981778383255,
+      "learning_rate": 3.235331569154493e-05,
+      "loss": 3.0972,
+      "step": 15350
+    },
+    {
+      "epoch": 0.8912356030055992,
+      "grad_norm": 0.08950291574001312,
+      "learning_rate": 3.2014656409584174e-05,
+      "loss": 3.0999,
+      "step": 15360
+    },
+    {
+      "epoch": 0.8918158345179726,
+      "grad_norm": 0.09219230711460114,
+      "learning_rate": 3.167772029868321e-05,
+      "loss": 3.1019,
+      "step": 15370
+    },
+    {
+      "epoch": 0.8923960660303462,
+      "grad_norm": 0.09009566158056259,
+      "learning_rate": 3.134250859947635e-05,
+      "loss": 3.0978,
+      "step": 15380
+    },
+    {
+      "epoch": 0.8929762975427196,
+      "grad_norm": 0.0936865359544754,
+      "learning_rate": 3.1009022546248045e-05,
+      "loss": 3.1021,
+      "step": 15390
+    },
+    {
+      "epoch": 0.893556529055093,
+      "grad_norm": 0.09219173341989517,
+      "learning_rate": 3.0677263366928944e-05,
+      "loss": 3.0984,
+      "step": 15400
+    },
+    {
+      "epoch": 0.8941367605674664,
+      "grad_norm": 0.09084775298833847,
+      "learning_rate": 3.0347232283091107e-05,
+      "loss": 3.1039,
+      "step": 15410
+    },
+    {
+      "epoch": 0.8947169920798399,
+      "grad_norm": 0.0911671370267868,
+      "learning_rate": 3.001893050994342e-05,
+      "loss": 3.0934,
+      "step": 15420
+    },
+    {
+      "epoch": 0.8952972235922133,
+      "grad_norm": 0.08928447961807251,
+      "learning_rate": 2.9692359256327628e-05,
+      "loss": 3.1013,
+      "step": 15430
+    },
+    {
+      "epoch": 0.8958774551045867,
+      "grad_norm": 0.09330154210329056,
+      "learning_rate": 2.936751972471313e-05,
+      "loss": 3.0978,
+      "step": 15440
+    },
+    {
+      "epoch": 0.8964576866169601,
+      "grad_norm": 0.09146152436733246,
+      "learning_rate": 2.904441311119321e-05,
+      "loss": 3.0998,
+      "step": 15450
+    },
+    {
+      "epoch": 0.8970379181293336,
+      "grad_norm": 0.09131080657243729,
+      "learning_rate": 2.87230406054802e-05,
+      "loss": 3.1012,
+      "step": 15460
+    },
+    {
+      "epoch": 0.897618149641707,
+      "grad_norm": 0.09509788453578949,
+      "learning_rate": 2.8403403390901305e-05,
+      "loss": 3.102,
+      "step": 15470
+    },
+    {
+      "epoch": 0.8981983811540805,
+      "grad_norm": 0.09333484619855881,
+      "learning_rate": 2.8085502644394355e-05,
+      "loss": 3.1051,
+      "step": 15480
+    },
+    {
+      "epoch": 0.8987786126664539,
+      "grad_norm": 0.09036233276128769,
+      "learning_rate": 2.7769339536503125e-05,
+      "loss": 3.1117,
+      "step": 15490
+    },
+    {
+      "epoch": 0.8993588441788274,
+      "grad_norm": 0.08942391723394394,
+      "learning_rate": 2.745491523137328e-05,
+      "loss": 3.1117,
+      "step": 15500
+    },
+    {
+      "epoch": 0.8999390756912008,
+      "grad_norm": 0.0910114273428917,
+      "learning_rate": 2.7142230886748053e-05,
+      "loss": 3.0984,
+      "step": 15510
+    },
+    {
+      "epoch": 0.9005193072035742,
+      "grad_norm": 0.09009117633104324,
+      "learning_rate": 2.683128765396403e-05,
+      "loss": 3.0985,
+      "step": 15520
+    },
+    {
+      "epoch": 0.9010995387159476,
+      "grad_norm": 0.0904909297823906,
+      "learning_rate": 2.652208667794659e-05,
+      "loss": 3.0974,
+      "step": 15530
+    },
+    {
+      "epoch": 0.9016797702283211,
+      "grad_norm": 0.09261862933635712,
+      "learning_rate": 2.6214629097206345e-05,
+      "loss": 3.1042,
+      "step": 15540
+    },
+    {
+      "epoch": 0.9022600017406945,
+      "grad_norm": 0.09034094959497452,
+      "learning_rate": 2.5908916043834218e-05,
+      "loss": 3.1026,
+      "step": 15550
+    },
+    {
+      "epoch": 0.902840233253068,
+      "grad_norm": 0.09254541248083115,
+      "learning_rate": 2.560494864349766e-05,
+      "loss": 3.0954,
+      "step": 15560
+    },
+    {
+      "epoch": 0.9034204647654415,
+      "grad_norm": 0.08995792269706726,
+      "learning_rate": 2.530272801543654e-05,
+      "loss": 3.1003,
+      "step": 15570
+    },
+    {
+      "epoch": 0.9040006962778149,
+      "grad_norm": 0.08986230194568634,
+      "learning_rate": 2.5002255272458806e-05,
+      "loss": 3.0967,
+      "step": 15580
+    },
+    {
+      "epoch": 0.9045809277901883,
+      "grad_norm": 0.08791092783212662,
+      "learning_rate": 2.4703531520936572e-05,
+      "loss": 3.0929,
+      "step": 15590
+    },
+    {
+      "epoch": 0.9051611593025617,
+      "grad_norm": 0.09303991496562958,
+      "learning_rate": 2.440655786080209e-05,
+      "loss": 3.0981,
+      "step": 15600
+    },
+    {
+      "epoch": 0.9057413908149352,
+      "grad_norm": 0.09381508827209473,
+      "learning_rate": 2.4111335385543387e-05,
+      "loss": 3.0977,
+      "step": 15610
+    },
+    {
+      "epoch": 0.9063216223273086,
+      "grad_norm": 0.09249861538410187,
+      "learning_rate": 2.3817865182200638e-05,
+      "loss": 3.0969,
+      "step": 15620
+    },
+    {
+      "epoch": 0.906901853839682,
+      "grad_norm": 0.09130273759365082,
+      "learning_rate": 2.352614833136174e-05,
+      "loss": 3.1012,
+      "step": 15630
+    },
+    {
+      "epoch": 0.9074820853520554,
+      "grad_norm": 0.08810003846883774,
+      "learning_rate": 2.3236185907158814e-05,
+      "loss": 3.0956,
+      "step": 15640
+    },
+    {
+      "epoch": 0.908062316864429,
+      "grad_norm": 0.09278014302253723,
+      "learning_rate": 2.2947978977263807e-05,
+      "loss": 3.1024,
+      "step": 15650
+    },
+    {
+      "epoch": 0.9086425483768024,
+      "grad_norm": 0.09021242707967758,
+      "learning_rate": 2.266152860288484e-05,
+      "loss": 3.0915,
+      "step": 15660
+    },
+    {
+      "epoch": 0.9092227798891758,
+      "grad_norm": 0.08989161998033524,
+      "learning_rate": 2.2376835838762265e-05,
+      "loss": 3.0851,
+      "step": 15670
+    },
+    {
+      "epoch": 0.9098030114015492,
+      "grad_norm": 0.09114709496498108,
+      "learning_rate": 2.2093901733164612e-05,
+      "loss": 3.1014,
+      "step": 15680
+    },
+    {
+      "epoch": 0.9103832429139227,
+      "grad_norm": 0.08926232159137726,
+      "learning_rate": 2.1812727327884918e-05,
+      "loss": 3.0965,
+      "step": 15690
+    },
+    {
+      "epoch": 0.9109634744262961,
+      "grad_norm": 0.09116176515817642,
+      "learning_rate": 2.1533313658236688e-05,
+      "loss": 3.1009,
+      "step": 15700
+    },
+    {
+      "epoch": 0.9115437059386695,
+      "grad_norm": 0.08799432218074799,
+      "learning_rate": 2.1255661753050492e-05,
+      "loss": 3.1023,
+      "step": 15710
+    },
+    {
+      "epoch": 0.9121239374510429,
+      "grad_norm": 0.08743447065353394,
+      "learning_rate": 2.097977263466966e-05,
+      "loss": 3.0984,
+      "step": 15720
+    },
+    {
+      "epoch": 0.9127041689634164,
+      "grad_norm": 0.09166787564754486,
+      "learning_rate": 2.0705647318946806e-05,
+      "loss": 3.097,
+      "step": 15730
+    },
+    {
+      "epoch": 0.9132844004757898,
+      "grad_norm": 0.09091733396053314,
+      "learning_rate": 2.0433286815240092e-05,
+      "loss": 3.1049,
+      "step": 15740
+    },
+    {
+      "epoch": 0.9138646319881633,
+      "grad_norm": 0.08930478990077972,
+      "learning_rate": 2.0162692126409365e-05,
+      "loss": 3.0977,
+      "step": 15750
+    },
+    {
+      "epoch": 0.9144448635005367,
+      "grad_norm": 0.08997286111116409,
+      "learning_rate": 1.989386424881273e-05,
+      "loss": 3.1036,
+      "step": 15760
+    },
+    {
+      "epoch": 0.9150250950129102,
+      "grad_norm": 0.08813077956438065,
+      "learning_rate": 1.9626804172302447e-05,
+      "loss": 3.1003,
+      "step": 15770
+    },
+    {
+      "epoch": 0.9156053265252836,
+      "grad_norm": 0.08898695558309555,
+      "learning_rate": 1.936151288022181e-05,
+      "loss": 3.1065,
+      "step": 15780
+    },
+    {
+      "epoch": 0.916185558037657,
+      "grad_norm": 0.0892912819981575,
+      "learning_rate": 1.9097991349401156e-05,
+      "loss": 3.1047,
+      "step": 15790
+    },
+    {
+      "epoch": 0.9167657895500304,
+      "grad_norm": 0.09048785269260406,
+      "learning_rate": 1.8836240550154205e-05,
+      "loss": 3.1035,
+      "step": 15800
+    },
+    {
+      "epoch": 0.9173460210624039,
+      "grad_norm": 0.08921167254447937,
+      "learning_rate": 1.8576261446275057e-05,
+      "loss": 3.1013,
+      "step": 15810
+    },
+    {
+      "epoch": 0.9179262525747773,
+      "grad_norm": 0.08912645280361176,
+      "learning_rate": 1.8318054995033805e-05,
+      "loss": 3.0982,
+      "step": 15820
+    },
+    {
+      "epoch": 0.9185064840871507,
+      "grad_norm": 0.09258027374744415,
+      "learning_rate": 1.8061622147173716e-05,
+      "loss": 3.1059,
+      "step": 15830
+    },
+    {
+      "epoch": 0.9190867155995243,
+      "grad_norm": 0.08841285109519958,
+      "learning_rate": 1.7806963846907498e-05,
+      "loss": 3.095,
+      "step": 15840
+    },
+    {
+      "epoch": 0.9196669471118977,
+      "grad_norm": 0.09142499417066574,
+      "learning_rate": 1.7554081031913528e-05,
+      "loss": 3.1007,
+      "step": 15850
+    },
+    {
+      "epoch": 0.9202471786242711,
+      "grad_norm": 0.08727526664733887,
+      "learning_rate": 1.7302974633332968e-05,
+      "loss": 3.0974,
+      "step": 15860
+    },
+    {
+      "epoch": 0.9208274101366445,
+      "grad_norm": 0.09390713274478912,
+      "learning_rate": 1.7053645575765718e-05,
+      "loss": 3.0998,
+      "step": 15870
+    },
+    {
+      "epoch": 0.921407641649018,
+      "grad_norm": 0.09036395698785782,
+      "learning_rate": 1.6806094777267744e-05,
+      "loss": 3.0948,
+      "step": 15880
+    },
+    {
+      "epoch": 0.9219878731613914,
+      "grad_norm": 0.08717726916074753,
+      "learning_rate": 1.656032314934669e-05,
+      "loss": 3.0995,
+      "step": 15890
+    },
+    {
+      "epoch": 0.9225681046737648,
+      "grad_norm": 0.08829261362552643,
+      "learning_rate": 1.631633159695972e-05,
+      "loss": 3.0997,
+      "step": 15900
+    },
+    {
+      "epoch": 0.9231483361861382,
+      "grad_norm": 0.09095877408981323,
+      "learning_rate": 1.6074121018509137e-05,
+      "loss": 3.099,
+      "step": 15910
+    },
+    {
+      "epoch": 0.9237285676985117,
+      "grad_norm": 0.08995683491230011,
+      "learning_rate": 1.5833692305839642e-05,
+      "loss": 3.0973,
+      "step": 15920
+    },
+    {
+      "epoch": 0.9243087992108852,
+      "grad_norm": 0.08980005234479904,
+      "learning_rate": 1.5595046344235143e-05,
+      "loss": 3.1039,
+      "step": 15930
+    },
+    {
+      "epoch": 0.9248890307232586,
+      "grad_norm": 0.08741045743227005,
+      "learning_rate": 1.535818401241479e-05,
+      "loss": 3.1075,
+      "step": 15940
+    },
+    {
+      "epoch": 0.925469262235632,
+      "grad_norm": 0.0891076922416687,
+      "learning_rate": 1.512310618253071e-05,
+      "loss": 3.0986,
+      "step": 15950
+    },
+    {
+      "epoch": 0.9260494937480055,
+      "grad_norm": 0.0910383015871048,
+      "learning_rate": 1.4889813720164013e-05,
+      "loss": 3.1035,
+      "step": 15960
+    },
+    {
+      "epoch": 0.9266297252603789,
+      "grad_norm": 0.08668383955955505,
+      "learning_rate": 1.4658307484321953e-05,
+      "loss": 3.1023,
+      "step": 15970
+    },
+    {
+      "epoch": 0.9272099567727523,
+      "grad_norm": 0.08511517196893692,
+      "learning_rate": 1.4428588327434933e-05,
+      "loss": 3.0929,
+      "step": 15980
+    },
+    {
+      "epoch": 0.9277901882851257,
+      "grad_norm": 0.08581209927797318,
+      "learning_rate": 1.4200657095352676e-05,
+      "loss": 3.1002,
+      "step": 15990
+    },
+    {
+      "epoch": 0.9283704197974992,
+      "grad_norm": 0.08814380317926407,
+      "learning_rate": 1.397451462734206e-05,
+      "loss": 3.098,
+      "step": 16000
+    },
+    {
+      "epoch": 0.9283704197974992,
+      "eval_loss": 3.031247854232788,
+      "eval_runtime": 3.2581,
+      "eval_samples_per_second": 1328.977,
+      "eval_steps_per_second": 10.435,
+      "step": 16000
+    },
+    {
+      "epoch": 0.9289506513098726,
+      "grad_norm": 0.08873005956411362,
+      "learning_rate": 1.3750161756083234e-05,
+      "loss": 3.1036,
+      "step": 16010
+    },
+    {
+      "epoch": 0.929530882822246,
+      "grad_norm": 0.08824755996465683,
+      "learning_rate": 1.3527599307667005e-05,
+      "loss": 3.0983,
+      "step": 16020
+    },
+    {
+      "epoch": 0.9301111143346195,
+      "grad_norm": 0.08939357846975327,
+      "learning_rate": 1.3306828101591728e-05,
+      "loss": 3.1033,
+      "step": 16030
+    },
+    {
+      "epoch": 0.930691345846993,
+      "grad_norm": 0.08686842769384384,
+      "learning_rate": 1.3087848950759873e-05,
+      "loss": 3.0965,
+      "step": 16040
+    },
+    {
+      "epoch": 0.9312715773593664,
+      "grad_norm": 0.08945832401514053,
+      "learning_rate": 1.2870662661475852e-05,
+      "loss": 3.0951,
+      "step": 16050
+    },
+    {
+      "epoch": 0.9318518088717398,
+      "grad_norm": 0.088344506919384,
+      "learning_rate": 1.2655270033442189e-05,
+      "loss": 3.1039,
+      "step": 16060
+    },
+    {
+      "epoch": 0.9324320403841133,
+      "grad_norm": 0.08870889991521835,
+      "learning_rate": 1.2441671859757143e-05,
+      "loss": 3.0998,
+      "step": 16070
+    },
+    {
+      "epoch": 0.9330122718964867,
+      "grad_norm": 0.08570938557386398,
+      "learning_rate": 1.2229868926911636e-05,
+      "loss": 3.0957,
+      "step": 16080
+    },
+    {
+      "epoch": 0.9335925034088601,
+      "grad_norm": 0.0883183628320694,
+      "learning_rate": 1.201986201478611e-05,
+      "loss": 3.1008,
+      "step": 16090
+    },
+    {
+      "epoch": 0.9341727349212335,
+      "grad_norm": 0.09012133628129959,
+      "learning_rate": 1.1811651896648178e-05,
+      "loss": 3.0915,
+      "step": 16100
+    },
+    {
+      "epoch": 0.9347529664336071,
+      "grad_norm": 0.09092947840690613,
+      "learning_rate": 1.1605239339149199e-05,
+      "loss": 3.0886,
+      "step": 16110
+    },
+    {
+      "epoch": 0.9353331979459805,
+      "grad_norm": 0.08645268529653549,
+      "learning_rate": 1.140062510232187e-05,
+      "loss": 3.09,
+      "step": 16120
+    },
+    {
+      "epoch": 0.9359134294583539,
+      "grad_norm": 0.08784764260053635,
+      "learning_rate": 1.1197809939577197e-05,
+      "loss": 3.0997,
+      "step": 16130
+    },
+    {
+      "epoch": 0.9364936609707273,
+      "grad_norm": 0.0903283953666687,
+      "learning_rate": 1.0996794597701865e-05,
+      "loss": 3.1108,
+      "step": 16140
+    },
+    {
+      "epoch": 0.9370738924831008,
+      "grad_norm": 0.0857265442609787,
+      "learning_rate": 1.0797579816855585e-05,
+      "loss": 3.1028,
+      "step": 16150
+    },
+    {
+      "epoch": 0.9376541239954742,
+      "grad_norm": 0.09120402485132217,
+      "learning_rate": 1.0600166330567761e-05,
+      "loss": 3.0891,
+      "step": 16160
+    },
+    {
+      "epoch": 0.9382343555078476,
+      "grad_norm": 0.09053795039653778,
+      "learning_rate": 1.0404554865735771e-05,
+      "loss": 3.1013,
+      "step": 16170
+    },
+    {
+      "epoch": 0.938814587020221,
+      "grad_norm": 0.08675380051136017,
+      "learning_rate": 1.0210746142621408e-05,
+      "loss": 3.1088,
+      "step": 16180
+    },
+    {
+      "epoch": 0.9393948185325945,
+      "grad_norm": 0.08844220638275146,
+      "learning_rate": 1.0018740874848664e-05,
+      "loss": 3.1075,
+      "step": 16190
+    },
+    {
+      "epoch": 0.939975050044968,
+      "grad_norm": 0.09051468223333359,
+      "learning_rate": 9.828539769401235e-06,
+      "loss": 3.0957,
+      "step": 16200
+    },
+    {
+      "epoch": 0.9405552815573414,
+      "grad_norm": 0.08637753129005432,
+      "learning_rate": 9.640143526619239e-06,
+      "loss": 3.103,
+      "step": 16210
+    },
+    {
+      "epoch": 0.9411355130697148,
+      "grad_norm": 0.09072989225387573,
+      "learning_rate": 9.45355284019761e-06,
+      "loss": 3.1031,
+      "step": 16220
+    },
+    {
+      "epoch": 0.9417157445820883,
+      "grad_norm": 0.08837340027093887,
+      "learning_rate": 9.268768397182715e-06,
+      "loss": 3.0989,
+      "step": 16230
+    },
+    {
+      "epoch": 0.9422959760944617,
+      "grad_norm": 0.08710675686597824,
+      "learning_rate": 9.085790877970234e-06,
+      "loss": 3.0922,
+      "step": 16240
+    },
+    {
+      "epoch": 0.9428762076068351,
+      "grad_norm": 0.08685487508773804,
+      "learning_rate": 8.904620956302512e-06,
+      "loss": 3.0877,
+      "step": 16250
+    },
+    {
+      "epoch": 0.9434564391192085,
+      "grad_norm": 0.08923321217298508,
+      "learning_rate": 8.725259299266209e-06,
+      "loss": 3.0994,
+      "step": 16260
+    },
+    {
+      "epoch": 0.944036670631582,
+      "grad_norm": 0.08733417093753815,
+      "learning_rate": 8.547706567289814e-06,
+      "loss": 3.0953,
+      "step": 16270
+    },
+    {
+      "epoch": 0.9446169021439554,
+      "grad_norm": 0.08618912100791931,
+      "learning_rate": 8.371963414140982e-06,
+      "loss": 3.1021,
+      "step": 16280
+    },
+    {
+      "epoch": 0.9451971336563288,
+      "grad_norm": 0.09028159826993942,
+      "learning_rate": 8.198030486924468e-06,
+      "loss": 3.1022,
+      "step": 16290
+    },
+    {
+      "epoch": 0.9457773651687024,
+      "grad_norm": 0.09073447436094284,
+      "learning_rate": 8.025908426079532e-06,
+      "loss": 3.1016,
+      "step": 16300
+    },
+    {
+      "epoch": 0.9463575966810758,
+      "grad_norm": 0.08651082217693329,
+      "learning_rate": 7.85559786537754e-06,
+      "loss": 3.1053,
+      "step": 16310
+    },
+    {
+      "epoch": 0.9469378281934492,
+      "grad_norm": 0.08602554351091385,
+      "learning_rate": 7.687099431919974e-06,
+      "loss": 3.0999,
+      "step": 16320
+    },
+    {
+      "epoch": 0.9475180597058226,
+      "grad_norm": 0.08610852062702179,
+      "learning_rate": 7.520413746135657e-06,
+      "loss": 3.1059,
+      "step": 16330
+    },
+    {
+      "epoch": 0.9480982912181961,
+      "grad_norm": 0.08780808746814728,
+      "learning_rate": 7.355541421778689e-06,
+      "loss": 3.1046,
+      "step": 16340
+    },
+    {
+      "epoch": 0.9486785227305695,
+      "grad_norm": 0.08994690328836441,
+      "learning_rate": 7.1924830659262916e-06,
+      "loss": 3.1094,
+      "step": 16350
+    },
+    {
+      "epoch": 0.9492587542429429,
+      "grad_norm": 0.08709990233182907,
+      "learning_rate": 7.03123927897642e-06,
+      "loss": 3.105,
+      "step": 16360
+    },
+    {
+      "epoch": 0.9498389857553163,
+      "grad_norm": 0.08651293069124222,
+      "learning_rate": 6.871810654645483e-06,
+      "loss": 3.0934,
+      "step": 16370
+    },
+    {
+      "epoch": 0.9504192172676899,
+      "grad_norm": 0.08726444095373154,
+      "learning_rate": 6.7141977799665685e-06,
+      "loss": 3.0952,
+      "step": 16380
+    },
+    {
+      "epoch": 0.9509994487800633,
+      "grad_norm": 0.08613457530736923,
+      "learning_rate": 6.558401235286615e-06,
+      "loss": 3.088,
+      "step": 16390
+    },
+    {
+      "epoch": 0.9515796802924367,
+      "grad_norm": 0.088069386780262,
+      "learning_rate": 6.404421594264909e-06,
+      "loss": 3.0973,
+      "step": 16400
+    },
+    {
+      "epoch": 0.9521599118048101,
+      "grad_norm": 0.08668968081474304,
+      "learning_rate": 6.252259423870643e-06,
+      "loss": 3.1089,
+      "step": 16410
+    },
+    {
+      "epoch": 0.9527401433171836,
+      "grad_norm": 0.08948860317468643,
+      "learning_rate": 6.10191528438081e-06,
+      "loss": 3.0993,
+      "step": 16420
+    },
+    {
+      "epoch": 0.953320374829557,
+      "grad_norm": 0.08769353479146957,
+      "learning_rate": 5.953389729378256e-06,
+      "loss": 3.1144,
+      "step": 16430
+    },
+    {
+      "epoch": 0.9539006063419304,
+      "grad_norm": 0.08861144632101059,
+      "learning_rate": 5.806683305749682e-06,
+      "loss": 3.1077,
+      "step": 16440
+    },
+    {
+      "epoch": 0.9544808378543038,
+      "grad_norm": 0.09008525311946869,
+      "learning_rate": 5.661796553683541e-06,
+      "loss": 3.101,
+      "step": 16450
+    },
+    {
+      "epoch": 0.9550610693666773,
+      "grad_norm": 0.08720948547124863,
+      "learning_rate": 5.518730006668027e-06,
+      "loss": 3.1043,
+      "step": 16460
+    },
+    {
+      "epoch": 0.9556413008790507,
+      "grad_norm": 0.08759420365095139,
+      "learning_rate": 5.377484191489035e-06,
+      "loss": 3.1016,
+      "step": 16470
+    },
+    {
+      "epoch": 0.9562215323914242,
+      "grad_norm": 0.08649755269289017,
+      "learning_rate": 5.238059628228598e-06,
+      "loss": 3.0915,
+      "step": 16480
+    },
+    {
+      "epoch": 0.9568017639037976,
+      "grad_norm": 0.0874122902750969,
+      "learning_rate": 5.1004568302624456e-06,
+      "loss": 3.1012,
+      "step": 16490
+    },
+    {
+      "epoch": 0.9573819954161711,
+      "grad_norm": 0.08565357327461243,
+      "learning_rate": 4.96467630425862e-06,
+      "loss": 3.1011,
+      "step": 16500
+    },
+    {
+      "epoch": 0.9579622269285445,
+      "grad_norm": 0.08730974048376083,
+      "learning_rate": 4.830718550175139e-06,
+      "loss": 3.1077,
+      "step": 16510
+    },
+    {
+      "epoch": 0.9585424584409179,
+      "grad_norm": 0.08632799237966537,
+      "learning_rate": 4.698584061258559e-06,
+      "loss": 3.0943,
+      "step": 16520
+    },
+    {
+      "epoch": 0.9591226899532914,
+      "grad_norm": 0.08760344982147217,
+      "learning_rate": 4.5682733240418605e-06,
+      "loss": 3.0995,
+      "step": 16530
+    },
+    {
+      "epoch": 0.9597029214656648,
+      "grad_norm": 0.08842134475708008,
+      "learning_rate": 4.439786818342784e-06,
+      "loss": 3.1084,
+      "step": 16540
+    },
+    {
+      "epoch": 0.9602831529780382,
+      "grad_norm": 0.08709140866994858,
+      "learning_rate": 4.313125017262221e-06,
+      "loss": 3.0968,
+      "step": 16550
+    },
+    {
+      "epoch": 0.9608633844904116,
+      "grad_norm": 0.08542519807815552,
+      "learning_rate": 4.188288387182104e-06,
+      "loss": 3.097,
+      "step": 16560
+    },
+    {
+      "epoch": 0.9614436160027852,
+      "grad_norm": 0.08856651186943054,
+      "learning_rate": 4.065277387764077e-06,
+      "loss": 3.0984,
+      "step": 16570
+    },
+    {
+      "epoch": 0.9620238475151586,
+      "grad_norm": 0.09005647897720337,
+      "learning_rate": 3.9440924719473805e-06,
+      "loss": 3.0914,
+      "step": 16580
+    },
+    {
+      "epoch": 0.962604079027532,
+      "grad_norm": 0.08719483762979507,
+      "learning_rate": 3.82473408594769e-06,
+      "loss": 3.0979,
+      "step": 16590
+    },
+    {
+      "epoch": 0.9631843105399054,
+      "grad_norm": 0.08669883012771606,
+      "learning_rate": 3.7072026692550608e-06,
+      "loss": 3.098,
+      "step": 16600
+    },
+    {
+      "epoch": 0.9637645420522789,
+      "grad_norm": 0.08828626573085785,
+      "learning_rate": 3.5914986546323747e-06,
+      "loss": 3.0995,
+      "step": 16610
+    },
+    {
+      "epoch": 0.9643447735646523,
+      "grad_norm": 0.08661678433418274,
+      "learning_rate": 3.4776224681141167e-06,
+      "loss": 3.0966,
+      "step": 16620
+    },
+    {
+      "epoch": 0.9649250050770257,
+      "grad_norm": 0.08606945723295212,
+      "learning_rate": 3.3655745290042117e-06,
+      "loss": 3.1034,
+      "step": 16630
+    },
+    {
+      "epoch": 0.9655052365893991,
+      "grad_norm": 0.08614910393953323,
+      "learning_rate": 3.255355249874914e-06,
+      "loss": 3.1112,
+      "step": 16640
+    },
+    {
+      "epoch": 0.9660854681017726,
+      "grad_norm": 0.08819039165973663,
+      "learning_rate": 3.1469650365652525e-06,
+      "loss": 3.0944,
+      "step": 16650
+    },
+    {
+      "epoch": 0.9666656996141461,
+      "grad_norm": 0.08805646747350693,
+      "learning_rate": 3.0404042881792546e-06,
+      "loss": 3.0881,
+      "step": 16660
+    },
+    {
+      "epoch": 0.9672459311265195,
+      "grad_norm": 0.08734241127967834,
+      "learning_rate": 2.9356733970847817e-06,
+      "loss": 3.0993,
+      "step": 16670
+    },
+    {
+      "epoch": 0.9678261626388929,
+      "grad_norm": 0.08561859279870987,
+      "learning_rate": 2.832772748911916e-06,
+      "loss": 3.0975,
+      "step": 16680
+    },
+    {
+      "epoch": 0.9684063941512664,
+      "grad_norm": 0.08738164603710175,
+      "learning_rate": 2.7317027225516323e-06,
+      "loss": 3.1009,
+      "step": 16690
+    },
+    {
+      "epoch": 0.9689866256636398,
+      "grad_norm": 0.08456070721149445,
+      "learning_rate": 2.632463690154463e-06,
+      "loss": 3.1,
+      "step": 16700
+    },
+    {
+      "epoch": 0.9695668571760132,
+      "grad_norm": 0.08673311769962311,
+      "learning_rate": 2.5350560171287783e-06,
+      "loss": 3.1015,
+      "step": 16710
+    },
+    {
+      "epoch": 0.9701470886883866,
+      "grad_norm": 0.09023085236549377,
+      "learning_rate": 2.439480062139954e-06,
+      "loss": 3.101,
+      "step": 16720
+    },
+    {
+      "epoch": 0.9707273202007601,
+      "grad_norm": 0.08622407913208008,
+      "learning_rate": 2.345736177108537e-06,
+      "loss": 3.1137,
+      "step": 16730
+    },
+    {
+      "epoch": 0.9713075517131335,
+      "grad_norm": 0.08697347342967987,
+      "learning_rate": 2.2538247072094177e-06,
+      "loss": 3.1018,
+      "step": 16740
+    },
+    {
+      "epoch": 0.971887783225507,
+      "grad_norm": 0.0895911455154419,
+      "learning_rate": 2.1637459908702695e-06,
+      "loss": 3.1039,
+      "step": 16750
+    },
+    {
+      "epoch": 0.9724680147378804,
+      "grad_norm": 0.08516402542591095,
+      "learning_rate": 2.075500359770277e-06,
+      "loss": 3.102,
+      "step": 16760
+    },
+    {
+      "epoch": 0.9730482462502539,
+      "grad_norm": 0.08854671567678452,
+      "learning_rate": 1.98908813883919e-06,
+      "loss": 3.0975,
+      "step": 16770
+    },
+    {
+      "epoch": 0.9736284777626273,
+      "grad_norm": 0.08487720042467117,
+      "learning_rate": 1.9045096462558253e-06,
+      "loss": 3.0992,
+      "step": 16780
+    },
+    {
+      "epoch": 0.9742087092750007,
+      "grad_norm": 0.08441456407308578,
+      "learning_rate": 1.8217651934470669e-06,
+      "loss": 3.1019,
+      "step": 16790
+    },
+    {
+      "epoch": 0.9747889407873742,
+      "grad_norm": 0.08551038801670074,
+      "learning_rate": 1.74085508508659e-06,
+      "loss": 3.098,
+      "step": 16800
+    },
+    {
+      "epoch": 0.9753691722997476,
+      "grad_norm": 0.08782043308019638,
+      "learning_rate": 1.6617796190939726e-06,
+      "loss": 3.1039,
+      "step": 16810
+    },
+    {
+      "epoch": 0.975949403812121,
+      "grad_norm": 0.08647370338439941,
+      "learning_rate": 1.5845390866333631e-06,
+      "loss": 3.104,
+      "step": 16820
+    },
+    {
+      "epoch": 0.9765296353244944,
+      "grad_norm": 0.08616235107183456,
+      "learning_rate": 1.5091337721124254e-06,
+      "loss": 3.0958,
+      "step": 16830
+    },
+    {
+      "epoch": 0.977109866836868,
+      "grad_norm": 0.08559519797563553,
+      "learning_rate": 1.4355639531815067e-06,
+      "loss": 3.1067,
+      "step": 16840
+    },
+    {
+      "epoch": 0.9776900983492414,
+      "grad_norm": 0.08562770485877991,
+      "learning_rate": 1.363829900732305e-06,
+      "loss": 3.1058,
+      "step": 16850
+    },
+    {
+      "epoch": 0.9782703298616148,
+      "grad_norm": 0.08564095199108124,
+      "learning_rate": 1.2939318788971477e-06,
+      "loss": 3.0996,
+      "step": 16860
+    },
+    {
+      "epoch": 0.9788505613739882,
+      "grad_norm": 0.08510784804821014,
+      "learning_rate": 1.225870145047936e-06,
+      "loss": 3.092,
+      "step": 16870
+    },
+    {
+      "epoch": 0.9794307928863617,
+      "grad_norm": 0.08742259442806244,
+      "learning_rate": 1.1596449497949802e-06,
+      "loss": 3.0902,
+      "step": 16880
+    },
+    {
+      "epoch": 0.9800110243987351,
+      "grad_norm": 0.08536962419748306,
+      "learning_rate": 1.0952565369864997e-06,
+      "loss": 3.0974,
+      "step": 16890
+    },
+    {
+      "epoch": 0.9805912559111085,
+      "grad_norm": 0.08702561259269714,
+      "learning_rate": 1.0327051437073464e-06,
+      "loss": 3.1094,
+      "step": 16900
+    },
+    {
+      "epoch": 0.9811714874234819,
+      "grad_norm": 0.08652474731206894,
+      "learning_rate": 9.719910002782829e-07,
+      "loss": 3.1017,
+      "step": 16910
+    },
+    {
+      "epoch": 0.9817517189358554,
+      "grad_norm": 0.08817258477210999,
+      "learning_rate": 9.131143302551492e-07,
+      "loss": 3.1042,
+      "step": 16920
+    },
+    {
+      "epoch": 0.9823319504482289,
+      "grad_norm": 0.08725294470787048,
+      "learning_rate": 8.560753504279761e-07,
+      "loss": 3.1031,
+      "step": 16930
+    },
+    {
+      "epoch": 0.9829121819606023,
+      "grad_norm": 0.0844888985157013,
+      "learning_rate": 8.008742708203731e-07,
+      "loss": 3.1036,
+      "step": 16940
+    },
+    {
+      "epoch": 0.9834924134729757,
+      "grad_norm": 0.08515673130750656,
+      "learning_rate": 7.475112946883633e-07,
+      "loss": 3.0962,
+      "step": 16950
+    },
+    {
+      "epoch": 0.9840726449853492,
+      "grad_norm": 0.08552297949790955,
+      "learning_rate": 6.959866185201058e-07,
+      "loss": 3.0973,
+      "step": 16960
+    },
+    {
+      "epoch": 0.9846528764977226,
+      "grad_norm": 0.08687438070774078,
+      "learning_rate": 6.463004320348409e-07,
+      "loss": 3.1033,
+      "step": 16970
+    },
+    {
+      "epoch": 0.985233108010096,
+      "grad_norm": 0.0858420580625534,
+      "learning_rate": 5.984529181822795e-07,
+      "loss": 3.1055,
+      "step": 16980
+    },
+    {
+      "epoch": 0.9858133395224694,
+      "grad_norm": 0.08427808433771133,
+      "learning_rate": 5.524442531419927e-07,
+      "loss": 3.0978,
+      "step": 16990
+    },
+    {
+      "epoch": 0.9863935710348429,
+      "grad_norm": 0.0840638130903244,
+      "learning_rate": 5.08274606322745e-07,
+      "loss": 3.092,
+      "step": 17000
+    },
+    {
+      "epoch": 0.9863935710348429,
+      "eval_loss": 3.0300889015197754,
+      "eval_runtime": 3.2481,
+      "eval_samples_per_second": 1333.079,
+      "eval_steps_per_second": 10.468,
+      "step": 17000
+    },
+    {
+      "epoch": 0.9869738025472163,
+      "grad_norm": 0.08689724653959274,
+      "learning_rate": 4.6594414036171815e-07,
+      "loss": 3.1035,
+      "step": 17010
+    },
+    {
+      "epoch": 0.9875540340595897,
+      "grad_norm": 0.08669095486402512,
+      "learning_rate": 4.2545301112423274e-07,
+      "loss": 3.0939,
+      "step": 17020
+    },
+    {
+      "epoch": 0.9881342655719633,
+      "grad_norm": 0.08550863713026047,
+      "learning_rate": 3.868013677028048e-07,
+      "loss": 3.097,
+      "step": 17030
+    },
+    {
+      "epoch": 0.9887144970843367,
+      "grad_norm": 0.08494267612695694,
+      "learning_rate": 3.4998935241681295e-07,
+      "loss": 3.0995,
+      "step": 17040
+    },
+    {
+      "epoch": 0.9892947285967101,
+      "grad_norm": 0.08708130568265915,
+      "learning_rate": 3.1501710081199843e-07,
+      "loss": 3.1011,
+      "step": 17050
+    },
+    {
+      "epoch": 0.9898749601090835,
+      "grad_norm": 0.08479303121566772,
+      "learning_rate": 2.8188474165979915e-07,
+      "loss": 3.1057,
+      "step": 17060
+    },
+    {
+      "epoch": 0.990455191621457,
+      "grad_norm": 0.08544128388166428,
+      "learning_rate": 2.505923969571278e-07,
+      "loss": 3.084,
+      "step": 17070
+    },
+    {
+      "epoch": 0.9910354231338304,
+      "grad_norm": 0.08571518212556839,
+      "learning_rate": 2.2114018192553874e-07,
+      "loss": 3.1036,
+      "step": 17080
+    },
+    {
+      "epoch": 0.9916156546462038,
+      "grad_norm": 0.08791361004114151,
+      "learning_rate": 1.9352820501133961e-07,
+      "loss": 3.1063,
+      "step": 17090
+    },
+    {
+      "epoch": 0.9921958861585772,
+      "grad_norm": 0.08581080287694931,
+      "learning_rate": 1.6775656788459158e-07,
+      "loss": 3.1072,
+      "step": 17100
+    },
+    {
+      "epoch": 0.9927761176709508,
+      "grad_norm": 0.0846245288848877,
+      "learning_rate": 1.4382536543922076e-07,
+      "loss": 3.0959,
+      "step": 17110
+    },
+    {
+      "epoch": 0.9933563491833242,
+      "grad_norm": 0.08586116135120392,
+      "learning_rate": 1.217346857924073e-07,
+      "loss": 3.114,
+      "step": 17120
+    },
+    {
+      "epoch": 0.9939365806956976,
+      "grad_norm": 0.0862361267209053,
+      "learning_rate": 1.014846102843081e-07,
+      "loss": 3.0948,
+      "step": 17130
+    },
+    {
+      "epoch": 0.994516812208071,
+      "grad_norm": 0.08657976239919662,
+      "learning_rate": 8.307521347789005e-08,
+      "loss": 3.1003,
+      "step": 17140
+    },
+    {
+      "epoch": 0.9950970437204445,
+      "grad_norm": 0.0862279012799263,
+      "learning_rate": 6.650656315848602e-08,
+      "loss": 3.0992,
+      "step": 17150
+    },
+    {
+      "epoch": 0.9956772752328179,
+      "grad_norm": 0.0885181874036789,
+      "learning_rate": 5.1778720333517383e-08,
+      "loss": 3.0967,
+      "step": 17160
+    },
+    {
+      "epoch": 0.9962575067451913,
+      "grad_norm": 0.08494796603918076,
+      "learning_rate": 3.88917392325494e-08,
+      "loss": 3.1039,
+      "step": 17170
+    },
+    {
+      "epoch": 0.9968377382575647,
+      "grad_norm": 0.08858942240476608,
+      "learning_rate": 2.78456673066807e-08,
+      "loss": 3.0973,
+      "step": 17180
+    },
+    {
+      "epoch": 0.9974179697699382,
+      "grad_norm": 0.08530562371015549,
+      "learning_rate": 1.8640545228820748e-08,
+      "loss": 3.0982,
+      "step": 17190
+    },
+    {
+      "epoch": 0.9979982012823116,
+      "grad_norm": 0.08376429975032806,
+      "learning_rate": 1.1276406893079294e-08,
+      "loss": 3.0963,
+      "step": 17200
+    },
+    {
+      "epoch": 0.9985784327946851,
+      "grad_norm": 0.08597017079591751,
+      "learning_rate": 5.7532794150994e-09,
+      "loss": 3.1043,
+      "step": 17210
+    },
+    {
+      "epoch": 0.9991586643070585,
+      "grad_norm": 0.08577102422714233,
+      "learning_rate": 2.0711831315578524e-09,
+      "loss": 3.0981,
+      "step": 17220
+    },
+    {
+      "epoch": 0.999738895819432,
+      "grad_norm": 0.09527655690908432,
+      "learning_rate": 2.3013160027618442e-10,
+      "loss": 3.0959,
+      "step": 17230
+    },
+    {
+      "epoch": 0.9999709884243814,
+      "step": 17234,
+      "total_flos": 7.47219573773697e+18,
+      "train_loss": 3.3572154520448043,
+      "train_runtime": 15550.2697,
+      "train_samples_per_second": 567.438,
+      "train_steps_per_second": 1.108
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 17234,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.47219573773697e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}