diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,4793 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.09319377109279975,
+  "eval_steps": 500,
+  "global_step": 34000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00013704966337176435,
+      "grad_norm": 1.0435270071029663,
+      "learning_rate": 2e-05,
+      "loss": 1.8539,
+      "step": 50
+    },
+    {
+      "epoch": 0.0002740993267435287,
+      "grad_norm": 1.1732431650161743,
+      "learning_rate": 4e-05,
+      "loss": 1.2692,
+      "step": 100
+    },
+    {
+      "epoch": 0.00041114899011529305,
+      "grad_norm": 1.176833987236023,
+      "learning_rate": 6e-05,
+      "loss": 1.1063,
+      "step": 150
+    },
+    {
+      "epoch": 0.0005481986534870574,
+      "grad_norm": 1.2997100353240967,
+      "learning_rate": 8e-05,
+      "loss": 1.0579,
+      "step": 200
+    },
+    {
+      "epoch": 0.0006852483168588217,
+      "grad_norm": 1.2449016571044922,
+      "learning_rate": 0.0001,
+      "loss": 1.0288,
+      "step": 250
+    },
+    {
+      "epoch": 0.0008222979802305861,
+      "grad_norm": 1.1221928596496582,
+      "learning_rate": 0.00012,
+      "loss": 1.0089,
+      "step": 300
+    },
+    {
+      "epoch": 0.0009593476436023504,
+      "grad_norm": 1.4317854642868042,
+      "learning_rate": 0.00014,
+      "loss": 1.0009,
+      "step": 350
+    },
+    {
+      "epoch": 0.0010963973069741148,
+      "grad_norm": 0.8827124834060669,
+      "learning_rate": 0.00016,
+      "loss": 0.9776,
+      "step": 400
+    },
+    {
+      "epoch": 0.0012334469703458792,
+      "grad_norm": 0.7773798704147339,
+      "learning_rate": 0.00018,
+      "loss": 0.9696,
+      "step": 450
+    },
+    {
+      "epoch": 0.0013704966337176434,
+      "grad_norm": 0.7033634185791016,
+      "learning_rate": 0.0002,
+      "loss": 0.9546,
+      "step": 500
+    },
+    {
+      "epoch": 0.0015075462970894078,
+      "grad_norm": 0.7856244444847107,
+      "learning_rate": 0.0001999999907056826,
+      "loss": 0.9599,
+      "step": 550
+    },
+    {
+      "epoch": 0.0016445959604611722,
+      "grad_norm": 0.6872720122337341,
+      "learning_rate": 0.00019999996282273207,
+      "loss": 0.9439,
+      "step": 600
+    },
+    {
+      "epoch": 0.0017816456238329364,
+      "grad_norm": 0.7475189566612244,
+      "learning_rate": 0.00019999991635115367,
+      "loss": 0.9472,
+      "step": 650
+    },
+    {
+      "epoch": 0.0019186952872047008,
+      "grad_norm": 0.6948615908622742,
+      "learning_rate": 0.000199999851290956,
+      "loss": 0.9288,
+      "step": 700
+    },
+    {
+      "epoch": 0.002055744950576465,
+      "grad_norm": 0.682184636592865,
+      "learning_rate": 0.00019999976764215115,
+      "loss": 0.9259,
+      "step": 750
+    },
+    {
+      "epoch": 0.0021927946139482296,
+      "grad_norm": 0.6290674209594727,
+      "learning_rate": 0.00019999966540475462,
+      "loss": 0.933,
+      "step": 800
+    },
+    {
+      "epoch": 0.002329844277319994,
+      "grad_norm": 0.6512512564659119,
+      "learning_rate": 0.00019999954457878552,
+      "loss": 0.9161,
+      "step": 850
+    },
+    {
+      "epoch": 0.0024668939406917584,
+      "grad_norm": 0.6670469641685486,
+      "learning_rate": 0.00019999940516426623,
+      "loss": 0.921,
+      "step": 900
+    },
+    {
+      "epoch": 0.0026039436040635223,
+      "grad_norm": 0.6236344575881958,
+      "learning_rate": 0.0001999992471612227,
+      "loss": 0.9115,
+      "step": 950
+    },
+    {
+      "epoch": 0.0027409932674352868,
+      "grad_norm": 0.5856446027755737,
+      "learning_rate": 0.00019999907056968429,
+      "loss": 0.8974,
+      "step": 1000
+    },
+    {
+      "epoch": 0.002878042930807051,
+      "grad_norm": 0.6129364967346191,
+      "learning_rate": 0.00019999887538968381,
+      "loss": 0.893,
+      "step": 1050
+    },
+    {
+      "epoch": 0.0030150925941788156,
+      "grad_norm": 0.6010938286781311,
+      "learning_rate": 0.00019999866162125754,
+      "loss": 0.9043,
+      "step": 1100
+    },
+    {
+      "epoch": 0.00315214225755058,
+      "grad_norm": 0.6339828372001648,
+      "learning_rate": 0.00019999842926444528,
+      "loss": 0.9026,
+      "step": 1150
+    },
+    {
+      "epoch": 0.0032891919209223444,
+      "grad_norm": 0.6208747029304504,
+      "learning_rate": 0.00019999817831929017,
+      "loss": 0.8944,
+      "step": 1200
+    },
+    {
+      "epoch": 0.0034262415842941088,
+      "grad_norm": 0.612678587436676,
+      "learning_rate": 0.0001999979087858389,
+      "loss": 0.8858,
+      "step": 1250
+    },
+    {
+      "epoch": 0.0035632912476658727,
+      "grad_norm": 0.5846489667892456,
+      "learning_rate": 0.00019999762066414144,
+      "loss": 0.8873,
+      "step": 1300
+    },
+    {
+      "epoch": 0.003700340911037637,
+      "grad_norm": 0.6027012467384338,
+      "learning_rate": 0.00019999731395425153,
+      "loss": 0.8951,
+      "step": 1350
+    },
+    {
+      "epoch": 0.0038373905744094015,
+      "grad_norm": 0.5514549016952515,
+      "learning_rate": 0.00019999698865622606,
+      "loss": 0.8746,
+      "step": 1400
+    },
+    {
+      "epoch": 0.003974440237781166,
+      "grad_norm": 0.6226556301116943,
+      "learning_rate": 0.00019999664477012553,
+      "loss": 0.8852,
+      "step": 1450
+    },
+    {
+      "epoch": 0.00411148990115293,
+      "grad_norm": 0.5818433165550232,
+      "learning_rate": 0.00019999628229601388,
+      "loss": 0.8859,
+      "step": 1500
+    },
+    {
+      "epoch": 0.004248539564524694,
+      "grad_norm": 0.588690996170044,
+      "learning_rate": 0.0001999959012339585,
+      "loss": 0.8813,
+      "step": 1550
+    },
+    {
+      "epoch": 0.004385589227896459,
+      "grad_norm": 0.5324600338935852,
+      "learning_rate": 0.00019999550158403018,
+      "loss": 0.8643,
+      "step": 1600
+    },
+    {
+      "epoch": 0.004522638891268223,
+      "grad_norm": 0.5539863109588623,
+      "learning_rate": 0.00019999508334630323,
+      "loss": 0.8732,
+      "step": 1650
+    },
+    {
+      "epoch": 0.004659688554639988,
+      "grad_norm": 0.5579485893249512,
+      "learning_rate": 0.0001999946465208554,
+      "loss": 0.8718,
+      "step": 1700
+    },
+    {
+      "epoch": 0.004796738218011752,
+      "grad_norm": 0.5488457083702087,
+      "learning_rate": 0.0001999941911077679,
+      "loss": 0.8656,
+      "step": 1750
+    },
+    {
+      "epoch": 0.004933787881383517,
+      "grad_norm": 0.5370609164237976,
+      "learning_rate": 0.00019999371710712537,
+      "loss": 0.8729,
+      "step": 1800
+    },
+    {
+      "epoch": 0.005070837544755281,
+      "grad_norm": 0.5298507809638977,
+      "learning_rate": 0.0001999932245190159,
+      "loss": 0.8703,
+      "step": 1850
+    },
+    {
+      "epoch": 0.005207887208127045,
+      "grad_norm": 0.5601800084114075,
+      "learning_rate": 0.0001999927133435311,
+      "loss": 0.874,
+      "step": 1900
+    },
+    {
+      "epoch": 0.0053449368714988095,
+      "grad_norm": 0.547234833240509,
+      "learning_rate": 0.00019999218358076598,
+      "loss": 0.8713,
+      "step": 1950
+    },
+    {
+      "epoch": 0.0054819865348705735,
+      "grad_norm": 0.5623106360435486,
+      "learning_rate": 0.00019999163523081896,
+      "loss": 0.8742,
+      "step": 2000
+    },
+    {
+      "epoch": 0.005619036198242338,
+      "grad_norm": 0.5385792255401611,
+      "learning_rate": 0.00019999106829379207,
+      "loss": 0.8679,
+      "step": 2050
+    },
+    {
+      "epoch": 0.005756085861614102,
+      "grad_norm": 0.5167329907417297,
+      "learning_rate": 0.0001999904827697906,
+      "loss": 0.8634,
+      "step": 2100
+    },
+    {
+      "epoch": 0.005893135524985867,
+      "grad_norm": 0.5551963448524475,
+      "learning_rate": 0.00019998987865892345,
+      "loss": 0.8684,
+      "step": 2150
+    },
+    {
+      "epoch": 0.006030185188357631,
+      "grad_norm": 0.5749981999397278,
+      "learning_rate": 0.00019998925596130288,
+      "loss": 0.8593,
+      "step": 2200
+    },
+    {
+      "epoch": 0.006167234851729395,
+      "grad_norm": 0.506532609462738,
+      "learning_rate": 0.00019998861467704468,
+      "loss": 0.8511,
+      "step": 2250
+    },
+    {
+      "epoch": 0.00630428451510116,
+      "grad_norm": 0.5676366686820984,
+      "learning_rate": 0.000199987954806268,
+      "loss": 0.8704,
+      "step": 2300
+    },
+    {
+      "epoch": 0.006441334178472924,
+      "grad_norm": 0.5860298871994019,
+      "learning_rate": 0.00019998727634909557,
+      "loss": 0.8624,
+      "step": 2350
+    },
+    {
+      "epoch": 0.006578383841844689,
+      "grad_norm": 0.5202209949493408,
+      "learning_rate": 0.00019998657930565348,
+      "loss": 0.8681,
+      "step": 2400
+    },
+    {
+      "epoch": 0.006715433505216453,
+      "grad_norm": 0.5211143493652344,
+      "learning_rate": 0.00019998586367607127,
+      "loss": 0.8578,
+      "step": 2450
+    },
+    {
+      "epoch": 0.0068524831685882175,
+      "grad_norm": 0.5322254300117493,
+      "learning_rate": 0.000199985129460482,
+      "loss": 0.8595,
+      "step": 2500
+    },
+    {
+      "epoch": 0.0069895328319599815,
+      "grad_norm": 0.5367264151573181,
+      "learning_rate": 0.00019998437665902214,
+      "loss": 0.8549,
+      "step": 2550
+    },
+    {
+      "epoch": 0.0071265824953317455,
+      "grad_norm": 0.5147452354431152,
+      "learning_rate": 0.0001999836052718316,
+      "loss": 0.8608,
+      "step": 2600
+    },
+    {
+      "epoch": 0.00726363215870351,
+      "grad_norm": 0.5567029118537903,
+      "learning_rate": 0.0001999828152990538,
+      "loss": 0.8648,
+      "step": 2650
+    },
+    {
+      "epoch": 0.007400681822075274,
+      "grad_norm": 0.5113374590873718,
+      "learning_rate": 0.00019998200674083562,
+      "loss": 0.8608,
+      "step": 2700
+    },
+    {
+      "epoch": 0.007537731485447039,
+      "grad_norm": 0.5345346331596375,
+      "learning_rate": 0.0001999811795973273,
+      "loss": 0.8603,
+      "step": 2750
+    },
+    {
+      "epoch": 0.007674781148818803,
+      "grad_norm": 0.5091614723205566,
+      "learning_rate": 0.00019998033386868258,
+      "loss": 0.863,
+      "step": 2800
+    },
+    {
+      "epoch": 0.007811830812190568,
+      "grad_norm": 0.5146520137786865,
+      "learning_rate": 0.00019997946955505874,
+      "loss": 0.8555,
+      "step": 2850
+    },
+    {
+      "epoch": 0.007948880475562333,
+      "grad_norm": 0.5425053834915161,
+      "learning_rate": 0.0001999785866566164,
+      "loss": 0.847,
+      "step": 2900
+    },
+    {
+      "epoch": 0.008085930138934097,
+      "grad_norm": 0.5353215932846069,
+      "learning_rate": 0.00019997768517351967,
+      "loss": 0.8463,
+      "step": 2950
+    },
+    {
+      "epoch": 0.00822297980230586,
+      "grad_norm": 0.5260730385780334,
+      "learning_rate": 0.00019997676510593614,
+      "loss": 0.8536,
+      "step": 3000
+    },
+    {
+      "epoch": 0.008360029465677625,
+      "grad_norm": 0.5397545099258423,
+      "learning_rate": 0.00019997582645403687,
+      "loss": 0.8579,
+      "step": 3050
+    },
+    {
+      "epoch": 0.008497079129049389,
+      "grad_norm": 0.5507705211639404,
+      "learning_rate": 0.0001999748692179963,
+      "loss": 0.8429,
+      "step": 3100
+    },
+    {
+      "epoch": 0.008634128792421154,
+      "grad_norm": 0.5049775838851929,
+      "learning_rate": 0.00019997389339799235,
+      "loss": 0.8497,
+      "step": 3150
+    },
+    {
+      "epoch": 0.008771178455792918,
+      "grad_norm": 0.49926578998565674,
+      "learning_rate": 0.00019997289899420647,
+      "loss": 0.8384,
+      "step": 3200
+    },
+    {
+      "epoch": 0.008908228119164682,
+      "grad_norm": 0.5003844499588013,
+      "learning_rate": 0.00019997188600682345,
+      "loss": 0.8465,
+      "step": 3250
+    },
+    {
+      "epoch": 0.009045277782536446,
+      "grad_norm": 0.5375053286552429,
+      "learning_rate": 0.00019997085443603166,
+      "loss": 0.8496,
+      "step": 3300
+    },
+    {
+      "epoch": 0.00918232744590821,
+      "grad_norm": 0.5452185869216919,
+      "learning_rate": 0.0001999698042820228,
+      "loss": 0.8502,
+      "step": 3350
+    },
+    {
+      "epoch": 0.009319377109279976,
+      "grad_norm": 0.53727126121521,
+      "learning_rate": 0.00019996873554499204,
+      "loss": 0.8498,
+      "step": 3400
+    },
+    {
+      "epoch": 0.00945642677265174,
+      "grad_norm": 0.4958254098892212,
+      "learning_rate": 0.00019996764822513812,
+      "loss": 0.8532,
+      "step": 3450
+    },
+    {
+      "epoch": 0.009593476436023504,
+      "grad_norm": 0.5111714601516724,
+      "learning_rate": 0.00019996654232266314,
+      "loss": 0.8457,
+      "step": 3500
+    },
+    {
+      "epoch": 0.009730526099395268,
+      "grad_norm": 0.5191270112991333,
+      "learning_rate": 0.00019996541783777268,
+      "loss": 0.8555,
+      "step": 3550
+    },
+    {
+      "epoch": 0.009867575762767034,
+      "grad_norm": 0.47990381717681885,
+      "learning_rate": 0.0001999642747706757,
+      "loss": 0.8382,
+      "step": 3600
+    },
+    {
+      "epoch": 0.010004625426138797,
+      "grad_norm": 0.5000740885734558,
+      "learning_rate": 0.0001999631131215848,
+      "loss": 0.845,
+      "step": 3650
+    },
+    {
+      "epoch": 0.010141675089510561,
+      "grad_norm": 0.5065198540687561,
+      "learning_rate": 0.0001999619328907158,
+      "loss": 0.8392,
+      "step": 3700
+    },
+    {
+      "epoch": 0.010278724752882325,
+      "grad_norm": 0.5001168847084045,
+      "learning_rate": 0.00019996073407828812,
+      "loss": 0.8455,
+      "step": 3750
+    },
+    {
+      "epoch": 0.01041577441625409,
+      "grad_norm": 0.501194417476654,
+      "learning_rate": 0.00019995951668452466,
+      "loss": 0.8474,
+      "step": 3800
+    },
+    {
+      "epoch": 0.010552824079625855,
+      "grad_norm": 0.5274146795272827,
+      "learning_rate": 0.00019995828070965165,
+      "loss": 0.8494,
+      "step": 3850
+    },
+    {
+      "epoch": 0.010689873742997619,
+      "grad_norm": 0.4850250780582428,
+      "learning_rate": 0.00019995702615389885,
+      "loss": 0.8508,
+      "step": 3900
+    },
+    {
+      "epoch": 0.010826923406369383,
+      "grad_norm": 0.49130746722221375,
+      "learning_rate": 0.00019995575301749954,
+      "loss": 0.8422,
+      "step": 3950
+    },
+    {
+      "epoch": 0.010963973069741147,
+      "grad_norm": 0.5032249093055725,
+      "learning_rate": 0.00019995446130069026,
+      "loss": 0.8419,
+      "step": 4000
+    },
+    {
+      "epoch": 0.011101022733112911,
+      "grad_norm": 0.5428338646888733,
+      "learning_rate": 0.0001999531510037112,
+      "loss": 0.839,
+      "step": 4050
+    },
+    {
+      "epoch": 0.011238072396484677,
+      "grad_norm": 0.5070387125015259,
+      "learning_rate": 0.00019995182212680592,
+      "loss": 0.8372,
+      "step": 4100
+    },
+    {
+      "epoch": 0.01137512205985644,
+      "grad_norm": 0.506024956703186,
+      "learning_rate": 0.0001999504746702214,
+      "loss": 0.8525,
+      "step": 4150
+    },
+    {
+      "epoch": 0.011512171723228205,
+      "grad_norm": 0.48989248275756836,
+      "learning_rate": 0.00019994910863420815,
+      "loss": 0.846,
+      "step": 4200
+    },
+    {
+      "epoch": 0.011649221386599969,
+      "grad_norm": 0.5321078300476074,
+      "learning_rate": 0.0001999477240190201,
+      "loss": 0.8383,
+      "step": 4250
+    },
+    {
+      "epoch": 0.011786271049971734,
+      "grad_norm": 0.4978984594345093,
+      "learning_rate": 0.00019994632082491462,
+      "loss": 0.8466,
+      "step": 4300
+    },
+    {
+      "epoch": 0.011923320713343498,
+      "grad_norm": 0.5495821833610535,
+      "learning_rate": 0.00019994489905215254,
+      "loss": 0.8467,
+      "step": 4350
+    },
+    {
+      "epoch": 0.012060370376715262,
+      "grad_norm": 0.49797794222831726,
+      "learning_rate": 0.00019994345870099814,
+      "loss": 0.8314,
+      "step": 4400
+    },
+    {
+      "epoch": 0.012197420040087026,
+      "grad_norm": 0.5053208470344543,
+      "learning_rate": 0.00019994199977171922,
+      "loss": 0.835,
+      "step": 4450
+    },
+    {
+      "epoch": 0.01233446970345879,
+      "grad_norm": 0.5199260115623474,
+      "learning_rate": 0.00019994052226458687,
+      "loss": 0.8353,
+      "step": 4500
+    },
+    {
+      "epoch": 0.012471519366830556,
+      "grad_norm": 0.48166322708129883,
+      "learning_rate": 0.00019993902617987584,
+      "loss": 0.8383,
+      "step": 4550
+    },
+    {
+      "epoch": 0.01260856903020232,
+      "grad_norm": 0.49765893816947937,
+      "learning_rate": 0.00019993751151786414,
+      "loss": 0.8426,
+      "step": 4600
+    },
+    {
+      "epoch": 0.012745618693574084,
+      "grad_norm": 0.5026233792304993,
+      "learning_rate": 0.00019993597827883345,
+      "loss": 0.8303,
+      "step": 4650
+    },
+    {
+      "epoch": 0.012882668356945848,
+      "grad_norm": 0.46544715762138367,
+      "learning_rate": 0.0001999344264630686,
+      "loss": 0.8401,
+      "step": 4700
+    },
+    {
+      "epoch": 0.013019718020317612,
+      "grad_norm": 0.5394437909126282,
+      "learning_rate": 0.00019993285607085827,
+      "loss": 0.8342,
+      "step": 4750
+    },
+    {
+      "epoch": 0.013156767683689377,
+      "grad_norm": 0.5067171454429626,
+      "learning_rate": 0.00019993126710249416,
+      "loss": 0.8325,
+      "step": 4800
+    },
+    {
+      "epoch": 0.013293817347061141,
+      "grad_norm": 0.48171430826187134,
+      "learning_rate": 0.0001999296595582718,
+      "loss": 0.8336,
+      "step": 4850
+    },
+    {
+      "epoch": 0.013430867010432905,
+      "grad_norm": 0.49666112661361694,
+      "learning_rate": 0.00019992803343848992,
+      "loss": 0.8305,
+      "step": 4900
+    },
+    {
+      "epoch": 0.01356791667380467,
+      "grad_norm": 0.4750135838985443,
+      "learning_rate": 0.00019992638874345082,
+      "loss": 0.839,
+      "step": 4950
+    },
+    {
+      "epoch": 0.013704966337176435,
+      "grad_norm": 0.49714985489845276,
+      "learning_rate": 0.00019992472547346023,
+      "loss": 0.8303,
+      "step": 5000
+    },
+    {
+      "epoch": 0.013842016000548199,
+      "grad_norm": 0.48859328031539917,
+      "learning_rate": 0.0001999230436288273,
+      "loss": 0.8359,
+      "step": 5050
+    },
+    {
+      "epoch": 0.013979065663919963,
+      "grad_norm": 0.5311440229415894,
+      "learning_rate": 0.00019992134320986473,
+      "loss": 0.8295,
+      "step": 5100
+    },
+    {
+      "epoch": 0.014116115327291727,
+      "grad_norm": 0.5218192934989929,
+      "learning_rate": 0.00019991962421688855,
+      "loss": 0.833,
+      "step": 5150
+    },
+    {
+      "epoch": 0.014253164990663491,
+      "grad_norm": 0.4954502284526825,
+      "learning_rate": 0.00019991788665021828,
+      "loss": 0.835,
+      "step": 5200
+    },
+    {
+      "epoch": 0.014390214654035257,
+      "grad_norm": 0.502061665058136,
+      "learning_rate": 0.00019991613051017698,
+      "loss": 0.8305,
+      "step": 5250
+    },
+    {
+      "epoch": 0.01452726431740702,
+      "grad_norm": 0.4653523564338684,
+      "learning_rate": 0.00019991435579709102,
+      "loss": 0.838,
+      "step": 5300
+    },
+    {
+      "epoch": 0.014664313980778785,
+      "grad_norm": 0.4881741404533386,
+      "learning_rate": 0.00019991256251129035,
+      "loss": 0.827,
+      "step": 5350
+    },
+    {
+      "epoch": 0.014801363644150549,
+      "grad_norm": 0.49700623750686646,
+      "learning_rate": 0.0001999107506531083,
+      "loss": 0.8418,
+      "step": 5400
+    },
+    {
+      "epoch": 0.014938413307522314,
+      "grad_norm": 0.535879909992218,
+      "learning_rate": 0.00019990892022288164,
+      "loss": 0.8402,
+      "step": 5450
+    },
+    {
+      "epoch": 0.015075462970894078,
+      "grad_norm": 0.4582114815711975,
+      "learning_rate": 0.00019990707122095064,
+      "loss": 0.8367,
+      "step": 5500
+    },
+    {
+      "epoch": 0.015212512634265842,
+      "grad_norm": 0.5143499970436096,
+      "learning_rate": 0.00019990520364765902,
+      "loss": 0.8314,
+      "step": 5550
+    },
+    {
+      "epoch": 0.015349562297637606,
+      "grad_norm": 0.45117756724357605,
+      "learning_rate": 0.00019990331750335393,
+      "loss": 0.8224,
+      "step": 5600
+    },
+    {
+      "epoch": 0.01548661196100937,
+      "grad_norm": 0.5038974285125732,
+      "learning_rate": 0.000199901412788386,
+      "loss": 0.8379,
+      "step": 5650
+    },
+    {
+      "epoch": 0.015623661624381136,
+      "grad_norm": 0.4858607351779938,
+      "learning_rate": 0.0001998994895031092,
+      "loss": 0.8329,
+      "step": 5700
+    },
+    {
+      "epoch": 0.015760711287752898,
+      "grad_norm": 0.4778089225292206,
+      "learning_rate": 0.0001998975476478812,
+      "loss": 0.8261,
+      "step": 5750
+    },
+    {
+      "epoch": 0.015897760951124666,
+      "grad_norm": 0.47703468799591064,
+      "learning_rate": 0.00019989558722306277,
+      "loss": 0.8276,
+      "step": 5800
+    },
+    {
+      "epoch": 0.01603481061449643,
+      "grad_norm": 0.49877598881721497,
+      "learning_rate": 0.0001998936082290185,
+      "loss": 0.8374,
+      "step": 5850
+    },
+    {
+      "epoch": 0.016171860277868193,
+      "grad_norm": 0.45777496695518494,
+      "learning_rate": 0.00019989161066611617,
+      "loss": 0.8345,
+      "step": 5900
+    },
+    {
+      "epoch": 0.016308909941239957,
+      "grad_norm": 0.4969111382961273,
+      "learning_rate": 0.00019988959453472708,
+      "loss": 0.83,
+      "step": 5950
+    },
+    {
+      "epoch": 0.01644595960461172,
+      "grad_norm": 0.4316222667694092,
+      "learning_rate": 0.0001998875598352261,
+      "loss": 0.8248,
+      "step": 6000
+    },
+    {
+      "epoch": 0.016583009267983485,
+      "grad_norm": 0.4757803678512573,
+      "learning_rate": 0.00019988550656799135,
+      "loss": 0.8336,
+      "step": 6050
+    },
+    {
+      "epoch": 0.01672005893135525,
+      "grad_norm": 0.5188441872596741,
+      "learning_rate": 0.00019988343473340456,
+      "loss": 0.8336,
+      "step": 6100
+    },
+    {
+      "epoch": 0.016857108594727013,
+      "grad_norm": 0.5048016309738159,
+      "learning_rate": 0.00019988134433185083,
+      "loss": 0.8209,
+      "step": 6150
+    },
+    {
+      "epoch": 0.016994158258098777,
+      "grad_norm": 0.5321030020713806,
+      "learning_rate": 0.00019987923536371875,
+      "loss": 0.8338,
+      "step": 6200
+    },
+    {
+      "epoch": 0.01713120792147054,
+      "grad_norm": 0.5210222005844116,
+      "learning_rate": 0.00019987710782940035,
+      "loss": 0.8313,
+      "step": 6250
+    },
+    {
+      "epoch": 0.01726825758484231,
+      "grad_norm": 0.47530996799468994,
+      "learning_rate": 0.00019987496172929107,
+      "loss": 0.8271,
+      "step": 6300
+    },
+    {
+      "epoch": 0.017405307248214073,
+      "grad_norm": 0.47147926688194275,
+      "learning_rate": 0.00019987279706378992,
+      "loss": 0.8241,
+      "step": 6350
+    },
+    {
+      "epoch": 0.017542356911585837,
+      "grad_norm": 0.5165305733680725,
+      "learning_rate": 0.0001998706138332992,
+      "loss": 0.8255,
+      "step": 6400
+    },
+    {
+      "epoch": 0.0176794065749576,
+      "grad_norm": 0.5041708946228027,
+      "learning_rate": 0.00019986841203822482,
+      "loss": 0.8251,
+      "step": 6450
+    },
+    {
+      "epoch": 0.017816456238329365,
+      "grad_norm": 0.4591565430164337,
+      "learning_rate": 0.000199866191678976,
+      "loss": 0.8285,
+      "step": 6500
+    },
+    {
+      "epoch": 0.01795350590170113,
+      "grad_norm": 0.5095290541648865,
+      "learning_rate": 0.00019986395275596553,
+      "loss": 0.8325,
+      "step": 6550
+    },
+    {
+      "epoch": 0.018090555565072892,
+      "grad_norm": 0.5019941926002502,
+      "learning_rate": 0.00019986169526960953,
+      "loss": 0.8266,
+      "step": 6600
+    },
+    {
+      "epoch": 0.018227605228444656,
+      "grad_norm": 0.4563123285770416,
+      "learning_rate": 0.00019985941922032768,
+      "loss": 0.8215,
+      "step": 6650
+    },
+    {
+      "epoch": 0.01836465489181642,
+      "grad_norm": 0.46666857600212097,
+      "learning_rate": 0.00019985712460854308,
+      "loss": 0.8223,
+      "step": 6700
+    },
+    {
+      "epoch": 0.018501704555188188,
+      "grad_norm": 0.4678499102592468,
+      "learning_rate": 0.00019985481143468224,
+      "loss": 0.8319,
+      "step": 6750
+    },
+    {
+      "epoch": 0.018638754218559952,
+      "grad_norm": 0.47524577379226685,
+      "learning_rate": 0.00019985247969917511,
+      "loss": 0.826,
+      "step": 6800
+    },
+    {
+      "epoch": 0.018775803881931716,
+      "grad_norm": 0.49438291788101196,
+      "learning_rate": 0.0001998501294024552,
+      "loss": 0.8244,
+      "step": 6850
+    },
+    {
+      "epoch": 0.01891285354530348,
+      "grad_norm": 0.49879634380340576,
+      "learning_rate": 0.00019984776054495935,
+      "loss": 0.8315,
+      "step": 6900
+    },
+    {
+      "epoch": 0.019049903208675244,
+      "grad_norm": 0.44203802943229675,
+      "learning_rate": 0.00019984537312712792,
+      "loss": 0.8286,
+      "step": 6950
+    },
+    {
+      "epoch": 0.019186952872047008,
+      "grad_norm": 0.503197193145752,
+      "learning_rate": 0.00019984296714940469,
+      "loss": 0.826,
+      "step": 7000
+    },
+    {
+      "epoch": 0.01932400253541877,
+      "grad_norm": 0.47709017992019653,
+      "learning_rate": 0.0001998405426122369,
+      "loss": 0.8289,
+      "step": 7050
+    },
+    {
+      "epoch": 0.019461052198790536,
+      "grad_norm": 0.47072190046310425,
+      "learning_rate": 0.00019983809951607526,
+      "loss": 0.8241,
+      "step": 7100
+    },
+    {
+      "epoch": 0.0195981018621623,
+      "grad_norm": 0.5062088370323181,
+      "learning_rate": 0.00019983563786137387,
+      "loss": 0.8298,
+      "step": 7150
+    },
+    {
+      "epoch": 0.019735151525534067,
+      "grad_norm": 0.47417354583740234,
+      "learning_rate": 0.00019983315764859034,
+      "loss": 0.824,
+      "step": 7200
+    },
+    {
+      "epoch": 0.01987220118890583,
+      "grad_norm": 0.48068752884864807,
+      "learning_rate": 0.0001998306588781857,
+      "loss": 0.8237,
+      "step": 7250
+    },
+    {
+      "epoch": 0.020009250852277595,
+      "grad_norm": 0.4789169430732727,
+      "learning_rate": 0.00019982814155062445,
+      "loss": 0.8289,
+      "step": 7300
+    },
+    {
+      "epoch": 0.02014630051564936,
+      "grad_norm": 0.48607948422431946,
+      "learning_rate": 0.00019982560566637455,
+      "loss": 0.8241,
+      "step": 7350
+    },
+    {
+      "epoch": 0.020283350179021123,
+      "grad_norm": 0.4875280261039734,
+      "learning_rate": 0.0001998230512259073,
+      "loss": 0.8236,
+      "step": 7400
+    },
+    {
+      "epoch": 0.020420399842392887,
+      "grad_norm": 0.5202775001525879,
+      "learning_rate": 0.00019982047822969762,
+      "loss": 0.8231,
+      "step": 7450
+    },
+    {
+      "epoch": 0.02055744950576465,
+      "grad_norm": 0.4869972765445709,
+      "learning_rate": 0.00019981788667822374,
+      "loss": 0.8248,
+      "step": 7500
+    },
+    {
+      "epoch": 0.020694499169136415,
+      "grad_norm": 0.49330034852027893,
+      "learning_rate": 0.00019981527657196745,
+      "loss": 0.8358,
+      "step": 7550
+    },
+    {
+      "epoch": 0.02083154883250818,
+      "grad_norm": 0.49196910858154297,
+      "learning_rate": 0.00019981264791141387,
+      "loss": 0.8218,
+      "step": 7600
+    },
+    {
+      "epoch": 0.020968598495879943,
+      "grad_norm": 0.48113760352134705,
+      "learning_rate": 0.00019981000069705168,
+      "loss": 0.8233,
+      "step": 7650
+    },
+    {
+      "epoch": 0.02110564815925171,
+      "grad_norm": 0.47389155626296997,
+      "learning_rate": 0.00019980733492937293,
+      "loss": 0.8177,
+      "step": 7700
+    },
+    {
+      "epoch": 0.021242697822623474,
+      "grad_norm": 0.48917362093925476,
+      "learning_rate": 0.00019980465060887319,
+      "loss": 0.8261,
+      "step": 7750
+    },
+    {
+      "epoch": 0.021379747485995238,
+      "grad_norm": 0.4534808397293091,
+      "learning_rate": 0.00019980194773605141,
+      "loss": 0.8249,
+      "step": 7800
+    },
+    {
+      "epoch": 0.021516797149367002,
+      "grad_norm": 0.5190074443817139,
+      "learning_rate": 0.00019979922631141,
+      "loss": 0.834,
+      "step": 7850
+    },
+    {
+      "epoch": 0.021653846812738766,
+      "grad_norm": 0.49182918667793274,
+      "learning_rate": 0.00019979648633545487,
+      "loss": 0.8229,
+      "step": 7900
+    },
+    {
+      "epoch": 0.02179089647611053,
+      "grad_norm": 0.47965994477272034,
+      "learning_rate": 0.00019979372780869534,
+      "loss": 0.8204,
+      "step": 7950
+    },
+    {
+      "epoch": 0.021927946139482294,
+      "grad_norm": 0.48521897196769714,
+      "learning_rate": 0.00019979095073164416,
+      "loss": 0.8215,
+      "step": 8000
+    },
+    {
+      "epoch": 0.022064995802854058,
+      "grad_norm": 0.522633969783783,
+      "learning_rate": 0.00019978815510481756,
+      "loss": 0.826,
+      "step": 8050
+    },
+    {
+      "epoch": 0.022202045466225822,
+      "grad_norm": 0.47689947485923767,
+      "learning_rate": 0.00019978534092873523,
+      "loss": 0.8232,
+      "step": 8100
+    },
+    {
+      "epoch": 0.02233909512959759,
+      "grad_norm": 0.4920557737350464,
+      "learning_rate": 0.00019978250820392024,
+      "loss": 0.8271,
+      "step": 8150
+    },
+    {
+      "epoch": 0.022476144792969353,
+      "grad_norm": 0.49732133746147156,
+      "learning_rate": 0.00019977965693089922,
+      "loss": 0.8255,
+      "step": 8200
+    },
+    {
+      "epoch": 0.022613194456341117,
+      "grad_norm": 0.4798087775707245,
+      "learning_rate": 0.00019977678711020214,
+      "loss": 0.818,
+      "step": 8250
+    },
+    {
+      "epoch": 0.02275024411971288,
+      "grad_norm": 0.48783496022224426,
+      "learning_rate": 0.00019977389874236242,
+      "loss": 0.8164,
+      "step": 8300
+    },
+    {
+      "epoch": 0.022887293783084645,
+      "grad_norm": 0.48201945424079895,
+      "learning_rate": 0.00019977099182791707,
+      "loss": 0.8114,
+      "step": 8350
+    },
+    {
+      "epoch": 0.02302434344645641,
+      "grad_norm": 0.4945693016052246,
+      "learning_rate": 0.00019976806636740638,
+      "loss": 0.8213,
+      "step": 8400
+    },
+    {
+      "epoch": 0.023161393109828173,
+      "grad_norm": 0.4559716582298279,
+      "learning_rate": 0.00019976512236137418,
+      "loss": 0.8133,
+      "step": 8450
+    },
+    {
+      "epoch": 0.023298442773199937,
+      "grad_norm": 0.5023700594902039,
+      "learning_rate": 0.00019976215981036766,
+      "loss": 0.8205,
+      "step": 8500
+    },
+    {
+      "epoch": 0.0234354924365717,
+      "grad_norm": 0.49640583992004395,
+      "learning_rate": 0.00019975917871493758,
+      "loss": 0.8304,
+      "step": 8550
+    },
+    {
+      "epoch": 0.02357254209994347,
+      "grad_norm": 0.4793291389942169,
+      "learning_rate": 0.0001997561790756381,
+      "loss": 0.8312,
+      "step": 8600
+    },
+    {
+      "epoch": 0.023709591763315233,
+      "grad_norm": 0.5027498006820679,
+      "learning_rate": 0.00019975316089302674,
+      "loss": 0.8255,
+      "step": 8650
+    },
+    {
+      "epoch": 0.023846641426686997,
+      "grad_norm": 0.48330357670783997,
+      "learning_rate": 0.0001997501241676646,
+      "loss": 0.817,
+      "step": 8700
+    },
+    {
+      "epoch": 0.02398369109005876,
+      "grad_norm": 0.4686134159564972,
+      "learning_rate": 0.00019974706890011615,
+      "loss": 0.8145,
+      "step": 8750
+    },
+    {
+      "epoch": 0.024120740753430524,
+      "grad_norm": 0.5041179656982422,
+      "learning_rate": 0.00019974399509094929,
+      "loss": 0.8226,
+      "step": 8800
+    },
+    {
+      "epoch": 0.02425779041680229,
+      "grad_norm": 0.48190945386886597,
+      "learning_rate": 0.00019974090274073544,
+      "loss": 0.8174,
+      "step": 8850
+    },
+    {
+      "epoch": 0.024394840080174052,
+      "grad_norm": 0.4537580907344818,
+      "learning_rate": 0.00019973779185004941,
+      "loss": 0.8236,
+      "step": 8900
+    },
+    {
+      "epoch": 0.024531889743545816,
+      "grad_norm": 0.46414464712142944,
+      "learning_rate": 0.00019973466241946948,
+      "loss": 0.8154,
+      "step": 8950
+    },
+    {
+      "epoch": 0.02466893940691758,
+      "grad_norm": 0.4720008671283722,
+      "learning_rate": 0.00019973151444957737,
+      "loss": 0.8171,
+      "step": 9000
+    },
+    {
+      "epoch": 0.024805989070289348,
+      "grad_norm": 0.484370619058609,
+      "learning_rate": 0.0001997283479409582,
+      "loss": 0.8226,
+      "step": 9050
+    },
+    {
+      "epoch": 0.024943038733661112,
+      "grad_norm": 0.44341036677360535,
+      "learning_rate": 0.00019972516289420064,
+      "loss": 0.8214,
+      "step": 9100
+    },
+    {
+      "epoch": 0.025080088397032876,
+      "grad_norm": 0.4852900207042694,
+      "learning_rate": 0.00019972195930989675,
+      "loss": 0.8279,
+      "step": 9150
+    },
+    {
+      "epoch": 0.02521713806040464,
+      "grad_norm": 0.4795898199081421,
+      "learning_rate": 0.000199718737188642,
+      "loss": 0.8256,
+      "step": 9200
+    },
+    {
+      "epoch": 0.025354187723776404,
+      "grad_norm": 0.47716230154037476,
+      "learning_rate": 0.00019971549653103533,
+      "loss": 0.815,
+      "step": 9250
+    },
+    {
+      "epoch": 0.025491237387148168,
+      "grad_norm": 0.4899328351020813,
+      "learning_rate": 0.00019971223733767913,
+      "loss": 0.8123,
+      "step": 9300
+    },
+    {
+      "epoch": 0.02562828705051993,
+      "grad_norm": 0.48426783084869385,
+      "learning_rate": 0.00019970895960917927,
+      "loss": 0.8128,
+      "step": 9350
+    },
+    {
+      "epoch": 0.025765336713891696,
+      "grad_norm": 0.48592498898506165,
+      "learning_rate": 0.00019970566334614502,
+      "loss": 0.8117,
+      "step": 9400
+    },
+    {
+      "epoch": 0.02590238637726346,
+      "grad_norm": 0.4794251322746277,
+      "learning_rate": 0.00019970234854918914,
+      "loss": 0.8165,
+      "step": 9450
+    },
+    {
+      "epoch": 0.026039436040635223,
+      "grad_norm": 0.4640505015850067,
+      "learning_rate": 0.00019969901521892778,
+      "loss": 0.8194,
+      "step": 9500
+    },
+    {
+      "epoch": 0.02617648570400699,
+      "grad_norm": 0.4869844913482666,
+      "learning_rate": 0.00019969566335598056,
+      "loss": 0.8198,
+      "step": 9550
+    },
+    {
+      "epoch": 0.026313535367378755,
+      "grad_norm": 0.4699898362159729,
+      "learning_rate": 0.00019969229296097052,
+      "loss": 0.8145,
+      "step": 9600
+    },
+    {
+      "epoch": 0.02645058503075052,
+      "grad_norm": 0.43347305059432983,
+      "learning_rate": 0.0001996889040345242,
+      "loss": 0.826,
+      "step": 9650
+    },
+    {
+      "epoch": 0.026587634694122283,
+      "grad_norm": 0.49129053950309753,
+      "learning_rate": 0.00019968549657727155,
+      "loss": 0.8168,
+      "step": 9700
+    },
+    {
+      "epoch": 0.026724684357494047,
+      "grad_norm": 0.48174822330474854,
+      "learning_rate": 0.00019968207058984597,
+      "loss": 0.8116,
+      "step": 9750
+    },
+    {
+      "epoch": 0.02686173402086581,
+      "grad_norm": 0.4568016827106476,
+      "learning_rate": 0.00019967862607288435,
+      "loss": 0.8142,
+      "step": 9800
+    },
+    {
+      "epoch": 0.026998783684237575,
+      "grad_norm": 0.4910202622413635,
+      "learning_rate": 0.00019967516302702692,
+      "loss": 0.8181,
+      "step": 9850
+    },
+    {
+      "epoch": 0.02713583334760934,
+      "grad_norm": 0.4772759974002838,
+      "learning_rate": 0.0001996716814529174,
+      "loss": 0.8116,
+      "step": 9900
+    },
+    {
+      "epoch": 0.027272883010981103,
+      "grad_norm": 0.5179160833358765,
+      "learning_rate": 0.000199668181351203,
+      "loss": 0.8254,
+      "step": 9950
+    },
+    {
+      "epoch": 0.02740993267435287,
+      "grad_norm": 0.506245493888855,
+      "learning_rate": 0.00019966466272253435,
+      "loss": 0.8182,
+      "step": 10000
+    },
+    {
+      "epoch": 0.027546982337724634,
+      "grad_norm": 0.4551698565483093,
+      "learning_rate": 0.00019966112556756547,
+      "loss": 0.8201,
+      "step": 10050
+    },
+    {
+      "epoch": 0.027684032001096398,
+      "grad_norm": 0.4976311922073364,
+      "learning_rate": 0.00019965756988695393,
+      "loss": 0.8195,
+      "step": 10100
+    },
+    {
+      "epoch": 0.027821081664468162,
+      "grad_norm": 0.49078822135925293,
+      "learning_rate": 0.00019965399568136062,
+      "loss": 0.8181,
+      "step": 10150
+    },
+    {
+      "epoch": 0.027958131327839926,
+      "grad_norm": 0.4534152150154114,
+      "learning_rate": 0.00019965040295145,
+      "loss": 0.8093,
+      "step": 10200
+    },
+    {
+      "epoch": 0.02809518099121169,
+      "grad_norm": 0.4695248603820801,
+      "learning_rate": 0.00019964679169788986,
+      "loss": 0.8105,
+      "step": 10250
+    },
+    {
+      "epoch": 0.028232230654583454,
+      "grad_norm": 0.49516522884368896,
+      "learning_rate": 0.00019964316192135152,
+      "loss": 0.8152,
+      "step": 10300
+    },
+    {
+      "epoch": 0.028369280317955218,
+      "grad_norm": 0.4861946105957031,
+      "learning_rate": 0.00019963951362250967,
+      "loss": 0.8267,
+      "step": 10350
+    },
+    {
+      "epoch": 0.028506329981326982,
+      "grad_norm": 0.46852409839630127,
+      "learning_rate": 0.0001996358468020425,
+      "loss": 0.8187,
+      "step": 10400
+    },
+    {
+      "epoch": 0.02864337964469875,
+      "grad_norm": 0.4557548463344574,
+      "learning_rate": 0.00019963216146063158,
+      "loss": 0.808,
+      "step": 10450
+    },
+    {
+      "epoch": 0.028780429308070513,
+      "grad_norm": 0.4835500121116638,
+      "learning_rate": 0.00019962845759896207,
+      "loss": 0.8142,
+      "step": 10500
+    },
+    {
+      "epoch": 0.028917478971442277,
+      "grad_norm": 0.48590588569641113,
+      "learning_rate": 0.00019962473521772234,
+      "loss": 0.8251,
+      "step": 10550
+    },
+    {
+      "epoch": 0.02905452863481404,
+      "grad_norm": 0.46996089816093445,
+      "learning_rate": 0.00019962099431760442,
+      "loss": 0.8209,
+      "step": 10600
+    },
+    {
+      "epoch": 0.029191578298185805,
+      "grad_norm": 0.46507537364959717,
+      "learning_rate": 0.00019961723489930365,
+      "loss": 0.8124,
+      "step": 10650
+    },
+    {
+      "epoch": 0.02932862796155757,
+      "grad_norm": 0.48764947056770325,
+      "learning_rate": 0.00019961345696351888,
+      "loss": 0.8221,
+      "step": 10700
+    },
+    {
+      "epoch": 0.029465677624929333,
+      "grad_norm": 0.49930909276008606,
+      "learning_rate": 0.00019960966051095234,
+      "loss": 0.8181,
+      "step": 10750
+    },
+    {
+      "epoch": 0.029602727288301097,
+      "grad_norm": 0.5145189166069031,
+      "learning_rate": 0.00019960584554230978,
+      "loss": 0.8195,
+      "step": 10800
+    },
+    {
+      "epoch": 0.02973977695167286,
+      "grad_norm": 0.46932724118232727,
+      "learning_rate": 0.00019960201205830033,
+      "loss": 0.8208,
+      "step": 10850
+    },
+    {
+      "epoch": 0.02987682661504463,
+      "grad_norm": 0.48645126819610596,
+      "learning_rate": 0.00019959816005963657,
+      "loss": 0.8142,
+      "step": 10900
+    },
+    {
+      "epoch": 0.030013876278416392,
+      "grad_norm": 0.4872402250766754,
+      "learning_rate": 0.00019959428954703453,
+      "loss": 0.8058,
+      "step": 10950
+    },
+    {
+      "epoch": 0.030150925941788156,
+      "grad_norm": 0.49168017506599426,
+      "learning_rate": 0.00019959040052121375,
+      "loss": 0.8133,
+      "step": 11000
+    },
+    {
+      "epoch": 0.03028797560515992,
+      "grad_norm": 0.5038509964942932,
+      "learning_rate": 0.00019958649298289707,
+      "loss": 0.8218,
+      "step": 11050
+    },
+    {
+      "epoch": 0.030425025268531684,
+      "grad_norm": 0.4773401618003845,
+      "learning_rate": 0.00019958256693281088,
+      "loss": 0.8083,
+      "step": 11100
+    },
+    {
+      "epoch": 0.03056207493190345,
+      "grad_norm": 0.4657188653945923,
+      "learning_rate": 0.00019957862237168498,
+      "loss": 0.8097,
+      "step": 11150
+    },
+    {
+      "epoch": 0.030699124595275212,
+      "grad_norm": 0.45735588669776917,
+      "learning_rate": 0.0001995746593002526,
+      "loss": 0.8123,
+      "step": 11200
+    },
+    {
+      "epoch": 0.030836174258646976,
+      "grad_norm": 0.4805397689342499,
+      "learning_rate": 0.00019957067771925044,
+      "loss": 0.8178,
+      "step": 11250
+    },
+    {
+      "epoch": 0.03097322392201874,
+      "grad_norm": 0.4764343202114105,
+      "learning_rate": 0.00019956667762941862,
+      "loss": 0.8133,
+      "step": 11300
+    },
+    {
+      "epoch": 0.031110273585390504,
+      "grad_norm": 0.4501846432685852,
+      "learning_rate": 0.00019956265903150067,
+      "loss": 0.8114,
+      "step": 11350
+    },
+    {
+      "epoch": 0.03124732324876227,
+      "grad_norm": 0.46945250034332275,
+      "learning_rate": 0.00019955862192624362,
+      "loss": 0.8214,
+      "step": 11400
+    },
+    {
+      "epoch": 0.031384372912134036,
+      "grad_norm": 0.461480975151062,
+      "learning_rate": 0.00019955456631439792,
+      "loss": 0.8163,
+      "step": 11450
+    },
+    {
+      "epoch": 0.031521422575505796,
+      "grad_norm": 0.4994974434375763,
+      "learning_rate": 0.0001995504921967174,
+      "loss": 0.8192,
+      "step": 11500
+    },
+    {
+      "epoch": 0.031658472238877564,
+      "grad_norm": 0.4574553072452545,
+      "learning_rate": 0.00019954639957395947,
+      "loss": 0.8148,
+      "step": 11550
+    },
+    {
+      "epoch": 0.03179552190224933,
+      "grad_norm": 0.45984333753585815,
+      "learning_rate": 0.0001995422884468848,
+      "loss": 0.8111,
+      "step": 11600
+    },
+    {
+      "epoch": 0.03193257156562109,
+      "grad_norm": 0.5009839534759521,
+      "learning_rate": 0.00019953815881625767,
+      "loss": 0.8224,
+      "step": 11650
+    },
+    {
+      "epoch": 0.03206962122899286,
+      "grad_norm": 0.4804627597332001,
+      "learning_rate": 0.00019953401068284568,
+      "loss": 0.81,
+      "step": 11700
+    },
+    {
+      "epoch": 0.03220667089236462,
+      "grad_norm": 0.48593953251838684,
+      "learning_rate": 0.00019952984404741995,
+      "loss": 0.8061,
+      "step": 11750
+    },
+    {
+      "epoch": 0.03234372055573639,
+      "grad_norm": 0.5112223625183105,
+      "learning_rate": 0.00019952565891075494,
+      "loss": 0.8112,
+      "step": 11800
+    },
+    {
+      "epoch": 0.03248077021910815,
+      "grad_norm": 0.4945172071456909,
+      "learning_rate": 0.00019952145527362864,
+      "loss": 0.8113,
+      "step": 11850
+    },
+    {
+      "epoch": 0.032617819882479915,
+      "grad_norm": 0.4510950744152069,
+      "learning_rate": 0.00019951723313682248,
+      "loss": 0.8124,
+      "step": 11900
+    },
+    {
+      "epoch": 0.032754869545851675,
+      "grad_norm": 0.4693338871002197,
+      "learning_rate": 0.00019951299250112122,
+      "loss": 0.8095,
+      "step": 11950
+    },
+    {
+      "epoch": 0.03289191920922344,
+      "grad_norm": 0.5016142725944519,
+      "learning_rate": 0.0001995087333673132,
+      "loss": 0.8156,
+      "step": 12000
+    },
+    {
+      "epoch": 0.0330289688725952,
+      "grad_norm": 0.46924394369125366,
+      "learning_rate": 0.0001995044557361901,
+      "loss": 0.8122,
+      "step": 12050
+    },
+    {
+      "epoch": 0.03316601853596697,
+      "grad_norm": 0.49559521675109863,
+      "learning_rate": 0.00019950015960854716,
+      "loss": 0.8173,
+      "step": 12100
+    },
+    {
+      "epoch": 0.03330306819933874,
+      "grad_norm": 0.4793796241283417,
+      "learning_rate": 0.00019949584498518284,
+      "loss": 0.8241,
+      "step": 12150
+    },
+    {
+      "epoch": 0.0334401178627105,
+      "grad_norm": 0.48246756196022034,
+      "learning_rate": 0.00019949151186689928,
+      "loss": 0.8145,
+      "step": 12200
+    },
+    {
+      "epoch": 0.033577167526082266,
+      "grad_norm": 0.4593210220336914,
+      "learning_rate": 0.00019948716025450187,
+      "loss": 0.8133,
+      "step": 12250
+    },
+    {
+      "epoch": 0.03371421718945403,
+      "grad_norm": 0.49932458996772766,
+      "learning_rate": 0.00019948279014879957,
+      "loss": 0.814,
+      "step": 12300
+    },
+    {
+      "epoch": 0.033851266852825794,
+      "grad_norm": 0.48580220341682434,
+      "learning_rate": 0.00019947840155060467,
+      "loss": 0.8045,
+      "step": 12350
+    },
+    {
+      "epoch": 0.033988316516197555,
+      "grad_norm": 0.47070175409317017,
+      "learning_rate": 0.00019947399446073298,
+      "loss": 0.809,
+      "step": 12400
+    },
+    {
+      "epoch": 0.03412536617956932,
+      "grad_norm": 0.472607284784317,
+      "learning_rate": 0.00019946956888000373,
+      "loss": 0.8041,
+      "step": 12450
+    },
+    {
+      "epoch": 0.03426241584294108,
+      "grad_norm": 0.5041372776031494,
+      "learning_rate": 0.0001994651248092396,
+      "loss": 0.8129,
+      "step": 12500
+    },
+    {
+      "epoch": 0.03439946550631285,
+      "grad_norm": 0.4530751705169678,
+      "learning_rate": 0.00019946066224926657,
+      "loss": 0.8134,
+      "step": 12550
+    },
+    {
+      "epoch": 0.03453651516968462,
+      "grad_norm": 0.512209951877594,
+      "learning_rate": 0.00019945618120091428,
+      "loss": 0.8112,
+      "step": 12600
+    },
+    {
+      "epoch": 0.03467356483305638,
+      "grad_norm": 0.4971703588962555,
+      "learning_rate": 0.00019945168166501568,
+      "loss": 0.8138,
+      "step": 12650
+    },
+    {
+      "epoch": 0.034810614496428145,
+      "grad_norm": 0.46569451689720154,
+      "learning_rate": 0.0001994471636424071,
+      "loss": 0.816,
+      "step": 12700
+    },
+    {
+      "epoch": 0.034947664159799906,
+      "grad_norm": 0.46872833371162415,
+      "learning_rate": 0.00019944262713392848,
+      "loss": 0.813,
+      "step": 12750
+    },
+    {
+      "epoch": 0.03508471382317167,
+      "grad_norm": 0.4578431248664856,
+      "learning_rate": 0.00019943807214042303,
+      "loss": 0.8109,
+      "step": 12800
+    },
+    {
+      "epoch": 0.035221763486543434,
+      "grad_norm": 0.46130719780921936,
+      "learning_rate": 0.00019943349866273746,
+      "loss": 0.8109,
+      "step": 12850
+    },
+    {
+      "epoch": 0.0353588131499152,
+      "grad_norm": 0.5125893354415894,
+      "learning_rate": 0.00019942890670172195,
+      "loss": 0.8066,
+      "step": 12900
+    },
+    {
+      "epoch": 0.03549586281328696,
+      "grad_norm": 0.47778981924057007,
+      "learning_rate": 0.00019942429625823005,
+      "loss": 0.8159,
+      "step": 12950
+    },
+    {
+      "epoch": 0.03563291247665873,
+      "grad_norm": 0.4936039447784424,
+      "learning_rate": 0.0001994196673331188,
+      "loss": 0.7937,
+      "step": 13000
+    },
+    {
+      "epoch": 0.035769962140030497,
+      "grad_norm": 0.4861447513103485,
+      "learning_rate": 0.00019941501992724867,
+      "loss": 0.8097,
+      "step": 13050
+    },
+    {
+      "epoch": 0.03590701180340226,
+      "grad_norm": 0.47155696153640747,
+      "learning_rate": 0.0001994103540414835,
+      "loss": 0.813,
+      "step": 13100
+    },
+    {
+      "epoch": 0.036044061466774024,
+      "grad_norm": 0.4665374755859375,
+      "learning_rate": 0.00019940566967669067,
+      "loss": 0.8024,
+      "step": 13150
+    },
+    {
+      "epoch": 0.036181111130145785,
+      "grad_norm": 0.5167489051818848,
+      "learning_rate": 0.0001994009668337409,
+      "loss": 0.8117,
+      "step": 13200
+    },
+    {
+      "epoch": 0.03631816079351755,
+      "grad_norm": 0.4830285608768463,
+      "learning_rate": 0.00019939624551350838,
+      "loss": 0.8239,
+      "step": 13250
+    },
+    {
+      "epoch": 0.03645521045688931,
+      "grad_norm": 0.46591734886169434,
+      "learning_rate": 0.00019939150571687077,
+      "loss": 0.8075,
+      "step": 13300
+    },
+    {
+      "epoch": 0.03659226012026108,
+      "grad_norm": 0.46838170289993286,
+      "learning_rate": 0.00019938674744470914,
+      "loss": 0.803,
+      "step": 13350
+    },
+    {
+      "epoch": 0.03672930978363284,
+      "grad_norm": 0.4891092777252197,
+      "learning_rate": 0.00019938197069790793,
+      "loss": 0.8098,
+      "step": 13400
+    },
+    {
+      "epoch": 0.03686635944700461,
+      "grad_norm": 0.4904557466506958,
+      "learning_rate": 0.00019937717547735513,
+      "loss": 0.8216,
+      "step": 13450
+    },
+    {
+      "epoch": 0.037003409110376376,
+      "grad_norm": 0.4622911810874939,
+      "learning_rate": 0.00019937236178394207,
+      "loss": 0.8051,
+      "step": 13500
+    },
+    {
+      "epoch": 0.037140458773748136,
+      "grad_norm": 0.48750799894332886,
+      "learning_rate": 0.00019936752961856357,
+      "loss": 0.8044,
+      "step": 13550
+    },
+    {
+      "epoch": 0.037277508437119904,
+      "grad_norm": 0.510019063949585,
+      "learning_rate": 0.00019936267898211786,
+      "loss": 0.8098,
+      "step": 13600
+    },
+    {
+      "epoch": 0.037414558100491664,
+      "grad_norm": 0.49900510907173157,
+      "learning_rate": 0.0001993578098755066,
+      "loss": 0.806,
+      "step": 13650
+    },
+    {
+      "epoch": 0.03755160776386343,
+      "grad_norm": 0.4632837474346161,
+      "learning_rate": 0.0001993529222996349,
+      "loss": 0.8013,
+      "step": 13700
+    },
+    {
+      "epoch": 0.03768865742723519,
+      "grad_norm": 0.4744565188884735,
+      "learning_rate": 0.00019934801625541129,
+      "loss": 0.8082,
+      "step": 13750
+    },
+    {
+      "epoch": 0.03782570709060696,
+      "grad_norm": 0.4971522092819214,
+      "learning_rate": 0.00019934309174374774,
+      "loss": 0.8056,
+      "step": 13800
+    },
+    {
+      "epoch": 0.03796275675397872,
+      "grad_norm": 0.4654538333415985,
+      "learning_rate": 0.00019933814876555963,
+      "loss": 0.8119,
+      "step": 13850
+    },
+    {
+      "epoch": 0.03809980641735049,
+      "grad_norm": 0.47612524032592773,
+      "learning_rate": 0.00019933318732176582,
+      "loss": 0.8196,
+      "step": 13900
+    },
+    {
+      "epoch": 0.038236856080722255,
+      "grad_norm": 0.5004960298538208,
+      "learning_rate": 0.00019932820741328856,
+      "loss": 0.8082,
+      "step": 13950
+    },
+    {
+      "epoch": 0.038373905744094015,
+      "grad_norm": 0.4597856104373932,
+      "learning_rate": 0.00019932320904105354,
+      "loss": 0.7997,
+      "step": 14000
+    },
+    {
+      "epoch": 0.03851095540746578,
+      "grad_norm": 0.46188652515411377,
+      "learning_rate": 0.00019931819220598993,
+      "loss": 0.8051,
+      "step": 14050
+    },
+    {
+      "epoch": 0.03864800507083754,
+      "grad_norm": 0.477914959192276,
+      "learning_rate": 0.00019931315690903026,
+      "loss": 0.8115,
+      "step": 14100
+    },
+    {
+      "epoch": 0.03878505473420931,
+      "grad_norm": 0.511289119720459,
+      "learning_rate": 0.0001993081031511105,
+      "loss": 0.8127,
+      "step": 14150
+    },
+    {
+      "epoch": 0.03892210439758107,
+      "grad_norm": 0.4742279052734375,
+      "learning_rate": 0.0001993030309331701,
+      "loss": 0.8113,
+      "step": 14200
+    },
+    {
+      "epoch": 0.03905915406095284,
+      "grad_norm": 0.5221574306488037,
+      "learning_rate": 0.00019929794025615188,
+      "loss": 0.8122,
+      "step": 14250
+    },
+    {
+      "epoch": 0.0391962037243246,
+      "grad_norm": 0.4802887737751007,
+      "learning_rate": 0.00019929283112100218,
+      "loss": 0.8001,
+      "step": 14300
+    },
+    {
+      "epoch": 0.03933325338769637,
+      "grad_norm": 0.5080459117889404,
+      "learning_rate": 0.00019928770352867074,
+      "loss": 0.811,
+      "step": 14350
+    },
+    {
+      "epoch": 0.039470303051068134,
+      "grad_norm": 0.4794371426105499,
+      "learning_rate": 0.0001992825574801106,
+      "loss": 0.801,
+      "step": 14400
+    },
+    {
+      "epoch": 0.039607352714439895,
+      "grad_norm": 0.49599120020866394,
+      "learning_rate": 0.00019927739297627848,
+      "loss": 0.8099,
+      "step": 14450
+    },
+    {
+      "epoch": 0.03974440237781166,
+      "grad_norm": 0.4951411187648773,
+      "learning_rate": 0.0001992722100181343,
+      "loss": 0.8149,
+      "step": 14500
+    },
+    {
+      "epoch": 0.03988145204118342,
+      "grad_norm": 0.5010212659835815,
+      "learning_rate": 0.00019926700860664148,
+      "loss": 0.8043,
+      "step": 14550
+    },
+    {
+      "epoch": 0.04001850170455519,
+      "grad_norm": 0.4742908477783203,
+      "learning_rate": 0.00019926178874276698,
+      "loss": 0.8116,
+      "step": 14600
+    },
+    {
+      "epoch": 0.04015555136792695,
+      "grad_norm": 0.5191108584403992,
+      "learning_rate": 0.00019925655042748102,
+      "loss": 0.8024,
+      "step": 14650
+    },
+    {
+      "epoch": 0.04029260103129872,
+      "grad_norm": 0.50883948802948,
+      "learning_rate": 0.00019925129366175738,
+      "loss": 0.7936,
+      "step": 14700
+    },
+    {
+      "epoch": 0.04042965069467048,
+      "grad_norm": 0.47401902079582214,
+      "learning_rate": 0.0001992460184465732,
+      "loss": 0.8082,
+      "step": 14750
+    },
+    {
+      "epoch": 0.040566700358042246,
+      "grad_norm": 0.4578768014907837,
+      "learning_rate": 0.00019924072478290906,
+      "loss": 0.8068,
+      "step": 14800
+    },
+    {
+      "epoch": 0.04070375002141401,
+      "grad_norm": 0.4548453092575073,
+      "learning_rate": 0.00019923541267174907,
+      "loss": 0.8083,
+      "step": 14850
+    },
+    {
+      "epoch": 0.040840799684785774,
+      "grad_norm": 0.45983466506004333,
+      "learning_rate": 0.0001992300821140805,
+      "loss": 0.8065,
+      "step": 14900
+    },
+    {
+      "epoch": 0.04097784934815754,
+      "grad_norm": 0.48265522718429565,
+      "learning_rate": 0.0001992247331108944,
+      "loss": 0.8122,
+      "step": 14950
+    },
+    {
+      "epoch": 0.0411148990115293,
+      "grad_norm": 0.4878624975681305,
+      "learning_rate": 0.000199219365663185,
+      "loss": 0.8101,
+      "step": 15000
+    },
+    {
+      "epoch": 0.04125194867490107,
+      "grad_norm": 0.5351042151451111,
+      "learning_rate": 0.00019921397977195002,
+      "loss": 0.8092,
+      "step": 15050
+    },
+    {
+      "epoch": 0.04138899833827283,
+      "grad_norm": 0.47971010208129883,
+      "learning_rate": 0.00019920857543819068,
+      "loss": 0.8111,
+      "step": 15100
+    },
+    {
+      "epoch": 0.0415260480016446,
+      "grad_norm": 0.4985937774181366,
+      "learning_rate": 0.00019920315266291154,
+      "loss": 0.8131,
+      "step": 15150
+    },
+    {
+      "epoch": 0.04166309766501636,
+      "grad_norm": 0.4487249255180359,
+      "learning_rate": 0.0001991977114471206,
+      "loss": 0.8171,
+      "step": 15200
+    },
+    {
+      "epoch": 0.041800147328388125,
+      "grad_norm": 0.4501367211341858,
+      "learning_rate": 0.00019919225179182933,
+      "loss": 0.8041,
+      "step": 15250
+    },
+    {
+      "epoch": 0.041937196991759886,
+      "grad_norm": 0.4651374816894531,
+      "learning_rate": 0.0001991867736980526,
+      "loss": 0.7995,
+      "step": 15300
+    },
+    {
+      "epoch": 0.04207424665513165,
+      "grad_norm": 0.5046921372413635,
+      "learning_rate": 0.00019918127716680873,
+      "loss": 0.8076,
+      "step": 15350
+    },
+    {
+      "epoch": 0.04221129631850342,
+      "grad_norm": 0.4766380488872528,
+      "learning_rate": 0.00019917576219911942,
+      "loss": 0.808,
+      "step": 15400
+    },
+    {
+      "epoch": 0.04234834598187518,
+      "grad_norm": 0.5033254623413086,
+      "learning_rate": 0.00019917022879600987,
+      "loss": 0.8092,
+      "step": 15450
+    },
+    {
+      "epoch": 0.04248539564524695,
+      "grad_norm": 0.5111039280891418,
+      "learning_rate": 0.0001991646769585086,
+      "loss": 0.8028,
+      "step": 15500
+    },
+    {
+      "epoch": 0.04262244530861871,
+      "grad_norm": 0.5142612457275391,
+      "learning_rate": 0.00019915910668764767,
+      "loss": 0.8091,
+      "step": 15550
+    },
+    {
+      "epoch": 0.042759494971990476,
+      "grad_norm": 0.5041571259498596,
+      "learning_rate": 0.00019915351798446254,
+      "loss": 0.8085,
+      "step": 15600
+    },
+    {
+      "epoch": 0.04289654463536224,
+      "grad_norm": 0.5112966299057007,
+      "learning_rate": 0.000199147910849992,
+      "loss": 0.8045,
+      "step": 15650
+    },
+    {
+      "epoch": 0.043033594298734004,
+      "grad_norm": 0.5248098373413086,
+      "learning_rate": 0.0001991422852852784,
+      "loss": 0.8012,
+      "step": 15700
+    },
+    {
+      "epoch": 0.043170643962105765,
+      "grad_norm": 0.47679683566093445,
+      "learning_rate": 0.00019913664129136743,
+      "loss": 0.8116,
+      "step": 15750
+    },
+    {
+      "epoch": 0.04330769362547753,
+      "grad_norm": 0.46716034412384033,
+      "learning_rate": 0.00019913097886930823,
+      "loss": 0.7983,
+      "step": 15800
+    },
+    {
+      "epoch": 0.0434447432888493,
+      "grad_norm": 0.5045149326324463,
+      "learning_rate": 0.00019912529802015337,
+      "loss": 0.8066,
+      "step": 15850
+    },
+    {
+      "epoch": 0.04358179295222106,
+      "grad_norm": 0.44902193546295166,
+      "learning_rate": 0.00019911959874495885,
+      "loss": 0.8005,
+      "step": 15900
+    },
+    {
+      "epoch": 0.04371884261559283,
+      "grad_norm": 0.46237778663635254,
+      "learning_rate": 0.0001991138810447841,
+      "loss": 0.7956,
+      "step": 15950
+    },
+    {
+      "epoch": 0.04385589227896459,
+      "grad_norm": 0.4876990020275116,
+      "learning_rate": 0.0001991081449206919,
+      "loss": 0.8045,
+      "step": 16000
+    },
+    {
+      "epoch": 0.043992941942336355,
+      "grad_norm": 0.4410344362258911,
+      "learning_rate": 0.0001991023903737486,
+      "loss": 0.8022,
+      "step": 16050
+    },
+    {
+      "epoch": 0.044129991605708116,
+      "grad_norm": 0.44994667172431946,
+      "learning_rate": 0.00019909661740502387,
+      "loss": 0.7971,
+      "step": 16100
+    },
+    {
+      "epoch": 0.04426704126907988,
+      "grad_norm": 0.49519991874694824,
+      "learning_rate": 0.00019909082601559077,
+      "loss": 0.8005,
+      "step": 16150
+    },
+    {
+      "epoch": 0.044404090932451644,
+      "grad_norm": 0.48117753863334656,
+      "learning_rate": 0.0001990850162065259,
+      "loss": 0.8029,
+      "step": 16200
+    },
+    {
+      "epoch": 0.04454114059582341,
+      "grad_norm": 0.4871579706668854,
+      "learning_rate": 0.0001990791879789092,
+      "loss": 0.8059,
+      "step": 16250
+    },
+    {
+      "epoch": 0.04467819025919518,
+      "grad_norm": 0.489726722240448,
+      "learning_rate": 0.00019907334133382405,
+      "loss": 0.799,
+      "step": 16300
+    },
+    {
+      "epoch": 0.04481523992256694,
+      "grad_norm": 0.5114454030990601,
+      "learning_rate": 0.00019906747627235728,
+      "loss": 0.8076,
+      "step": 16350
+    },
+    {
+      "epoch": 0.04495228958593871,
+      "grad_norm": 0.4839656949043274,
+      "learning_rate": 0.00019906159279559912,
+      "loss": 0.8107,
+      "step": 16400
+    },
+    {
+      "epoch": 0.04508933924931047,
+      "grad_norm": 0.5057470202445984,
+      "learning_rate": 0.00019905569090464324,
+      "loss": 0.8061,
+      "step": 16450
+    },
+    {
+      "epoch": 0.045226388912682235,
+      "grad_norm": 0.4629577100276947,
+      "learning_rate": 0.0001990497706005867,
+      "loss": 0.8076,
+      "step": 16500
+    },
+    {
+      "epoch": 0.045363438576053995,
+      "grad_norm": 0.5014634132385254,
+      "learning_rate": 0.00019904383188453002,
+      "loss": 0.811,
+      "step": 16550
+    },
+    {
+      "epoch": 0.04550048823942576,
+      "grad_norm": 0.47154685854911804,
+      "learning_rate": 0.00019903787475757708,
+      "loss": 0.7882,
+      "step": 16600
+    },
+    {
+      "epoch": 0.04563753790279752,
+      "grad_norm": 0.4919736981391907,
+      "learning_rate": 0.0001990318992208353,
+      "loss": 0.8136,
+      "step": 16650
+    },
+    {
+      "epoch": 0.04577458756616929,
+      "grad_norm": 0.4639228880405426,
+      "learning_rate": 0.00019902590527541545,
+      "loss": 0.8056,
+      "step": 16700
+    },
+    {
+      "epoch": 0.04591163722954106,
+      "grad_norm": 0.42496591806411743,
+      "learning_rate": 0.00019901989292243165,
+      "loss": 0.8116,
+      "step": 16750
+    },
+    {
+      "epoch": 0.04604868689291282,
+      "grad_norm": 0.4563073217868805,
+      "learning_rate": 0.00019901386216300155,
+      "loss": 0.7984,
+      "step": 16800
+    },
+    {
+      "epoch": 0.046185736556284586,
+      "grad_norm": 0.4917526841163635,
+      "learning_rate": 0.00019900781299824619,
+      "loss": 0.8067,
+      "step": 16850
+    },
+    {
+      "epoch": 0.046322786219656346,
+      "grad_norm": 0.47236648201942444,
+      "learning_rate": 0.00019900174542929005,
+      "loss": 0.7997,
+      "step": 16900
+    },
+    {
+      "epoch": 0.046459835883028114,
+      "grad_norm": 0.4592929780483246,
+      "learning_rate": 0.00019899565945726098,
+      "loss": 0.8128,
+      "step": 16950
+    },
+    {
+      "epoch": 0.046596885546399874,
+      "grad_norm": 0.4753207862377167,
+      "learning_rate": 0.00019898955508329023,
+      "loss": 0.8122,
+      "step": 17000
+    },
+    {
+      "epoch": 0.04673393520977164,
+      "grad_norm": 0.47833189368247986,
+      "learning_rate": 0.00019898343230851265,
+      "loss": 0.799,
+      "step": 17050
+    },
+    {
+      "epoch": 0.0468709848731434,
+      "grad_norm": 0.498756468296051,
+      "learning_rate": 0.00019897729113406624,
+      "loss": 0.8033,
+      "step": 17100
+    },
+    {
+      "epoch": 0.04700803453651517,
+      "grad_norm": 0.5029916167259216,
+      "learning_rate": 0.00019897113156109268,
+      "loss": 0.8033,
+      "step": 17150
+    },
+    {
+      "epoch": 0.04714508419988694,
+      "grad_norm": 0.45970985293388367,
+      "learning_rate": 0.00019896495359073688,
+      "loss": 0.7994,
+      "step": 17200
+    },
+    {
+      "epoch": 0.0472821338632587,
+      "grad_norm": 0.47940370440483093,
+      "learning_rate": 0.00019895875722414724,
+      "loss": 0.8016,
+      "step": 17250
+    },
+    {
+      "epoch": 0.047419183526630465,
+      "grad_norm": 0.46687984466552734,
+      "learning_rate": 0.0001989525424624756,
+      "loss": 0.8115,
+      "step": 17300
+    },
+    {
+      "epoch": 0.047556233190002226,
+      "grad_norm": 0.5174044966697693,
+      "learning_rate": 0.0001989463093068772,
+      "loss": 0.8037,
+      "step": 17350
+    },
+    {
+      "epoch": 0.04769328285337399,
+      "grad_norm": 0.5020077228546143,
+      "learning_rate": 0.0001989400577585107,
+      "loss": 0.7974,
+      "step": 17400
+    },
+    {
+      "epoch": 0.047830332516745754,
+      "grad_norm": 0.504945695400238,
+      "learning_rate": 0.00019893378781853818,
+      "loss": 0.8093,
+      "step": 17450
+    },
+    {
+      "epoch": 0.04796738218011752,
+      "grad_norm": 0.4759848415851593,
+      "learning_rate": 0.00019892749948812507,
+      "loss": 0.8098,
+      "step": 17500
+    },
+    {
+      "epoch": 0.04810443184348928,
+      "grad_norm": 0.4772614538669586,
+      "learning_rate": 0.0001989211927684404,
+      "loss": 0.8054,
+      "step": 17550
+    },
+    {
+      "epoch": 0.04824148150686105,
+      "grad_norm": 0.47073760628700256,
+      "learning_rate": 0.00019891486766065644,
+      "loss": 0.8077,
+      "step": 17600
+    },
+    {
+      "epoch": 0.048378531170232816,
+      "grad_norm": 0.47336697578430176,
+      "learning_rate": 0.00019890852416594893,
+      "loss": 0.8043,
+      "step": 17650
+    },
+    {
+      "epoch": 0.04851558083360458,
+      "grad_norm": 0.45777302980422974,
+      "learning_rate": 0.00019890216228549704,
+      "loss": 0.8008,
+      "step": 17700
+    },
+    {
+      "epoch": 0.048652630496976344,
+      "grad_norm": 0.4561302661895752,
+      "learning_rate": 0.0001988957820204834,
+      "loss": 0.8044,
+      "step": 17750
+    },
+    {
+      "epoch": 0.048789680160348105,
+      "grad_norm": 0.4956313669681549,
+      "learning_rate": 0.000198889383372094,
+      "loss": 0.8002,
+      "step": 17800
+    },
+    {
+      "epoch": 0.04892672982371987,
+      "grad_norm": 0.485945463180542,
+      "learning_rate": 0.00019888296634151822,
+      "loss": 0.7988,
+      "step": 17850
+    },
+    {
+      "epoch": 0.04906377948709163,
+      "grad_norm": 0.46995940804481506,
+      "learning_rate": 0.00019887653092994894,
+      "loss": 0.7973,
+      "step": 17900
+    },
+    {
+      "epoch": 0.0492008291504634,
+      "grad_norm": 0.46722331643104553,
+      "learning_rate": 0.00019887007713858239,
+      "loss": 0.8108,
+      "step": 17950
+    },
+    {
+      "epoch": 0.04933787881383516,
+      "grad_norm": 0.47116202116012573,
+      "learning_rate": 0.00019886360496861825,
+      "loss": 0.8062,
+      "step": 18000
+    },
+    {
+      "epoch": 0.04947492847720693,
+      "grad_norm": 0.48090749979019165,
+      "learning_rate": 0.00019885711442125961,
+      "loss": 0.8006,
+      "step": 18050
+    },
+    {
+      "epoch": 0.049611978140578696,
+      "grad_norm": 0.46765968203544617,
+      "learning_rate": 0.00019885060549771299,
+      "loss": 0.7977,
+      "step": 18100
+    },
+    {
+      "epoch": 0.049749027803950456,
+      "grad_norm": 0.482744961977005,
+      "learning_rate": 0.00019884407819918828,
+      "loss": 0.8017,
+      "step": 18150
+    },
+    {
+      "epoch": 0.049886077467322223,
+      "grad_norm": 0.5016223192214966,
+      "learning_rate": 0.00019883753252689885,
+      "loss": 0.8049,
+      "step": 18200
+    },
+    {
+      "epoch": 0.050023127130693984,
+      "grad_norm": 0.4829504191875458,
+      "learning_rate": 0.0001988309684820614,
+      "loss": 0.8061,
+      "step": 18250
+    },
+    {
+      "epoch": 0.05016017679406575,
+      "grad_norm": 0.500898003578186,
+      "learning_rate": 0.00019882438606589616,
+      "loss": 0.8018,
+      "step": 18300
+    },
+    {
+      "epoch": 0.05029722645743751,
+      "grad_norm": 0.4587385058403015,
+      "learning_rate": 0.0001988177852796267,
+      "loss": 0.8035,
+      "step": 18350
+    },
+    {
+      "epoch": 0.05043427612080928,
+      "grad_norm": 0.49064910411834717,
+      "learning_rate": 0.00019881116612447994,
+      "loss": 0.8069,
+      "step": 18400
+    },
+    {
+      "epoch": 0.05057132578418104,
+      "grad_norm": 0.476414293050766,
+      "learning_rate": 0.00019880452860168636,
+      "loss": 0.8059,
+      "step": 18450
+    },
+    {
+      "epoch": 0.05070837544755281,
+      "grad_norm": 0.49723196029663086,
+      "learning_rate": 0.0001987978727124798,
+      "loss": 0.7962,
+      "step": 18500
+    },
+    {
+      "epoch": 0.050845425110924575,
+      "grad_norm": 0.47163042426109314,
+      "learning_rate": 0.00019879119845809745,
+      "loss": 0.7943,
+      "step": 18550
+    },
+    {
+      "epoch": 0.050982474774296335,
+      "grad_norm": 0.4651722311973572,
+      "learning_rate": 0.00019878450583978,
+      "loss": 0.7956,
+      "step": 18600
+    },
+    {
+      "epoch": 0.0511195244376681,
+      "grad_norm": 0.44886264204978943,
+      "learning_rate": 0.0001987777948587715,
+      "loss": 0.8028,
+      "step": 18650
+    },
+    {
+      "epoch": 0.05125657410103986,
+      "grad_norm": 0.4783935248851776,
+      "learning_rate": 0.00019877106551631938,
+      "loss": 0.8072,
+      "step": 18700
+    },
+    {
+      "epoch": 0.05139362376441163,
+      "grad_norm": 0.5192398428916931,
+      "learning_rate": 0.0001987643178136746,
+      "loss": 0.8029,
+      "step": 18750
+    },
+    {
+      "epoch": 0.05153067342778339,
+      "grad_norm": 0.5124772787094116,
+      "learning_rate": 0.00019875755175209148,
+      "loss": 0.7977,
+      "step": 18800
+    },
+    {
+      "epoch": 0.05166772309115516,
+      "grad_norm": 0.46866002678871155,
+      "learning_rate": 0.0001987507673328277,
+      "loss": 0.8019,
+      "step": 18850
+    },
+    {
+      "epoch": 0.05180477275452692,
+      "grad_norm": 0.496614933013916,
+      "learning_rate": 0.0001987439645571444,
+      "loss": 0.8074,
+      "step": 18900
+    },
+    {
+      "epoch": 0.051941822417898686,
+      "grad_norm": 0.46783682703971863,
+      "learning_rate": 0.0001987371434263061,
+      "loss": 0.7904,
+      "step": 18950
+    },
+    {
+      "epoch": 0.05207887208127045,
+      "grad_norm": 0.5020947456359863,
+      "learning_rate": 0.0001987303039415808,
+      "loss": 0.7959,
+      "step": 19000
+    },
+    {
+      "epoch": 0.052215921744642214,
+      "grad_norm": 0.463768869638443,
+      "learning_rate": 0.0001987234461042398,
+      "loss": 0.8069,
+      "step": 19050
+    },
+    {
+      "epoch": 0.05235297140801398,
+      "grad_norm": 0.4553416967391968,
+      "learning_rate": 0.00019871656991555793,
+      "loss": 0.7983,
+      "step": 19100
+    },
+    {
+      "epoch": 0.05249002107138574,
+      "grad_norm": 0.4670025706291199,
+      "learning_rate": 0.0001987096753768134,
+      "loss": 0.7906,
+      "step": 19150
+    },
+    {
+      "epoch": 0.05262707073475751,
+      "grad_norm": 0.4533403813838959,
+      "learning_rate": 0.00019870276248928776,
+      "loss": 0.8072,
+      "step": 19200
+    },
+    {
+      "epoch": 0.05276412039812927,
+      "grad_norm": 0.5267584323883057,
+      "learning_rate": 0.00019869583125426606,
+      "loss": 0.8001,
+      "step": 19250
+    },
+    {
+      "epoch": 0.05290117006150104,
+      "grad_norm": 0.4778296947479248,
+      "learning_rate": 0.0001986888816730367,
+      "loss": 0.8038,
+      "step": 19300
+    },
+    {
+      "epoch": 0.0530382197248728,
+      "grad_norm": 0.48750537633895874,
+      "learning_rate": 0.00019868191374689152,
+      "loss": 0.7952,
+      "step": 19350
+    },
+    {
+      "epoch": 0.053175269388244566,
+      "grad_norm": 0.4721476435661316,
+      "learning_rate": 0.00019867492747712573,
+      "loss": 0.8035,
+      "step": 19400
+    },
+    {
+      "epoch": 0.053312319051616326,
+      "grad_norm": 0.471542626619339,
+      "learning_rate": 0.00019866792286503802,
+      "loss": 0.7943,
+      "step": 19450
+    },
+    {
+      "epoch": 0.053449368714988094,
+      "grad_norm": 0.4602237343788147,
+      "learning_rate": 0.00019866089991193045,
+      "loss": 0.7999,
+      "step": 19500
+    },
+    {
+      "epoch": 0.05358641837835986,
+      "grad_norm": 0.4979691505432129,
+      "learning_rate": 0.0001986538586191085,
+      "loss": 0.8028,
+      "step": 19550
+    },
+    {
+      "epoch": 0.05372346804173162,
+      "grad_norm": 0.4573438763618469,
+      "learning_rate": 0.000198646798987881,
+      "loss": 0.801,
+      "step": 19600
+    },
+    {
+      "epoch": 0.05386051770510339,
+      "grad_norm": 0.4607889950275421,
+      "learning_rate": 0.0001986397210195603,
+      "loss": 0.7963,
+      "step": 19650
+    },
+    {
+      "epoch": 0.05399756736847515,
+      "grad_norm": 0.489907443523407,
+      "learning_rate": 0.00019863262471546205,
+      "loss": 0.8003,
+      "step": 19700
+    },
+    {
+      "epoch": 0.05413461703184692,
+      "grad_norm": 0.47025689482688904,
+      "learning_rate": 0.0001986255100769054,
+      "loss": 0.791,
+      "step": 19750
+    },
+    {
+      "epoch": 0.05427166669521868,
+      "grad_norm": 0.4787987172603607,
+      "learning_rate": 0.00019861837710521282,
+      "loss": 0.7966,
+      "step": 19800
+    },
+    {
+      "epoch": 0.054408716358590445,
+      "grad_norm": 0.4853837788105011,
+      "learning_rate": 0.00019861122580171027,
+      "loss": 0.8056,
+      "step": 19850
+    },
+    {
+      "epoch": 0.054545766021962205,
+      "grad_norm": 0.5139968991279602,
+      "learning_rate": 0.00019860405616772706,
+      "loss": 0.7907,
+      "step": 19900
+    },
+    {
+      "epoch": 0.05468281568533397,
+      "grad_norm": 0.5047359466552734,
+      "learning_rate": 0.00019859686820459594,
+      "loss": 0.7988,
+      "step": 19950
+    },
+    {
+      "epoch": 0.05481986534870574,
+      "grad_norm": 0.48581087589263916,
+      "learning_rate": 0.000198589661913653,
+      "loss": 0.8095,
+      "step": 20000
+    },
+    {
+      "epoch": 0.0549569150120775,
+      "grad_norm": 0.46513304114341736,
+      "learning_rate": 0.0001985824372962379,
+      "loss": 0.8026,
+      "step": 20050
+    },
+    {
+      "epoch": 0.05509396467544927,
+      "grad_norm": 0.46750718355178833,
+      "learning_rate": 0.0001985751943536935,
+      "loss": 0.8037,
+      "step": 20100
+    },
+    {
+      "epoch": 0.05523101433882103,
+      "grad_norm": 0.49538227915763855,
+      "learning_rate": 0.00019856793308736616,
+      "loss": 0.7956,
+      "step": 20150
+    },
+    {
+      "epoch": 0.055368064002192796,
+      "grad_norm": 0.46910130977630615,
+      "learning_rate": 0.00019856065349860576,
+      "loss": 0.7893,
+      "step": 20200
+    },
+    {
+      "epoch": 0.05550511366556456,
+      "grad_norm": 0.4639648199081421,
+      "learning_rate": 0.00019855335558876535,
+      "loss": 0.8017,
+      "step": 20250
+    },
+    {
+      "epoch": 0.055642163328936324,
+      "grad_norm": 0.43415069580078125,
+      "learning_rate": 0.00019854603935920157,
+      "loss": 0.7954,
+      "step": 20300
+    },
+    {
+      "epoch": 0.055779212992308085,
+      "grad_norm": 0.44691941142082214,
+      "learning_rate": 0.00019853870481127442,
+      "loss": 0.8008,
+      "step": 20350
+    },
+    {
+      "epoch": 0.05591626265567985,
+      "grad_norm": 0.4980349838733673,
+      "learning_rate": 0.00019853135194634726,
+      "loss": 0.8039,
+      "step": 20400
+    },
+    {
+      "epoch": 0.05605331231905162,
+      "grad_norm": 0.4900018870830536,
+      "learning_rate": 0.0001985239807657869,
+      "loss": 0.797,
+      "step": 20450
+    },
+    {
+      "epoch": 0.05619036198242338,
+      "grad_norm": 0.48318371176719666,
+      "learning_rate": 0.00019851659127096357,
+      "loss": 0.8072,
+      "step": 20500
+    },
+    {
+      "epoch": 0.05632741164579515,
+      "grad_norm": 0.47676709294319153,
+      "learning_rate": 0.00019850918346325084,
+      "loss": 0.7995,
+      "step": 20550
+    },
+    {
+      "epoch": 0.05646446130916691,
+      "grad_norm": 0.4761766493320465,
+      "learning_rate": 0.00019850175734402572,
+      "loss": 0.8024,
+      "step": 20600
+    },
+    {
+      "epoch": 0.056601510972538675,
+      "grad_norm": 0.4747091233730316,
+      "learning_rate": 0.00019849431291466864,
+      "loss": 0.8041,
+      "step": 20650
+    },
+    {
+      "epoch": 0.056738560635910436,
+      "grad_norm": 0.49748796224594116,
+      "learning_rate": 0.00019848685017656342,
+      "loss": 0.8072,
+      "step": 20700
+    },
+    {
+      "epoch": 0.0568756102992822,
+      "grad_norm": 0.4664687216281891,
+      "learning_rate": 0.00019847936913109727,
+      "loss": 0.8037,
+      "step": 20750
+    },
+    {
+      "epoch": 0.057012659962653964,
+      "grad_norm": 0.4881364703178406,
+      "learning_rate": 0.00019847186977966082,
+      "loss": 0.7898,
+      "step": 20800
+    },
+    {
+      "epoch": 0.05714970962602573,
+      "grad_norm": 0.46295851469039917,
+      "learning_rate": 0.00019846435212364806,
+      "loss": 0.8069,
+      "step": 20850
+    },
+    {
+      "epoch": 0.0572867592893975,
+      "grad_norm": 0.48034432530403137,
+      "learning_rate": 0.00019845681616445648,
+      "loss": 0.7978,
+      "step": 20900
+    },
+    {
+      "epoch": 0.05742380895276926,
+      "grad_norm": 0.4847252666950226,
+      "learning_rate": 0.00019844926190348692,
+      "loss": 0.7984,
+      "step": 20950
+    },
+    {
+      "epoch": 0.05756085861614103,
+      "grad_norm": 0.4705177843570709,
+      "learning_rate": 0.00019844168934214353,
+      "loss": 0.8,
+      "step": 21000
+    },
+    {
+      "epoch": 0.05769790827951279,
+      "grad_norm": 0.4897174835205078,
+      "learning_rate": 0.00019843409848183403,
+      "loss": 0.7983,
+      "step": 21050
+    },
+    {
+      "epoch": 0.057834957942884554,
+      "grad_norm": 0.46888941526412964,
+      "learning_rate": 0.0001984264893239694,
+      "loss": 0.8026,
+      "step": 21100
+    },
+    {
+      "epoch": 0.057972007606256315,
+      "grad_norm": 0.4739408493041992,
+      "learning_rate": 0.0001984188618699641,
+      "loss": 0.7953,
+      "step": 21150
+    },
+    {
+      "epoch": 0.05810905726962808,
+      "grad_norm": 0.45465242862701416,
+      "learning_rate": 0.000198411216121236,
+      "loss": 0.7972,
+      "step": 21200
+    },
+    {
+      "epoch": 0.05824610693299984,
+      "grad_norm": 0.4990096688270569,
+      "learning_rate": 0.0001984035520792063,
+      "loss": 0.8089,
+      "step": 21250
+    },
+    {
+      "epoch": 0.05838315659637161,
+      "grad_norm": 0.48028653860092163,
+      "learning_rate": 0.00019839586974529963,
+      "loss": 0.798,
+      "step": 21300
+    },
+    {
+      "epoch": 0.05852020625974338,
+      "grad_norm": 0.48342519998550415,
+      "learning_rate": 0.00019838816912094405,
+      "loss": 0.8078,
+      "step": 21350
+    },
+    {
+      "epoch": 0.05865725592311514,
+      "grad_norm": 0.5085064768791199,
+      "learning_rate": 0.00019838045020757105,
+      "loss": 0.8079,
+      "step": 21400
+    },
+    {
+      "epoch": 0.058794305586486906,
+      "grad_norm": 0.47484463453292847,
+      "learning_rate": 0.00019837271300661538,
+      "loss": 0.7995,
+      "step": 21450
+    },
+    {
+      "epoch": 0.058931355249858666,
+      "grad_norm": 0.4731627404689789,
+      "learning_rate": 0.00019836495751951536,
+      "loss": 0.8001,
+      "step": 21500
+    },
+    {
+      "epoch": 0.059068404913230434,
+      "grad_norm": 0.4548656940460205,
+      "learning_rate": 0.00019835718374771257,
+      "loss": 0.8092,
+      "step": 21550
+    },
+    {
+      "epoch": 0.059205454576602194,
+      "grad_norm": 0.5077337026596069,
+      "learning_rate": 0.00019834939169265206,
+      "loss": 0.8032,
+      "step": 21600
+    },
+    {
+      "epoch": 0.05934250423997396,
+      "grad_norm": 0.46409091353416443,
+      "learning_rate": 0.00019834158135578233,
+      "loss": 0.8005,
+      "step": 21650
+    },
+    {
+      "epoch": 0.05947955390334572,
+      "grad_norm": 0.4751618206501007,
+      "learning_rate": 0.00019833375273855514,
+      "loss": 0.7954,
+      "step": 21700
+    },
+    {
+      "epoch": 0.05961660356671749,
+      "grad_norm": 0.4541051983833313,
+      "learning_rate": 0.00019832590584242574,
+      "loss": 0.8003,
+      "step": 21750
+    },
+    {
+      "epoch": 0.05975365323008926,
+      "grad_norm": 0.5068663954734802,
+      "learning_rate": 0.0001983180406688528,
+      "loss": 0.7951,
+      "step": 21800
+    },
+    {
+      "epoch": 0.05989070289346102,
+      "grad_norm": 0.4474228322505951,
+      "learning_rate": 0.00019831015721929825,
+      "loss": 0.7793,
+      "step": 21850
+    },
+    {
+      "epoch": 0.060027752556832785,
+      "grad_norm": 0.4839327037334442,
+      "learning_rate": 0.00019830225549522762,
+      "loss": 0.8067,
+      "step": 21900
+    },
+    {
+      "epoch": 0.060164802220204545,
+      "grad_norm": 0.47900480031967163,
+      "learning_rate": 0.0001982943354981097,
+      "loss": 0.7976,
+      "step": 21950
+    },
+    {
+      "epoch": 0.06030185188357631,
+      "grad_norm": 0.49303489923477173,
+      "learning_rate": 0.0001982863972294167,
+      "loss": 0.8012,
+      "step": 22000
+    },
+    {
+      "epoch": 0.06043890154694807,
+      "grad_norm": 0.4726907014846802,
+      "learning_rate": 0.0001982784406906242,
+      "loss": 0.799,
+      "step": 22050
+    },
+    {
+      "epoch": 0.06057595121031984,
+      "grad_norm": 0.4976119101047516,
+      "learning_rate": 0.00019827046588321133,
+      "loss": 0.7957,
+      "step": 22100
+    },
+    {
+      "epoch": 0.0607130008736916,
+      "grad_norm": 0.4890015423297882,
+      "learning_rate": 0.00019826247280866038,
+      "loss": 0.7927,
+      "step": 22150
+    },
+    {
+      "epoch": 0.06085005053706337,
+      "grad_norm": 0.4718722701072693,
+      "learning_rate": 0.00019825446146845717,
+      "loss": 0.7993,
+      "step": 22200
+    },
+    {
+      "epoch": 0.06098710020043513,
+      "grad_norm": 0.4812386631965637,
+      "learning_rate": 0.00019824643186409094,
+      "loss": 0.7984,
+      "step": 22250
+    },
+    {
+      "epoch": 0.0611241498638069,
+      "grad_norm": 0.5093216896057129,
+      "learning_rate": 0.0001982383839970543,
+      "loss": 0.794,
+      "step": 22300
+    },
+    {
+      "epoch": 0.061261199527178664,
+      "grad_norm": 0.45676031708717346,
+      "learning_rate": 0.00019823031786884315,
+      "loss": 0.8003,
+      "step": 22350
+    },
+    {
+      "epoch": 0.061398249190550425,
+      "grad_norm": 0.47176119685173035,
+      "learning_rate": 0.00019822223348095697,
+      "loss": 0.7922,
+      "step": 22400
+    },
+    {
+      "epoch": 0.06153529885392219,
+      "grad_norm": 0.485170841217041,
+      "learning_rate": 0.00019821413083489847,
+      "loss": 0.8026,
+      "step": 22450
+    },
+    {
+      "epoch": 0.06167234851729395,
+      "grad_norm": 0.5021151900291443,
+      "learning_rate": 0.00019820600993217385,
+      "loss": 0.7926,
+      "step": 22500
+    },
+    {
+      "epoch": 0.06180939818066572,
+      "grad_norm": 0.4861471951007843,
+      "learning_rate": 0.00019819787077429268,
+      "loss": 0.8004,
+      "step": 22550
+    },
+    {
+      "epoch": 0.06194644784403748,
+      "grad_norm": 0.48229849338531494,
+      "learning_rate": 0.00019818971336276787,
+      "loss": 0.7963,
+      "step": 22600
+    },
+    {
+      "epoch": 0.06208349750740925,
+      "grad_norm": 0.45228251814842224,
+      "learning_rate": 0.00019818153769911586,
+      "loss": 0.8017,
+      "step": 22650
+    },
+    {
+      "epoch": 0.06222054717078101,
+      "grad_norm": 0.48165270686149597,
+      "learning_rate": 0.00019817334378485635,
+      "loss": 0.7904,
+      "step": 22700
+    },
+    {
+      "epoch": 0.062357596834152776,
+      "grad_norm": 0.46487587690353394,
+      "learning_rate": 0.00019816513162151242,
+      "loss": 0.8009,
+      "step": 22750
+    },
+    {
+      "epoch": 0.06249464649752454,
+      "grad_norm": 0.4883232116699219,
+      "learning_rate": 0.00019815690121061067,
+      "loss": 0.7981,
+      "step": 22800
+    },
+    {
+      "epoch": 0.06263169616089631,
+      "grad_norm": 0.4995619058609009,
+      "learning_rate": 0.00019814865255368105,
+      "loss": 0.7978,
+      "step": 22850
+    },
+    {
+      "epoch": 0.06276874582426807,
+      "grad_norm": 0.4523701071739197,
+      "learning_rate": 0.0001981403856522568,
+      "loss": 0.795,
+      "step": 22900
+    },
+    {
+      "epoch": 0.06290579548763983,
+      "grad_norm": 0.5012710094451904,
+      "learning_rate": 0.0001981321005078746,
+      "loss": 0.7942,
+      "step": 22950
+    },
+    {
+      "epoch": 0.06304284515101159,
+      "grad_norm": 0.47123944759368896,
+      "learning_rate": 0.00019812379712207462,
+      "loss": 0.8057,
+      "step": 23000
+    },
+    {
+      "epoch": 0.06317989481438337,
+      "grad_norm": 0.4805154800415039,
+      "learning_rate": 0.00019811547549640035,
+      "loss": 0.8059,
+      "step": 23050
+    },
+    {
+      "epoch": 0.06331694447775513,
+      "grad_norm": 0.4887898564338684,
+      "learning_rate": 0.0001981071356323986,
+      "loss": 0.7988,
+      "step": 23100
+    },
+    {
+      "epoch": 0.06345399414112689,
+      "grad_norm": 0.4898166358470917,
+      "learning_rate": 0.0001980987775316197,
+      "loss": 0.7993,
+      "step": 23150
+    },
+    {
+      "epoch": 0.06359104380449866,
+      "grad_norm": 0.4890003800392151,
+      "learning_rate": 0.00019809040119561728,
+      "loss": 0.7926,
+      "step": 23200
+    },
+    {
+      "epoch": 0.06372809346787042,
+      "grad_norm": 0.47351571917533875,
+      "learning_rate": 0.00019808200662594838,
+      "loss": 0.7928,
+      "step": 23250
+    },
+    {
+      "epoch": 0.06386514313124218,
+      "grad_norm": 0.4690215587615967,
+      "learning_rate": 0.00019807359382417343,
+      "loss": 0.7894,
+      "step": 23300
+    },
+    {
+      "epoch": 0.06400219279461394,
+      "grad_norm": 0.4408493638038635,
+      "learning_rate": 0.00019806516279185628,
+      "loss": 0.8,
+      "step": 23350
+    },
+    {
+      "epoch": 0.06413924245798572,
+      "grad_norm": 0.4749327003955841,
+      "learning_rate": 0.00019805671353056412,
+      "loss": 0.7966,
+      "step": 23400
+    },
+    {
+      "epoch": 0.06427629212135748,
+      "grad_norm": 0.4740825593471527,
+      "learning_rate": 0.00019804824604186758,
+      "loss": 0.7984,
+      "step": 23450
+    },
+    {
+      "epoch": 0.06441334178472924,
+      "grad_norm": 0.45539775490760803,
+      "learning_rate": 0.00019803976032734064,
+      "loss": 0.8011,
+      "step": 23500
+    },
+    {
+      "epoch": 0.064550391448101,
+      "grad_norm": 0.47875672578811646,
+      "learning_rate": 0.00019803125638856063,
+      "loss": 0.794,
+      "step": 23550
+    },
+    {
+      "epoch": 0.06468744111147277,
+      "grad_norm": 0.4581809639930725,
+      "learning_rate": 0.00019802273422710843,
+      "loss": 0.7968,
+      "step": 23600
+    },
+    {
+      "epoch": 0.06482449077484453,
+      "grad_norm": 0.502196729183197,
+      "learning_rate": 0.00019801419384456805,
+      "loss": 0.7994,
+      "step": 23650
+    },
+    {
+      "epoch": 0.0649615404382163,
+      "grad_norm": 0.4565235674381256,
+      "learning_rate": 0.00019800563524252716,
+      "loss": 0.8012,
+      "step": 23700
+    },
+    {
+      "epoch": 0.06509859010158807,
+      "grad_norm": 0.49176493287086487,
+      "learning_rate": 0.00019799705842257659,
+      "loss": 0.7974,
+      "step": 23750
+    },
+    {
+      "epoch": 0.06523563976495983,
+      "grad_norm": 0.47653689980506897,
+      "learning_rate": 0.00019798846338631073,
+      "loss": 0.7912,
+      "step": 23800
+    },
+    {
+      "epoch": 0.06537268942833159,
+      "grad_norm": 0.48957517743110657,
+      "learning_rate": 0.00019797985013532722,
+      "loss": 0.8003,
+      "step": 23850
+    },
+    {
+      "epoch": 0.06550973909170335,
+      "grad_norm": 0.4694381058216095,
+      "learning_rate": 0.00019797121867122717,
+      "loss": 0.7942,
+      "step": 23900
+    },
+    {
+      "epoch": 0.06564678875507513,
+      "grad_norm": 0.435002863407135,
+      "learning_rate": 0.00019796256899561504,
+      "loss": 0.8081,
+      "step": 23950
+    },
+    {
+      "epoch": 0.06578383841844689,
+      "grad_norm": 0.48766547441482544,
+      "learning_rate": 0.0001979539011100987,
+      "loss": 0.7908,
+      "step": 24000
+    },
+    {
+      "epoch": 0.06592088808181865,
+      "grad_norm": 0.456478476524353,
+      "learning_rate": 0.0001979452150162894,
+      "loss": 0.8038,
+      "step": 24050
+    },
+    {
+      "epoch": 0.0660579377451904,
+      "grad_norm": 0.4724636375904083,
+      "learning_rate": 0.00019793651071580177,
+      "loss": 0.7906,
+      "step": 24100
+    },
+    {
+      "epoch": 0.06619498740856218,
+      "grad_norm": 0.48072385787963867,
+      "learning_rate": 0.0001979277882102538,
+      "loss": 0.7936,
+      "step": 24150
+    },
+    {
+      "epoch": 0.06633203707193394,
+      "grad_norm": 0.4881296455860138,
+      "learning_rate": 0.00019791904750126688,
+      "loss": 0.7992,
+      "step": 24200
+    },
+    {
+      "epoch": 0.0664690867353057,
+      "grad_norm": 0.4786093831062317,
+      "learning_rate": 0.0001979102885904658,
+      "loss": 0.8022,
+      "step": 24250
+    },
+    {
+      "epoch": 0.06660613639867748,
+      "grad_norm": 0.4414425194263458,
+      "learning_rate": 0.0001979015114794787,
+      "loss": 0.7908,
+      "step": 24300
+    },
+    {
+      "epoch": 0.06674318606204924,
+      "grad_norm": 0.4660377502441406,
+      "learning_rate": 0.00019789271616993718,
+      "loss": 0.8056,
+      "step": 24350
+    },
+    {
+      "epoch": 0.066880235725421,
+      "grad_norm": 0.4839298725128174,
+      "learning_rate": 0.00019788390266347613,
+      "loss": 0.8041,
+      "step": 24400
+    },
+    {
+      "epoch": 0.06701728538879276,
+      "grad_norm": 0.4496074914932251,
+      "learning_rate": 0.00019787507096173386,
+      "loss": 0.7869,
+      "step": 24450
+    },
+    {
+      "epoch": 0.06715433505216453,
+      "grad_norm": 0.43039125204086304,
+      "learning_rate": 0.00019786622106635207,
+      "loss": 0.7886,
+      "step": 24500
+    },
+    {
+      "epoch": 0.06729138471553629,
+      "grad_norm": 0.4781733751296997,
+      "learning_rate": 0.00019785735297897584,
+      "loss": 0.8016,
+      "step": 24550
+    },
+    {
+      "epoch": 0.06742843437890805,
+      "grad_norm": 0.4739139974117279,
+      "learning_rate": 0.00019784846670125358,
+      "loss": 0.7987,
+      "step": 24600
+    },
+    {
+      "epoch": 0.06756548404227983,
+      "grad_norm": 0.47873392701148987,
+      "learning_rate": 0.0001978395622348372,
+      "loss": 0.7957,
+      "step": 24650
+    },
+    {
+      "epoch": 0.06770253370565159,
+      "grad_norm": 0.4496181905269623,
+      "learning_rate": 0.0001978306395813819,
+      "loss": 0.7966,
+      "step": 24700
+    },
+    {
+      "epoch": 0.06783958336902335,
+      "grad_norm": 0.48717474937438965,
+      "learning_rate": 0.00019782169874254623,
+      "loss": 0.7961,
+      "step": 24750
+    },
+    {
+      "epoch": 0.06797663303239511,
+      "grad_norm": 0.4960111379623413,
+      "learning_rate": 0.00019781273971999222,
+      "loss": 0.8033,
+      "step": 24800
+    },
+    {
+      "epoch": 0.06811368269576688,
+      "grad_norm": 0.45468300580978394,
+      "learning_rate": 0.00019780376251538523,
+      "loss": 0.7954,
+      "step": 24850
+    },
+    {
+      "epoch": 0.06825073235913864,
+      "grad_norm": 0.4620811939239502,
+      "learning_rate": 0.00019779476713039395,
+      "loss": 0.7884,
+      "step": 24900
+    },
+    {
+      "epoch": 0.0683877820225104,
+      "grad_norm": 0.5242409110069275,
+      "learning_rate": 0.00019778575356669057,
+      "loss": 0.8074,
+      "step": 24950
+    },
+    {
+      "epoch": 0.06852483168588216,
+      "grad_norm": 0.4887278378009796,
+      "learning_rate": 0.00019777672182595053,
+      "loss": 0.7938,
+      "step": 25000
+    },
+    {
+      "epoch": 0.06866188134925394,
+      "grad_norm": 0.48667004704475403,
+      "learning_rate": 0.0001977676719098527,
+      "loss": 0.8002,
+      "step": 25050
+    },
+    {
+      "epoch": 0.0687989310126257,
+      "grad_norm": 0.46475672721862793,
+      "learning_rate": 0.0001977586038200794,
+      "loss": 0.8023,
+      "step": 25100
+    },
+    {
+      "epoch": 0.06893598067599746,
+      "grad_norm": 0.4685113728046417,
+      "learning_rate": 0.00019774951755831627,
+      "loss": 0.7976,
+      "step": 25150
+    },
+    {
+      "epoch": 0.06907303033936923,
+      "grad_norm": 0.47015681862831116,
+      "learning_rate": 0.00019774041312625222,
+      "loss": 0.7936,
+      "step": 25200
+    },
+    {
+      "epoch": 0.069210080002741,
+      "grad_norm": 0.4932703375816345,
+      "learning_rate": 0.00019773129052557973,
+      "loss": 0.7985,
+      "step": 25250
+    },
+    {
+      "epoch": 0.06934712966611276,
+      "grad_norm": 0.4694651663303375,
+      "learning_rate": 0.00019772214975799453,
+      "loss": 0.7955,
+      "step": 25300
+    },
+    {
+      "epoch": 0.06948417932948452,
+      "grad_norm": 0.44623926281929016,
+      "learning_rate": 0.00019771299082519574,
+      "loss": 0.7993,
+      "step": 25350
+    },
+    {
+      "epoch": 0.06962122899285629,
+      "grad_norm": 0.4828040599822998,
+      "learning_rate": 0.00019770381372888595,
+      "loss": 0.7941,
+      "step": 25400
+    },
+    {
+      "epoch": 0.06975827865622805,
+      "grad_norm": 0.4696119427680969,
+      "learning_rate": 0.000197694618470771,
+      "loss": 0.798,
+      "step": 25450
+    },
+    {
+      "epoch": 0.06989532831959981,
+      "grad_norm": 0.49441754817962646,
+      "learning_rate": 0.00019768540505256022,
+      "loss": 0.8041,
+      "step": 25500
+    },
+    {
+      "epoch": 0.07003237798297159,
+      "grad_norm": 0.5214009881019592,
+      "learning_rate": 0.00019767617347596619,
+      "loss": 0.7859,
+      "step": 25550
+    },
+    {
+      "epoch": 0.07016942764634335,
+      "grad_norm": 0.46143990755081177,
+      "learning_rate": 0.00019766692374270496,
+      "loss": 0.7938,
+      "step": 25600
+    },
+    {
+      "epoch": 0.0703064773097151,
+      "grad_norm": 0.5268940925598145,
+      "learning_rate": 0.00019765765585449594,
+      "loss": 0.8088,
+      "step": 25650
+    },
+    {
+      "epoch": 0.07044352697308687,
+      "grad_norm": 0.4896582365036011,
+      "learning_rate": 0.00019764836981306193,
+      "loss": 0.7983,
+      "step": 25700
+    },
+    {
+      "epoch": 0.07058057663645864,
+      "grad_norm": 0.45617246627807617,
+      "learning_rate": 0.000197639065620129,
+      "loss": 0.7983,
+      "step": 25750
+    },
+    {
+      "epoch": 0.0707176262998304,
+      "grad_norm": 0.43470898270606995,
+      "learning_rate": 0.00019762974327742675,
+      "loss": 0.7981,
+      "step": 25800
+    },
+    {
+      "epoch": 0.07085467596320216,
+      "grad_norm": 0.4943329393863678,
+      "learning_rate": 0.000197620402786688,
+      "loss": 0.7973,
+      "step": 25850
+    },
+    {
+      "epoch": 0.07099172562657392,
+      "grad_norm": 0.4778243899345398,
+      "learning_rate": 0.00019761104414964912,
+      "loss": 0.796,
+      "step": 25900
+    },
+    {
+      "epoch": 0.0711287752899457,
+      "grad_norm": 0.4903486669063568,
+      "learning_rate": 0.00019760166736804968,
+      "loss": 0.7967,
+      "step": 25950
+    },
+    {
+      "epoch": 0.07126582495331746,
+      "grad_norm": 0.4814266562461853,
+      "learning_rate": 0.0001975922724436327,
+      "loss": 0.7989,
+      "step": 26000
+    },
+    {
+      "epoch": 0.07140287461668922,
+      "grad_norm": 0.4753884971141815,
+      "learning_rate": 0.00019758285937814457,
+      "loss": 0.7977,
+      "step": 26050
+    },
+    {
+      "epoch": 0.07153992428006099,
+      "grad_norm": 0.4597572684288025,
+      "learning_rate": 0.0001975734281733351,
+      "loss": 0.7962,
+      "step": 26100
+    },
+    {
+      "epoch": 0.07167697394343275,
+      "grad_norm": 0.4517659544944763,
+      "learning_rate": 0.00019756397883095736,
+      "loss": 0.7938,
+      "step": 26150
+    },
+    {
+      "epoch": 0.07181402360680451,
+      "grad_norm": 0.48652228713035583,
+      "learning_rate": 0.00019755451135276787,
+      "loss": 0.7902,
+      "step": 26200
+    },
+    {
+      "epoch": 0.07195107327017627,
+      "grad_norm": 0.4702857732772827,
+      "learning_rate": 0.00019754502574052655,
+      "loss": 0.7999,
+      "step": 26250
+    },
+    {
+      "epoch": 0.07208812293354805,
+      "grad_norm": 0.47835201025009155,
+      "learning_rate": 0.0001975355219959966,
+      "loss": 0.7982,
+      "step": 26300
+    },
+    {
+      "epoch": 0.07222517259691981,
+      "grad_norm": 0.4716067314147949,
+      "learning_rate": 0.0001975260001209446,
+      "loss": 0.7959,
+      "step": 26350
+    },
+    {
+      "epoch": 0.07236222226029157,
+      "grad_norm": 0.4469781517982483,
+      "learning_rate": 0.0001975164601171406,
+      "loss": 0.7946,
+      "step": 26400
+    },
+    {
+      "epoch": 0.07249927192366334,
+      "grad_norm": 0.4652368724346161,
+      "learning_rate": 0.00019750690198635796,
+      "loss": 0.794,
+      "step": 26450
+    },
+    {
+      "epoch": 0.0726363215870351,
+      "grad_norm": 0.47072771191596985,
+      "learning_rate": 0.00019749732573037338,
+      "loss": 0.8008,
+      "step": 26500
+    },
+    {
+      "epoch": 0.07277337125040687,
+      "grad_norm": 0.48966971039772034,
+      "learning_rate": 0.00019748773135096694,
+      "loss": 0.8039,
+      "step": 26550
+    },
+    {
+      "epoch": 0.07291042091377863,
+      "grad_norm": 0.4906329810619354,
+      "learning_rate": 0.00019747811884992213,
+      "loss": 0.7942,
+      "step": 26600
+    },
+    {
+      "epoch": 0.0730474705771504,
+      "grad_norm": 0.4698156714439392,
+      "learning_rate": 0.0001974684882290258,
+      "loss": 0.7966,
+      "step": 26650
+    },
+    {
+      "epoch": 0.07318452024052216,
+      "grad_norm": 0.4471251964569092,
+      "learning_rate": 0.00019745883949006808,
+      "loss": 0.7853,
+      "step": 26700
+    },
+    {
+      "epoch": 0.07332156990389392,
+      "grad_norm": 0.4238675832748413,
+      "learning_rate": 0.00019744917263484263,
+      "loss": 0.7954,
+      "step": 26750
+    },
+    {
+      "epoch": 0.07345861956726568,
+      "grad_norm": 0.48359739780426025,
+      "learning_rate": 0.00019743948766514632,
+      "loss": 0.788,
+      "step": 26800
+    },
+    {
+      "epoch": 0.07359566923063746,
+      "grad_norm": 0.4774411916732788,
+      "learning_rate": 0.0001974297845827795,
+      "loss": 0.7928,
+      "step": 26850
+    },
+    {
+      "epoch": 0.07373271889400922,
+      "grad_norm": 0.46250951290130615,
+      "learning_rate": 0.0001974200633895458,
+      "loss": 0.7981,
+      "step": 26900
+    },
+    {
+      "epoch": 0.07386976855738098,
+      "grad_norm": 0.4583037495613098,
+      "learning_rate": 0.00019741032408725226,
+      "loss": 0.7907,
+      "step": 26950
+    },
+    {
+      "epoch": 0.07400681822075275,
+      "grad_norm": 0.46894222497940063,
+      "learning_rate": 0.00019740056667770932,
+      "loss": 0.7959,
+      "step": 27000
+    },
+    {
+      "epoch": 0.07414386788412451,
+      "grad_norm": 0.48790639638900757,
+      "learning_rate": 0.0001973907911627307,
+      "loss": 0.791,
+      "step": 27050
+    },
+    {
+      "epoch": 0.07428091754749627,
+      "grad_norm": 0.4991588592529297,
+      "learning_rate": 0.0001973809975441336,
+      "loss": 0.8112,
+      "step": 27100
+    },
+    {
+      "epoch": 0.07441796721086803,
+      "grad_norm": 0.4748496413230896,
+      "learning_rate": 0.00019737118582373845,
+      "loss": 0.7932,
+      "step": 27150
+    },
+    {
+      "epoch": 0.07455501687423981,
+      "grad_norm": 0.47536593675613403,
+      "learning_rate": 0.00019736135600336916,
+      "loss": 0.8025,
+      "step": 27200
+    },
+    {
+      "epoch": 0.07469206653761157,
+      "grad_norm": 0.48133620619773865,
+      "learning_rate": 0.00019735150808485293,
+      "loss": 0.7986,
+      "step": 27250
+    },
+    {
+      "epoch": 0.07482911620098333,
+      "grad_norm": 0.4730217754840851,
+      "learning_rate": 0.00019734164207002038,
+      "loss": 0.7917,
+      "step": 27300
+    },
+    {
+      "epoch": 0.07496616586435509,
+      "grad_norm": 0.4878595471382141,
+      "learning_rate": 0.00019733175796070546,
+      "loss": 0.7946,
+      "step": 27350
+    },
+    {
+      "epoch": 0.07510321552772686,
+      "grad_norm": 0.4788990616798401,
+      "learning_rate": 0.00019732185575874547,
+      "loss": 0.796,
+      "step": 27400
+    },
+    {
+      "epoch": 0.07524026519109862,
+      "grad_norm": 0.4414103627204895,
+      "learning_rate": 0.00019731193546598114,
+      "loss": 0.7901,
+      "step": 27450
+    },
+    {
+      "epoch": 0.07537731485447038,
+      "grad_norm": 0.47307801246643066,
+      "learning_rate": 0.00019730199708425646,
+      "loss": 0.8015,
+      "step": 27500
+    },
+    {
+      "epoch": 0.07551436451784216,
+      "grad_norm": 0.490078866481781,
+      "learning_rate": 0.00019729204061541889,
+      "loss": 0.8079,
+      "step": 27550
+    },
+    {
+      "epoch": 0.07565141418121392,
+      "grad_norm": 0.4815292954444885,
+      "learning_rate": 0.00019728206606131916,
+      "loss": 0.8044,
+      "step": 27600
+    },
+    {
+      "epoch": 0.07578846384458568,
+      "grad_norm": 0.47046026587486267,
+      "learning_rate": 0.00019727207342381143,
+      "loss": 0.7982,
+      "step": 27650
+    },
+    {
+      "epoch": 0.07592551350795744,
+      "grad_norm": 0.488438218832016,
+      "learning_rate": 0.0001972620627047532,
+      "loss": 0.7939,
+      "step": 27700
+    },
+    {
+      "epoch": 0.07606256317132921,
+      "grad_norm": 0.5354650020599365,
+      "learning_rate": 0.0001972520339060053,
+      "loss": 0.7929,
+      "step": 27750
+    },
+    {
+      "epoch": 0.07619961283470097,
+      "grad_norm": 0.461015909910202,
+      "learning_rate": 0.00019724198702943197,
+      "loss": 0.7995,
+      "step": 27800
+    },
+    {
+      "epoch": 0.07633666249807274,
+      "grad_norm": 0.5413848161697388,
+      "learning_rate": 0.00019723192207690078,
+      "loss": 0.7895,
+      "step": 27850
+    },
+    {
+      "epoch": 0.07647371216144451,
+      "grad_norm": 0.46962863206863403,
+      "learning_rate": 0.00019722183905028265,
+      "loss": 0.7885,
+      "step": 27900
+    },
+    {
+      "epoch": 0.07661076182481627,
+      "grad_norm": 0.4755912125110626,
+      "learning_rate": 0.00019721173795145188,
+      "loss": 0.7883,
+      "step": 27950
+    },
+    {
+      "epoch": 0.07674781148818803,
+      "grad_norm": 0.4535827040672302,
+      "learning_rate": 0.0001972016187822862,
+      "loss": 0.7905,
+      "step": 28000
+    },
+    {
+      "epoch": 0.07688486115155979,
+      "grad_norm": 0.4831449091434479,
+      "learning_rate": 0.0001971914815446665,
+      "loss": 0.7918,
+      "step": 28050
+    },
+    {
+      "epoch": 0.07702191081493157,
+      "grad_norm": 0.49459129571914673,
+      "learning_rate": 0.00019718132624047725,
+      "loss": 0.7905,
+      "step": 28100
+    },
+    {
+      "epoch": 0.07715896047830333,
+      "grad_norm": 0.46750736236572266,
+      "learning_rate": 0.00019717115287160613,
+      "loss": 0.7954,
+      "step": 28150
+    },
+    {
+      "epoch": 0.07729601014167509,
+      "grad_norm": 0.5005916953086853,
+      "learning_rate": 0.00019716096143994428,
+      "loss": 0.7912,
+      "step": 28200
+    },
+    {
+      "epoch": 0.07743305980504685,
+      "grad_norm": 0.5159142017364502,
+      "learning_rate": 0.00019715075194738608,
+      "loss": 0.7944,
+      "step": 28250
+    },
+    {
+      "epoch": 0.07757010946841862,
+      "grad_norm": 0.4624069929122925,
+      "learning_rate": 0.00019714052439582939,
+      "loss": 0.7898,
+      "step": 28300
+    },
+    {
+      "epoch": 0.07770715913179038,
+      "grad_norm": 0.4599010646343231,
+      "learning_rate": 0.00019713027878717537,
+      "loss": 0.7904,
+      "step": 28350
+    },
+    {
+      "epoch": 0.07784420879516214,
+      "grad_norm": 0.5177801847457886,
+      "learning_rate": 0.00019712001512332852,
+      "loss": 0.7961,
+      "step": 28400
+    },
+    {
+      "epoch": 0.07798125845853392,
+      "grad_norm": 0.494656503200531,
+      "learning_rate": 0.00019710973340619675,
+      "loss": 0.798,
+      "step": 28450
+    },
+    {
+      "epoch": 0.07811830812190568,
+      "grad_norm": 0.46997737884521484,
+      "learning_rate": 0.0001970994336376912,
+      "loss": 0.7912,
+      "step": 28500
+    },
+    {
+      "epoch": 0.07825535778527744,
+      "grad_norm": 0.47283247113227844,
+      "learning_rate": 0.00019708911581972657,
+      "loss": 0.7986,
+      "step": 28550
+    },
+    {
+      "epoch": 0.0783924074486492,
+      "grad_norm": 0.47213488817214966,
+      "learning_rate": 0.0001970787799542207,
+      "loss": 0.7926,
+      "step": 28600
+    },
+    {
+      "epoch": 0.07852945711202097,
+      "grad_norm": 0.4525822401046753,
+      "learning_rate": 0.00019706842604309496,
+      "loss": 0.7946,
+      "step": 28650
+    },
+    {
+      "epoch": 0.07866650677539273,
+      "grad_norm": 0.4752897024154663,
+      "learning_rate": 0.000197058054088274,
+      "loss": 0.7888,
+      "step": 28700
+    },
+    {
+      "epoch": 0.0788035564387645,
+      "grad_norm": 0.47640207409858704,
+      "learning_rate": 0.00019704766409168575,
+      "loss": 0.8016,
+      "step": 28750
+    },
+    {
+      "epoch": 0.07894060610213627,
+      "grad_norm": 0.49076226353645325,
+      "learning_rate": 0.00019703725605526166,
+      "loss": 0.7891,
+      "step": 28800
+    },
+    {
+      "epoch": 0.07907765576550803,
+      "grad_norm": 0.5122234225273132,
+      "learning_rate": 0.00019702682998093638,
+      "loss": 0.7914,
+      "step": 28850
+    },
+    {
+      "epoch": 0.07921470542887979,
+      "grad_norm": 0.4687149226665497,
+      "learning_rate": 0.00019701638587064801,
+      "loss": 0.7876,
+      "step": 28900
+    },
+    {
+      "epoch": 0.07935175509225155,
+      "grad_norm": 0.48135191202163696,
+      "learning_rate": 0.00019700592372633792,
+      "loss": 0.797,
+      "step": 28950
+    },
+    {
+      "epoch": 0.07948880475562332,
+      "grad_norm": 0.457065612077713,
+      "learning_rate": 0.00019699544354995097,
+      "loss": 0.7903,
+      "step": 29000
+    },
+    {
+      "epoch": 0.07962585441899508,
+      "grad_norm": 0.49080437421798706,
+      "learning_rate": 0.0001969849453434352,
+      "loss": 0.7941,
+      "step": 29050
+    },
+    {
+      "epoch": 0.07976290408236685,
+      "grad_norm": 0.47728291153907776,
+      "learning_rate": 0.0001969744291087421,
+      "loss": 0.7973,
+      "step": 29100
+    },
+    {
+      "epoch": 0.0798999537457386,
+      "grad_norm": 0.4719684422016144,
+      "learning_rate": 0.00019696389484782649,
+      "loss": 0.7931,
+      "step": 29150
+    },
+    {
+      "epoch": 0.08003700340911038,
+      "grad_norm": 0.4797501862049103,
+      "learning_rate": 0.00019695334256264658,
+      "loss": 0.7852,
+      "step": 29200
+    },
+    {
+      "epoch": 0.08017405307248214,
+      "grad_norm": 0.4938788115978241,
+      "learning_rate": 0.00019694277225516387,
+      "loss": 0.7949,
+      "step": 29250
+    },
+    {
+      "epoch": 0.0803111027358539,
+      "grad_norm": 0.5137888193130493,
+      "learning_rate": 0.00019693218392734325,
+      "loss": 0.7933,
+      "step": 29300
+    },
+    {
+      "epoch": 0.08044815239922568,
+      "grad_norm": 0.4900031089782715,
+      "learning_rate": 0.0001969215775811529,
+      "loss": 0.8004,
+      "step": 29350
+    },
+    {
+      "epoch": 0.08058520206259744,
+      "grad_norm": 0.4935697317123413,
+      "learning_rate": 0.00019691095321856445,
+      "loss": 0.7966,
+      "step": 29400
+    },
+    {
+      "epoch": 0.0807222517259692,
+      "grad_norm": 0.48324236273765564,
+      "learning_rate": 0.00019690031084155282,
+      "loss": 0.802,
+      "step": 29450
+    },
+    {
+      "epoch": 0.08085930138934096,
+      "grad_norm": 0.44104790687561035,
+      "learning_rate": 0.00019688965045209624,
+      "loss": 0.7906,
+      "step": 29500
+    },
+    {
+      "epoch": 0.08099635105271273,
+      "grad_norm": 0.4910528361797333,
+      "learning_rate": 0.00019687897205217638,
+      "loss": 0.7939,
+      "step": 29550
+    },
+    {
+      "epoch": 0.08113340071608449,
+      "grad_norm": 0.4602694809436798,
+      "learning_rate": 0.0001968682756437782,
+      "loss": 0.7927,
+      "step": 29600
+    },
+    {
+      "epoch": 0.08127045037945625,
+      "grad_norm": 0.47206035256385803,
+      "learning_rate": 0.00019685756122888996,
+      "loss": 0.7845,
+      "step": 29650
+    },
+    {
+      "epoch": 0.08140750004282803,
+      "grad_norm": 0.471284419298172,
+      "learning_rate": 0.00019684682880950337,
+      "loss": 0.7878,
+      "step": 29700
+    },
+    {
+      "epoch": 0.08154454970619979,
+      "grad_norm": 0.4756605625152588,
+      "learning_rate": 0.0001968360783876135,
+      "loss": 0.7915,
+      "step": 29750
+    },
+    {
+      "epoch": 0.08168159936957155,
+      "grad_norm": 0.5032472014427185,
+      "learning_rate": 0.00019682530996521859,
+      "loss": 0.7811,
+      "step": 29800
+    },
+    {
+      "epoch": 0.08181864903294331,
+      "grad_norm": 0.49361568689346313,
+      "learning_rate": 0.0001968145235443204,
+      "loss": 0.8027,
+      "step": 29850
+    },
+    {
+      "epoch": 0.08195569869631508,
+      "grad_norm": 0.4601605236530304,
+      "learning_rate": 0.00019680371912692395,
+      "loss": 0.7915,
+      "step": 29900
+    },
+    {
+      "epoch": 0.08209274835968684,
+      "grad_norm": 0.46172550320625305,
+      "learning_rate": 0.0001967928967150377,
+      "loss": 0.8005,
+      "step": 29950
+    },
+    {
+      "epoch": 0.0822297980230586,
+      "grad_norm": 0.4655384123325348,
+      "learning_rate": 0.00019678205631067333,
+      "loss": 0.7884,
+      "step": 30000
+    },
+    {
+      "epoch": 0.08236684768643036,
+      "grad_norm": 0.49141067266464233,
+      "learning_rate": 0.00019677119791584592,
+      "loss": 0.7914,
+      "step": 30050
+    },
+    {
+      "epoch": 0.08250389734980214,
+      "grad_norm": 0.5075825452804565,
+      "learning_rate": 0.00019676032153257396,
+      "loss": 0.7973,
+      "step": 30100
+    },
+    {
+      "epoch": 0.0826409470131739,
+      "grad_norm": 0.4856976866722107,
+      "learning_rate": 0.00019674942716287915,
+      "loss": 0.7892,
+      "step": 30150
+    },
+    {
+      "epoch": 0.08277799667654566,
+      "grad_norm": 0.4587472975254059,
+      "learning_rate": 0.00019673851480878665,
+      "loss": 0.79,
+      "step": 30200
+    },
+    {
+      "epoch": 0.08291504633991743,
+      "grad_norm": 0.4883078932762146,
+      "learning_rate": 0.00019672758447232487,
+      "loss": 0.7917,
+      "step": 30250
+    },
+    {
+      "epoch": 0.0830520960032892,
+      "grad_norm": 0.46209436655044556,
+      "learning_rate": 0.00019671663615552566,
+      "loss": 0.7957,
+      "step": 30300
+    },
+    {
+      "epoch": 0.08318914566666095,
+      "grad_norm": 0.48940733075141907,
+      "learning_rate": 0.00019670566986042416,
+      "loss": 0.7883,
+      "step": 30350
+    },
+    {
+      "epoch": 0.08332619533003272,
+      "grad_norm": 0.44892752170562744,
+      "learning_rate": 0.0001966946855890588,
+      "loss": 0.7926,
+      "step": 30400
+    },
+    {
+      "epoch": 0.08346324499340449,
+      "grad_norm": 0.4406869113445282,
+      "learning_rate": 0.0001966836833434715,
+      "loss": 0.7937,
+      "step": 30450
+    },
+    {
+      "epoch": 0.08360029465677625,
+      "grad_norm": 0.47302696108818054,
+      "learning_rate": 0.00019667266312570732,
+      "loss": 0.7966,
+      "step": 30500
+    },
+    {
+      "epoch": 0.08373734432014801,
+      "grad_norm": 0.4698280990123749,
+      "learning_rate": 0.00019666162493781484,
+      "loss": 0.7959,
+      "step": 30550
+    },
+    {
+      "epoch": 0.08387439398351977,
+      "grad_norm": 0.46622198820114136,
+      "learning_rate": 0.00019665056878184588,
+      "loss": 0.7848,
+      "step": 30600
+    },
+    {
+      "epoch": 0.08401144364689155,
+      "grad_norm": 0.44060927629470825,
+      "learning_rate": 0.00019663949465985564,
+      "loss": 0.7901,
+      "step": 30650
+    },
+    {
+      "epoch": 0.0841484933102633,
+      "grad_norm": 0.4829963743686676,
+      "learning_rate": 0.00019662840257390264,
+      "loss": 0.7857,
+      "step": 30700
+    },
+    {
+      "epoch": 0.08428554297363507,
+      "grad_norm": 0.4828779399394989,
+      "learning_rate": 0.00019661729252604877,
+      "loss": 0.7991,
+      "step": 30750
+    },
+    {
+      "epoch": 0.08442259263700684,
+      "grad_norm": 0.4656374156475067,
+      "learning_rate": 0.0001966061645183592,
+      "loss": 0.8017,
+      "step": 30800
+    },
+    {
+      "epoch": 0.0845596423003786,
+      "grad_norm": 0.43232300877571106,
+      "learning_rate": 0.0001965950185529025,
+      "loss": 0.7916,
+      "step": 30850
+    },
+    {
+      "epoch": 0.08469669196375036,
+      "grad_norm": 0.483804315328598,
+      "learning_rate": 0.00019658385463175053,
+      "loss": 0.7831,
+      "step": 30900
+    },
+    {
+      "epoch": 0.08483374162712212,
+      "grad_norm": 0.4652833938598633,
+      "learning_rate": 0.00019657267275697854,
+      "loss": 0.7963,
+      "step": 30950
+    },
+    {
+      "epoch": 0.0849707912904939,
+      "grad_norm": 0.4923244118690491,
+      "learning_rate": 0.00019656147293066508,
+      "loss": 0.7988,
+      "step": 31000
+    },
+    {
+      "epoch": 0.08510784095386566,
+      "grad_norm": 0.4675879180431366,
+      "learning_rate": 0.00019655025515489201,
+      "loss": 0.7933,
+      "step": 31050
+    },
+    {
+      "epoch": 0.08524489061723742,
+      "grad_norm": 0.46415793895721436,
+      "learning_rate": 0.00019653901943174462,
+      "loss": 0.7928,
+      "step": 31100
+    },
+    {
+      "epoch": 0.08538194028060919,
+      "grad_norm": 0.4991393983364105,
+      "learning_rate": 0.00019652776576331146,
+      "loss": 0.7948,
+      "step": 31150
+    },
+    {
+      "epoch": 0.08551898994398095,
+      "grad_norm": 0.46876922249794006,
+      "learning_rate": 0.00019651649415168437,
+      "loss": 0.7875,
+      "step": 31200
+    },
+    {
+      "epoch": 0.08565603960735271,
+      "grad_norm": 0.4724014699459076,
+      "learning_rate": 0.00019650520459895868,
+      "loss": 0.7958,
+      "step": 31250
+    },
+    {
+      "epoch": 0.08579308927072447,
+      "grad_norm": 0.5036382079124451,
+      "learning_rate": 0.0001964938971072329,
+      "loss": 0.8053,
+      "step": 31300
+    },
+    {
+      "epoch": 0.08593013893409625,
+      "grad_norm": 0.4702759087085724,
+      "learning_rate": 0.00019648257167860899,
+      "loss": 0.7869,
+      "step": 31350
+    },
+    {
+      "epoch": 0.08606718859746801,
+      "grad_norm": 0.48790547251701355,
+      "learning_rate": 0.00019647122831519215,
+      "loss": 0.7887,
+      "step": 31400
+    },
+    {
+      "epoch": 0.08620423826083977,
+      "grad_norm": 0.4691486954689026,
+      "learning_rate": 0.00019645986701909097,
+      "loss": 0.7939,
+      "step": 31450
+    },
+    {
+      "epoch": 0.08634128792421153,
+      "grad_norm": 0.48565474152565,
+      "learning_rate": 0.00019644848779241735,
+      "loss": 0.7927,
+      "step": 31500
+    },
+    {
+      "epoch": 0.0864783375875833,
+      "grad_norm": 0.4843965172767639,
+      "learning_rate": 0.00019643709063728654,
+      "loss": 0.7876,
+      "step": 31550
+    },
+    {
+      "epoch": 0.08661538725095506,
+      "grad_norm": 0.49629899859428406,
+      "learning_rate": 0.00019642567555581714,
+      "loss": 0.7887,
+      "step": 31600
+    },
+    {
+      "epoch": 0.08675243691432682,
+      "grad_norm": 0.49302300810813904,
+      "learning_rate": 0.00019641424255013106,
+      "loss": 0.798,
+      "step": 31650
+    },
+    {
+      "epoch": 0.0868894865776986,
+      "grad_norm": 0.510237455368042,
+      "learning_rate": 0.00019640279162235348,
+      "loss": 0.7981,
+      "step": 31700
+    },
+    {
+      "epoch": 0.08702653624107036,
+      "grad_norm": 0.4617491662502289,
+      "learning_rate": 0.000196391322774613,
+      "loss": 0.8049,
+      "step": 31750
+    },
+    {
+      "epoch": 0.08716358590444212,
+      "grad_norm": 0.4670426547527313,
+      "learning_rate": 0.00019637983600904156,
+      "loss": 0.7886,
+      "step": 31800
+    },
+    {
+      "epoch": 0.08730063556781388,
+      "grad_norm": 0.5087895393371582,
+      "learning_rate": 0.00019636833132777434,
+      "loss": 0.7899,
+      "step": 31850
+    },
+    {
+      "epoch": 0.08743768523118566,
+      "grad_norm": 0.5074539184570312,
+      "learning_rate": 0.00019635680873294993,
+      "loss": 0.7868,
+      "step": 31900
+    },
+    {
+      "epoch": 0.08757473489455742,
+      "grad_norm": 0.4760209023952484,
+      "learning_rate": 0.00019634526822671022,
+      "loss": 0.7927,
+      "step": 31950
+    },
+    {
+      "epoch": 0.08771178455792918,
+      "grad_norm": 0.4537639319896698,
+      "learning_rate": 0.00019633370981120044,
+      "loss": 0.7858,
+      "step": 32000
+    },
+    {
+      "epoch": 0.08784883422130095,
+      "grad_norm": 0.48078107833862305,
+      "learning_rate": 0.00019632213348856912,
+      "loss": 0.7918,
+      "step": 32050
+    },
+    {
+      "epoch": 0.08798588388467271,
+      "grad_norm": 0.47621411085128784,
+      "learning_rate": 0.00019631053926096815,
+      "loss": 0.7978,
+      "step": 32100
+    },
+    {
+      "epoch": 0.08812293354804447,
+      "grad_norm": 0.4982161819934845,
+      "learning_rate": 0.00019629892713055275,
+      "loss": 0.7901,
+      "step": 32150
+    },
+    {
+      "epoch": 0.08825998321141623,
+      "grad_norm": 0.4935854971408844,
+      "learning_rate": 0.00019628729709948145,
+      "loss": 0.7851,
+      "step": 32200
+    },
+    {
+      "epoch": 0.088397032874788,
+      "grad_norm": 0.49464720487594604,
+      "learning_rate": 0.00019627564916991613,
+      "loss": 0.7949,
+      "step": 32250
+    },
+    {
+      "epoch": 0.08853408253815977,
+      "grad_norm": 0.4959528148174286,
+      "learning_rate": 0.00019626398334402195,
+      "loss": 0.7926,
+      "step": 32300
+    },
+    {
+      "epoch": 0.08867113220153153,
+      "grad_norm": 0.45773616433143616,
+      "learning_rate": 0.00019625229962396743,
+      "loss": 0.7868,
+      "step": 32350
+    },
+    {
+      "epoch": 0.08880818186490329,
+      "grad_norm": 0.48498624563217163,
+      "learning_rate": 0.00019624059801192444,
+      "loss": 0.7811,
+      "step": 32400
+    },
+    {
+      "epoch": 0.08894523152827506,
+      "grad_norm": 0.4527600109577179,
+      "learning_rate": 0.0001962288785100681,
+      "loss": 0.789,
+      "step": 32450
+    },
+    {
+      "epoch": 0.08908228119164682,
+      "grad_norm": 0.46126919984817505,
+      "learning_rate": 0.000196217141120577,
+      "loss": 0.7894,
+      "step": 32500
+    },
+    {
+      "epoch": 0.08921933085501858,
+      "grad_norm": 0.4673360586166382,
+      "learning_rate": 0.00019620538584563284,
+      "loss": 0.7866,
+      "step": 32550
+    },
+    {
+      "epoch": 0.08935638051839036,
+      "grad_norm": 0.4808477461338043,
+      "learning_rate": 0.00019619361268742087,
+      "loss": 0.7941,
+      "step": 32600
+    },
+    {
+      "epoch": 0.08949343018176212,
+      "grad_norm": 0.4685232639312744,
+      "learning_rate": 0.0001961818216481295,
+      "loss": 0.7915,
+      "step": 32650
+    },
+    {
+      "epoch": 0.08963047984513388,
+      "grad_norm": 0.45769527554512024,
+      "learning_rate": 0.00019617001272995054,
+      "loss": 0.7996,
+      "step": 32700
+    },
+    {
+      "epoch": 0.08976752950850564,
+      "grad_norm": 0.46215105056762695,
+      "learning_rate": 0.00019615818593507914,
+      "loss": 0.7873,
+      "step": 32750
+    },
+    {
+      "epoch": 0.08990457917187741,
+      "grad_norm": 0.4922528564929962,
+      "learning_rate": 0.00019614634126571365,
+      "loss": 0.7899,
+      "step": 32800
+    },
+    {
+      "epoch": 0.09004162883524917,
+      "grad_norm": 0.486177921295166,
+      "learning_rate": 0.0001961344787240559,
+      "loss": 0.7896,
+      "step": 32850
+    },
+    {
+      "epoch": 0.09017867849862093,
+      "grad_norm": 0.47860997915267944,
+      "learning_rate": 0.00019612259831231098,
+      "loss": 0.793,
+      "step": 32900
+    },
+    {
+      "epoch": 0.09031572816199271,
+      "grad_norm": 0.47204136848449707,
+      "learning_rate": 0.0001961107000326873,
+      "loss": 0.785,
+      "step": 32950
+    },
+    {
+      "epoch": 0.09045277782536447,
+      "grad_norm": 0.4928169250488281,
+      "learning_rate": 0.00019609878388739653,
+      "loss": 0.7896,
+      "step": 33000
+    },
+    {
+      "epoch": 0.09058982748873623,
+      "grad_norm": 0.5053184032440186,
+      "learning_rate": 0.00019608684987865375,
+      "loss": 0.7901,
+      "step": 33050
+    },
+    {
+      "epoch": 0.09072687715210799,
+      "grad_norm": 0.4610320031642914,
+      "learning_rate": 0.00019607489800867737,
+      "loss": 0.7888,
+      "step": 33100
+    },
+    {
+      "epoch": 0.09086392681547976,
+      "grad_norm": 0.48464083671569824,
+      "learning_rate": 0.000196062928279689,
+      "loss": 0.7977,
+      "step": 33150
+    },
+    {
+      "epoch": 0.09100097647885153,
+      "grad_norm": 0.4782997965812683,
+      "learning_rate": 0.0001960509406939137,
+      "loss": 0.7971,
+      "step": 33200
+    },
+    {
+      "epoch": 0.09113802614222329,
+      "grad_norm": 0.5001197457313538,
+      "learning_rate": 0.00019603893525357984,
+      "loss": 0.7911,
+      "step": 33250
+    },
+    {
+      "epoch": 0.09127507580559505,
+      "grad_norm": 0.4851212203502655,
+      "learning_rate": 0.000196026911960919,
+      "loss": 0.7976,
+      "step": 33300
+    },
+    {
+      "epoch": 0.09141212546896682,
+      "grad_norm": 0.5041796565055847,
+      "learning_rate": 0.00019601487081816616,
+      "loss": 0.7862,
+      "step": 33350
+    },
+    {
+      "epoch": 0.09154917513233858,
+      "grad_norm": 0.458372563123703,
+      "learning_rate": 0.0001960028118275596,
+      "loss": 0.7859,
+      "step": 33400
+    },
+    {
+      "epoch": 0.09168622479571034,
+      "grad_norm": 0.4743143618106842,
+      "learning_rate": 0.00019599073499134093,
+      "loss": 0.7898,
+      "step": 33450
+    },
+    {
+      "epoch": 0.09182327445908212,
+      "grad_norm": 0.5056654214859009,
+      "learning_rate": 0.0001959786403117551,
+      "loss": 0.7998,
+      "step": 33500
+    },
+    {
+      "epoch": 0.09196032412245388,
+      "grad_norm": 0.49195945262908936,
+      "learning_rate": 0.00019596652779105026,
+      "loss": 0.7912,
+      "step": 33550
+    },
+    {
+      "epoch": 0.09209737378582564,
+      "grad_norm": 0.4621276557445526,
+      "learning_rate": 0.00019595439743147806,
+      "loss": 0.8009,
+      "step": 33600
+    },
+    {
+      "epoch": 0.0922344234491974,
+      "grad_norm": 0.5244962573051453,
+      "learning_rate": 0.00019594224923529332,
+      "loss": 0.7926,
+      "step": 33650
+    },
+    {
+      "epoch": 0.09237147311256917,
+      "grad_norm": 0.48035624623298645,
+      "learning_rate": 0.0001959300832047542,
+      "loss": 0.7893,
+      "step": 33700
+    },
+    {
+      "epoch": 0.09250852277594093,
+      "grad_norm": 0.48420900106430054,
+      "learning_rate": 0.00019591789934212225,
+      "loss": 0.7889,
+      "step": 33750
+    },
+    {
+      "epoch": 0.09264557243931269,
+      "grad_norm": 0.46187272667884827,
+      "learning_rate": 0.00019590569764966226,
+      "loss": 0.7873,
+      "step": 33800
+    },
+    {
+      "epoch": 0.09278262210268447,
+      "grad_norm": 0.48740679025650024,
+      "learning_rate": 0.0001958934781296424,
+      "loss": 0.786,
+      "step": 33850
+    },
+    {
+      "epoch": 0.09291967176605623,
+      "grad_norm": 0.49153274297714233,
+      "learning_rate": 0.00019588124078433403,
+      "loss": 0.7906,
+      "step": 33900
+    },
+    {
+      "epoch": 0.09305672142942799,
+      "grad_norm": 0.4947867691516876,
+      "learning_rate": 0.00019586898561601196,
+      "loss": 0.7856,
+      "step": 33950
+    },
+    {
+      "epoch": 0.09319377109279975,
+      "grad_norm": 0.48114946484565735,
+      "learning_rate": 0.00019585671262695425,
+      "loss": 0.7914,
+      "step": 34000
+    }
+  ],
+  "logging_steps": 50,
+  "max_steps": 364831,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 2000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.376731557311283e+19,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}